parse a simplified variant of CSV

rationale: the purpose is to read back our own values,
yet it should be reasonably standard, to allow investigating
and tweaking values with a spreadsheet

 - first line is a header line and used to verify the number of columns
 - one record per line, embedded line breaks prohibited
 - fields separated by comma, semicolon tolerated
 - fields are trimmed and may be empty
 - a field may be double quoted
 - only quoted fields may contain whitespace or comma
 - no escaping of quotes, i.e. no quotes within quotes
This commit is contained in:
Fischlurch 2021-09-16 23:54:11 +02:00
parent a523861428
commit b6a2eec94c
3 changed files with 359 additions and 4 deletions

View file

@ -0,0 +1,186 @@
/*
* csv - parser and encoder
*
* Copyright 2021, Hermann Vosseler <Ichthyostega@web.de>
*
* This file is part of the Yoshimi-Testsuite, which is free software:
* you can redistribute and/or modify it under the terms of the GNU
* General Public License as published by the Free Software Foundation,
* either version 3 of the License, or (at your option) any later version.
*
* Yoshimi-Testsuite is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with yoshimi. If not, see <http://www.gnu.org/licenses/>.
***************************************************************/
/** @file csv.hpp
** Encoding and decoding of data into CSV format.
** The sequence of values transformed here is part of a data table, with columns
** holding data of various primitive value types; persisted CSV data is human readable,
** can be checked into Git and loaded into various spreadsheet and statistics applications.
**
** # CSV Format
** Even while there is a standard defined in [RFC 4180], a plethora of format variations
** can be found _in the wild._ Since the primary purpose of this implementation is _to read
** back our own data,_ by deliberate choice only one single form of CSV is accepted.
** - first line is a header line and used to verify the number of columns
** - one record per line, embedded line breaks prohibited
** - fields separated by comma, semicolon tolerated
** - fields are trimmed and may be empty
** - a field may be double quoted
** - only quoted fields may contain whitespace or comma
** - no escaping of quotes, i.e. no quotes within quotes
** [RFC 4180]: https://datatracker.ietf.org/doc/html/rfc4180
**
** @todo WIP as of 9/21
** @see util::DataFile used for [Timing statistics](\ref TimingObservation.hpp)
**
*/
#ifndef TESTRUNNER_UTIL_CSV_HPP_
#define TESTRUNNER_UTIL_CSV_HPP_
//#include "util/nocopy.hpp"
#include "util/error.hpp"
#include "util/format.hpp"
#include "util/regex.hpp"
//#include "util/utils.hpp"
//#include <string>
//#include <memory>
//#include <utility>
//#include <vector>
namespace util {
using std::regex;
//using std::vector;
//using util::isnil;
namespace { // Implementation details...
const string MATCH_SINGLE_TOKEN {R"~(([^,;"\s]*)\s*)~"};
const string MATCH_QUOTED_TOKEN {R"~("([^"]*)"\s*)~"};
const string MATCH_DELIMITER {R"~((?:^|,|;)\s*)~"};
const regex ACCEPT_FIELD{ MATCH_DELIMITER + "(?:"+ MATCH_QUOTED_TOKEN +"|"+ MATCH_SINGLE_TOKEN +")"
, regex::optimize};
template<typename VAL>
inline string format4Csv(VAL const& val)
{
return util::str(val);
}
inline string format4Csv(string const& val)
{
return '"'+val+'"';
}
}//(End)Implementation
/**
* Parser to split one line of CSV data into fields.
* @remarks iterator-like throw-away object
* - the `bool` evaluation indicates more fields to extract
* - dereference to get the field as string
* - increment to move to the next field
* @throws error::Invalid on CSV format violation
*/
class CsvLine
: util::NonCopyable
, MatchSeq
{
string const& line_;
size_t field_;
iterator curr_;
size_t pos_;
public:
CsvLine(string const& line)
: MatchSeq(line, ACCEPT_FIELD)
, line_{line}
, field_{0}
, curr_{MatchSeq::begin()}
, pos_{0}
{ }
explicit operator bool()
{
return isValid();
}
string operator*()
{
if (not isValid()) fail();
auto& mat = *curr_;
return mat[2].matched? mat[2]
: mat[1];
}
void operator++()
{
if (not isValid())
fail();
pos_ = curr_->position() + curr_->length();
++curr_;
if (pos_ < line_.length() and not isValid())
fail();
++field_;
}
size_t getParsedFieldCnt()
{
return field_;
}
bool isValid()
{
return curr_ != end()
and curr_->position() == pos_
and not curr_->empty();
}
void fail()
{
if (curr_ == end())
if (pos_ >= line_.length())
throw error::Invalid("Only "+formatVal(field_)+" data fields. Line:"+line_);
else
throw error::Invalid("Garbage after last field. Line:"
+line_.substr(0,pos_)+"|↯|"+line_.substr(pos_));
else
if (pos_ != curr_->position())
throw error::Invalid("Garbage before field("+formatVal(field_+1)+"):"
+line_.substr(0,pos_)+"|↯|"+line_.substr(pos_));
else
throw error::Invalid("CSV parse floundered. Line:"+line_);
}
};
/**
* Format and append a data value to a CSV string representation
*/
template<typename VAL>
inline void appendCsvField(string& csv, VAL const& val)
{
csv += (0 == csv.length()? "":",")
+ format4Csv(val);
}
} // namespace util
#endif /*TESTRUNNER_UTIL_CSV_HPP_*/

View file

@ -0,0 +1,173 @@
/*
* data - read and write a table with CSV data
*
* Copyright 2021, Hermann Vosseler <Ichthyostega@web.de>
*
* This file is part of the Yoshimi-Testsuite, which is free software:
* you can redistribute and/or modify it under the terms of the GNU
* General Public License as published by the Free Software Foundation,
* either version 3 of the License, or (at your option) any later version.
*
* Yoshimi-Testsuite is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with yoshimi. If not, see <http://www.gnu.org/licenses/>.
***************************************************************/
/** @file data.hpp
** Manage a table with time sequence data, stored persistently as CSV.
** The Yoshimi Testsuite captures timing data, to detect the possible performance
** impact of code reworking. Due to the statistical nature of timing measurements
** and the dependency on the run environment, it is not sufficient just to rely on
** a single measurement to establish the runtime characteristics of a given test;
** rather, the statistical trend of the timings observed over several consecutive
** runs of the Testsuite must be established. Short of using a database, a modest
** amount of numeric data can be maintained in CSV files, which also allows for
** further manual evaluation within a spreadsheet or statistics application.
**
** As a fundamental building block, this header provides a data table template
** with a flexible column configuration to hold arbitrary, explicitly typed values.
** This solution is statically typed and does not carry any runtime type information;
** the actual data table object is then defined and accessed by means of _accessor_
** components for each column of data. A tuple of _current values_ corresponding to
** the most recent row of data can be accessed directly through these sub-components.
**
** @todo WIP as of 9/21
** @see TimingObservation.hpp usage
**
*/
#ifndef TESTRUNNER_UTIL_DATA_HPP_
#define TESTRUNNER_UTIL_DATA_HPP_
#include "util/nocopy.hpp"
#include "util/error.hpp"
#include "util/utils.hpp"
#include "util/csv.hpp"
//#include <string>
//#include <memory>
#include <utility>
#include <vector>
#include <tuple>
namespace util {
using std::tuple;
using std::vector;
using util::isnil;
/**
* perform some arbitrary operation on each element of a tuple.
* @note the given functor must be generic, since each position of the tuple
* may hold a data element of a different type.
* @remark credits to David Vandevoorde (member of C++ committee) for using
* std::apply to unpack the tuple's contents into an argument pack and
* then using a fold expression with the comma operator.
*/
template<class FUN, typename...ELMS>
void forEach(tuple<ELMS...>&& tuple, FUN fun)
{
std::apply([&fun](auto&... elms)
{
(fun(elms), ...);
}
,tuple);
}
template<typename VAL>
struct Column : util::NonCopyable
{
string header;
vector<VAL> data;
Column(string headerID)
: header{headerID}
, data{}
{ }
VAL& get()
{
if (isnil(data))
throw error::State("No rows in DataTable yet");
return data.back();
}
operator VAL&()
{
return get();
}
template<typename X>
VAL& operator=(X&& newVal)
{
return get() = std::forward<X>(newVal);
}
};
template<class TAB>
class DataFile
: public TAB
, util::NonCopyable
{
public:
DataFile()
{
newRow();
}
void newRow()
{
forEach(TAB::allColumns(),
[](auto& col)
{
col.data.resize(col.data.size()+1);
});
}
void reserve(size_t expectedCapacity)
{
forEach(TAB::allColumns(),
[=](auto& col)
{
col.data.reserve(expectedCapacity);
});
}
template<size_t i>
decltype(auto) getCol()
{
return std::get<i>(TAB::allColumns());
}
template<size_t i>
decltype(auto) getStorage()
{
return getCol<i>().data;
}
template<size_t i>
string getHeader()
{
return getCol<i>().header;
}
};
} // namespace util
#endif /*TESTRUNNER_UTIL_DATA_HPP_*/

View file

@ -52,10 +52,6 @@ struct MatchSeq
iterator end() { return iterator(); }
};
/**
*/
MatchSeq allMatches(std::regex regex);
}//(End)namespace util
#endif /*TESTRUNNER_UTIL_PARSE_HPP_*/