lumiera_/src/lib/stat/csv.hpp
Ichthyostega 8e33194882 Scheduler-test: settle definition of specific test setup and data
After a lot of further tinkering, seemingly arriving at a
somewhat satisfactory solution for the layout and arrangement of
test definitions and especially the table for measurement series.

While the complete setup remains fragile indeed, and complexity is more
hidden than reduced — the pragmatic compromise established yesterday
at least allows to reduce the amount of boilerplate in the test or
measurement setup to make the actual specifics stand out clearly.

----

As an aside, the usage of the `DataFile` type imported from Yoshimi-test
recently was re-shaped more towards a generic handling of tabular data with
CSV storage option; thus renaming the type now into `DataTable`.
Persistent storage is now just one option, while another usage pattern
compounds observation data into table rows, which are then directly
rendered into a CSV string, e.g. for visualisation as Gnuplot graph.
2024-04-08 03:58:15 +02:00

340 lines
9.7 KiB
C++

/*
CSV.hpp - Parser and Encoder for CSV data
Copyright (C) Lumiera.org
2022, Hermann Vosseler <Ichthyostega@web.de>
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of
the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/** @file csv.hpp
** Encoding and decoding of data into CSV format.
** The sequence of values transformed here is part of a data table, with columns
** holding data of various primitive value types; persisted CSV data is human readable,
** can be checked into Git and loaded into various spreadsheet and statistics applications.
**
** # CSV Format
** Even while there is a standard defined in [RFC 4180], a plethora of format variations
** can be found _in the wild._ Since the primary purpose of this implementation is _to read
** back our own data,_ by deliberate choice only one single form of CSV is accepted.
** - first line is a header line and used to verify the number of columns
** - one record per line, embedded line breaks prohibited
** - fields separated by comma, semicolon tolerated
** - fields are trimmed and may be empty
** - a field may be double quoted
** - only quoted fields may contain whitespace or comma
** - no escaping of quotes, i.e. no quotes within quotes
** [RFC 4180]: https://datatracker.ietf.org/doc/html/rfc4180
** @see lib::stat::DataTable
**
*/
#ifndef LIB_STAT_CSV_H
#define LIB_STAT_CSV_H
#include "lib/error.hpp"
#include "lib/null-value.hpp"
#include "lib/meta/tuple-helper.hpp"
#include "lib/format-string.hpp"
#include "lib/regex.hpp"
#include <limits>
#include <string>
#include <vector>
namespace lib {
namespace stat {
namespace error = lumiera::error;
using util::_Fmt;
using util::toString;
using std::string;
using std::regex;
namespace { // Implementation details...
const string MATCH_SINGLE_TOKEN { R"~(([^,;"\s]*)\s*)~"};
const string MATCH_QUOTED_TOKEN { R"~("([^"]*)"\s*)~" };
const string MATCH_DELIMITER { R"~((?:^|,|;)\s*)~" };
const regex FIND_DELIMITER_TOKEN{"[,;]"};
const regex ACCEPT_FIELD{ MATCH_DELIMITER + "(?:"+ MATCH_QUOTED_TOKEN +"|"+ MATCH_SINGLE_TOKEN +")"
, regex::optimize};
template<typename VAL>
inline string
format4Csv (VAL const& val)
{
if constexpr (std::is_floating_point_v<VAL>)
return util::showDecimal (val);
// standard textual rendering
auto res = util::toString (val);
if constexpr (std::is_arithmetic_v<VAL>)
return res; // includes bool
else
return '"'+res+'"';
}
}//(End)Implementation
/**
* Format and append a data value to a CSV string representation
*/
template<typename VAL>
inline void
appendCsvField (string& csv, VAL const& val)
{
csv += (0 == csv.length()? "":",")
+ format4Csv(val);
}
/**
* A string with the ability to construct
* or append the CSV-rendering of data fields
*/
struct CSVLine
: std::string
{
using value_type = string;
template<typename...ELMS, typename = meta::disable_if_self<CSVLine,ELMS...>>
CSVLine (ELMS&& ...items)
{
meta::forEach (std::make_tuple (items...)
,[this](auto const& it){ *this += it; }
);
}
// Standard copy acceptable
template<typename X>
CSVLine&
operator+= (X const& x)
{
stat::appendCsvField (*this, x);
return *this;
}
};
/**
* Wrapper to simplify notation in tests.
* Accepts data suitable for representation as CSV
* - either as an std::initializer_list<string> for pre-formatted rows
* - or as a sequence of strings (words) to form a single header line
* - or a list of strings for the header, and then a list of data tuples,
* which will be rendered into data rows in CSV format
* Since this wrapper is-a `vector<string>`, the rows can be retrieved
* directly and then rendered, or the \ref operator string() can be used
* to retrieve the complete data set in a single string of data lines.
*/
struct CSVData
: std::vector<CSVLine>
{
using VecCSV = std::vector<CSVLine>;
CSVData (std::initializer_list<string> lines)
: VecCSV(detectHeader(lines))
{ }
CSVData (std::initializer_list<string> header
,std::initializer_list<CSVLine> data)
{
reserve (data.size()+1);
appendHeaderLine(*this, header);
for (CSVLine const& line : data)
emplace_back (line);
}
// standard copy operations acceptable
operator string() const
{
std::ostringstream buffer;
for (string const& line : *this)
buffer << line << '\n';
return buffer.str();
}
private:
static bool
containsCSV (string const& line)
{
return std::regex_search (line, FIND_DELIMITER_TOKEN);
}
static void
appendHeaderLine (VecCSV& data, std::initializer_list<string> const& input)
{
CSVLine header;
for (string const& s : input)
header += s;
data.emplace_back (move(header));
}
static VecCSV
detectHeader (std::initializer_list<string> input)
{
VecCSV csv;
if (input.size() > 0 and containsCSV(*input.begin()))
{// the first line is a header => slurp in all as lines
csv.reserve (input.size());
for (string const& s : input)
csv.emplace_back (s);
}
else // combine all strings into a single header line
appendHeaderLine (csv, input);
return csv;
}
};
/** parse string representation into typed value */
template<typename TAR>
inline TAR
parseAs (string const& encodedVal)
{
std::istringstream converter{encodedVal};
TAR value;
converter >> value;
if (converter.fail())
throw error::Invalid{_Fmt{"unable to parse \"%s\""} % encodedVal};
return value;
}
template<>
inline bool
parseAs (string const& encodedBool)
{
return util::boolVal(encodedBool);
}
template<>
inline string
parseAs (string const& string)
{
return string; // pass-through (even if empty)
}
/**
* Parser to split one line of CSV data into fields.
* @remarks iterator-like throw-away object
* - the `bool` evaluation indicates more fields to extract
* - dereference to get the field as string
* - increment to move to the next field
* @throws error::Invalid on CSV format violation
*/
class CsvParser
: public util::RegexSearchIter
{
string const& line_{};
size_t field_{0};
size_t pos_{0};
util::RegexSearchIter const& curr() const { return *this; }
util::RegexSearchIter end() const { return util::RegexSearchIter{}; }
public:
CsvParser()
: line_{lib::NullValue<string>::get()}
{ }
CsvParser (string& line) // NOTE: string and reg-exp must exist elsewhere
: RegexSearchIter(line, ACCEPT_FIELD)
, line_{line}
{ }
explicit operator bool() const
{
return isValid();
}
ENABLE_USE_IN_STD_RANGE_FOR_LOOPS (CsvParser);
string operator*() const
{
if (not isValid()) fail();
auto& mat = *curr();
return mat[2].matched? mat[2]
: mat[1];
}
void
operator++()
{
if (not isValid())
fail();
pos_ = curr()->position() + curr()->length();
util::RegexSearchIter::operator ++();
if (pos_ < line_.length() and not isValid())
fail ();
++field_;
}
size_t
getParsedFieldCnt()
{
return field_;
}
bool
isValid() const
{
return curr() != end()
and pos_ == size_t(curr()->position())
and not curr()->empty();
}
bool
isParseFail() const
{
return curr() != end()
and not isValid();
}
void
fail() const
{
if (curr() == end())
if (pos_ >= line_.length())
throw error::Invalid{_Fmt{"Only %d data fields. Line:%s"}
% field_ % line_};
else
throw error::Invalid{_Fmt{"Garbage after last field. Line:%s|↯|%s"}
% line_.substr(0,pos_) % line_.substr(pos_)};
else
if (pos_ != size_t(curr()->position()))
throw error::Invalid{_Fmt{"Garbage before field(%d):%s|↯|%s"}
% (field_+1)
% line_.substr(0,pos_) % line_.substr(pos_)};
throw error::Invalid{_Fmt{"CSV parse floundered. Line:%s"} % line_};
}
};
}} // namespace lib::stat
#endif /*LIB_STAT_CSV_H*/