parse a simplified variant of CSV

rationale: the purpose is to read back our own values, yet it should be reasonably standard, to allow investigating and tweaking values with a spreadsheet - first line is a header line and used to verify the number of columns - one record per line, embedded line breaks prohibited - fields separated by comma, semicolon tolerated - fields are trimmed and may be empty - a field may be double quoted - only quoted fields may contain whitespace or comma - no escaping of quotes, i.e. no quotes within quotes
2021-09-16 23:54:11 +02:00 · 2021-09-16 23:54:11 +02:00 · b6a2eec94c
commit b6a2eec94c
parent a523861428
3 changed files with 359 additions and 4 deletions
--- a/yoshimi-testrunner/src/util/csv.hpp
+++ b/yoshimi-testrunner/src/util/csv.hpp
@ -0,0 +1,186 @@
+/*
+ *  csv - parser and encoder
+ *
+ *  Copyright 2021, Hermann Vosseler <Ichthyostega@web.de>
+ *
+ *  This file is part of the Yoshimi-Testsuite, which is free software:
+ *  you can redistribute and/or modify it under the terms of the GNU
+ *  General Public License as published by the Free Software Foundation,
+ *  either version 3 of the License, or (at your option) any later version.
+ *
+ *  Yoshimi-Testsuite is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with yoshimi.  If not, see <http://www.gnu.org/licenses/>.
+ ***************************************************************/
+
+
+/** @file csv.hpp
+ ** Encoding and decoding of data into CSV format.
+ ** The sequence of values transformed here is part of a data table, with columns
+ ** holding data of various primitive value types; persisted CSV data is human readable,
+ ** can be checked into Git and loaded into various spreadsheet and statistics applications.
+ **
+ ** # CSV Format
+ ** Even while there is a standard defined in [RFC 4180], a plethora of format variations
+ ** can be found _in the wild._ Since the primary purpose of this implementation is _to read
+ ** back our own data,_ by deliberate choice only one single form of CSV is accepted.
+ ** - first line is a header line and used to verify the number of columns
+ ** - one record per line, embedded line breaks prohibited
+ ** - fields separated by comma, semicolon tolerated
+ ** - fields are trimmed and may be empty
+ ** - a field may be double quoted
+ ** - only quoted fields may contain whitespace or comma
+ ** - no escaping of quotes, i.e. no quotes within quotes
+ ** [RFC 4180]: https://datatracker.ietf.org/doc/html/rfc4180
+ ** 
+ ** @todo WIP as of 9/21
+ ** @see util::DataFile used for [Timing statistics](\ref TimingObservation.hpp)
+ ** 
+ */
+
+
+
+#ifndef TESTRUNNER_UTIL_CSV_HPP_
+#define TESTRUNNER_UTIL_CSV_HPP_
+
+
+//#include "util/nocopy.hpp"
+#include "util/error.hpp"
+#include "util/format.hpp"
+#include "util/regex.hpp"
+//#include "util/utils.hpp"
+
+//#include <string>
+//#include <memory>
+//#include <utility>
+//#include <vector>
+
+
+namespace util {
+
+using std::regex;
+//using std::vector;
+//using util::isnil;
+
+namespace { // Implementation details...
+
+    const string MATCH_SINGLE_TOKEN {R"~(([^,;"\s]*)\s*)~"};
+    const string MATCH_QUOTED_TOKEN {R"~("([^"]*)"\s*)~"};
+    const string MATCH_DELIMITER    {R"~((?:^|,|;)\s*)~"};
+
+    const regex ACCEPT_FIELD{ MATCH_DELIMITER + "(?:"+ MATCH_QUOTED_TOKEN +"|"+ MATCH_SINGLE_TOKEN +")"
+                            , regex::optimize};
+
+
+    template<typename VAL>
+    inline string format4Csv(VAL const& val)
+    {
+        return util::str(val);
+    }
+    inline string format4Csv(string const& val)
+    {
+        return '"'+val+'"';
+    }
+
+}//(End)Implementation
+
+
+/**
+ * Parser to split one line of CSV data into fields.
+ * @remarks iterator-like throw-away object
+ *  - the `bool` evaluation indicates more fields to extract
+ *  - dereference to get the field as string
+ *  - increment to move to the next field
+ * @throws error::Invalid on CSV format violation
+ */
+class CsvLine
+    : util::NonCopyable
+    , MatchSeq
+{
+    string const& line_;
+    size_t   field_;
+    iterator curr_;
+    size_t   pos_;
+
+public:
+    CsvLine(string const& line)
+        : MatchSeq(line, ACCEPT_FIELD)
+        , line_{line}
+        , field_{0}
+        , curr_{MatchSeq::begin()}
+        , pos_{0}
+    { }
+
+    explicit operator bool()
+    {
+        return isValid();
+    }
+
+    string operator*()
+    {
+        if (not isValid()) fail();
+        auto& mat = *curr_;
+        return mat[2].matched? mat[2]
+                             : mat[1];
+    }
+
+    void operator++()
+    {
+        if (not isValid())
+            fail();
+        pos_ = curr_->position() + curr_->length();
+        ++curr_;
+        if (pos_ < line_.length() and not isValid())
+            fail();
+        ++field_;
+    }
+
+    size_t getParsedFieldCnt()
+    {
+        return field_;
+    }
+
+    bool isValid()
+    {
+        return curr_ != end()
+           and curr_->position() == pos_
+           and not curr_->empty();
+    }
+
+    void fail()
+    {
+        if (curr_ == end())
+            if (pos_ >= line_.length())
+                throw error::Invalid("Only "+formatVal(field_)+" data fields. Line:"+line_);
+            else
+                throw error::Invalid("Garbage after last field. Line:"
+                                    +line_.substr(0,pos_)+"|↯|"+line_.substr(pos_));
+        else
+            if (pos_ != curr_->position())
+                throw error::Invalid("Garbage before field("+formatVal(field_+1)+"):"
+                                    +line_.substr(0,pos_)+"|↯|"+line_.substr(pos_));
+            else
+                throw error::Invalid("CSV parse floundered. Line:"+line_);
+    }
+};
+
+
+
+/**
+ * Format and append a data value to a CSV string representation
+ */
+template<typename VAL>
+inline void appendCsvField(string& csv, VAL const& val)
+{
+    csv += (0 == csv.length()? "":",")
+         + format4Csv(val);
+}
+
+
+
+} // namespace util
+#endif /*TESTRUNNER_UTIL_CSV_HPP_*/
--- a/yoshimi-testrunner/src/util/data.hpp
+++ b/yoshimi-testrunner/src/util/data.hpp
@ -0,0 +1,173 @@
+/*
+ *  data - read and write a table with CSV data
+ *
+ *  Copyright 2021, Hermann Vosseler <Ichthyostega@web.de>
+ *
+ *  This file is part of the Yoshimi-Testsuite, which is free software:
+ *  you can redistribute and/or modify it under the terms of the GNU
+ *  General Public License as published by the Free Software Foundation,
+ *  either version 3 of the License, or (at your option) any later version.
+ *
+ *  Yoshimi-Testsuite is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with yoshimi.  If not, see <http://www.gnu.org/licenses/>.
+ ***************************************************************/
+
+
+/** @file data.hpp
+ ** Manage a table with time sequence data, stored persistently as CSV.
+ ** The Yoshimi Testsuite captures timing data, to detect the possible performance
+ ** impact of code reworking. Due to the statistical nature of timing measurements
+ ** and the dependency on the run environment, it is not sufficient just to rely on
+ ** a single measurement to establish the runtime characteristics of a given test;
+ ** rather, the statistical trend of the timings observed over several consecutive
+ ** runs of the Testsuite must be established. Short of using a database, a modest
+ ** amount of numeric data can be maintained in CSV files, which also allows for
+ ** further manual evaluation within a spreadsheet or statistics application.
+ ** 
+ ** As a fundamental building block, this header provides a data table template
+ ** with a flexible column configuration to hold arbitrary, explicitly typed values.
+ ** This solution is statically typed and does not carry any runtime type information;
+ ** the actual data table object is then defined and accessed by means of _accessor_
+ ** components for each column of data. A tuple of _current values_ corresponding to
+ ** the most recent row of data can be accessed directly through these sub-components.
+ ** 
+ ** @todo WIP as of 9/21
+ ** @see TimingObservation.hpp usage
+ ** 
+ */
+
+
+
+#ifndef TESTRUNNER_UTIL_DATA_HPP_
+#define TESTRUNNER_UTIL_DATA_HPP_
+
+
+#include "util/nocopy.hpp"
+#include "util/error.hpp"
+#include "util/utils.hpp"
+#include "util/csv.hpp"
+
+//#include <string>
+//#include <memory>
+#include <utility>
+#include <vector>
+#include <tuple>
+
+
+namespace util {
+
+using std::tuple;
+using std::vector;
+using util::isnil;
+
+
+
+/**
+ * perform some arbitrary operation on each element of a tuple.
+ * @note the given functor must be generic, since each position of the tuple
+ *       may hold a data element of a different type.
+ * @remark credits to David Vandevoorde (member of C++ committee) for using
+ *       std::apply to unpack the tuple's contents into an argument pack and
+ *       then using a fold expression with the comma operator.
+ */
+template<class FUN, typename...ELMS>
+void forEach(tuple<ELMS...>&& tuple, FUN fun)
+{
+    std::apply([&fun](auto&... elms)
+                    {
+                        (fun(elms), ...);
+                    }
+              ,tuple);
+}
+
+
+template<typename VAL>
+struct Column : util::NonCopyable
+{
+    string header;
+    vector<VAL> data;
+
+
+    Column(string headerID)
+        : header{headerID}
+        , data{}
+    { }
+
+    VAL& get()
+    {
+        if (isnil(data))
+            throw error::State("No rows in DataTable yet");
+        return data.back();
+    }
+
+    operator VAL&()
+    {
+        return get();
+    }
+
+    template<typename X>
+    VAL& operator=(X&& newVal)
+    {
+        return get() = std::forward<X>(newVal);
+    }
+};
+
+
+
+template<class TAB>
+class DataFile
+    : public TAB
+    , util::NonCopyable
+{
+
+public:
+    DataFile()
+    {
+        newRow();
+    }
+
+    void newRow()
+    {
+        forEach(TAB::allColumns(),
+                [](auto& col)
+                {
+                    col.data.resize(col.data.size()+1);
+                });
+    }
+
+    void reserve(size_t expectedCapacity)
+    {
+        forEach(TAB::allColumns(),
+                [=](auto& col)
+                {
+                    col.data.reserve(expectedCapacity);
+                });
+    }
+
+    template<size_t i>
+    decltype(auto) getCol()
+    {
+        return std::get<i>(TAB::allColumns());
+    }
+
+    template<size_t i>
+    decltype(auto) getStorage()
+    {
+        return getCol<i>().data;
+    }
+    template<size_t i>
+    string getHeader()
+    {
+        return getCol<i>().header;
+    }
+};
+
+
+
+} // namespace util
+#endif /*TESTRUNNER_UTIL_DATA_HPP_*/
--- a/yoshimi-testrunner/src/util/regex.hpp
+++ b/yoshimi-testrunner/src/util/regex.hpp
@ -52,10 +52,6 @@ struct MatchSeq
    iterator end()   { return iterator(); }
 };

-/**
- */
-MatchSeq allMatches(std::regex regex);
-

 }//(End)namespace util
 #endif /*TESTRUNNER_UTIL_PARSE_HPP_*/