complete implementation of CSV backed data table

...mostly routine after solving the tricky design challenge

- for usage, instantiate the template DataFile with a Storage record
- object is created with filename, and immediately slurps in existing data
- data storage is optimised for readability (not speed); newest value at top

Note: some kind of testcase is "hidden" in this changeset only;
next changeset will remove research-experiment.hpp
This commit is contained in:
Fischlurch 2021-09-17 17:57:55 +02:00
parent a42de3ee1b
commit 7639ac4172
4 changed files with 228 additions and 51 deletions

View file

@ -153,7 +153,7 @@ public:
bool isParseFail()
{
return curr_ == end()
return curr_ != end()
and not isValid();
}

View file

@ -19,7 +19,7 @@
/** @file data.hpp
** Manage a table with time sequence data, stored persistently as CSV.
** Manage a table with time series data, stored persistently as CSV.
** The Yoshimi Testsuite captures timing data, to detect the possible performance
** impact of code reworking. Due to the statistical nature of timing measurements
** and the dependency on the run environment, it is not sufficient just to rely on
@ -28,17 +28,42 @@
** runs of the Testsuite must be established. Short of using a database, a modest
** amount of numeric data can be maintained in CSV files, which also allows for
** further manual evaluation within a spreadsheet or statistics application.
**
**
** As a fundamental building block, this header provides a data table template
** with a flexible column configuration to hold arbitrary, explicitly typed values.
** with flexible column configuration to hold arbitrary, explicitly typed values.
** This solution is statically typed and does not carry any runtime type information;
** the actual data table object is then defined and accessed by means of _accessor_
** components for each column of data. A tuple of _current values_ corresponding to
** the most recent row of data can be accessed directly through these sub-components.
**
** @todo WIP as of 9/21
**
** # Usage
** Create an actual instantiation of the DataFile template, passing a structure
** with util::Column descriptors. You may then directly access the values of the
** _actual column_ or save/load from a persistent CSV file.
** @note mandatory to define a method `allColumns()`
** \code
** struct Storage
** {
** Column<string> name{"theName"};
** Column<int> n{"counter"};
** Column<double> x{"X value"};
** Column<double> y{"Y value"};
**
** auto allColumns(){ return std::tie(name,count,x,y); }
** };
**
** using Dataz = util::DataFile<Storage>;
**
** Dataz daz("filename.csv");
**
** daz.x = 123e-4;
** daz.y = -12345e-6;
**
** std::vector<int>& counters = daz.n.data;
** \endcode
**
** @see TimingObservation.hpp usage
**
**
*/
@ -50,14 +75,16 @@
#include "util/nocopy.hpp"
#include "util/error.hpp"
#include "util/utils.hpp"
#include "util/file.hpp"
#include "util/csv.hpp"
//#include <string>
//#include <memory>
#include <type_traits>
#include <utility>
#include <fstream>
#include <vector>
#include <string>
#include <limits>
#include <deque>
#include <tuple>
@ -65,14 +92,15 @@ namespace util {
using std::tuple;
using std::vector;
using std::string;
using util::isnil;
/**
* perform some arbitrary operation on each element of a tuple.
* Helper: perform some arbitrary operation on each element of a tuple.
* @note the given functor must be generic, since each position of the tuple
* may hold a data element of a different type.
* may hold a data element of different type.
* @remark credits to David Vandevoorde (member of C++ committee) for using
* std::apply to unpack the tuple's contents into an argument pack and
* then using a fold expression with the comma operator.
@ -88,6 +116,12 @@ void forEach(tuple<ELMS...>&& tuple, FUN fun)
}
/**
* Descriptor and Accessor for a data column within a DataFile table.
* @tparam VAL type of values contained within this column;
* this type must be _default constructible_ and _copyable._
*/
template<typename VAL>
struct Column : util::NonCopyable
{
@ -102,6 +136,7 @@ struct Column : util::NonCopyable
, data{}
{ }
VAL& get()
{
if (isnil(data))
@ -123,20 +158,76 @@ struct Column : util::NonCopyable
/**
* Table with data values, stored persistently as CSV file.
* Each row within the table represents a data record, holding a sequence
* of values. Values are statically typed per column, i.e. one column may hold
* strings, while the next column holds doubles. For actual usage it is thus necessary
* to define the column layout, through a sequence of [column Descriptors](\ref util::Column).
*
* # Usage
* Actually those Column objects serve as descriptors, but also as accessors -- and they hold
* the actual data storage for each column, which is a `std::vector<VAL>` of value type `VAL`.
* There is always a _current record_ -- corresponding to the actual data value and the newest
* data row. For persistent storage, the sequence of rows is _reversed,_ so the newest data
* appears at the top of the CSV file.
* @tparam TAB a struct comprised of several Column objects, which hold the data and
* provide access to values of this specific column. Moreover, this type _must define_
* a function `allColumns()` to return a tuple with references to these column fields;
* the order of fields within this tuple also defines the order of columns
* within the table and persistent CSV storage.
* @see suite::step::TimingObservation (relevant usage example)
*/
template<class TAB>
class DataFile
: public TAB
, util::NonCopyable
{
fs::path filename_;
public:
DataFile(fs::path csvFile)
: filename_{consolidated(csvFile)}
{
loadData();
}
/* === Data Access === */
static constexpr size_t columnCnt = std::tuple_size_v<decltype(std::declval<TAB>().allColumns())>;
DataFile()
bool empty() const
{
newRow();
return 0 == this->size();
}
size_t size() const
{
if (0 == columnCnt) return 0;
size_t rowCnt = std::numeric_limits<size_t>::max();
forEach(unConst(this)->allColumns(),
[&](auto& col)
{
rowCnt = std::min(rowCnt, col.data.size());
}); // the smallest number of data points found in any column
return rowCnt;
}
string dumpCSV() const
{
string csv;
for (uint i=0; i < size(); ++i)
csv += formatCSVRow(i) + '\n';
return csv;
}
/* === Manipulation === */
void newRow()
{
forEach(TAB::allColumns(),
@ -146,6 +237,18 @@ public:
});
}
void dupRow()
{
if (empty())
newRow();
else
forEach(TAB::allColumns(),
[](auto& col)
{
col.data.emplace_back(col.data.back());
});
}
void reserve(size_t expectedCapacity)
{
forEach(TAB::allColumns(),
@ -155,6 +258,98 @@ public:
});
}
/** @param lineLimit number of rows to retain, back from the newest */
void save(size_t lineLimit =std::numeric_limits<size_t>::max())
{
fs::path newFilename{filename_};
newFilename += ".tmp";
std::ofstream csvFile{newFilename, std::ios_base::out | std::ios_base::trunc};
if (not csvFile.good())
throw error::State("Unable to create CSV output file "+formatVal(newFilename));
saveData(csvFile, lineLimit);
fs::path oldFile{filename_};
oldFile += ".bak";
if (fs::exists(filename_))
fs::rename(filename_, oldFile);
fs::rename(newFilename, filename_);
}
private: /* === Implementation === */
void loadData()
{
if (not (filename_.parent_path().empty()
or fs::exists(filename_.parent_path())))
throw error::Invalid("DataFile("+formatVal(filename_.filename())
+") shall be placed into nonexistent directory "
+formatVal(filename_.parent_path()));
if (not fs::exists(filename_))
return; // leave the table empty
std::ifstream csvFile(filename_);
if (not csvFile.good())
throw error::Misconfig{"unable to read CSV data file "+formatVal(filename_)};
std::deque<string> rawLines;
for (string line; std::getline(csvFile, line); )
rawLines.emplace_back(move(line));
if (rawLines.size() < 1) return;
verifyHeaderSpec(rawLines[0]);
// we know the number of rows now...
reserve(rawLines.size() - 1);
// storage in file is backwards, with newest data on top
for (size_t row = rawLines.size()-1; 0<row; --row)
if (not isnil(rawLines[row]))
appendRowFromCSV(rawLines[row]);
}
void saveData(std::ofstream& csvFile, size_t lineLimit)
{
csvFile << generateHeaderSpec() << "\n";
if (empty())
return;
lineLimit = size() > lineLimit? size()-lineLimit : 0;
// store newest data first, possibly discard old data
for (size_t row = size(); lineLimit < row; --row)
csvFile << formatCSVRow(row-1) << "\n";
}
void verifyHeaderSpec(string headerLine)
{
CsvLine header(headerLine);
forEach(TAB::allColumns(),
[&](auto& col)
{
if (*header != col.header)
throw error::Invalid("Header mismatch in CSV file. "
"Expecting column("+formatVal(col.header)
+") but found "+formatVal(*header));
++header;
});
}
string generateHeaderSpec()
{
string csv;
forEach(TAB::allColumns(),
[&](auto& col)
{
appendCsvField(csv, col.header);
});
return csv;
}
void appendRowFromCSV(string line)
{
newRow();
@ -167,16 +362,20 @@ public:
csv.fail();
else
throw error::Invalid("Insufficient data; only "
+formatVal(csv.getParsedFieldCnt())
+" fields. Line="+line);
+str(csv.getParsedFieldCnt())
+" fields, "+str(columnCnt)
+" expected. Line="+line);
using Value = typename std::remove_reference<decltype(col)>::type::ValueType;
col.get() = parseAs<Value>(*csv);
++csv;
});
if (csv)
throw error::Invalid("Excess data fields in CSV. Expect "+str(columnCnt)+" fields. Line="+line);
}
string formatCSVRow(size_t rownum)
string formatCSVRow(size_t rownum) const
{
if (this->empty())
throw error::LogicBroken("Attempt to access data from empty DataTable.");
@ -185,47 +384,13 @@ public:
+" beyond range [0.."+str(size()-1)+"].");
string csvLine;
forEach(TAB::allColumns(),
forEach(unConst(this)->allColumns(),
[&](auto& col)
{
appendCsvField(csvLine, col.data.at(rownum));
});
return csvLine;
}
size_t size() const
{
if (0 == columnCnt) return 0;
size_t rowCnt = std::numeric_limits<size_t>::max();
forEach(unConst(this)->allColumns(),
[&](auto& col)
{
rowCnt = std::min(rowCnt, col.data.size());
});
return rowCnt;
}
bool empty() const
{
return 0 == this->size();
}
template<size_t i>
decltype(auto) getCol()
{
return std::get<i>(TAB::allColumns());
}
template<size_t i>
decltype(auto) getStorage()
{
return getCol<i>().data;
}
template<size_t i>
string getHeader()
{
return getCol<i>().header;
}
};

View file

@ -72,4 +72,13 @@ inline fs::path consolidated(fs::path rawPath)
}
}//(End)namespace fs
namespace util {
inline string formatVal(fs::path path)
{
return "\""+string{path}+"\"";
}
}//(End)namespace util
#endif /*TESTRUNNER_UTIL_TEE_HPP_*/

View file

@ -86,9 +86,12 @@ inline TAR parseAs(string const& encodedVal)
std::istringstream converter{encodedVal};
TAR value;
converter >> value;
if (converter.fail())
throw error::Invalid("unable to parse "+formatVal(encodedVal));
return value;
}
template<>
inline bool parseAs(string const& encodedBool)
{
return util::boolVal(encodedBool);