Yet another chainsaw massacre. One of the most obnoxious annoyances with C++ metaprogramming is the need to insert `typename` and `template` qualifiers into most definitions, to help the compiler to cope with the syntax, which is not context-free. The recent standards adds several clarifications, so that most of these qualifiers are redundant now, at least at places where it is unambiguously clear that only a type can be given. GCC already supports most of these relaxing rules (Clang unfortunately lags way behind with support of newer language features...)
489 lines
15 KiB
C++
489 lines
15 KiB
C++
/*
|
||
DATA.hpp - read and write a table with CSV data
|
||
|
||
Copyright (C)
|
||
2022, Hermann Vosseler <Ichthyostega@web.de>
|
||
|
||
**Lumiera** is free software; you can redistribute it and/or modify it
|
||
under the terms of the GNU General Public License as published by the
|
||
Free Software Foundation; either version 2 of the License, or (at your
|
||
option) any later version. See the file COPYING for further details.
|
||
|
||
*/
|
||
|
||
|
||
/** @file data.hpp
|
||
** Manage a table with data records, stored persistently as CSV.
|
||
** In the context of observations, configuration, calibration and QA, a series
|
||
** of measurement data taken over time is often evaluated statistically, to distill
|
||
** typical averages, variances and trends. Short of using a database, a modest
|
||
** amount of numeric data can be maintained in CSV files, which also allows for
|
||
** further manual evaluation within a spreadsheet or statistics application.
|
||
** The CSV format as such can be quite elaborate, yet for the purpose of
|
||
** saving and later reading back some values generated by the application
|
||
** itself, supporting a limited format flavour is sufficient:
|
||
** - first line is a header line and used to verify the storage format
|
||
** - one record per line, embedded line breaks prohibited
|
||
** - fields separated by comma, semicolon tolerated
|
||
** - fields are trimmed and may be empty
|
||
** - a field may be double quoted
|
||
** - only quoted fields may contain whitespace or comma
|
||
** - no escaping of quotes, i.e. no quotes within quotes
|
||
**
|
||
** As a fundamental building block, this header provides a data table template
|
||
** with flexible column configuration to hold arbitrary, explicitly typed values.
|
||
** This solution is statically typed and does not carry any runtime type information;
|
||
** the actual data table object is then defined and accessed by means of _accessor_
|
||
** components for each column of data. A tuple of _current values_ corresponding to
|
||
** the most recent row of data can be accessed directly through these sub-components.
|
||
**
|
||
** # Usage
|
||
** Create an actual instantiation of the DataTable template, passing a structure
|
||
** with util::Column descriptors. You may then directly access the values of the
|
||
** _actual column_ or save/load from a persistent CSV file.
|
||
** @note mandatory to define a method `allColumns()`
|
||
** \code
|
||
** struct Storage
|
||
** {
|
||
** Column<string> name{"theName"};
|
||
** Column<int> n{"counter"};
|
||
** Column<double> x{"X value"};
|
||
** Column<double> y{"Y value"};
|
||
**
|
||
** auto allColumns(){ return std::tie(name,n,x,y); }
|
||
** };
|
||
**
|
||
** using Dataz = lib::stat::DataTable<Storage>;
|
||
**
|
||
** Dataz daz("filename.csv");
|
||
**
|
||
** daz.x = 123e-4;
|
||
** daz.y = -12345e-6;
|
||
**
|
||
** std::vector<int>& counters = daz.n.data;
|
||
** \endcode
|
||
** \par Variations
|
||
** The standard case is to have a table backed by persistent file storage,
|
||
** which can be initially empty. Under some conditions, especially for tests
|
||
** - the DataTable can be created without filename
|
||
** - it can be created from a CSVData, which is a `std::vector` of CSV-strings
|
||
** - it can be [rendered into CSV strings](\ref #renderCSV)
|
||
** - a (new) storage file name can be [given later](\ref saveAs)
|
||
** @see DataCSV_test
|
||
**
|
||
*/
|
||
|
||
|
||
|
||
#ifndef LIB_STAT_DATA_H
|
||
#define LIB_STAT_DATA_H
|
||
|
||
|
||
#include "lib/error.hpp"
|
||
#include "lib/nocopy.hpp"
|
||
#include "lib/stat/csv.hpp"
|
||
#include "lib/file.hpp"
|
||
#include "lib/format-string.hpp"
|
||
#include "lib/util.hpp"
|
||
|
||
#include <type_traits>
|
||
#include <utility>
|
||
#include <fstream>
|
||
#include <vector>
|
||
#include <string>
|
||
#include <limits>
|
||
#include <deque>
|
||
|
||
|
||
namespace lib {
|
||
namespace stat{
|
||
|
||
namespace error = lumiera::error;
|
||
|
||
using std::move;
|
||
using std::tuple;
|
||
using std::vector;
|
||
using std::string;
|
||
using util::isnil;
|
||
using util::unConst;
|
||
using util::_Fmt;
|
||
using util::min;
|
||
|
||
|
||
|
||
|
||
/**
|
||
* Descriptor and Accessor for a data column within a DataTable table.
|
||
* @tparam VAL type of values contained within this column;
|
||
* this type must be _default constructible_ and _copyable._
|
||
*/
|
||
template<typename VAL>
|
||
struct Column
|
||
: util::MoveOnly
|
||
{
|
||
string header;
|
||
vector<VAL> data;
|
||
|
||
using ValueType = VAL;
|
||
|
||
|
||
Column (string headerID)
|
||
: header{headerID}
|
||
, data{}
|
||
{ }
|
||
|
||
|
||
VAL&
|
||
get()
|
||
{
|
||
if (isnil (data))
|
||
throw error::State{"No rows in DataTable yet"};
|
||
return data.back();
|
||
}
|
||
|
||
operator VAL&()
|
||
{
|
||
return get();
|
||
}
|
||
|
||
operator VAL const&() const
|
||
{
|
||
return unConst(this)->get();
|
||
}
|
||
|
||
template<typename X>
|
||
VAL& operator= (X&& newVal)
|
||
{
|
||
return get() = std::forward<X> (newVal);
|
||
}
|
||
};
|
||
|
||
|
||
|
||
|
||
|
||
/******************************************************************************************//**
|
||
* Table with data values, stored persistently as CSV file.
|
||
* Each row within the table represents a data record, holding a sequence
|
||
* of values. Values are statically typed per column, i.e. one column may hold
|
||
* strings, while the next column holds doubles. For actual usage it is thus necessary
|
||
* to define the column layout, through a sequence of [column Descriptors](\ref util::Column).
|
||
*
|
||
* # Usage
|
||
* Actually those Column objects serve as descriptors, but also as accessors — and they hold
|
||
* the actual data storage for each column, which is a `std::vector<VAL>` of value type `VAL`.
|
||
* There is always a _current record_ — corresponding to the actual data value and the newest
|
||
* data row. For persistent storage, the sequence of rows is _reversed,_ so the newest data
|
||
* appears at the top of the CSV file.
|
||
* @tparam TAB a struct comprised of several Column objects, which hold the data and
|
||
* provide access to values of this specific column. Moreover, this type _must define_
|
||
* a function `allColumns()` to return a tuple with references to these column fields;
|
||
* the order of fields within this tuple also defines the order of columns
|
||
* within the table and persistent CSV storage.
|
||
*/
|
||
template<class TAB>
|
||
class DataTable
|
||
: public TAB
|
||
, util::MoveOnly
|
||
{
|
||
fs::path filename_;
|
||
|
||
public:
|
||
DataTable(fs::path csvFile ="")
|
||
: filename_{fs::consolidated (csvFile)}
|
||
{
|
||
loadData();
|
||
}
|
||
|
||
DataTable (CSVData const& csv)
|
||
: filename_{}
|
||
{
|
||
appendFrom (csv);
|
||
}
|
||
|
||
|
||
/* === Data Access === */
|
||
|
||
static constexpr size_t columnCnt = std::tuple_size_v<decltype(std::declval<TAB>().allColumns())>;
|
||
|
||
bool
|
||
empty() const
|
||
{
|
||
return 0 == this->size();
|
||
}
|
||
|
||
size_t
|
||
size() const
|
||
{
|
||
if (0 == columnCnt) return 0;
|
||
size_t rowCnt = std::numeric_limits<size_t>::max();
|
||
forAllColumns(
|
||
[&](auto& col)
|
||
{
|
||
rowCnt = min (rowCnt, col.data.size());
|
||
}); // the smallest number of data points found in any column
|
||
return rowCnt;
|
||
}
|
||
|
||
CSVData
|
||
renderCSV() const
|
||
{
|
||
CSVData csv{{}};
|
||
csv.reserve (size()+1);
|
||
auto header = generateHeaderSpec();
|
||
using std::swap;
|
||
swap (csv[0], header);
|
||
for (uint i=0; i < size(); ++i)
|
||
csv.emplace_back (formatCSVRow(i));
|
||
return csv;
|
||
}
|
||
|
||
|
||
|
||
/* === Manipulation === */
|
||
|
||
void
|
||
newRow()
|
||
{
|
||
forAllColumns(
|
||
[siz = size()+1]
|
||
(auto& col)
|
||
{
|
||
col.data.resize (siz);
|
||
});
|
||
}
|
||
|
||
void
|
||
dupRow()
|
||
{
|
||
if (empty())
|
||
newRow();
|
||
else
|
||
forAllColumns(
|
||
[](auto& col)
|
||
{
|
||
col.data.emplace_back (col.data.back());
|
||
});
|
||
}
|
||
|
||
void
|
||
dropLastRow()
|
||
{
|
||
if (not empty())
|
||
forAllColumns(
|
||
[](auto& col)
|
||
{
|
||
size_t siz = col.data.size();
|
||
col.data.resize (siz>0? siz-1 : 0);
|
||
});
|
||
}
|
||
|
||
void
|
||
reserve (size_t expectedCapacity)
|
||
{
|
||
forAllColumns(
|
||
[=](auto& col)
|
||
{
|
||
col.data.reserve(expectedCapacity);
|
||
});
|
||
}
|
||
|
||
void
|
||
clear()
|
||
{
|
||
forAllColumns(
|
||
[](auto& col)
|
||
{
|
||
col.data.clear();
|
||
});
|
||
}
|
||
|
||
void
|
||
appendFrom (CSVData const& csv)
|
||
{
|
||
if (isnil (csv)) return;
|
||
verifyHeaderSpec (csv[0]);
|
||
for (size_t row=1; row<csv.size(); ++row)
|
||
if (not isnil (csv[row]))
|
||
appendRowFromCSV (csv[row]);
|
||
}
|
||
|
||
|
||
|
||
/** @param lineLimit number of rows to retain, back from the newest */
|
||
void
|
||
save (size_t lineLimit =std::numeric_limits<size_t>::max()
|
||
,bool backupOld =false)
|
||
{
|
||
if (filename_.empty())
|
||
throw error::Logic{"Unable to save DataFile without filename given."};
|
||
|
||
fs::path newFilename{filename_};
|
||
newFilename += ".tmp";
|
||
|
||
std::ofstream csvFile{newFilename, std::ios_base::out | std::ios_base::trunc};
|
||
if (not csvFile.good())
|
||
throw error::State{_Fmt{"Unable to create CSV output file %s"}
|
||
% newFilename};
|
||
saveData (csvFile, lineLimit);
|
||
|
||
if (backupOld)
|
||
{
|
||
fs::path oldFile{filename_};
|
||
oldFile += ".bak";
|
||
if (fs::exists (filename_))
|
||
fs::rename (filename_, oldFile);
|
||
}
|
||
fs::rename (newFilename, filename_);
|
||
filename_ = fs::consolidated(filename_);
|
||
} // lock onto absolute path
|
||
|
||
|
||
void
|
||
saveAs (fs::path newStorage
|
||
,size_t lineLimit =std::numeric_limits<size_t>::max())
|
||
{
|
||
newStorage = fs::consolidated (newStorage);
|
||
if (fs::exists(newStorage))
|
||
throw error::Invalid{_Fmt{"Storing DataFile rejected: target %s exists already"}
|
||
% newStorage};
|
||
if (not (newStorage.parent_path().empty()
|
||
or fs::exists(newStorage.parent_path())))
|
||
throw error::Invalid{_Fmt{"DataFile(%s) placed into nonexistent directory %s"}
|
||
% newStorage.filename() % newStorage.parent_path()};
|
||
filename_ = newStorage;
|
||
save (lineLimit);
|
||
}
|
||
|
||
|
||
private: /* === Implementation === */
|
||
|
||
/** apply a generic Lambda to all columns */
|
||
template<class OP>
|
||
void
|
||
forAllColumns (OP&& doIt) const
|
||
{
|
||
lib::meta::forEach (unConst(this)->allColumns()
|
||
,std::forward<OP> (doIt));
|
||
}
|
||
|
||
void
|
||
loadData()
|
||
{
|
||
if (not (filename_.parent_path().empty()
|
||
or fs::exists(filename_.parent_path())))
|
||
throw error::Invalid{_Fmt{"DataFile(%s) placed into nonexistent directory %s"}
|
||
% filename_.filename() % filename_.parent_path()};
|
||
if (not fs::exists(filename_))
|
||
return; // leave the table empty
|
||
|
||
std::ifstream csvFile{filename_};
|
||
if (not csvFile.good())
|
||
throw error::Config{_Fmt{"unable to read CSV data file %s"} % filename_};
|
||
|
||
std::deque<string> rawLines;
|
||
for (string line; std::getline(csvFile, line); )
|
||
rawLines.emplace_back (move(line));
|
||
|
||
if (rawLines.size() < 1) return;
|
||
verifyHeaderSpec (rawLines[0]);
|
||
|
||
// we know the number of rows now...
|
||
reserve (rawLines.size() - 1);
|
||
|
||
// storage in file is backwards, with newest data on top
|
||
for (size_t row = rawLines.size()-1; 0<row; --row)
|
||
if (not isnil(rawLines[row]))
|
||
appendRowFromCSV (rawLines[row]);
|
||
}
|
||
|
||
|
||
void
|
||
saveData (std::ofstream& csvFile, size_t lineLimit)
|
||
{
|
||
csvFile << generateHeaderSpec() << "\n";
|
||
if (empty())
|
||
return;
|
||
lineLimit = size() > lineLimit? size()-lineLimit : 0;
|
||
// store newest data first, possibly discard old data
|
||
for (size_t row = size(); lineLimit < row; --row)
|
||
csvFile << formatCSVRow(row-1) << "\n";
|
||
}
|
||
|
||
|
||
void
|
||
verifyHeaderSpec (string headerLine)
|
||
{
|
||
CsvParser header{headerLine};
|
||
forAllColumns(
|
||
[&](auto& col)
|
||
{
|
||
if (*header != col.header)
|
||
throw error::Invalid{_Fmt{"Header mismatch in CSV file %s. "
|
||
"Expecting column(%s) but found \"%s\""}
|
||
% filename_ % col.header % *header};
|
||
++header;
|
||
});
|
||
}
|
||
|
||
CSVLine
|
||
generateHeaderSpec() const
|
||
{
|
||
CSVLine csv;
|
||
forAllColumns(
|
||
[&](auto& col)
|
||
{
|
||
csv += col.header;
|
||
});
|
||
return csv;
|
||
}
|
||
|
||
|
||
void
|
||
appendRowFromCSV (string line)
|
||
{
|
||
newRow();
|
||
CsvParser csv(line);
|
||
forAllColumns(
|
||
[&](auto& col)
|
||
{
|
||
if (not csv)
|
||
{
|
||
if (csv.isParseFail())
|
||
csv.fail();
|
||
else
|
||
throw error::Invalid{_Fmt{"Insufficient data; only %d fields, %d expected. Line:%s"}
|
||
% csv.getParsedFieldCnt() % columnCnt % line};
|
||
}
|
||
|
||
using Value = std::remove_reference<decltype(col)>::type::ValueType;
|
||
col.get() = parseAs<Value>(*csv);
|
||
++csv;
|
||
});
|
||
if (csv)
|
||
throw error::Invalid{_Fmt{"Excess data fields in CSV. Expect %d fields. Line:%s"}
|
||
% columnCnt % line};
|
||
}
|
||
|
||
|
||
CSVLine
|
||
formatCSVRow (size_t rownum) const
|
||
{
|
||
if (this->empty())
|
||
throw error::Logic{"Attempt to access data from empty DataTable."};
|
||
if (rownum >= this->size())
|
||
throw error::Logic{_Fmt{"Attempt to access row #%d beyond range [0..%d]."}
|
||
% rownum % (size()-1)};
|
||
|
||
CSVLine csvLine;
|
||
forAllColumns(
|
||
[&](auto& col)
|
||
{
|
||
csvLine += col.data.at(rownum);
|
||
});
|
||
return csvLine;
|
||
}
|
||
};
|
||
|
||
}} // namespace lib::stat
|
||
#endif /*LIB_STAT_DATA_H*/
|