/* DATA.hpp - read and write a table with CSV data Copyright (C) Lumiera.org 2022, Hermann Vosseler This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /** @file data.hpp ** Manage a table with data records, stored persistently as CSV. ** In the context of observations, configuration, calibration and QA, a series ** of measurement data taken over time is often evaluated statistically, to distill ** typical averages, variances and trends. Short of using a database, a modest ** amount of numeric data can be maintained in CSV files, which also allows for ** further manual evaluation within a spreadsheet or statistics application. ** The CSV format as such can be quite elaborate, yet for the purpose of ** saving and later reading back some values generated by the application ** itself, supporting a limited format flavour is sufficient: ** - first line is a header line and used to verify the storage format ** - one record per line, embedded line breaks prohibited ** - fields separated by comma, semicolon tolerated ** - fields are trimmed and may be empty ** - a field may be double quoted ** - only quoted fields may contain whitespace or comma ** - no escaping of quotes, i.e. no quotes within quotes ** ** As a fundamental building block, this header provides a data table template ** with flexible column configuration to hold arbitrary, explicitly typed values. ** This solution is statically typed and does not carry any runtime type information; ** the actual data table object is then defined and accessed by means of _accessor_ ** components for each column of data. A tuple of _current values_ corresponding to ** the most recent row of data can be accessed directly through these sub-components. ** ** # Usage ** Create an actual instantiation of the DataFile template, passing a structure ** with util::Column descriptors. You may then directly access the values of the ** _actual column_ or save/load from a persistent CSV file. ** @note mandatory to define a method `allColumns()` ** \code ** struct Storage ** { ** Column name{"theName"}; ** Column n{"counter"}; ** Column x{"X value"}; ** Column y{"Y value"}; ** ** auto allColumns(){ return std::tie(name,n,x,y); } ** }; ** ** using Dataz = lib::stat::DataFile; ** ** Dataz daz("filename.csv"); ** ** daz.x = 123e-4; ** daz.y = -12345e-6; ** ** std::vector& counters = daz.n.data; ** \endcode ** \par Variations ** The standard case is to have a table backed by persistent file storage, ** which can be initially empty. Under some conditions, especially for tests ** - the DataFile can be created without filename ** - it can be created from a CSVData, which is a `std::vector` of CSV-strings ** - it can be [rendered into CSV strings](\ref #renderCSV) ** - a (new) storage file name can be [given later](\ref saveAs) ** @see DataCSV_test ** */ #ifndef LIB_STAT_DATA_H #define LIB_STAT_DATA_H #include "lib/error.hpp" #include "lib/nocopy.hpp" #include "lib/stat/csv.hpp" #include "lib/stat/file.hpp" #include "lib/format-string.hpp" #include "lib/util.hpp" #include #include #include #include #include #include #include namespace lib { namespace stat{ namespace error = lumiera::error; using std::move; using std::tuple; using std::vector; using std::string; using util::isnil; using util::unConst; using util::_Fmt; using util::min; /** * Descriptor and Accessor for a data column within a DataFile table. * @tparam VAL type of values contained within this column; * this type must be _default constructible_ and _copyable._ */ template struct Column : util::MoveOnly { string header; vector data; using ValueType = VAL; Column (string headerID) : header{headerID} , data{} { } VAL& get() { if (isnil (data)) throw error::State{"No rows in DataTable yet"}; return data.back(); } operator VAL&() { return get(); } operator VAL const&() const { return unConst(this)->get(); } template VAL& operator= (X&& newVal) { return get() = std::forward (newVal); } }; /******************************************************************************************//** * Table with data values, stored persistently as CSV file. * Each row within the table represents a data record, holding a sequence * of values. Values are statically typed per column, i.e. one column may hold * strings, while the next column holds doubles. For actual usage it is thus necessary * to define the column layout, through a sequence of [column Descriptors](\ref util::Column). * * # Usage * Actually those Column objects serve as descriptors, but also as accessors — and they hold * the actual data storage for each column, which is a `std::vector` of value type `VAL`. * There is always a _current record_ — corresponding to the actual data value and the newest * data row. For persistent storage, the sequence of rows is _reversed,_ so the newest data * appears at the top of the CSV file. * @tparam TAB a struct comprised of several Column objects, which hold the data and * provide access to values of this specific column. Moreover, this type _must define_ * a function `allColumns()` to return a tuple with references to these column fields; * the order of fields within this tuple also defines the order of columns * within the table and persistent CSV storage. */ template class DataFile : public TAB , util::MoveOnly { fs::path filename_; public: DataFile(fs::path csvFile ="") : filename_{fs::consolidated (csvFile)} { loadData(); } DataFile (CSVData const& csv) : filename_{} { appendFrom (csv); } /* === Data Access === */ static constexpr size_t columnCnt = std::tuple_size_v().allColumns())>; bool empty() const { return 0 == this->size(); } size_t size() const { if (0 == columnCnt) return 0; size_t rowCnt = std::numeric_limits::max(); forAllColumns( [&](auto& col) { rowCnt = min (rowCnt, col.data.size()); }); // the smallest number of data points found in any column return rowCnt; } CSVData renderCSV() const { CSVData csv{{}}; csv.reserve (size()+1); auto header = generateHeaderSpec(); std::swap (csv[0], header); for (uint i=0; i < size(); ++i) csv.emplace_back (formatCSVRow(i)); return csv; } /* === Manipulation === */ void newRow() { forAllColumns( [siz = size()+1] (auto& col) { col.data.resize (siz); }); } void dupRow() { if (empty()) newRow(); else forAllColumns( [](auto& col) { col.data.emplace_back (col.data.back()); }); } void dropLastRow() { if (not empty()) forAllColumns( [](auto& col) { size_t siz = col.data.size(); col.data.resize (siz>0? siz-1 : 0); }); } void reserve (size_t expectedCapacity) { forAllColumns( [=](auto& col) { col.data.reserve(expectedCapacity); }); } void clear() { forAllColumns( [](auto& col) { col.data.clear(); }); } void appendFrom (CSVData const& csv) { if (isnil (csv)) return; verifyHeaderSpec (csv[0]); for (size_t row=1; row::max() ,bool backupOld =false) { if (filename_.empty()) throw error::Logic{"Unable to save DataFile without filename given."}; fs::path newFilename{filename_}; newFilename += ".tmp"; std::ofstream csvFile{newFilename, std::ios_base::out | std::ios_base::trunc}; if (not csvFile.good()) throw error::State{_Fmt{"Unable to create CSV output file %s"} % newFilename}; saveData (csvFile, lineLimit); if (backupOld) { fs::path oldFile{filename_}; oldFile += ".bak"; if (fs::exists (filename_)) fs::rename (filename_, oldFile); } fs::rename (newFilename, filename_); filename_ = fs::consolidated(filename_); } // lock onto absolute path void saveAs (fs::path newStorage ,size_t lineLimit =std::numeric_limits::max()) { newStorage = fs::consolidated (newStorage); if (fs::exists(newStorage)) throw error::Invalid{_Fmt{"Storing DataFile rejected: target %s exists already"} % newStorage}; if (not (newStorage.parent_path().empty() or fs::exists(newStorage.parent_path()))) throw error::Invalid{_Fmt{"DataFile(%s) placed into nonexistent directory %s"} % newStorage.filename() % newStorage.parent_path()}; filename_ = newStorage; save (lineLimit); } private: /* === Implementation === */ /** apply a generic Lambda to all columns */ template void forAllColumns (OP&& doIt) const { lib::meta::forEach (unConst(this)->allColumns() ,std::forward (doIt)); } void loadData() { if (not (filename_.parent_path().empty() or fs::exists(filename_.parent_path()))) throw error::Invalid{_Fmt{"DataFile(%s) placed into nonexistent directory %s"} % filename_.filename() % filename_.parent_path()}; if (not fs::exists(filename_)) return; // leave the table empty std::ifstream csvFile{filename_}; if (not csvFile.good()) throw error::Config{_Fmt{"unable to read CSV data file %s"} % filename_}; std::deque rawLines; for (string line; std::getline(csvFile, line); ) rawLines.emplace_back (move(line)); if (rawLines.size() < 1) return; verifyHeaderSpec (rawLines[0]); // we know the number of rows now... reserve (rawLines.size() - 1); // storage in file is backwards, with newest data on top for (size_t row = rawLines.size()-1; 0 lineLimit? size()-lineLimit : 0; // store newest data first, possibly discard old data for (size_t row = size(); lineLimit < row; --row) csvFile << formatCSVRow(row-1) << "\n"; } void verifyHeaderSpec (string headerLine) { CsvParser header{headerLine}; forAllColumns( [&](auto& col) { if (*header != col.header) throw error::Invalid{_Fmt{"Header mismatch in CSV file %s. " "Expecting column(%s) but found \"%s\""} % filename_ % col.header % *header}; ++header; }); } CSVLine generateHeaderSpec() const { CSVLine csv; forAllColumns( [&](auto& col) { csv += col.header; }); return csv; } void appendRowFromCSV (string line) { newRow(); CsvParser csv(line); forAllColumns( [&](auto& col) { if (not csv) { if (csv.isParseFail()) csv.fail(); else throw error::Invalid{_Fmt{"Insufficient data; only %d fields, %d expected. Line:%s"} % csv.getParsedFieldCnt() % columnCnt % line}; } using Value = typename std::remove_reference::type::ValueType; col.get() = parseAs(*csv); ++csv; }); if (csv) throw error::Invalid{_Fmt{"Excess data fields in CSV. Expect %d fields. Line:%s"} % columnCnt % line}; } CSVLine formatCSVRow (size_t rownum) const { if (this->empty()) throw error::Logic{"Attempt to access data from empty DataTable."}; if (rownum >= this->size()) throw error::Logic{_Fmt{"Attempt to access row #%d beyond range [0..%d]."} % rownum % (size()-1)}; CSVLine csvLine; forAllColumns( [&](auto& col) { csvLine += col.data.at(rownum); }); return csvLine; } }; }} // namespace lib::stat #endif /*LIB_STAT_DATA_H*/