lumiera_/src/lib/stat/data.hpp

/*
  DATA.hpp  -  read and write a table with CSV data

   Copyright (C)
     2022,            Hermann Vosseler <Ichthyostega@web.de>

  **Lumiera** is free software; you can redistribute it and/or modify it
  under the terms of the GNU General Public License as published by the
  Free Software Foundation; either version 2 of the License, or (at your
  option) any later version. See the file COPYING for further details.

*/


/** @file data.hpp
 ** Manage a table with data records, stored persistently as CSV.
 ** In the context of observations, configuration, calibration and QA, a series
 ** of measurement data taken over time is often evaluated statistically, to distill
 ** typical averages, variances and trends. Short of using a database, a modest
 ** amount of numeric data can be maintained in CSV files, which also allows for
 ** further manual evaluation within a spreadsheet or statistics application.
 ** The CSV format as such can be quite elaborate, yet for the purpose of
 ** saving and later reading back some values generated by the application
 ** itself, supporting a limited format flavour is sufficient:
 ** - first line is a header line and used to verify the storage format
 ** - one record per line, embedded line breaks prohibited
 ** - fields separated by comma, semicolon tolerated
 ** - fields are trimmed and may be empty
 ** - a field may be double quoted
 ** - only quoted fields may contain whitespace or comma
 ** - no escaping of quotes, i.e. no quotes within quotes
 **
 ** As a fundamental building block, this header provides a data table template
 ** with flexible column configuration to hold arbitrary, explicitly typed values.
 ** This solution is statically typed and does not carry any runtime type information;
 ** the actual data table object is then defined and accessed by means of _accessor_
 ** components for each column of data. A tuple of _current values_ corresponding to
 ** the most recent row of data can be accessed directly through these sub-components.
 **
 ** # Usage
 ** Create an actual instantiation of the DataTable template, passing a structure
 ** with util::Column descriptors. You may then directly access the values of the
 ** _actual column_ or save/load from a persistent CSV file.
 ** @note mandatory to define a method `allColumns()`
 ** \code
 ** struct Storage
 **   {
 **     Column<string> name{"theName"};
 **     Column<int>    n{"counter"};
 **     Column<double> x{"X value"};
 **     Column<double> y{"Y value"};
 **
 **     auto allColumns(){ return std::tie(name,n,x,y); }
 **   };
 **
 ** using Dataz = lib::stat::DataTable<Storage>;
 **
 ** Dataz daz("filename.csv");
 **
 ** daz.x = 123e-4;
 ** daz.y = -12345e-6;
 **
 ** std::vector<int>& counters = daz.n.data;
 ** \endcode
 ** \par Variations
 ** The standard case is to have a table backed by persistent file storage,
 ** which can be initially empty. Under some conditions, especially for tests
 ** - the DataTable can be created without filename
 ** - it can be created from a CSVData, which is a `std::vector` of CSV-strings
 ** - it can be [rendered into CSV strings](\ref #renderCSV)
 ** - a (new) storage file name can be [given later](\ref saveAs)
 ** @see DataCSV_test
 **
 */


#ifndef LIB_STAT_DATA_H
#define LIB_STAT_DATA_H


#include "lib/error.hpp"
#include "lib/nocopy.hpp"
#include "lib/stat/csv.hpp"
#include "lib/file.hpp"
#include "lib/format-string.hpp"
#include "lib/util.hpp"

#include <type_traits>
#include <utility>
#include <fstream>
#include <vector>
#include <string>
#include <limits>
#include <deque>


namespace lib {
namespace stat{

  namespace error = lumiera::error;

  using std::move;
  using std::tuple;
  using std::vector;
  using std::string;
  using util::isnil;
  using util::unConst;
  using util::_Fmt;
  using util::min;


  /**
   * Descriptor and Accessor for a data column within a DataTable table.
   * @tparam VAL type of values contained within this column;
   *             this type must be _default constructible_ and _copyable._
   */
  template<typename VAL>
  struct Column
    : util::MoveOnly
    {
      string header;
      vector<VAL> data;

      using ValueType = VAL;


      Column (string headerID)
        : header{headerID}
        , data{}
        { }


      VAL&
      get()
        {
          if (isnil (data))
              throw error::State{"No rows in DataTable yet"};
          return data.back();
        }

      operator VAL&()
        {
          return get();
        }

      operator VAL const&()  const
        {
          return unConst(this)->get();
        }

      template<typename X>
      VAL& operator= (X&& newVal)
      {
          return get() = std::forward<X> (newVal);
      }
    };


  /******************************************************************************************//**
   * Table with data values, stored persistently as CSV file.
   * Each row within the table represents a data record, holding a sequence
   * of values. Values are statically typed per column, i.e. one column may hold
   * strings, while the next column holds doubles. For actual usage it is thus necessary
   * to define the column layout, through a sequence of [column Descriptors](\ref util::Column).
   *
   * # Usage
   * Actually those Column objects serve as descriptors, but also as accessors — and they hold
   * the actual data storage for each column, which is a `std::vector<VAL>` of value type `VAL`.
   * There is always a _current record_ — corresponding to the actual data value and the newest
   * data row. For persistent storage, the sequence of rows is _reversed,_ so the newest data
   * appears at the top of the CSV file.
   * @tparam TAB a struct comprised of several Column objects, which hold the data and
   *         provide access to values of this specific column. Moreover, this type _must define_
   *         a function `allColumns()` to return a tuple with references to these column fields;
   *         the order of fields within this tuple also defines the order of columns
   *         within the table and persistent CSV storage.
   */
  template<class TAB>
  class DataTable
      : public TAB
      , util::MoveOnly
    {
      fs::path filename_;

    public:
      DataTable(fs::path csvFile ="")
        : filename_{fs::consolidated (csvFile)}
        {
          loadData();
        }

      DataTable (CSVData const& csv)
        : filename_{}
        {
          appendFrom (csv);
        }


      /* === Data Access === */

      static constexpr size_t columnCnt = std::tuple_size_v<decltype(std::declval<TAB>().allColumns())>;

      bool
      empty()  const
        {
          return 0 == this->size();
        }

      size_t
      size()  const
        {
          if (0 == columnCnt) return 0;
          size_t rowCnt = std::numeric_limits<size_t>::max();
          forAllColumns(
                    [&](auto& col)
                      {
                        rowCnt = min (rowCnt, col.data.size());
                      }); // the smallest number of data points found in any column
          return rowCnt;
        }

      CSVData
      renderCSV()  const
        {
          CSVData csv{{}};
          csv.reserve (size()+1);
          auto header = generateHeaderSpec();
          using std::swap;
          swap (csv[0], header);
          for (uint i=0; i < size(); ++i)
            csv.emplace_back (formatCSVRow(i));
          return csv;
        }


      /* === Manipulation === */

      void
      newRow()
        {
            forAllColumns(
                    [siz = size()+1]
                    (auto& col)
                      {
                        col.data.resize (siz);
                      });
        }

      void
      dupRow()
        {
          if (empty())
            newRow();
          else
            forAllColumns(
                    [](auto& col)
                      {
                        col.data.emplace_back (col.data.back());
                      });
        }

      void
      dropLastRow()
        {
          if (not empty())
            forAllColumns(
                    [](auto& col)
                      {
                        size_t siz = col.data.size();
                        col.data.resize (siz>0? siz-1 : 0);
                      });
        }

      void
      reserve (size_t expectedCapacity)
        {
            forAllColumns(
                    [=](auto& col)
                      {
                        col.data.reserve(expectedCapacity);
                      });
        }

      void
      clear()
        {
            forAllColumns(
                    [](auto& col)
                      {
                        col.data.clear();
                      });
        }

      void
      appendFrom (CSVData const& csv)
        {
          if (isnil (csv)) return;
          verifyHeaderSpec (csv[0]);
          for (size_t row=1; row<csv.size(); ++row)
            if (not isnil (csv[row]))
              appendRowFromCSV (csv[row]);
        }


      /** @param lineLimit number of rows to retain, back from the newest */
      void
      save (size_t lineLimit =std::numeric_limits<size_t>::max()
           ,bool backupOld =false)
        {
          if (filename_.empty())
            throw error::Logic{"Unable to save DataFile without filename given."};

          fs::path newFilename{filename_};
          newFilename += ".tmp";

          std::ofstream csvFile{newFilename, std::ios_base::out | std::ios_base::trunc};
          if (not csvFile.good())
            throw error::State{_Fmt{"Unable to create CSV output file %s"}
                                   % newFilename};
          saveData (csvFile, lineLimit);

          if (backupOld)
            {
              fs::path oldFile{filename_};
              oldFile += ".bak";
              if (fs::exists (filename_))
                  fs::rename (filename_, oldFile);
            }
          fs::rename (newFilename, filename_);
          filename_ = fs::consolidated(filename_);
        }                // lock onto absolute path


      void
      saveAs (fs::path newStorage
             ,size_t lineLimit =std::numeric_limits<size_t>::max())
        {
          newStorage = fs::consolidated (newStorage);
          if (fs::exists(newStorage))
            throw error::Invalid{_Fmt{"Storing DataFile rejected: target %s exists already"}
                                     % newStorage};
          if (not (newStorage.parent_path().empty()
                   or fs::exists(newStorage.parent_path())))
            throw error::Invalid{_Fmt{"DataFile(%s) placed into nonexistent directory %s"}
                                     % newStorage.filename() % newStorage.parent_path()};
          filename_ = newStorage;
          save (lineLimit);
        }


    private: /* === Implementation === */

      /** apply a generic Lambda to all columns */
      template<class OP>
      void
      forAllColumns (OP&& doIt)  const
        {
          lib::meta::forEach (unConst(this)->allColumns()
                             ,std::forward<OP> (doIt));
        }

      void
      loadData()
        {
          if (not (filename_.parent_path().empty()
                   or fs::exists(filename_.parent_path())))
            throw error::Invalid{_Fmt{"DataFile(%s) placed into nonexistent directory %s"}
                                     % filename_.filename() % filename_.parent_path()};
          if (not fs::exists(filename_))
            return; // leave the table empty

          std::ifstream csvFile{filename_};
          if (not csvFile.good())
            throw error::Config{_Fmt{"unable to read CSV data file %s"} % filename_};

          std::deque<string> rawLines;
          for (string line; std::getline(csvFile, line); )
            rawLines.emplace_back (move(line));

          if (rawLines.size() < 1) return;
          verifyHeaderSpec (rawLines[0]);

          // we know the number of rows now...
          reserve (rawLines.size() - 1);

          // storage in file is backwards, with newest data on top
          for (size_t row = rawLines.size()-1; 0<row; --row)
            if (not isnil(rawLines[row]))
              appendRowFromCSV (rawLines[row]);
        }


      void
      saveData (std::ofstream& csvFile, size_t lineLimit)
        {
          csvFile << generateHeaderSpec() << "\n";
          if (empty())
            return;
          lineLimit = size() > lineLimit? size()-lineLimit : 0;
          // store newest data first, possibly discard old data
          for (size_t row = size(); lineLimit < row; --row)
            csvFile << formatCSVRow(row-1) << "\n";
        }


      void
      verifyHeaderSpec (string headerLine)
        {
          CsvParser header{headerLine};
          forAllColumns(
                  [&](auto& col)
                    {
                      if (*header != col.header)
                        throw error::Invalid{_Fmt{"Header mismatch in CSV file %s. "
                                                  "Expecting column(%s) but found \"%s\""}
                                                 % filename_ % col.header % *header};
                      ++header;
                    });
        }

      CSVLine
      generateHeaderSpec()  const
        {
          CSVLine csv;
          forAllColumns(
                  [&](auto& col)
                    {
                      csv += col.header;
                    });
          return csv;
        }


      void
      appendRowFromCSV (string line)
        {
          newRow();
          CsvParser csv(line);
          forAllColumns(
                  [&](auto& col)
                    {
                      if (not csv)
                        {
                          if (csv.isParseFail())
                            csv.fail();
                          else
                            throw error::Invalid{_Fmt{"Insufficient data; only %d fields, %d expected. Line:%s"}
                                                     % csv.getParsedFieldCnt() % columnCnt % line};
                        }

                      using Value = std::remove_reference<decltype(col)>::type::ValueType;
                      col.get() = parseAs<Value>(*csv);
                      ++csv;
                    });
          if (csv)
            throw error::Invalid{_Fmt{"Excess data fields in CSV. Expect %d fields. Line:%s"}
                                     % columnCnt % line};
        }


      CSVLine
      formatCSVRow (size_t rownum)  const
        {
          if (this->empty())
            throw error::Logic{"Attempt to access data from empty DataTable."};
          if (rownum >= this->size())
            throw error::Logic{_Fmt{"Attempt to access row #%d beyond range [0..%d]."}
                                   % rownum % (size()-1)};

          CSVLine csvLine;
          forAllColumns(
                  [&](auto& col)
                    {
                      csvLine += col.data.at(rownum);
                    });
          return csvLine;
        }
    };

}} // namespace lib::stat
#endif /*LIB_STAT_DATA_H*/