LUMIERA.clone/src/lib/stat/csv.hpp
Ichthyostega 0e88dec28a Library: integrate into the Lumiera code base
- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+
2024-03-11 17:38:30 +01:00

209 lines
6.1 KiB
C++

/*
CSV.hpp - Parser and Encoder for CSV data
Copyright (C) Lumiera.org
2022, Hermann Vosseler <Ichthyostega@web.de>
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of
the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/** @file csv.hpp
** Encoding and decoding of data into CSV format.
** The sequence of values transformed here is part of a data table, with columns
** holding data of various primitive value types; persisted CSV data is human readable,
** can be checked into Git and loaded into various spreadsheet and statistics applications.
**
** # CSV Format
** Even while there is a standard defined in [RFC 4180], a plethora of format variations
** can be found _in the wild._ Since the primary purpose of this implementation is _to read
** back our own data,_ by deliberate choice only one single form of CSV is accepted.
** - first line is a header line and used to verify the number of columns
** - one record per line, embedded line breaks prohibited
** - fields separated by comma, semicolon tolerated
** - fields are trimmed and may be empty
** - a field may be double quoted
** - only quoted fields may contain whitespace or comma
** - no escaping of quotes, i.e. no quotes within quotes
** [RFC 4180]: https://datatracker.ietf.org/doc/html/rfc4180
** @see lib::stat::DataFile
**
*/
#ifndef LIB_STAT_CSV_H
#define LIB_STAT_CSV_H
#include "lib/error.hpp"
#include "lib/nocopy.hpp"
#include "lib/format-string.hpp"
#include "lib/format-obj.hpp"
#include "lib/stat/regex.hpp"
#include <limits>
#include <string>
namespace lib {
namespace stat {
namespace error = lumiera::error;
using util::_Fmt;
using util::toString;
using std::string;
using std::regex;
namespace { // Implementation details...
const string MATCH_SINGLE_TOKEN { R"~(([^,;"\s]*)\s*)~"};
const string MATCH_QUOTED_TOKEN { R"~("([^"]*)"\s*)~" };
const string MATCH_DELIMITER { R"~((?:^|,|;)\s*)~" };
const regex ACCEPT_FIELD{ MATCH_DELIMITER + "(?:"+ MATCH_QUOTED_TOKEN +"|"+ MATCH_SINGLE_TOKEN +")"
, regex::optimize};
template<typename VAL>
inline string
format4Csv (VAL const& val)
{
std::ostringstream oss;
oss.precision (std::numeric_limits<VAL>::digits10); /////////////////////////////OOO herausfinden ob hier lexical_cast genügt ==> dann toString()
oss << val;
return oss.str();
}
inline string
format4Csv (string const& val)
{
return '"'+val+'"';
}
inline string
format4Csv (bool boo)
{
return util::showBool(boo); ///////////////////////OOO würde toSting() das korrekt hinbekommen
}
}//(End)Implementation
/**
* Format and append a data value to a CSV string representation
*/
template<typename VAL>
inline void
appendCsvField (string& csv, VAL const& val)
{
csv += (0 == csv.length()? "":",")
+ format4Csv(val);
}
/**
* Parser to split one line of CSV data into fields.
* @remarks iterator-like throw-away object
* - the `bool` evaluation indicates more fields to extract
* - dereference to get the field as string
* - increment to move to the next field
* @throws error::Invalid on CSV format violation
* @todo 3/24 should be rewritten as Lumiera Forward Iterator
*/
class CsvLine
: util::NonCopyable
, util::MatchSeq
{
string const& line_;
size_t field_;
iterator curr_;
size_t pos_;
public:
CsvLine (string const& line)
: MatchSeq(line, ACCEPT_FIELD)
, line_{line}
, field_{0}
, curr_{MatchSeq::begin()}
, pos_{0}
{ }
explicit operator bool() const
{
return isValid ();
}
string operator*() const
{
if (not isValid ()) fail();
auto& mat = *curr_;
return mat[2].matched? mat[2]
: mat[1];
}
void
operator++()
{
if (not isValid())
fail();
pos_ = curr_->position() + curr_->length();
++curr_;
if (pos_ < line_.length() and not isValid())
fail ();
++field_;
}
size_t
getParsedFieldCnt()
{
return field_;
}
bool
isValid() const
{
return curr_ != end()
and pos_ == size_t(curr_->position())
and not curr_->empty();
}
bool
isParseFail() const
{
return curr_ != end()
and not isValid();
}
void
fail() const
{
if (curr_ == end())
if (pos_ >= line_.length())
throw error::Invalid{_Fmt{"Only %d data fields. Line:%s"}
% field_ % line_};
else
throw error::Invalid{_Fmt{"Garbage after last field. Line:%s|↯|%s"}
% line_.substr(0,pos_) % line_.substr(pos_)};
else
if (pos_ != curr_->position())
throw error::Invalid{_Fmt{"Garbage before field(%d):%s|↯|%s"}
% (field_+1)
% line_.substr(0,pos_) % line_.substr(pos_)};
else
throw error::Invalid{"CSV parse floundered. Line:"+toString(line_)};
}
};
}} // namespace lib::stat
#endif /*LIB_STAT_CSV_H*/