/* PARSE.hpp - helpers for parsing textual specifications Copyright (C) 2024, Hermann Vosseler   **Lumiera** is free software; you can redistribute it and/or modify it   under the terms of the GNU General Public License as published by the   Free Software Foundation; either version 2 of the License, or (at your   option) any later version. See the file COPYING for further details. */ /** @file parse.hpp ** Convenience wrappers and definitions for parsing structured definitions. ** Whenever a specification syntax entails nested structures, extracting contents ** with regular expressions alone becomes tricky. Without much sophistication, a ** directly implemented simple recursive descent parser is often less brittle and ** easier to understand and maintain. With some helper abbreviations, notably ** a combinator scheme to work from building blocks, a hand-written solution ** can benefit from taking short-cuts, especially related to result bindings. ** ** So what is provided here is _not a parser library_ — yet aims at »making ** simple things simple« and let you implement the complicated ones yourselves. ** Several decisions were taken accordingly, like only supporting std::string_view ** and automatically consuming any leading whitespace. And notably the focus was ** _not placed_ on the challenging aspects of parsing — while still allowing a ** pathway towards definition of arbitrarily recursive grammars, if so desired. */ #ifndef LIB_PARSE_H #define LIB_PARSE_H #include "lib/error.hpp" #include "lib/branch-case.hpp" #include "lib/format-string.hpp" #include "lib/meta/variadic-rebind.hpp" #include "lib/meta/function.hpp" #include "lib/meta/trait.hpp" #include "lib/regex.hpp" #include #include #include #include #include namespace util { namespace parse { namespace err = lumiera::error; using std::move; using std::forward; using std::optional; using lib::meta::_Fun; using lib::meta::has_Sig; using lib::meta::NullType; using lib::meta::_Vari; using std::decay_t; using std::tuple; using std::array; using util::_Fmt; using StrView = std::string_view; template class Syntax; /** * Parse evaluation result * @tparam RES model type to bind */ template struct Eval { using Result = RES; optional result; size_t consumed{0}; }; /** * Building block: parser function * definition and connection element. */ template struct Connex : util::NonAssign { using PFun = FUN; PFun parse; using Result = typename _Fun::Ret::Result; Connex (FUN&& pFun) : parse{move(pFun)} { } }; /** »Null-Connex« which always successfully accepts the empty sequence */ auto buildConnex(NullType) { return Connex{[](StrView) -> Eval { return {NullType{}}; }}; } using NulP = decltype(buildConnex (NullType())); /** * Foundation: build a \ref Connex to accept a _terminal symbol._ * the actual parsing is delegated to a Regular Expression, * which must match against the _beginning_ of the input sequence, * possibly after skipping some whitespace. The defined parser * returns an \ref Eval context, to hold a _Result Model_ and * the number of characters matched by this terminal symbol. */ auto buildConnex (regex rex) { return Connex{[regEx = move(rex)] (StrView toParse) -> Eval { // skip leading whitespace... size_t pre = leadingWhitespace (toParse); toParse = toParse.substr(pre); auto result{matchAtStart (toParse,regEx)}; size_t consumed = result? pre+result->length() : 0; return {move(result), consumed}; }}; } using Term = decltype(buildConnex (std::declval())); /** build from a string with Regular-Epression spec */ Term buildConnex (string const& rexDef) { return buildConnex (regex{rexDef}); } /** copy-builder from an existing parser function */ template auto buildConnex (Connex const& anchor) { return Connex{anchor}; } template auto buildConnex (Connex && anchor) { return Connex{move(anchor)}; } template auto buildConnex (Syntax const& anchor) { using Con = typename Syntax::Connex; return Con{anchor}; } namespace { /** helper to detect return type of a possibly generic λ */ template struct _ProbeFunReturn { static_assert (!sizeof(ARG), "Model binding must accept preceding model result."); }; template struct _ProbeFunReturn() (std::declval()))>> { // probe the λ with ARG to force template instantiation using Ret = decltype(std::declval() (std::declval())); }; } /** * Adapt by applying a result-transforming function after a successful parse. * @remark the purpose is to extract a custom data model immediately from the * result; binding functors can be applied at any level of a Syntax, * and thus the parse can be configured to produce custom result data. */ template auto adaptConnex (CON&& connex, BIND&& modelBinding) { using RX = typename CON::Result; using Arg = std::add_rvalue_reference_t; using AdaptedRes = typename _ProbeFunReturn::Ret; return Connex{[origConnex = forward(connex) ,binding = forward(modelBinding) ] (StrView toParse) -> Eval { auto eval = origConnex.parse (toParse); if (eval.result) return {binding (move (*eval.result))}; else return {std::nullopt}; }}; } /* ===== building structured models ===== */ /** * Product Model : results from a conjunction of parsing clauses, * which are to be accepted in sequence, one after the other. */ template struct SeqModel : tuple { static constexpr size_t N = sizeof...(RESULTS); using Seq = lib::meta::TySeq; using Tup = std::tuple; SeqModel() = default; template SeqModel (SeqModel&& seq, XX&& extraElm) : Tup{std::tuple_cat (seq.extractTuple() ,make_tuple (forward (extraElm)) )} { } template SeqModel (X1&& res1, X2&& res2) : Tup{move(res1), move(res2)} { } Tup&& extractTuple() { return move(*this); } template auto get() { return std::get (*this); } }; /** * Sum Model : results from a disjunction of parsing clauses, * which are are tested and accepted as alternatives, one at least. */ template struct AltModel : lib::BranchCase { using Alt = lib::BranchCase; static constexpr size_t N = Alt::TOP; template using Additionally = AltModel; template Additionally addBranch() ///< mark-up existing model to add a further branch-case { Additionally& upFaked = reinterpret_cast&> (*this); return {move (upFaked)}; } // this trick works due to similar storage layout /* === Builder functions to mark which side of the combinator to pick === */ using SubSeq = typename _Vari::Prefix; ///< a nested sub-model to extend using Penult = typename _Vari::Penult; ///< plain value expected for left-branch using Ultima = typename _Vari::Ultima; ///< plain value expected for right-branch static AltModel mark_left (SubSeq&& leftCases) { return {leftCases.template addBranch()}; } static AltModel mark_left (Penult&& leftCase) { return {Alt::TOP-1, move(leftCase)}; } static AltModel mark_right (Ultima&& rightCase) { return {Alt::TOP, move(rightCase)}; } private: template AltModel (size_t branchID, INIT&& init) : Alt{branchID, forward (init)} { } }; /** Special case Product Model to represent iterative sequence */ template struct IterModel : std::vector { RES& get (size_t i) { return this->at(i); } }; /** Marker-Tag for the result from a sub-expression, not to be joined */ template struct SubModel { RES model; }; /** Standard case : combinator of two model branches */ template class TAG, class R1, class R2 =void> struct _Join { using Result = TAG; }; /** Generic case : extend a structured model by further branch */ template class TAG, class...RS, class R2> struct _Join,R2> { using Result = TAG; }; /** Special Case : absorb sub-expression without flattening */ template class TAG, class R1, class R2> struct _Join,R2> { using Result = TAG; }; template class TAG, class R1, class R2> struct _Join> { using Result = TAG; }; template class TAG, class R1, class R2> struct _Join,SubModel> { using Result = TAG; }; /** accept sequence of two parse functions */ template auto sequenceConnex (C1&& connex1, C2&& connex2) { using R1 = typename decay_t::Result; using R2 = typename decay_t::Result; using ProductResult = typename _Join::Result; using ProductEval = Eval; return Connex{[conL = forward(connex1) ,conR = forward(connex2) ] (StrView toParse) -> ProductEval { auto eval1 = conL.parse (toParse); if (eval1.result) { size_t posAfter1 = eval1.consumed; StrView restInput = toParse.substr(posAfter1); auto eval2 = conR.parse (restInput); if (eval2.result) { uint consumedOverall = posAfter1 + eval2.consumed; return ProductEval{ProductResult{move(*eval1.result) ,move(*eval2.result)} ,consumedOverall }; } } return ProductEval{std::nullopt}; }}; } /** accept either one of two alternative parse functions */ template auto branchedConnex (C1&& connex1, C2&& connex2) { using R1 = typename decay_t::Result; using R2 = typename decay_t::Result; using SumResult = typename _Join::Result; using SumEval = Eval; return Connex{[conL = forward(connex1) ,conR = forward(connex2) ] (StrView toParse) -> SumEval { auto eval1 = conL.parse (toParse); if (eval1.result) { uint endBranch1 = eval1.consumed; return SumEval{SumResult::mark_left (move(*eval1.result)) ,endBranch1 }; } auto eval2 = conR.parse (toParse); if (eval2.result) { uint endBranch2 = eval2.consumed; return SumEval{SumResult::mark_right (move(*eval2.result)) ,endBranch2 }; } return SumEval{std::nullopt}; }}; } /** repeatedly accept parse-function, optionally delimited. */ template auto repeatedConnex (uint min, uint max ,C1&& delimConnex ,C2&& bodyConnex) { using Res = typename decay_t::Result; using IterResult = IterModel; using IterEval = Eval; return Connex{[sep = forward(delimConnex) ,body = forward(bodyConnex) ,min,max ] (StrView toParse) -> IterEval { size_t consumed{0}; IterResult results; do { uint offset{0}; if (not results.empty()) { // look for delimiter within sequence auto delim = sep.parse (toParse); if (not delim.result) break; offset += delim.consumed; } auto eval = body.parse (toParse.substr(offset)); if (not eval.result) break; offset += eval.consumed; results.emplace_back (move(*eval.result)); toParse = toParse.substr(offset); consumed += offset; } while (results.size() < max); return results.size() >= min? IterEval{move(results), consumed} : IterEval{std::nullopt}; }}; } /** try to accept parse-function, backtracking if not successful. */ template auto optionalConnex (CNX&& connex) { using Res = typename decay_t::Result; using OptResult = optional; using OptEval = Eval; return Connex{[body = forward(connex) ] (StrView toParse) -> OptEval { auto eval = body.parse (toParse); size_t consumed{eval.result? eval.consumed : 0}; return OptEval{OptResult{eval.result? move(eval.result) : std::nullopt} ,consumed }; }}; } /** accept some structure enclosed into a bracketing construct. * \param isOptional if the bracketing can be omitted */ template auto bracketedConnex (C1&& openingConnex ,C2&& closingConnex ,C3&& bodyConnex ,bool isOptional) { using Res = typename decay_t::Result; return Connex{[opening = forward(openingConnex) ,closing = forward(closingConnex) ,body = forward(bodyConnex) ,isOptional ] (StrView toParse) -> Eval { auto bracket = opening.parse (toParse); if (bracket.result or isOptional) { size_t consumed = bracket.consumed; bool expectClose{bracket.result}; auto eval = body.parse (toParse.substr(consumed)); if (eval.result) { consumed += eval.consumed; if (expectClose) bracket = closing.parse (toParse.substr(consumed)); if (bracket.result or not expectClose) { consumed += bracket.consumed; return Eval{move (*eval.result) ,consumed }; } } } return Eval{std::nullopt}; }}; } /** * A Parser function to match and accept some syntax. * This is a typing- and interface-adapter, wrapping a Connex. */ template class Parser : public CON { using PFun = typename CON::PFun; static_assert (_Fun(), "Connex must define a parse-function"); public: using Connex = CON; using Result = typename CON::Result; static_assert (has_Sig(StrView)>() ,"Signature of the parse-function not suitable"); /** * Parse-Function operator: test input and yield Eval record */ Eval operator() (StrView toParse) { return CON::parse (toParse); } template Parser (SPEC&& spec) : CON{buildConnex (forward (spec))} { } }; /* === Deduction guide : how to construct a Parser === */ Parser(NullType) -> Parser; Parser(regex &&) -> Parser; Parser(regex const&) -> Parser; Parser(string const&) -> Parser; template Parser(Connex const&) -> Parser>; template Parser(Syntax const&) -> Parser; /** @internal meta-helper : detect if parser can be built from a given type */ template struct is_usableSpec : std::false_type{ }; template struct is_usableSpec()})>> : std::true_type { }; template using if_acceptableSpec = lib::meta::enable_if>; template using if_acceptableSpecs = lib::meta::enable_if ,lib::meta::enable_if>>; /***********************************************************************//** * A Syntax clause with a parser and result state. * An instance of this class embodies a (possibly complex) * _expected syntactical structure;_ the [parse function](\ref parse) * analyses a given input text for compliance with this expected structure. * After the parse, result state has been set * - indicating if the parse was successful * - possibly with an failure message (TODO 1/25) * - the number of characters covered by this match * - a _Result Model,_ as a structured term holding * result components from each part / sub-clause */ template class Syntax : public Eval { PAR parse_; public: using Connex = typename PAR::Connex; using Result = typename PAR::Result; Syntax() : parse_{NullType()} { } explicit Syntax (PAR&& parser) : parse_{move (parser)} { } explicit operator bool() const { return success();} operator Connex&() { return parse_; } operator Connex const&() const { return parse_; } bool success() const { return bool(Syntax::result); } bool hasResult() const { return bool(Syntax::result); } size_t consumed() const { return Eval::consumed;} Result& getResult() { return * Syntax::result; } Result&& extractResult() { return move(getResult()); } /********************************************//** * Core API : parse against this syntax clause */ Syntax&& parse (StrView toParse) { eval() = parse_(toParse); return move(*this); } /** ===== Syntax clause builder DSL ===== */ template auto seq (SPEC&& clauseDef); template auto alt (SPEC&& clauseDef); template auto opt (SPEC&& clauseDef); template auto repeat (uint min, uint max, SPEC1&& delimDef, SPEC2&& clauseDef); template auto repeat (uint cnt, SPEC1&& delimDef, SPEC2&& clauseDef); template auto repeat (SPEC1&& delimDef, SPEC2&& clauseDef); template auto repeat (SPEC&& clauseDef); template auto bracket (SPEC1&& openDef, SPEC2&& closeDef, SPEC3&& bodyDef); template auto bracket (string bracketSpec, SPEC&& bodyDef); template auto bracket (SPEC&& bodyDef); template auto bracketOpt (string bracketSpec, SPEC&& bodyDef); template auto bracketOpt (SPEC&& bodyDef); template auto bind (FUN&& modelAdapt); auto bindMatch (uint group =0); private: Eval& eval() { return *this;} }; /** ===== Syntax clause builder DSL ===== */ /** build a Syntax clause from anything usable as parser-spec. */ template auto accept (SPEC&& clauseDef) { return Syntax{Parser{forward (clauseDef)}}; } /** empty Syntax clause to start further definition */ auto accept() { return Syntax>{}; } /** start Syntax clause with an optional syntax part */ template auto accept_opt (SPEC&& clauseDef) { return accept( optionalConnex (Parser{forward (clauseDef)})); } /** * Start Syntax clause with a repeated sub-clause, * with separator and repetition limit; repetitions ∊ [min..max] * The separator will be expected _between_ instances of the repeated sub-clause * and will by itself produce no model. The result model is an instance of \ref IterModel, * which implies it is a vector (uses heap storage); if min ≡ 0, the model can be empty. */ template auto accept_repeated (uint min, uint max, SPEC1&& delimDef, SPEC2&& clauseDef) { if (max max:%d"} % min % max }; if (max == 0) throw err::Invalid{"Invalid repeat with max ≡ 0 repetitions"}; return accept( repeatedConnex (min,max ,Parser{forward (delimDef)} ,Parser{forward (clauseDef)})); } /** \param cnt exact number of repetitions expected */ template> auto accept_repeated (uint cnt, SPEC1&& delimDef, SPEC2&& clauseDef) { return accept_repeated (cnt,cnt, forward(delimDef), forward(clauseDef)); } /** start Syntax with an arbitrarily repeated sub-clause, with separator */ template> auto accept_repeated (SPEC1&& delimDef, SPEC2&& clauseDef) { return accept_repeated (1,uint(-1), forward(delimDef), forward(clauseDef)); } template auto accept_repeated (uint min, uint max, SPEC&& clauseDef) { return accept_repeated (min, max, NullType{}, forward(clauseDef)); } template auto accept_repeated (uint cnt, SPEC&& clauseDef) { return accept_repeated (cnt, NullType{}, forward(clauseDef)); } template auto accept_repeated (SPEC&& clauseDef) { return accept_repeated (NullType{}, forward(clauseDef)); } /** * Start Syntax with a sub-clause enclosed into a _bracketing construct._ * The »bracket« is defined as syntax for the _open marker_ and _close marker._ * These are consumed without generating model elements. The parse fails unless * the full sequence `open body close` can be matched. */ template auto accept_bracket (SPEC1&& openDef, SPEC2&& closeDef, SPEC3&& bodyDef) { return accept( bracketedConnex (Parser{forward(openDef) } ,Parser{forward(closeDef)} ,Parser{forward(bodyDef) } ,false // bracket mandatory, not optional )); } /** * Start Syntax with a bracketed sub-clause, with given single-char delimiters. * \param bracketSpec a 2-char string, e.g. "{}" to expect curly braces. */ template auto accept_bracket (string bracketSpec, SPEC&& bodyDef) { if (bracketSpec.size() != 2) throw err::Invalid{"Bracket spec with opening and closing character expected"}; return accept( bracketedConnex (Parser{"\\"+bracketSpec.substr(0,1)} ,Parser{"\\"+bracketSpec.substr(1,1)} ,Parser{forward(bodyDef) } ,false // bracket mandatory, not optional )); } /** Start Syntax with a sub-clause enclosed in parentheses */ template auto accept_bracket (SPEC&& bodyDef) { return accept_bracket ("()", forward(bodyDef)); } /** Start Syntax with a sub-clause, _optionally_ enclosed into brackets. */ template auto accept_bracketOpt (string bracketSpec, SPEC&& bodyDef) { if (bracketSpec.size() != 2) throw err::Invalid{"Bracket spec with opening and closing character expected"}; return accept( bracketedConnex (Parser{"\\"+bracketSpec.substr(0,1)} ,Parser{"\\"+bracketSpec.substr(1,1)} ,Parser{forward(bodyDef) } ,true // bracket optional, can be omitted )); } template auto accept_bracketOpt (SPEC&& bodyDef) { return accept_bracketOpt ("()", forward(bodyDef)); } /** * Combinator: extend this Syntax clause by expecting a further sub-clause * behind the part of the input matched by the already defined part of this Syntax. * The result model will be a \SeqModel, which essentially is a tuple of the * result models of all sequenced parts. * @return Syntax clause instance accepting the extended structure. * @warning the old syntax is invalidated by moving the parse-function out. */ template template auto Syntax::seq (SPEC&& clauseDef) { return accept( sequenceConnex (move(parse_) ,Parser{forward (clauseDef)})); } /** * Combinator: extend this Syntax by adding an _alternative branch_. * So either the already defined part of this Syntax matches the input, * or the alternative clause is probed from the start of the input. At least * one branch must match for the parse to be successful; however, further * branches are not tested after finding a matching branch (short-circuit). * The result model is a _Sum Type,_ implemented as a custom variant record * of type \ref SubModel. It provides a branch selector field to detect which * branch of the syntax did match. And it allows to retrieve the result model * of this successful branch — which however requires that the invoking code * precisely knows the model type to expect. */ template template auto Syntax::alt (SPEC&& clauseDef) { return accept( branchedConnex (move(parse_) ,Parser{forward (clauseDef)})); } /** * Combinator: extend this Syntax with a further sequenced sub-clause, * which however is _only optional_ and the match succeed without it. * The result model is (as always for \ref seq ) a tuple; the result * from the optional part is packaged into a std::optional. */ template template auto Syntax::opt (SPEC&& clauseDef) { return seq (accept_opt (forward (clauseDef))); } /** * Combinator: extend this Syntax with a further sequenced sub-clause, * which in this case accepts a repeated sequence, with delimiter. * @see accept_sequenced() */ template template auto Syntax::repeat (uint min, uint max, SPEC1&& delimDef, SPEC2&& clauseDef) { return seq (accept_repeated (min,max ,forward(clauseDef) ,forward(clauseDef))); } template template auto Syntax::repeat (uint cnt, SPEC1&& delimDef, SPEC2&& clauseDef) { return seq (accept_repeated (cnt ,forward(clauseDef) ,forward(clauseDef))); } template template auto Syntax::repeat (SPEC1&& delimDef, SPEC2&& clauseDef) { return seq (accept_repeated (forward(clauseDef) ,forward(clauseDef))); } template template auto Syntax::repeat (SPEC&& clauseDef) { return seq (accept_repeated (forward(clauseDef))); } /** * Combinator: extend this Syntax with a further sequenced sub-clause in brackets. * @see accept_bracket() */ template template auto Syntax::bracket (SPEC1&& openDef, SPEC2&& closeDef, SPEC3&& bodyDef) { return seq (accept_bracket (forward(openDef) ,forward(closeDef) ,forward(bodyDef))); } template template auto Syntax::bracket (string bracketSpec, SPEC&& bodyDef) { return seq (accept_bracket (move (bracketSpec) ,forward(bodyDef))); } template template auto Syntax::bracket (SPEC&& bodyDef) { return seq (accept_bracket (forward(bodyDef))); } template template auto Syntax::bracketOpt (string bracketSpec, SPEC&& bodyDef) { return seq (accept_bracketOpt (move (bracketSpec) ,forward(bodyDef))); } template template auto Syntax::bracketOpt (SPEC&& bodyDef) { return seq (accept_bracketOpt (forward(bodyDef))); } template template auto Syntax::bind (FUN&& modelAdapt) { return accept( adaptConnex (move(parse_) ,forward(modelAdapt))); } template auto Syntax::bindMatch (uint group) { return bind ([group](smatch const& mat) { return mat.str(group); }); } }// namespace parse using parse::accept; using parse::accept_opt; using parse::accept_repeated; }// namespace util #endif/*LIB_PARSE_H*/