LUMIERA.clone/src/lib/stat/statistic.hpp

/*
  STATISTIC.hpp  -  helpers for generic statistics calculations

   Copyright (C)
     2022,            Hermann Vosseler <Ichthyostega@web.de>

  **Lumiera** is free software; you can redistribute it and/or modify it
  under the terms of the GNU General Public License as published by the
  Free Software Foundation; either version 2 of the License, or (at your
  option) any later version. See the file COPYING for further details.

*/


/** @file statistic.hpp
 ** Support for generic statistics calculations.
 ** - average over the N last elements in a data sequence
 ** - simple linear regression with weights (single predictor variable)
 ** - also over a time series with zero-based indices
 **
 */


#ifndef LIB_STAT_STATISTIC_H
#define LIB_STAT_STATISTIC_H


#include "lib/error.hpp"
#include "lib/nocopy.hpp"
#include "lib/iter-adapter.hpp"
#include "lib/format-string.hpp"
#include "lib/util.hpp"

#include <utility>
#include <vector>
#include <array>
#include <tuple>
#include <cmath>

namespace lib {
namespace stat{

  namespace error = lumiera::error;

  using std::fabs;
  using std::array;
  using std::tuple;
  using std::make_tuple;
  using std::forward;
  using std::move;
  using util::min;
  using util::max;
  using util::isnil;
  using util::_Fmt;

  using VecD = std::vector<double>;


  /** helper to unpack a std::tuple into a homogeneous std::array */
  template<typename TUP>
  constexpr auto
  array_from_tuple (TUP&& tuple)
  {
    constexpr auto makeArray = [](auto&& ... x)
      {
        return std::array{forward<decltype(x)> (x) ...};
      };
    return std::apply (makeArray, forward<TUP> (tuple));
  }

  template<size_t places>
  inline double
  round (double val)
  {
    constexpr double shift{pow(10.0, places)};
    return std::round(val*shift) / shift;
  }


  /**
   * Read-only view into a segment within a sequence of data
   * @tparam D value type of the data series
   * @remark simplistic workaround since we don't support C++20 yet
   * @todo replace by const std::span
   */
  template<typename D>
  class DataSpan
    : util::Cloneable
    {
      const D* const b_{nullptr};
      const D* const e_{nullptr};

    public:
      DataSpan() = default;
      DataSpan (D const& begin, D const& end)
        : b_{&begin}
        , e_{&end}
        {
          if (e_ < b_)
            throw error::Invalid{"End point before begin."};
        }

      template<class CON>
      DataSpan (CON const& container)
        : DataSpan{*std::begin(container), *std::end(container)}
        { }


      using iterator = const D*;
      using const_iterator = iterator;

      size_t size()  const { return e_ - b_; }
      bool empty()   const { return b_ == e_;}

      iterator begin() const { return b_; }
      iterator end()   const { return e_; }
      friend const_iterator begin (DataSpan const& span){ return span.begin();}
      friend const_iterator end   (DataSpan const& span){ return span.end();  }

      D const& operator[](size_t i) const { return *(b_ + i); }
      D const& at(size_t i)  const
        {
          if (i >= size())
            throw error::Invalid{_Fmt{"Index %d beyond size=%d"}
                                     % i % size()};
          return this->operator[](i);
        }
    };

  /** deduction guide: derive content from container. */
  template<class CON>
  DataSpan (CON const& container) -> DataSpan<typename lib::meta::ValueTypeBinding<CON>::value_type>;


  /** summation of variances, for error propagation: √Σe² */
  template<typename... NUMS>
  inline double
  errorSum (NUMS ...vals)
  {
    auto sqr = [](auto val){ return val*val; };
    return sqrt((sqr(vals)+ ... + 0.0));
  }


  template<typename D>
  inline double
  average (DataSpan<D> const& data)
  {
    if (isnil(data)) return 0.0;
    double sum = 0.0;
    for (auto val : data)
      sum += val;
    return sum / data.size();
  }

  template<typename D>
  inline double
  sdev (DataSpan<D> const& data, D mean)
  {
    if (isnil(data)) return 0.0;
    double sdev = 0.0;
    for (auto val : data)
      {
        D offset = val - mean;
        sdev += offset*offset;
      }
    size_t n = data.size();
    sdev /= n<2? 1: n-1;
    return sqrt (sdev);
  }

  inline double
  sdev (VecD const& data, double mean)
  {
    return sdev(DataSpan<double>{data}, mean);
  }


  inline DataSpan<double>
  lastN (VecD const& data, size_t n)
  {
    n = min (n, data.size());
    size_t oldest = data.size() - n;
    return DataSpan<double>{data[oldest], *data.end()};
  }

  inline double
  averageLastN (VecD const& data, size_t n)
  {
    return average (lastN (data,n));
  }

  inline double
  sdevLastN (VecD const& data, size_t n, double mean)
  {
    return sdev (lastN (data,n), mean);
  }


  /** "building blocks" for mean, variance and covariance of time series data */
  template<typename D>
  inline auto
  computeStatSums (DataSpan<D> const& series)
  {
    double ysum = 0.0;
    double yysum = 0.0;
    double xysum = 0.0;
    size_t x = 0;
    for (auto& y : series)
      {
        ysum += y;
        yysum += y*y;
        xysum += x*y;
        ++x;
      }
    return make_tuple (ysum,yysum, xysum);
  }


  /**
   * Single data point used for linear regression.
   * Simple case: single predictor variable (x).
   * @remark including a weight factor
   */
  struct RegressionPoint
    {
      double x;
      double y;
      double w;

      RegressionPoint (double vx, double vy, double vw=1.0)
        : x{vx}
        , y{vy}
        , w{vw}
        { }
    };

  using RegressionData = std::vector<RegressionPoint>;


  /** "building blocks" for weighted mean, weighted variance and covariance */
  inline auto
  computeWeightedStatSums (DataSpan<RegressionPoint> const& points)
  {
    std::array<double,6> sums;
    sums.fill(0.0);
    auto& [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = sums;
    for (auto& p : points)
      {
        wsum += p.w;
        wxsum += p.w * p.x;
        wysum += p.w * p.y;
        wxxsum += p.w * p.x*p.x;
        wyysum += p.w * p.y*p.y;
        wxysum += p.w * p.x*p.y;
      }
    return sums;
  }

  /**
   * Compute simple linear regression with a single predictor variable (x).
   * @param points 2D data to fit the linear model with, including weights.
   * @return the computed linear model `b + a·x`, and the resulting fit
   *       - socket (constant offset `b`)
   *       - gradient (linear factor `a`)
   *       - a vector with a predicted `y` value for each `x` value
   *       - a vector with the error, i.e `Δ = y - y_predicted`
   *       - correlation between x and y values
   *       - maximum absolute delta
   *       - delta standard deviation
   */
  inline auto
  computeLinearRegression (DataSpan<RegressionPoint> const& points)
  {
    auto [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = computeWeightedStatSums(points);

    double xm = wxsum / wsum;                                       // weighted mean x = 1/Σw · Σwx
    double ym = wysum / wsum;
    double varx = wxxsum + xm*xm * wsum - 2*xm * wxsum;             // Σw · x-Variance = Σw(x-xm)²
    double vary = wyysum + ym*ym * wsum - 2*ym * wysum;
    double cova = wxysum + xm*ym * wsum - ym * wxsum - xm * wysum;  // Σw · Covariance = Σw(x-xm)(y-ym)

    // Linear Regression minimising σ²
    double gradient = cova / varx;                                  // gradient = correlation · σy / σx ; σ = √Variance
    double socket   = ym - gradient * xm;                           // Regression line:  Y-ym = gradient · (x-xm)  ; set x≔0 yields socket

    // Correlation (Pearson's r)
    double correlation = wyysum==0.0? 1.0 : gradient * sqrt(varx/vary);

    // calculate error Δ for all measurement points
    size_t n = points.size();
    VecD predicted;  predicted.reserve(n);
    VecD deltas;     deltas.reserve(n);
    double maxDelta = 0.0;
    double variance = 0.0;
    for (auto& p : points)
      {
        double y_pred = socket + gradient * p.x;
        double delta  = p.y - y_pred;
        predicted.push_back (y_pred);
        deltas.push_back (delta);
        maxDelta = max (maxDelta, fabs(delta));
        variance += p.w * delta*delta;
      }
    variance /= wsum * (n<=2? 1 : (n-2)/double(n)); // N-2 because it's an estimation,
                                                    // based on 2 other estimated values (socket,gradient)
    return make_tuple (socket,gradient
                      ,move(predicted)
                      ,move(deltas)
                      ,correlation
                      ,maxDelta
                      ,sqrt(variance)
                      );
  }

  inline auto
  computeLinearRegression (RegressionData const& points)
  {
    return computeLinearRegression (DataSpan<RegressionPoint>{points});
  }


  /**
   * Compute linear regression over a time series with zero-based indices.
   * @remark using the indices as x-values, the calculations for a regression line
   *         can be simplified, using the known closed formula for a sum of integers,
   *         shifting the indices to 0…n-1 (leaving out the 0 and 0² term)
   *         - `1+…+n = n·(n+1)/2`
   *         - `1+…+n² = n·(n+1)·(2n+1)/6`
   * @return `(socket,gradient)` to describe the regression line y = socket + gradient · i
   */
  template<typename D>
  inline auto
  computeTimeSeriesLinearRegression (DataSpan<D> const& series)
  {
    if (series.size() < 2) return make_tuple(0.0,0.0,0.0);

    auto [ysum,yysum, xysum] = computeStatSums(series);

    size_t n = series.size();
    double im = (n-1)/2.0;                     // mean of zero-based indices i ∈ {0 … n-1}
    double ym = ysum / n;                      // mean y
    double varx = (n-1)*(n+1)/12.0;            // variance of zero-based indices Σ(i-im)² / n
    double vary = yysum/n - ym*ym;             // variance of data values  Σ(y-ym)² / n
    double cova = xysum  - ysum *(n-1)/2;      // Time series Covariance = Σ(i-im)(y-ym) = Σiy + im·ym·n - ym·Σi - im·Σy; use n·ym ≙ Σy

    // Linear Regression minimising σ²
    double gradient = cova / (n*varx);         // Gradient = Correlation · σy / σx ; σ = √Variance;  Correlation = Covariance /(√Σx √Σy)
    double socket   = ym - gradient * im;      // Regression line:  Y-ym = Gradient · (i-im)  ; set i≔0 yields socket

    // Correlation (Pearson's r)
    double correlation = yysum==0.0? 1.0 : gradient * sqrt(varx/vary);
    return make_tuple (socket,gradient,correlation);
  }

  inline auto
  computeTimeSeriesLinearRegression (VecD const& series)
  {
    return computeTimeSeriesLinearRegression (DataSpan<double>{series});
  }

}} // namespace lib::stat
#endif /*LIB_STAT_STATISTIC_H*/