LUMIERA.clone/src/lib/stat/statistic.hpp

/*
  STATISTIC.hpp  -  helpers for generic statistics calculations

  Copyright (C)         Lumiera.org
    2022,               Hermann Vosseler <Ichthyostega@web.de>

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
  published by the Free Software Foundation; either version 2 of
  the License, or (at your option) any later version.

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

*/


/** @file statistic.cpp
 ** Support for generic statistics calculations.
 ** - average over the N last elements in a data sequence
 ** - simple linear regression with weights (single predictor variable)
 ** - also over a time series with zero-based indices
 **
 */


#ifndef LIB_STAT_STATISTIC_H
#define LIB_STAT_STATISTIC_H


#include "lib/error.hpp"
#include "lib/nocopy.hpp"
#include "lib/iter-adapter.hpp"
#include "lib/format-string.hpp"
#include "lib/util.hpp"

#include <utility>
#include <vector>
#include <array>
#include <tuple>
#include <cmath>

namespace lib {
namespace stat{
  
  namespace error = lumiera::error;
  
  using std::fabs;
  using std::array;
  using std::tuple;
  using std::make_tuple;
  using std::forward;
  using std::move;
  using util::min;
  using util::max;
  using util::isnil;
  using util::_Fmt;
  
  using VecD = std::vector<double>;
  
  
  /** helper to unpack a std::tuple into a homogeneous std::array */
  template<typename TUP>
  constexpr auto
  array_from_tuple (TUP&& tuple)
  {
    constexpr auto makeArray = [](auto&& ... x)
      {
        return std::array{forward<decltype(x)> (x) ...};
      };
    return std::apply (makeArray, forward<TUP> (tuple));
  }
  
  template<size_t places>
  inline double
  round (double val)
  {
    constexpr double shift{pow(10.0, places)};
    return std::round(val*shift) / shift;
  }
  
  
  /**
   * Read-only view into a segment within a sequence of data
   * @tparam D value type of the data series
   * @remark simplistic workaround since we don't support C++20 yet
   * @todo replace by const std::span
   */
  template<typename D>
  class DataSpan
    : util::Cloneable
    {
      const D* const b_{nullptr};
      const D* const e_{nullptr};
      
    public:
      DataSpan() = default;
      DataSpan (D const& begin, D const& end)
        : b_{&begin}
        , e_{&end}
        {
          if (e_ < b_)
            throw error::Invalid{"End point before begin."};
        }
      
      template<class CON>
      DataSpan (CON const& container)
        : DataSpan{*std::begin(container), *std::end(container)}
        { }
      
      
      using iterator = const D*;
      using const_iterator = iterator;
      
      size_t size()  const { return e_ - b_; }
      bool empty()   const { return b_ == e_;}
      
      iterator begin() const { return b_; }
      iterator end()   const { return e_; }
      friend const_iterator begin (DataSpan const& span){ return span.begin();}
      friend const_iterator end   (DataSpan const& span){ return span.end();  }
      
      D const& operator[](size_t i) const { return *(b_ + i); }
      D const& at(size_t i)  const
        {
          if (i >= size())
            throw error::Invalid{_Fmt{"Index %d beyond size=%d"}
                                     % i % size()};
          return this->operator[](i);
        }
    };
  
  /** deduction guide: derive content from container. */
  template<class CON>
  DataSpan (CON const& container) -> DataSpan<typename lib::meta::ValueTypeBinding<CON>::value_type>;
  
  
  /** summation of variances, for error propagation: √Σe² */
  template<typename... NUMS>
  inline double
  errorSum (NUMS ...vals)
  {
    auto sqr = [](auto val){ return val*val; };
    return sqrt((sqr(vals)+ ... + 0.0));
  }
  
  
  template<typename D>
  inline double
  average (DataSpan<D> const& data)
  {
    if (isnil(data)) return 0.0;
    double sum = 0.0;
    for (auto val : data)
      sum += val;
    return sum / data.size();
  }
  
  template<typename D>
  inline double
  sdev (DataSpan<D> const& data, D mean)
  {
    if (isnil(data)) return 0.0;
    double sdev = 0.0;
    for (auto val : data)
      {
        D offset = val - mean;
        sdev += offset*offset;
      }
    size_t n = data.size();
    sdev /= n<2? 1: n-1;
    return sqrt (sdev);
  }
  
  inline double
  sdev (VecD const& data, double mean)
  {
    return sdev(DataSpan<double>{data}, mean);
  }
  
  
  inline DataSpan<double>
  lastN (VecD const& data, size_t n)
  {
    n = min (n, data.size());
    size_t oldest = data.size() - n;
    return DataSpan<double>{data[oldest], *data.end()};
  }
  
  inline double
  averageLastN (VecD const& data, size_t n)
  {
    return average (lastN (data,n));
  }
  
  inline double
  sdevLastN (VecD const& data, size_t n, double mean)
  {
    return sdev (lastN (data,n), mean);
  }
  
  
  /** "building blocks" for mean, variance and covariance of time series data */
  template<typename D>
  inline auto
  computeStatSums (DataSpan<D> const& series)
  {
    double ysum = 0.0;
    double yysum = 0.0;
    double xysum = 0.0;
    size_t x = 0;
    for (auto& y : series)
      {
        ysum += y;
        yysum += y*y;
        xysum += x*y;
        ++x;
      }
    return make_tuple (ysum,yysum, xysum);
  }
  
  
  /**
   * Single data point used for linear regression.
   * Simple case: single predictor variable (x).
   * @remark including a weight factor
   */
  struct RegressionPoint
    {
      double x;
      double y;
      double w;
    };
  
  using RegressionData = std::vector<RegressionPoint>;
  
  
  /** "building blocks" for weighted mean, weighted variance and covariance */
  inline auto
  computeWeightedStatSums (DataSpan<RegressionPoint> const& points)
  {
    std::array<double,6> sums;
    sums.fill(0.0);
    auto& [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = sums;
    for (auto& p : points)
      {
        wsum += p.w;
        wxsum += p.w * p.x;
        wysum += p.w * p.y;
        wxxsum += p.w * p.x*p.x;
        wyysum += p.w * p.y*p.y;
        wxysum += p.w * p.x*p.y;
      }
    return sums;
  }
  
  /**
   * Compute simple linear regression with a single predictor variable (x).
   * @param points 2D data to fit the linear model with, including weights.
   * @return the computed linear model `b + a·x`, and the resulting fit
   *       - socket (constant offset `b`)
   *       - gradient (linear factor `a`)
   *       - a vector with a predicted `y` value for each `x` value
   *       - a vector with the error, i.e `Δ = y - y_predicted`
   *       - correlation between x and y values
   *       - maximum absolute delta
   *       - delta standard deviation
   */
  inline auto
  computeLinearRegression (DataSpan<RegressionPoint> const& points)
  {
    auto [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = computeWeightedStatSums(points);
    
    double xm = wxsum / wsum;                                       // weighted mean x = 1/Σw · Σwx
    double ym = wysum / wsum;
    double varx = wxxsum + xm*xm * wsum - 2*xm * wxsum;             // Σw · x-Variance = Σw(x-xm)²
    double vary = wyysum + ym*ym * wsum - 2*ym * wysum;
    double cova = wxysum + xm*ym * wsum - ym * wxsum - xm * wysum;  // Σw · Covariance = Σw(x-xm)(y-ym)
    
    // Linear Regression minimising σ²
    double gradient = cova / varx;                                  // gradient = correlation · σy / σx ; σ = √Variance
    double socket   = ym - gradient * xm;                           // Regression line:  Y-ym = gradient · (x-xm)  ; set x≔0 yields socket
    
    // Correlation (Pearson's r)
    double correlation = wyysum==0.0? 1.0 : gradient * sqrt(varx/vary);
    
    // calculate error Δ for all measurement points
    size_t n = points.size();
    VecD predicted;  predicted.reserve(n);
    VecD deltas;     deltas.reserve(n);
    double maxDelta = 0.0;
    double variance = 0.0;
    for (auto& p : points)
      {
        double y_pred = socket + gradient * p.x;
        double delta  = p.y - y_pred;
        predicted.push_back (y_pred);
        deltas.push_back (delta);
        maxDelta = max (maxDelta, fabs(delta));
        variance += p.w * delta*delta;
      }
    variance /= wsum * (n<=2? 1 : (n-2)/double(n)); // N-2 because it's an estimation,
                                                    // based on 2 other estimated values (socket,gradient)
    return make_tuple (socket,gradient
                      ,move(predicted)
                      ,move(deltas)
                      ,correlation
                      ,maxDelta
                      ,sqrt(variance)
                      );
  }
  
  inline auto
  computeLinearRegression (RegressionData const& points)
  {
    return computeLinearRegression (DataSpan<RegressionPoint>{points});
  }
  
  
  /**
   * Compute linear regression over a time series with zero-based indices.
   * @remark using the indices as x-values, the calculations for a regression line
   *         can be simplified, using the known closed formula for a sum of integers,
   *         shifting the indices to 0…n-1 (leaving out the 0 and 0² term)
   *         - `1+…+n = n·(n+1)/2`
   *         - `1+…+n² = n·(n+1)·(2n+1)/6`
   * @return `(socket,gradient)` to describe the regression line y = socket + gradient · i
   */
  template<typename D>
  inline auto
  computeTimeSeriesLinearRegression (DataSpan<D> const& series)
  {
    if (series.size() < 2) return make_tuple(0.0,0.0,0.0);
    
    auto [ysum,yysum, xysum] = computeStatSums(series);
    
    size_t n = series.size();
    double im = (n-1)/2.0;                     // mean of zero-based indices i ∈ {0 … n-1}
    double ym = ysum / n;                      // mean y
    double varx = (n-1)*(n+1)/12.0;            // variance of zero-based indices Σ(i-im)² / n
    double vary = yysum/n - ym*ym;             // variance of data values  Σ(y-ym)² / n
    double cova = xysum  - ysum *(n-1)/2;      // Time series Covariance = Σ(i-im)(y-ym) = Σiy + im·ym·n - ym·Σi - im·Σy; use n·ym ≙ Σy
    
    // Linear Regression minimising σ²
    double gradient = cova / (n*varx);         // Gradient = Correlation · σy / σx ; σ = √Variance;  Correlation = Covariance /(√Σx √Σy)
    double socket   = ym - gradient * im;      // Regression line:  Y-ym = Gradient · (i-im)  ; set i≔0 yields socket
    
    // Correlation (Pearson's r)
    double correlation = yysum==0.0? 1.0 : gradient * sqrt(varx/vary);
    return make_tuple (socket,gradient,correlation);
  }
  
  inline auto
  computeTimeSeriesLinearRegression (VecD const& series)
  {
    return computeTimeSeriesLinearRegression (DataSpan<double>{series});
  }
  
}} // namespace lib::stat
#endif /*LIB_STAT_STATISTIC_H*/
-												capture and store individual timings as time series

Note: work-in-progress...
TODO: derive the expense factor and delta

											
										
										
											2021-09-19 17:31:54 +02:00
+								/*
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								  STATISTIC.hpp  -  helpers for generic statistics calculations
 								  Copyright (C)         Lumiera.org
 ,               Hermann Vosseler <Ichthyostega@web.de>
 								  This program is free software; you can redistribute it and/or
 								  modify it under the terms of the GNU General Public License as
 								  published by the Free Software Foundation; either version 2 of
 								  the License, or (at your option) any later version.
 								  This program is distributed in the hope that it will be useful,
 								  but WITHOUT ANY WARRANTY; without even the implied warranty of
 								  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 								  GNU General Public License for more details.
 								  You should have received a copy of the GNU General Public License
 								  along with this program; if not, write to the Free Software
 								  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 								*/
-												capture and store individual timings as time series

Note: work-in-progress...
TODO: derive the expense factor and delta

											
										
										
											2021-09-19 17:31:54 +02:00
 								/** @file statistic.cpp
 								 ** Support for generic statistics calculations.
-												use regression to monitor short term and long term trends

Since the platform calibration inevitably incurs some additional error band,
a linear regresssion over the time series of measurements can additionally be used
to spot ongoing systematic changes below this general error band, while
leveling out local statistical fluctuations.

											
										
										
											2021-10-03 04:00:51 +02:00
+								 ** - average over the N last elements in a data sequence
 								 ** - simple linear regression with weights (single predictor variable)
 								 ** - also over a time series with zero-based indices
 								 **
-												capture and store individual timings as time series

Note: work-in-progress...
TODO: derive the expense factor and delta

											
										
										
											2021-09-19 17:31:54 +02:00
+								 */
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								#ifndef LIB_STAT_STATISTIC_H
 								#define LIB_STAT_STATISTIC_H
-												capture and store individual timings as time series

Note: work-in-progress...
TODO: derive the expense factor and delta

											
										
										
											2021-09-19 17:31:54 +02:00
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								#include "lib/error.hpp"
 								#include "lib/nocopy.hpp"
-												Library: cover statistic functions and linear regression

											
										
										
											2024-03-15 21:07:02 +01:00
+								#include "lib/iter-adapter.hpp"
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								#include "lib/format-string.hpp"
 								#include "lib/util.hpp"
-												capture and store individual timings as time series

Note: work-in-progress...
TODO: derive the expense factor and delta

											
										
										
											2021-09-19 17:31:54 +02:00
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								#include <utility>
-												capture and store individual timings as time series

Note: work-in-progress...
TODO: derive the expense factor and delta

											
										
										
											2021-09-19 17:31:54 +02:00
+								#include <vector>
-												implement fitting the platform model by linear regression

 * triggered by --calibrate
 * normalise away any known expense factors, but use them as weight
 * calculate simple linear regression from statistic data

											
										
										
											2021-09-25 03:39:21 +02:00
+								#include <array>
 								#include <tuple>
 								#include <cmath>
-												capture and store individual timings as time series

Note: work-in-progress...
TODO: derive the expense factor and delta

											
										
										
											2021-09-19 17:31:54 +02:00
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								namespace lib {
 								namespace stat{
 								  namespace error = lumiera::error;
 								  using std::fabs;
 								  using std::array;
 								  using std::tuple;
 								  using std::make_tuple;
 								  using std::forward;
 								  using std::move;
 								  using util::min;
 								  using util::max;
 								  using util::isnil;
 								  using util::_Fmt;
 								  using VecD = std::vector<double>;
 								  /** helper to unpack a std::tuple into a homogeneous std::array */
 								  template<typename TUP>
 								  constexpr auto
 								  array_from_tuple (TUP&& tuple)
 								  {
 								    constexpr auto makeArray = [](auto&& ... x)
 								      {
 								        return std::array{forward<decltype(x)> (x) ...};
 								      };
 								    return std::apply (makeArray, forward<TUP> (tuple));
 								  }
 								  template<size_t places>
 								  inline double
 								  round (double val)
 								  {
 								    constexpr double shift{pow(10.0, places)};
 								    return std::round(val*shift) / shift;
 								  }
 								  /**
 								   * Read-only view into a segment within a sequence of data
 								   * @tparam D value type of the data series
 								   * @remark simplistic workaround since we don't support C++20 yet
 								   * @todo replace by const std::span
 								   */
 								  template<typename D>
 								  class DataSpan
-												use regression to monitor short term and long term trends

Since the platform calibration inevitably incurs some additional error band,
a linear regresssion over the time series of measurements can additionally be used
to spot ongoing systematic changes below this general error band, while
leveling out local statistical fluctuations.

											
										
										
											2021-10-03 04:00:51 +02:00
+								    : util::Cloneable
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								    {
 								      const D* const b_{nullptr};
 								      const D* const e_{nullptr};
 								    public:
 								      DataSpan() = default;
 								      DataSpan (D const& begin, D const& end)
-												use regression to monitor short term and long term trends

Since the platform calibration inevitably incurs some additional error band,
a linear regresssion over the time series of measurements can additionally be used
to spot ongoing systematic changes below this general error band, while
leveling out local statistical fluctuations.

											
										
										
											2021-10-03 04:00:51 +02:00
+								        : b_{&begin}
 								        , e_{&end}
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								        {
 								          if (e_ < b_)
 								            throw error::Invalid{"End point before begin."};
 								        }
 								      template<class CON>
 								      DataSpan (CON const& container)
-												use regression to monitor short term and long term trends

Since the platform calibration inevitably incurs some additional error band,
a linear regresssion over the time series of measurements can additionally be used
to spot ongoing systematic changes below this general error band, while
leveling out local statistical fluctuations.

											
										
										
											2021-10-03 04:00:51 +02:00
+								        : DataSpan{*std::begin(container), *std::end(container)}
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								        { }
 								      using iterator = const D*;
-												Library: cover statistic functions and linear regression

											
										
										
											2024-03-15 21:07:02 +01:00
+								      using const_iterator = iterator;
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
 								      size_t size()  const { return e_ - b_; }
 								      bool empty()   const { return b_ == e_;}
 								      iterator begin() const { return b_; }
 								      iterator end()   const { return e_; }
-												Library: cover statistic functions and linear regression

											
										
										
											2024-03-15 21:07:02 +01:00
+								      friend const_iterator begin (DataSpan const& span){ return span.begin();}
 								      friend const_iterator end   (DataSpan const& span){ return span.end();  }
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
-												Library: cover statistic functions and linear regression

											
										
										
											2024-03-15 21:07:02 +01:00
+								      D const& operator[](size_t i) const { return *(b_ + i); }
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								      D const& at(size_t i)  const
 								        {
 								          if (i >= size())
 								            throw error::Invalid{_Fmt{"Index %d beyond size=%d"}
 								                                     % i % size()};
 								          return this->operator[](i);
 								        }
 								    };
-												Library: cover statistic functions and linear regression

											
										
										
											2024-03-15 21:07:02 +01:00
+								  /** deduction guide: derive content from container. */
 								  template<class CON>
 								  DataSpan (CON const& container) -> DataSpan<typename lib::meta::ValueTypeBinding<CON>::value_type>;
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
 								  /** summation of variances, for error propagation: √Σe² */
 								  template<typename... NUMS>
 								  inline double
 								  errorSum (NUMS ...vals)
 								  {
-												heuristics to establish a tolerance band for watching global trends

After the individual tests, we calculate the averaged delta over the
whole test suite, to detect changes to the overall timings. As it turned out,
using the error propagation for the calculation of the averaged delta
yields the right tolerance band to ignore random fluctuations but
trigger alarm on real changes.

Moreover, add several further timing test cases
to verify the calibration via "platform model" works as intended

											
										
										
											2021-10-08 02:48:23 +02:00
+								    auto sqr = [](auto val){ return val*val; };
 								    return sqrt((sqr(vals)+ ... + 0.0));
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								  }
 								  template<typename D>
 								  inline double
 								  average (DataSpan<D> const& data)
 								  {
-												use regression to monitor short term and long term trends

Since the platform calibration inevitably incurs some additional error band,
a linear regresssion over the time series of measurements can additionally be used
to spot ongoing systematic changes below this general error band, while
leveling out local statistical fluctuations.

											
										
										
											2021-10-03 04:00:51 +02:00
+								    if (isnil(data)) return 0.0;
 								    double sum = 0.0;
 								    for (auto val : data)
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								      sum += val;
-												use regression to monitor short term and long term trends

Since the platform calibration inevitably incurs some additional error band,
a linear regresssion over the time series of measurements can additionally be used
to spot ongoing systematic changes below this general error band, while
leveling out local statistical fluctuations.

											
										
										
											2021-10-03 04:00:51 +02:00
+								    return sum / data.size();
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								  }
 								  template<typename D>
 								  inline double
 								  sdev (DataSpan<D> const& data, D mean)
 								  {
-												calculate statistics and trend for the complete testsuite

											
										
										
											2021-10-04 03:59:51 +02:00
+								    if (isnil(data)) return 0.0;
 								    double sdev = 0.0;
 								    for (auto val : data)
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								      {
-												calculate statistics and trend for the complete testsuite

											
										
										
											2021-10-04 03:59:51 +02:00
+								        D offset = val - mean;
 								        sdev += offset*offset;
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								      }
-												calculate statistics and trend for the complete testsuite

											
										
										
											2021-10-04 03:59:51 +02:00
+								    size_t n = data.size();
 								    sdev /= n<2? 1: n-1;
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								    return sqrt (sdev);
 								  }
 								  inline double
 								  sdev (VecD const& data, double mean)
 								  {
 								    return sdev(DataSpan<double>{data}, mean);
 								  }
 								  inline DataSpan<double>
 								  lastN (VecD const& data, size_t n)
 								  {
 								    n = min (n, data.size());
-												add global storage and apply existing platform model

...this is largely just wiring of components built thus far

...TODO build platform model by linear regression

											
										
										
											2021-09-20 02:26:01 +02:00
+								    size_t oldest = data.size() - n;
-												use regression to monitor short term and long term trends

Since the platform calibration inevitably incurs some additional error band,
a linear regresssion over the time series of measurements can additionally be used
to spot ongoing systematic changes below this general error band, while
leveling out local statistical fluctuations.

											
										
										
											2021-10-03 04:00:51 +02:00
+								    return DataSpan<double>{data[oldest], *data.end()};
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								  }
 								  inline double
 								  averageLastN (VecD const& data, size_t n)
 								  {
 								    return average (lastN (data,n));
 								  }
 								  inline double
 								  sdevLastN (VecD const& data, size_t n, double mean)
 								  {
 								    return sdev (lastN (data,n), mean);
 								  }
 								  /** "building blocks" for mean, variance and covariance of time series data */
 								  template<typename D>
 								  inline auto
 								  computeStatSums (DataSpan<D> const& series)
 								  {
-												use regression to monitor short term and long term trends

Since the platform calibration inevitably incurs some additional error band,
a linear regresssion over the time series of measurements can additionally be used
to spot ongoing systematic changes below this general error band, while
leveling out local statistical fluctuations.

											
										
										
											2021-10-03 04:00:51 +02:00
+								    double ysum = 0.0;
 								    double yysum = 0.0;
 								    double xysum = 0.0;
 								    size_t x = 0;
 								    for (auto& y : series)
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								      {
-												use regression to monitor short term and long term trends

Since the platform calibration inevitably incurs some additional error band,
a linear regresssion over the time series of measurements can additionally be used
to spot ongoing systematic changes below this general error band, while
leveling out local statistical fluctuations.

											
										
										
											2021-10-03 04:00:51 +02:00
+								        ysum += y;
 								        yysum += y*y;
 								        xysum += x*y;
 								        ++x;
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								      }
 								    return make_tuple (ysum,yysum, xysum);
 								  }
 								  /**
 								   * Single data point used for linear regression.
 								   * Simple case: single predictor variable (x).
 								   * @remark including a weight factor
 								   */
 								  struct RegressionPoint
 								    {
 								      double x;
 								      double y;
 								      double w;
 								    };
 								  using RegressionData = std::vector<RegressionPoint>;
 								  /** "building blocks" for weighted mean, weighted variance and covariance */
 								  inline auto
 								  computeWeightedStatSums (DataSpan<RegressionPoint> const& points)
 								  {
-												use regression to monitor short term and long term trends

Since the platform calibration inevitably incurs some additional error band,
a linear regresssion over the time series of measurements can additionally be used
to spot ongoing systematic changes below this general error band, while
leveling out local statistical fluctuations.

											
										
										
											2021-10-03 04:00:51 +02:00
+								    std::array<double,6> sums;
 								    sums.fill(0.0);
-												implement fitting the platform model by linear regression

 * triggered by --calibrate
 * normalise away any known expense factors, but use them as weight
 * calculate simple linear regression from statistic data

											
										
										
											2021-09-25 03:39:21 +02:00
+								    auto& [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = sums;
 								    for (auto& p : points)
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								      {
-												implement fitting the platform model by linear regression

 * triggered by --calibrate
 * normalise away any known expense factors, but use them as weight
 * calculate simple linear regression from statistic data

											
										
										
											2021-09-25 03:39:21 +02:00
+								        wsum += p.w;
 								        wxsum += p.w * p.x;
 								        wysum += p.w * p.y;
 								        wxxsum += p.w * p.x*p.x;
 								        wyysum += p.w * p.y*p.y;
 								        wxysum += p.w * p.x*p.y;
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								      }
-												implement fitting the platform model by linear regression

 * triggered by --calibrate
 * normalise away any known expense factors, but use them as weight
 * calculate simple linear regression from statistic data

											
										
										
											2021-09-25 03:39:21 +02:00
+								    return sums;
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								  }
 								  /**
 								   * Compute simple linear regression with a single predictor variable (x).
 								   * @param points 2D data to fit the linear model with, including weights.
 								   * @return the computed linear model `b + a·x`, and the resulting fit
 								   *       - socket (constant offset `b`)
 								   *       - gradient (linear factor `a`)
 								   *       - a vector with a predicted `y` value for each `x` value
 								   *       - a vector with the error, i.e `Δ = y - y_predicted`
 								   *       - correlation between x and y values
 								   *       - maximum absolute delta
 								   *       - delta standard deviation
 								   */
 								  inline auto
 								  computeLinearRegression (DataSpan<RegressionPoint> const& points)
 								  {
-												implement fitting the platform model by linear regression

 * triggered by --calibrate
 * normalise away any known expense factors, but use them as weight
 * calculate simple linear regression from statistic data

											
										
										
											2021-09-25 03:39:21 +02:00
+								    auto [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = computeWeightedStatSums(points);
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
-												implement fitting the platform model by linear regression

 * triggered by --calibrate
 * normalise away any known expense factors, but use them as weight
 * calculate simple linear regression from statistic data

											
										
										
											2021-09-25 03:39:21 +02:00
+								    double xm = wxsum / wsum;                                       // weighted mean x = 1/Σw · Σwx
 								    double ym = wysum / wsum;
 								    double varx = wxxsum + xm*xm * wsum - 2*xm * wxsum;             // Σw · x-Variance = Σw(x-xm)²
 								    double vary = wyysum + ym*ym * wsum - 2*ym * wysum;
 								    double cova = wxysum + xm*ym * wsum - ym * wxsum - xm * wysum;  // Σw · Covariance = Σw(x-xm)(y-ym)
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
-												implement fitting the platform model by linear regression

 * triggered by --calibrate
 * normalise away any known expense factors, but use them as weight
 * calculate simple linear regression from statistic data

											
										
										
											2021-09-25 03:39:21 +02:00
+								    // Linear Regression minimising σ²
 								    double gradient = cova / varx;                                  // gradient = correlation · σy / σx ; σ = √Variance
 								    double socket   = ym - gradient * xm;                           // Regression line:  Y-ym = gradient · (x-xm)  ; set x≔0 yields socket
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
-												implement fitting the platform model by linear regression

 * triggered by --calibrate
 * normalise away any known expense factors, but use them as weight
 * calculate simple linear regression from statistic data

											
										
										
											2021-09-25 03:39:21 +02:00
+								    // Correlation (Pearson's r)
-												Documentation clean-up and fixes

more clean-up and polishing
after some further test regarding the topic of timing measurements

Improved handling: filter test cases to be performed

											
										
										
											2021-10-09 17:38:37 +02:00
+								    double correlation = wyysum==0.0? 1.0 : gradient * sqrt(varx/vary);
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
-												implement fitting the platform model by linear regression

 * triggered by --calibrate
 * normalise away any known expense factors, but use them as weight
 * calculate simple linear regression from statistic data

											
										
										
											2021-09-25 03:39:21 +02:00
+								    // calculate error Δ for all measurement points
-												calculate statistics and trend for the complete testsuite

											
										
										
											2021-10-04 03:59:51 +02:00
+								    size_t n = points.size();
 								    VecD predicted;  predicted.reserve(n);
 								    VecD deltas;     deltas.reserve(n);
-												implement fitting the platform model by linear regression

 * triggered by --calibrate
 * normalise away any known expense factors, but use them as weight
 * calculate simple linear regression from statistic data

											
										
										
											2021-09-25 03:39:21 +02:00
+								    double maxDelta = 0.0;
 								    double variance = 0.0;
 								    for (auto& p : points)
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								      {
-												implement fitting the platform model by linear regression

 * triggered by --calibrate
 * normalise away any known expense factors, but use them as weight
 * calculate simple linear regression from statistic data

											
										
										
											2021-09-25 03:39:21 +02:00
+								        double y_pred = socket + gradient * p.x;
 								        double delta  = p.y - y_pred;
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								        predicted.push_back (y_pred);
 								        deltas.push_back (delta);
 								        maxDelta = max (maxDelta, fabs(delta));
-												implement fitting the platform model by linear regression

 * triggered by --calibrate
 * normalise away any known expense factors, but use them as weight
 * calculate simple linear regression from statistic data

											
										
										
											2021-09-25 03:39:21 +02:00
+								        variance += p.w * delta*delta;
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								      }
-												calculate statistics and trend for the complete testsuite

											
										
										
											2021-10-04 03:59:51 +02:00
+								    variance /= wsum * (n<=2? 1 : (n-2)/double(n)); // N-2 because it's an estimation,
 								                                                    // based on 2 other estimated values (socket,gradient)
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								    return make_tuple (socket,gradient
 								                      ,move(predicted)
 								                      ,move(deltas)
 								                      ,correlation
 								                      ,maxDelta
 								                      ,sqrt(variance)
 								                      );
 								  }
 								  inline auto
 								  computeLinearRegression (RegressionData const& points)
 								  {
-												Library: cover statistic functions and linear regression

											
										
										
											2024-03-15 21:07:02 +01:00
+								    return computeLinearRegression (DataSpan<RegressionPoint>{points});
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								  }
 								  /**
 								   * Compute linear regression over a time series with zero-based indices.
 								   * @remark using the indices as x-values, the calculations for a regression line
 								   *         can be simplified, using the known closed formula for a sum of integers,
 								   *         shifting the indices to 0…n-1 (leaving out the 0 and 0² term)
 								   *         - `1+…+n = n·(n+1)/2`
 								   *         - `1+…+n² = n·(n+1)·(2n+1)/6`
 								   * @return `(socket,gradient)` to describe the regression line y = socket + gradient · i
 								   */
 								  template<typename D>
 								  inline auto
 								  computeTimeSeriesLinearRegression (DataSpan<D> const& series)
 								  {
-												heuristics to establish a tolerance band for watching global trends

After the individual tests, we calculate the averaged delta over the
whole test suite, to detect changes to the overall timings. As it turned out,
using the error propagation for the calculation of the averaged delta
yields the right tolerance band to ignore random fluctuations but
trigger alarm on real changes.

Moreover, add several further timing test cases
to verify the calibration via "platform model" works as intended

											
										
										
											2021-10-08 02:48:23 +02:00
+								    if (series.size() < 2) return make_tuple(0.0,0.0,0.0);
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
-												use regression to monitor short term and long term trends

Since the platform calibration inevitably incurs some additional error band,
a linear regresssion over the time series of measurements can additionally be used
to spot ongoing systematic changes below this general error band, while
leveling out local statistical fluctuations.

											
										
										
											2021-10-03 04:00:51 +02:00
+								    auto [ysum,yysum, xysum] = computeStatSums(series);
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
-												use regression to monitor short term and long term trends

Since the platform calibration inevitably incurs some additional error band,
a linear regresssion over the time series of measurements can additionally be used
to spot ongoing systematic changes below this general error band, while
leveling out local statistical fluctuations.

											
										
										
											2021-10-03 04:00:51 +02:00
+								    size_t n = series.size();
 								    double im = (n-1)/2.0;                     // mean of zero-based indices i ∈ {0 … n-1}
 								    double ym = ysum / n;                      // mean y
 								    double varx = (n-1)*(n+1)/12.0;            // variance of zero-based indices Σ(i-im)² / n
 								    double vary = yysum/n - ym*ym;             // variance of data values  Σ(y-ym)² / n
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								    double cova = xysum  - ysum *(n-1)/2;      // Time series Covariance = Σ(i-im)(y-ym) = Σiy + im·ym·n - ym·Σi - im·Σy; use n·ym ≙ Σy
-												use regression to monitor short term and long term trends

Since the platform calibration inevitably incurs some additional error band,
a linear regresssion over the time series of measurements can additionally be used
to spot ongoing systematic changes below this general error band, while
leveling out local statistical fluctuations.

											
										
										
											2021-10-03 04:00:51 +02:00
+								    // Linear Regression minimising σ²
 								    double gradient = cova / (n*varx);         // Gradient = Correlation · σy / σx ; σ = √Variance;  Correlation = Covariance /(√Σx √Σy)
 								    double socket   = ym - gradient * im;      // Regression line:  Y-ym = Gradient · (i-im)  ; set i≔0 yields socket
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
-												use regression to monitor short term and long term trends

Since the platform calibration inevitably incurs some additional error band,
a linear regresssion over the time series of measurements can additionally be used
to spot ongoing systematic changes below this general error band, while
leveling out local statistical fluctuations.

											
										
										
											2021-10-03 04:00:51 +02:00
+								    // Correlation (Pearson's r)
-												Documentation clean-up and fixes

more clean-up and polishing
after some further test regarding the topic of timing measurements

Improved handling: filter test cases to be performed

											
										
										
											2021-10-09 17:38:37 +02:00
+								    double correlation = yysum==0.0? 1.0 : gradient * sqrt(varx/vary);
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								    return make_tuple (socket,gradient,correlation);
 								  }
 								  inline auto
 								  computeTimeSeriesLinearRegression (VecD const& series)
 								  {
-												Library: cover statistic functions and linear regression

											
										
										
											2024-03-15 21:07:02 +01:00
+								    return computeTimeSeriesLinearRegression (DataSpan<double>{series});
-												Library: integrate into the Lumiera code base

- reformat in Lumieara-GNU style
- use the Lumiera exceptions
- use Lumiera format-string frontend
- use lib/util

NOTE: I am the original author of the code introduced here,
and thus I can re-license it under GPL 2+

											
										
										
											2024-03-11 01:52:49 +01:00
+								  }
 								}} // namespace lib::stat
 								#endif /*LIB_STAT_STATISTIC_H*/