- remove obsolete configuration settings - walk through all settings according to the documentation https://www.doxygen.nl/manual/config.html - now try to use the new feature to rely on Clang for C++ parsing - walk through the doxygen-warnings.txt and fix some obvious misspellings and structural problems in the documentation comments. With Debian-Trixie, we are now using Doxygen 1.9.8 — which produces massively better results in various fine points. However, there are still problems with automatic cross links, especially from implementation to the corresponding test classes.
373 lines
11 KiB
C++
373 lines
11 KiB
C++
/*
|
||
STATISTIC.hpp - helpers for generic statistics calculations
|
||
|
||
Copyright (C)
|
||
2022, Hermann Vosseler <Ichthyostega@web.de>
|
||
|
||
**Lumiera** is free software; you can redistribute it and/or modify it
|
||
under the terms of the GNU General Public License as published by the
|
||
Free Software Foundation; either version 2 of the License, or (at your
|
||
option) any later version. See the file COPYING for further details.
|
||
|
||
*/
|
||
|
||
|
||
/** @file statistic.hpp
|
||
** Support for generic statistics calculations.
|
||
** - average over the N last elements in a data sequence
|
||
** - simple linear regression with weights (single predictor variable)
|
||
** - also over a time series with zero-based indices
|
||
**
|
||
*/
|
||
|
||
|
||
|
||
#ifndef LIB_STAT_STATISTIC_H
|
||
#define LIB_STAT_STATISTIC_H
|
||
|
||
|
||
#include "lib/error.hpp"
|
||
#include "lib/nocopy.hpp"
|
||
#include "lib/iter-adapter.hpp"
|
||
#include "lib/format-string.hpp"
|
||
#include "lib/util.hpp"
|
||
|
||
#include <utility>
|
||
#include <vector>
|
||
#include <array>
|
||
#include <tuple>
|
||
#include <cmath>
|
||
|
||
namespace lib {
|
||
namespace stat{
|
||
|
||
namespace error = lumiera::error;
|
||
|
||
using std::fabs;
|
||
using std::array;
|
||
using std::tuple;
|
||
using std::make_tuple;
|
||
using std::forward;
|
||
using std::move;
|
||
using util::min;
|
||
using util::max;
|
||
using util::isnil;
|
||
using util::_Fmt;
|
||
|
||
using VecD = std::vector<double>;
|
||
|
||
|
||
|
||
/** helper to unpack a std::tuple into a homogeneous std::array */
|
||
template<typename TUP>
|
||
constexpr auto
|
||
array_from_tuple (TUP&& tuple)
|
||
{
|
||
constexpr auto makeArray = [](auto&& ... x)
|
||
{
|
||
return std::array{forward<decltype(x)> (x) ...};
|
||
};
|
||
return std::apply (makeArray, forward<TUP> (tuple));
|
||
}
|
||
|
||
template<size_t places>
|
||
inline double
|
||
round (double val)
|
||
{
|
||
constexpr double shift{pow(10.0, places)};
|
||
return std::round(val*shift) / shift;
|
||
}
|
||
|
||
|
||
|
||
|
||
/**
|
||
* Read-only view into a segment within a sequence of data
|
||
* @tparam D value type of the data series
|
||
* @remark simplistic workaround since we don't support C++20 yet
|
||
* @todo replace by const std::span
|
||
*/
|
||
template<typename D>
|
||
class DataSpan
|
||
: util::Cloneable
|
||
{
|
||
const D* const b_{nullptr};
|
||
const D* const e_{nullptr};
|
||
|
||
public:
|
||
DataSpan() = default;
|
||
DataSpan (D const& begin, D const& end)
|
||
: b_{&begin}
|
||
, e_{&end}
|
||
{
|
||
if (e_ < b_)
|
||
throw error::Invalid{"End point before begin."};
|
||
}
|
||
|
||
template<class CON>
|
||
DataSpan (CON const& container)
|
||
: DataSpan{*std::begin(container), *std::end(container)}
|
||
{ }
|
||
|
||
|
||
using iterator = const D*;
|
||
using const_iterator = iterator;
|
||
|
||
size_t size() const { return e_ - b_; }
|
||
bool empty() const { return b_ == e_;}
|
||
|
||
iterator begin() const { return b_; }
|
||
iterator end() const { return e_; }
|
||
friend const_iterator begin (DataSpan const& span){ return span.begin();}
|
||
friend const_iterator end (DataSpan const& span){ return span.end(); }
|
||
|
||
D const& operator[](size_t i) const { return *(b_ + i); }
|
||
D const& at(size_t i) const
|
||
{
|
||
if (i >= size())
|
||
throw error::Invalid{_Fmt{"Index %d beyond size=%d"}
|
||
% i % size()};
|
||
return this->operator[](i);
|
||
}
|
||
};
|
||
|
||
/** deduction guide: derive content from container. */
|
||
template<class CON>
|
||
DataSpan (CON const& container) -> DataSpan<typename lib::meta::ValueTypeBinding<CON>::value_type>;
|
||
|
||
|
||
|
||
|
||
|
||
/** summation of variances, for error propagation: √Σe² */
|
||
template<typename... NUMS>
|
||
inline double
|
||
errorSum (NUMS ...vals)
|
||
{
|
||
auto sqr = [](auto val){ return val*val; };
|
||
return sqrt((sqr(vals)+ ... + 0.0));
|
||
}
|
||
|
||
|
||
|
||
template<typename D>
|
||
inline double
|
||
average (DataSpan<D> const& data)
|
||
{
|
||
if (isnil(data)) return 0.0;
|
||
double sum = 0.0;
|
||
for (auto val : data)
|
||
sum += val;
|
||
return sum / data.size();
|
||
}
|
||
|
||
template<typename D>
|
||
inline double
|
||
sdev (DataSpan<D> const& data, D mean)
|
||
{
|
||
if (isnil(data)) return 0.0;
|
||
double sdev = 0.0;
|
||
for (auto val : data)
|
||
{
|
||
D offset = val - mean;
|
||
sdev += offset*offset;
|
||
}
|
||
size_t n = data.size();
|
||
sdev /= n<2? 1: n-1;
|
||
return sqrt (sdev);
|
||
}
|
||
|
||
inline double
|
||
sdev (VecD const& data, double mean)
|
||
{
|
||
return sdev(DataSpan<double>{data}, mean);
|
||
}
|
||
|
||
|
||
|
||
inline DataSpan<double>
|
||
lastN (VecD const& data, size_t n)
|
||
{
|
||
n = min (n, data.size());
|
||
size_t oldest = data.size() - n;
|
||
return DataSpan<double>{data[oldest], *data.end()};
|
||
}
|
||
|
||
inline double
|
||
averageLastN (VecD const& data, size_t n)
|
||
{
|
||
return average (lastN (data,n));
|
||
}
|
||
|
||
inline double
|
||
sdevLastN (VecD const& data, size_t n, double mean)
|
||
{
|
||
return sdev (lastN (data,n), mean);
|
||
}
|
||
|
||
|
||
/** "building blocks" for mean, variance and covariance of time series data */
|
||
template<typename D>
|
||
inline auto
|
||
computeStatSums (DataSpan<D> const& series)
|
||
{
|
||
double ysum = 0.0;
|
||
double yysum = 0.0;
|
||
double xysum = 0.0;
|
||
size_t x = 0;
|
||
for (auto& y : series)
|
||
{
|
||
ysum += y;
|
||
yysum += y*y;
|
||
xysum += x*y;
|
||
++x;
|
||
}
|
||
return make_tuple (ysum,yysum, xysum);
|
||
}
|
||
|
||
|
||
/**
|
||
* Single data point used for linear regression.
|
||
* Simple case: single predictor variable (x).
|
||
* @remark including a weight factor
|
||
*/
|
||
struct RegressionPoint
|
||
{
|
||
double x;
|
||
double y;
|
||
double w;
|
||
|
||
RegressionPoint (double vx, double vy, double vw=1.0)
|
||
: x{vx}
|
||
, y{vy}
|
||
, w{vw}
|
||
{ }
|
||
};
|
||
|
||
using RegressionData = std::vector<RegressionPoint>;
|
||
|
||
|
||
/** "building blocks" for weighted mean, weighted variance and covariance */
|
||
inline auto
|
||
computeWeightedStatSums (DataSpan<RegressionPoint> const& points)
|
||
{
|
||
std::array<double,6> sums;
|
||
sums.fill(0.0);
|
||
auto& [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = sums;
|
||
for (auto& p : points)
|
||
{
|
||
wsum += p.w;
|
||
wxsum += p.w * p.x;
|
||
wysum += p.w * p.y;
|
||
wxxsum += p.w * p.x*p.x;
|
||
wyysum += p.w * p.y*p.y;
|
||
wxysum += p.w * p.x*p.y;
|
||
}
|
||
return sums;
|
||
}
|
||
|
||
/**
|
||
* Compute simple linear regression with a single predictor variable (x).
|
||
* @param points 2D data to fit the linear model with, including weights.
|
||
* @return the computed linear model `b + a·x`, and the resulting fit
|
||
* - socket (constant offset `b`)
|
||
* - gradient (linear factor `a`)
|
||
* - a vector with a predicted `y` value for each `x` value
|
||
* - a vector with the error, i.e `Δ = y - y_predicted`
|
||
* - correlation between x and y values
|
||
* - maximum absolute delta
|
||
* - delta standard deviation
|
||
*/
|
||
inline auto
|
||
computeLinearRegression (DataSpan<RegressionPoint> const& points)
|
||
{
|
||
auto [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = computeWeightedStatSums(points);
|
||
|
||
double xm = wxsum / wsum; // weighted mean x = 1/Σw · Σwx
|
||
double ym = wysum / wsum;
|
||
double varx = wxxsum + xm*xm * wsum - 2*xm * wxsum; // Σw · x-Variance = Σw(x-xm)²
|
||
double vary = wyysum + ym*ym * wsum - 2*ym * wysum;
|
||
double cova = wxysum + xm*ym * wsum - ym * wxsum - xm * wysum; // Σw · Covariance = Σw(x-xm)(y-ym)
|
||
|
||
// Linear Regression minimising σ²
|
||
double gradient = cova / varx; // gradient = correlation · σy / σx ; σ = √Variance
|
||
double socket = ym - gradient * xm; // Regression line: Y-ym = gradient · (x-xm) ; set x≔0 yields socket
|
||
|
||
// Correlation (Pearson's r)
|
||
double correlation = wyysum==0.0? 1.0 : gradient * sqrt(varx/vary);
|
||
|
||
// calculate error Δ for all measurement points
|
||
size_t n = points.size();
|
||
VecD predicted; predicted.reserve(n);
|
||
VecD deltas; deltas.reserve(n);
|
||
double maxDelta = 0.0;
|
||
double variance = 0.0;
|
||
for (auto& p : points)
|
||
{
|
||
double y_pred = socket + gradient * p.x;
|
||
double delta = p.y - y_pred;
|
||
predicted.push_back (y_pred);
|
||
deltas.push_back (delta);
|
||
maxDelta = max (maxDelta, fabs(delta));
|
||
variance += p.w * delta*delta;
|
||
}
|
||
variance /= wsum * (n<=2? 1 : (n-2)/double(n)); // N-2 because it's an estimation,
|
||
// based on 2 other estimated values (socket,gradient)
|
||
return make_tuple (socket,gradient
|
||
,move(predicted)
|
||
,move(deltas)
|
||
,correlation
|
||
,maxDelta
|
||
,sqrt(variance)
|
||
);
|
||
}
|
||
|
||
inline auto
|
||
computeLinearRegression (RegressionData const& points)
|
||
{
|
||
return computeLinearRegression (DataSpan<RegressionPoint>{points});
|
||
}
|
||
|
||
|
||
|
||
/**
|
||
* Compute linear regression over a time series with zero-based indices.
|
||
* @remark using the indices as x-values, the calculations for a regression line
|
||
* can be simplified, using the known closed formula for a sum of integers,
|
||
* shifting the indices to 0…n-1 (leaving out the 0 and 0² term)
|
||
* - `1+…+n = n·(n+1)/2`
|
||
* - `1+…+n² = n·(n+1)·(2n+1)/6`
|
||
* @return `(socket,gradient)` to describe the regression line y = socket + gradient · i
|
||
*/
|
||
template<typename D>
|
||
inline auto
|
||
computeTimeSeriesLinearRegression (DataSpan<D> const& series)
|
||
{
|
||
if (series.size() < 2) return make_tuple(0.0,0.0,0.0);
|
||
|
||
auto [ysum,yysum, xysum] = computeStatSums(series);
|
||
|
||
size_t n = series.size();
|
||
double im = (n-1)/2.0; // mean of zero-based indices i ∈ {0 … n-1}
|
||
double ym = ysum / n; // mean y
|
||
double varx = (n-1)*(n+1)/12.0; // variance of zero-based indices Σ(i-im)² / n
|
||
double vary = yysum/n - ym*ym; // variance of data values Σ(y-ym)² / n
|
||
double cova = xysum - ysum *(n-1)/2; // Time series Covariance = Σ(i-im)(y-ym) = Σiy + im·ym·n - ym·Σi - im·Σy; use n·ym ≙ Σy
|
||
|
||
// Linear Regression minimising σ²
|
||
double gradient = cova / (n*varx); // Gradient = Correlation · σy / σx ; σ = √Variance; Correlation = Covariance /(√Σx √Σy)
|
||
double socket = ym - gradient * im; // Regression line: Y-ym = Gradient · (i-im) ; set i≔0 yields socket
|
||
|
||
// Correlation (Pearson's r)
|
||
double correlation = yysum==0.0? 1.0 : gradient * sqrt(varx/vary);
|
||
return make_tuple (socket,gradient,correlation);
|
||
}
|
||
|
||
inline auto
|
||
computeTimeSeriesLinearRegression (VecD const& series)
|
||
{
|
||
return computeTimeSeriesLinearRegression (DataSpan<double>{series});
|
||
}
|
||
|
||
}} // namespace lib::stat
|
||
#endif /*LIB_STAT_STATISTIC_H*/
|