LUMIERA.clone/src/lib/stat/statistic.hpp

377 lines
11 KiB
C++
Raw Normal View History

/*
STATISTIC.hpp - helpers for generic statistics calculations
Copyright (C) Lumiera.org
2022, Hermann Vosseler <Ichthyostega@web.de>
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of
the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/** @file statistic.cpp
** Support for generic statistics calculations.
** - average over the N last elements in a data sequence
** - simple linear regression with weights (single predictor variable)
** - also over a time series with zero-based indices
**
*/
#ifndef LIB_STAT_STATISTIC_H
#define LIB_STAT_STATISTIC_H
#include "lib/error.hpp"
#include "lib/nocopy.hpp"
#include "lib/iter-adapter.hpp"
#include "lib/format-string.hpp"
#include "lib/util.hpp"
#include <utility>
#include <vector>
#include <array>
#include <tuple>
#include <cmath>
namespace lib {
namespace stat{
namespace error = lumiera::error;
using std::fabs;
using std::array;
using std::tuple;
using std::make_tuple;
using std::forward;
using std::move;
using util::min;
using util::max;
using util::isnil;
using util::_Fmt;
using VecD = std::vector<double>;
/** helper to unpack a std::tuple into a homogeneous std::array */
template<typename TUP>
constexpr auto
array_from_tuple (TUP&& tuple)
{
constexpr auto makeArray = [](auto&& ... x)
{
return std::array{forward<decltype(x)> (x) ...};
};
return std::apply (makeArray, forward<TUP> (tuple));
}
template<size_t places>
inline double
round (double val)
{
constexpr double shift{pow(10.0, places)};
return std::round(val*shift) / shift;
}
/**
* Read-only view into a segment within a sequence of data
* @tparam D value type of the data series
* @remark simplistic workaround since we don't support C++20 yet
* @todo replace by const std::span
*/
template<typename D>
class DataSpan
: util::Cloneable
{
const D* const b_{nullptr};
const D* const e_{nullptr};
public:
DataSpan() = default;
DataSpan (D const& begin, D const& end)
: b_{&begin}
, e_{&end}
{
if (e_ < b_)
throw error::Invalid{"End point before begin."};
}
template<class CON>
DataSpan (CON const& container)
: DataSpan{*std::begin(container), *std::end(container)}
{ }
using iterator = const D*;
using const_iterator = iterator;
size_t size() const { return e_ - b_; }
bool empty() const { return b_ == e_;}
iterator begin() const { return b_; }
iterator end() const { return e_; }
friend const_iterator begin (DataSpan const& span){ return span.begin();}
friend const_iterator end (DataSpan const& span){ return span.end(); }
D const& operator[](size_t i) const { return *(b_ + i); }
D const& at(size_t i) const
{
if (i >= size())
throw error::Invalid{_Fmt{"Index %d beyond size=%d"}
% i % size()};
return this->operator[](i);
}
};
/** deduction guide: derive content from container. */
template<class CON>
DataSpan (CON const& container) -> DataSpan<typename lib::meta::ValueTypeBinding<CON>::value_type>;
/** summation of variances, for error propagation: √Σe² */
template<typename... NUMS>
inline double
errorSum (NUMS ...vals)
{
auto sqr = [](auto val){ return val*val; };
return sqrt((sqr(vals)+ ... + 0.0));
}
template<typename D>
inline double
average (DataSpan<D> const& data)
{
if (isnil(data)) return 0.0;
double sum = 0.0;
for (auto val : data)
sum += val;
return sum / data.size();
}
template<typename D>
inline double
sdev (DataSpan<D> const& data, D mean)
{
if (isnil(data)) return 0.0;
double sdev = 0.0;
for (auto val : data)
{
D offset = val - mean;
sdev += offset*offset;
}
size_t n = data.size();
sdev /= n<2? 1: n-1;
return sqrt (sdev);
}
inline double
sdev (VecD const& data, double mean)
{
return sdev(DataSpan<double>{data}, mean);
}
inline DataSpan<double>
lastN (VecD const& data, size_t n)
{
n = min (n, data.size());
size_t oldest = data.size() - n;
return DataSpan<double>{data[oldest], *data.end()};
}
inline double
averageLastN (VecD const& data, size_t n)
{
return average (lastN (data,n));
}
inline double
sdevLastN (VecD const& data, size_t n, double mean)
{
return sdev (lastN (data,n), mean);
}
/** "building blocks" for mean, variance and covariance of time series data */
template<typename D>
inline auto
computeStatSums (DataSpan<D> const& series)
{
double ysum = 0.0;
double yysum = 0.0;
double xysum = 0.0;
size_t x = 0;
for (auto& y : series)
{
ysum += y;
yysum += y*y;
xysum += x*y;
++x;
}
return make_tuple (ysum,yysum, xysum);
}
/**
* Single data point used for linear regression.
* Simple case: single predictor variable (x).
* @remark including a weight factor
*/
struct RegressionPoint
{
double x;
double y;
double w;
};
using RegressionData = std::vector<RegressionPoint>;
/** "building blocks" for weighted mean, weighted variance and covariance */
inline auto
computeWeightedStatSums (DataSpan<RegressionPoint> const& points)
{
std::array<double,6> sums;
sums.fill(0.0);
auto& [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = sums;
for (auto& p : points)
{
wsum += p.w;
wxsum += p.w * p.x;
wysum += p.w * p.y;
wxxsum += p.w * p.x*p.x;
wyysum += p.w * p.y*p.y;
wxysum += p.w * p.x*p.y;
}
return sums;
}
/**
* Compute simple linear regression with a single predictor variable (x).
* @param points 2D data to fit the linear model with, including weights.
* @return the computed linear model `b + a·x`, and the resulting fit
* - socket (constant offset `b`)
* - gradient (linear factor `a`)
* - a vector with a predicted `y` value for each `x` value
* - a vector with the error, i.e `Δ = y - y_predicted`
* - correlation between x and y values
* - maximum absolute delta
* - delta standard deviation
*/
inline auto
computeLinearRegression (DataSpan<RegressionPoint> const& points)
{
auto [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = computeWeightedStatSums(points);
double xm = wxsum / wsum; // weighted mean x = 1/Σw · Σwx
double ym = wysum / wsum;
double varx = wxxsum + xm*xm * wsum - 2*xm * wxsum; // Σw · x-Variance = Σw(x-xm)²
double vary = wyysum + ym*ym * wsum - 2*ym * wysum;
double cova = wxysum + xm*ym * wsum - ym * wxsum - xm * wysum; // Σw · Covariance = Σw(x-xm)(y-ym)
// Linear Regression minimising σ²
double gradient = cova / varx; // gradient = correlation · σy / σx ; σ = √Variance
double socket = ym - gradient * xm; // Regression line: Y-ym = gradient · (x-xm) ; set x≔0 yields socket
// Correlation (Pearson's r)
double correlation = wyysum==0.0? 1.0 : gradient * sqrt(varx/vary);
// calculate error Δ for all measurement points
size_t n = points.size();
VecD predicted; predicted.reserve(n);
VecD deltas; deltas.reserve(n);
double maxDelta = 0.0;
double variance = 0.0;
for (auto& p : points)
{
double y_pred = socket + gradient * p.x;
double delta = p.y - y_pred;
predicted.push_back (y_pred);
deltas.push_back (delta);
maxDelta = max (maxDelta, fabs(delta));
variance += p.w * delta*delta;
}
variance /= wsum * (n<=2? 1 : (n-2)/double(n)); // N-2 because it's an estimation,
// based on 2 other estimated values (socket,gradient)
return make_tuple (socket,gradient
,move(predicted)
,move(deltas)
,correlation
,maxDelta
,sqrt(variance)
);
}
inline auto
computeLinearRegression (RegressionData const& points)
{
return computeLinearRegression (DataSpan<RegressionPoint>{points});
}
/**
* Compute linear regression over a time series with zero-based indices.
* @remark using the indices as x-values, the calculations for a regression line
* can be simplified, using the known closed formula for a sum of integers,
* shifting the indices to 0n-1 (leaving out the 0 and 0² term)
* - `1++n = n·(n+1)/2`
* - `1++n² = n·(n+1)·(2n+1)/6`
* @return `(socket,gradient)` to describe the regression line y = socket + gradient · i
*/
template<typename D>
inline auto
computeTimeSeriesLinearRegression (DataSpan<D> const& series)
{
if (series.size() < 2) return make_tuple(0.0,0.0,0.0);
auto [ysum,yysum, xysum] = computeStatSums(series);
size_t n = series.size();
double im = (n-1)/2.0; // mean of zero-based indices i ∈ {0 … n-1}
double ym = ysum / n; // mean y
double varx = (n-1)*(n+1)/12.0; // variance of zero-based indices Σ(i-im)² / n
double vary = yysum/n - ym*ym; // variance of data values Σ(y-ym)² / n
double cova = xysum - ysum *(n-1)/2; // Time series Covariance = Σ(i-im)(y-ym) = Σiy + im·ym·n - ym·Σi - im·Σy; use n·ym ≙ Σy
// Linear Regression minimising σ²
double gradient = cova / (n*varx); // Gradient = Correlation · σy / σx ; σ = √Variance; Correlation = Covariance /(√Σx √Σy)
double socket = ym - gradient * im; // Regression line: Y-ym = Gradient · (i-im) ; set i≔0 yields socket
// Correlation (Pearson's r)
double correlation = yysum==0.0? 1.0 : gradient * sqrt(varx/vary);
return make_tuple (socket,gradient,correlation);
}
inline auto
computeTimeSeriesLinearRegression (VecD const& series)
{
return computeTimeSeriesLinearRegression (DataSpan<double>{series});
}
}} // namespace lib::stat
#endif /*LIB_STAT_STATISTIC_H*/