LUMIERA.clone/yoshimi-testrunner/src/util/statistic.hpp

318 lines
9.7 KiB
C++
Raw Normal View History

/*
* statistic - helpers for generic statistics calculations
*
* Copyright 2021, Hermann Vosseler <Ichthyostega@web.de>
*
* This file is part of the Yoshimi-Testsuite, which is free software:
* you can redistribute and/or modify it under the terms of the GNU
* General Public License as published by the Free Software Foundation,
* either version 3 of the License, or (at your option) any later version.
*
* Yoshimi-Testsuite is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with yoshimi. If not, see <http://www.gnu.org/licenses/>.
***************************************************************/
/** @file statistic.cpp
** Support for generic statistics calculations.
** - average over the N last elements in a data sequence
** - simple linear regression with weights (single predictor variable)
** - also over a time series with zero-based indices
**
*/
#ifndef TESTRUNNER_UTIL_STATISTIC_HPP_
#define TESTRUNNER_UTIL_STATISTIC_HPP_
#include "util/error.hpp"
#include "util/nocopy.hpp"
#include "util/format.hpp"
#include "util/utils.hpp"
#include <vector>
#include <array>
#include <tuple>
#include <cmath>
namespace util {
using std::fabs;
using std::array;
using std::tuple;
using std::make_tuple;
using VecD = std::vector<double>;
/** helper to unpack a std::tuple into a homogeneous std::array */
template<typename TUP>
constexpr auto array_from_tuple(TUP&& tuple)
{
constexpr auto makeArray = [](auto&& ... x){ return std::array{std::forward<decltype(x)>(x) ... }; };
return std::apply(makeArray, std::forward<TUP>(tuple));
}
/**
* Read-only view into a segment within a sequence of data
* @tparam D value type of the data series
* @remark simplistic workaround since we don't support C++20 yet
* @todo replace by const std::span
*/
template<typename D>
class DataSpan
: util::Cloneable
{
const D* const b_{nullptr};
const D* const e_{nullptr};
public:
DataSpan() = default;
DataSpan(D const& begin, D const& end)
: b_{&begin}
, e_{&end}
{ if (e_ < b_) throw error::Invalid("End point before begin."); }
template<class CON>
DataSpan(CON const& container)
: DataSpan{*std::begin(container), *std::end(container)}
{ }
using iterator = const D*;
size_t size() const { return e_ - b_; }
bool empty() const { return b_ == e_; }
iterator begin() const { return b_; }
iterator end() const { return e_; }
D const& operator[](size_t i) const { return b_ + i; }
D const& at(size_t i) const
{
if (i >= size()) throw error::Invalid("Index "+str(i)+" beyond size="+str(size()));
return this->operator[](i);
}
};
/** summation of variances, for error propagation: √Σe² */
template<typename... NUMS>
inline double errorSum(NUMS ...vals)
{
auto sqr = [](auto val){ return val*val; };
return sqrt((sqr(vals)+ ... + 0.0));
}
template<typename D>
inline double average(DataSpan<D> const& data)
{
if (isnil(data)) return 0.0;
double sum = 0.0;
for (auto val : data)
sum += val;
return sum / data.size();
}
template<typename D>
inline double sdev(DataSpan<D> const& data, D mean)
{
if (isnil(data)) return 0.0;
double sdev = 0.0;
for (auto val : data)
{
D offset = val - mean;
sdev += offset*offset;
}
size_t n = data.size();
sdev /= n<2? 1: n-1;
return sqrt(sdev);
}
inline double sdev(VecD const& data, double mean)
{ return sdev(DataSpan<double>{data}, mean); }
inline DataSpan<double> lastN(VecD const& data, size_t n)
{
n = std::min(n, data.size());
size_t oldest = data.size() - n;
return DataSpan<double>{data[oldest], *data.end()};
}
inline double averageLastN(VecD const& data, size_t n)
{
return average(lastN(data,n));
}
inline double sdevLastN(VecD const& data, size_t n, double mean)
{
return sdev(lastN(data,n), mean);
}
/** "building blocks" for mean, variance and covariance of time series data */
template<typename D>
inline auto computeStatSums(DataSpan<D> const& series)
{
double ysum = 0.0;
double yysum = 0.0;
double xysum = 0.0;
size_t x = 0;
for (auto& y : series)
{
ysum += y;
yysum += y*y;
xysum += x*y;
++x;
}
return make_tuple(ysum,yysum, xysum);
}
/**
* Single data point used for linear regression.
* Simple case: single predictor variable (x).
* @remark including a weight factor
*/
struct RegressionPoint
{
double x;
double y;
double w;
};
using RegressionData = std::vector<RegressionPoint>;
/** "building blocks" for weighted mean, weighted variance and covariance */
inline auto computeWeightedStatSums(DataSpan<RegressionPoint> const& points)
{
std::array<double,6> sums;
sums.fill(0.0);
auto& [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = sums;
for (auto& p : points)
{
wsum += p.w;
wxsum += p.w * p.x;
wysum += p.w * p.y;
wxxsum += p.w * p.x*p.x;
wyysum += p.w * p.y*p.y;
wxysum += p.w * p.x*p.y;
}
return sums;
}
/**
* Compute simple linear regression with a single predictor variable (x).
* @param points 2D data to fit the linear model with, including weights.
* @return the computed linear model `b + a·x`, and the resulting fit
* - socket (constant offset `b`)
* - gradient (linear factor `a`)
* - a vector with a predicted `y` value for each `x` value
* - a vector with the error, i.e `Δ = y - y_predicted`
* - correlation between x and y values
* - maximum absolute delta
* - delta standard deviation
*/
inline auto computeLinearRegression(DataSpan<RegressionPoint> const& points)
{
auto [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = computeWeightedStatSums(points);
double xm = wxsum / wsum; // weighted mean x = 1/Σw · Σwx
double ym = wysum / wsum;
double varx = wxxsum + xm*xm * wsum - 2*xm * wxsum; // Σw · x-Variance = Σw(x-xm)²
double vary = wyysum + ym*ym * wsum - 2*ym * wysum;
double cova = wxysum + xm*ym * wsum - ym * wxsum - xm * wysum; // Σw · Covariance = Σw(x-xm)(y-ym)
// Linear Regression minimising σ²
double gradient = cova / varx; // gradient = correlation · σy / σx ; σ = √Variance
double socket = ym - gradient * xm; // Regression line: Y-ym = gradient · (x-xm) ; set x≔0 yields socket
// Correlation (Pearson's r)
double correlation = wyysum==0.0? 1.0 : gradient * sqrt(varx/vary);
// calculate error Δ for all measurement points
size_t n = points.size();
VecD predicted; predicted.reserve(n);
VecD deltas; deltas.reserve(n);
double maxDelta = 0.0;
double variance = 0.0;
for (auto& p : points)
{
double y_pred = socket + gradient * p.x;
double delta = p.y - y_pred;
predicted.push_back(y_pred);
deltas.push_back(delta);
maxDelta = std::max(maxDelta, fabs(delta));
variance += p.w * delta*delta;
}
variance /= wsum * (n<=2? 1 : (n-2)/double(n)); // N-2 because it's an estimation,
// based on 2 other estimated values (socket,gradient)
return make_tuple(socket,gradient
,move(predicted)
,move(deltas)
,correlation
,maxDelta
,sqrt(variance)
);
}
inline auto computeLinearRegression(RegressionData const& points)
{ return computeLinearRegression(DataSpan<RegressionPoint>{points}); }
/**
* Compute linear regression over a time series with zero-based indices.
* @remark using the indices as x-values, the calculations for a regression line
* can be simplified, using the known closed formula for a sum of integers,
* shifting the indices to 0n-1 (leaving out the 0 and 0² term)
* - `1++n = n·(n+1)/2`
* - `1++n² = n·(n+1)·(2n+1)/6`
* @return `(socket,gradient)` to describe the regression line y = socket + gradient · i
*/
template<typename D>
inline auto computeTimeSeriesLinearRegression(DataSpan<D> const& series)
{
if (series.size() < 2) return make_tuple(0.0,0.0,0.0);
auto [ysum,yysum, xysum] = computeStatSums(series);
size_t n = series.size();
double im = (n-1)/2.0; // mean of zero-based indices i ∈ {0 … n-1}
double ym = ysum / n; // mean y
double varx = (n-1)*(n+1)/12.0; // variance of zero-based indices Σ(i-im)² / n
double vary = yysum/n - ym*ym; // variance of data values Σ(y-ym)² / n
double cova = xysum - ysum *(n-1)/2; // Time series Covariance = Σ(i-im)(y-ym) = Σiy + im·ym·n - ym·Σi - im·Σy; use n*ym = Σy
// Linear Regression minimising σ²
double gradient = cova / (n*varx); // Gradient = Correlation · σy / σx ; σ = √Variance; Correlation = Covariance /(√Σx √Σy)
double socket = ym - gradient * im; // Regression line: Y-ym = Gradient · (i-im) ; set i≔0 yields socket
// Correlation (Pearson's r)
double correlation = yysum==0.0? 1.0 : gradient * sqrt(varx/vary);
return make_tuple(socket,gradient,correlation);
}
inline auto computeTimeSeriesLinearRegression(VecD const& series)
{ return computeTimeSeriesLinearRegression(DataSpan<double>{series}); }
}//(End)namespace util
#endif /*TESTRUNNER_UTIL_STATISTIC_HPP_*/