/* STATISTIC.hpp - helpers for generic statistics calculations Copyright (C) Lumiera.org 2022, Hermann Vosseler This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /** @file statistic.cpp ** Support for generic statistics calculations. ** - average over the N last elements in a data sequence ** - simple linear regression with weights (single predictor variable) ** - also over a time series with zero-based indices ** */ #ifndef LIB_STAT_STATISTIC_H #define LIB_STAT_STATISTIC_H #include "lib/error.hpp" #include "lib/nocopy.hpp" #include "lib/iter-adapter.hpp" #include "lib/format-string.hpp" #include "lib/util.hpp" #include #include #include #include #include namespace lib { namespace stat{ namespace error = lumiera::error; using std::fabs; using std::array; using std::tuple; using std::make_tuple; using std::forward; using std::move; using util::min; using util::max; using util::isnil; using util::_Fmt; using VecD = std::vector; /** helper to unpack a std::tuple into a homogeneous std::array */ template constexpr auto array_from_tuple (TUP&& tuple) { constexpr auto makeArray = [](auto&& ... x) { return std::array{forward (x) ...}; }; return std::apply (makeArray, forward (tuple)); } template inline double round (double val) { constexpr double shift{pow(10.0, places)}; return std::round(val*shift) / shift; } /** * Read-only view into a segment within a sequence of data * @tparam D value type of the data series * @remark simplistic workaround since we don't support C++20 yet * @todo replace by const std::span */ template class DataSpan : util::Cloneable { const D* const b_{nullptr}; const D* const e_{nullptr}; public: DataSpan() = default; DataSpan (D const& begin, D const& end) : b_{&begin} , e_{&end} { if (e_ < b_) throw error::Invalid{"End point before begin."}; } template DataSpan (CON const& container) : DataSpan{*std::begin(container), *std::end(container)} { } using iterator = const D*; using const_iterator = iterator; size_t size() const { return e_ - b_; } bool empty() const { return b_ == e_;} iterator begin() const { return b_; } iterator end() const { return e_; } friend const_iterator begin (DataSpan const& span){ return span.begin();} friend const_iterator end (DataSpan const& span){ return span.end(); } D const& operator[](size_t i) const { return *(b_ + i); } D const& at(size_t i) const { if (i >= size()) throw error::Invalid{_Fmt{"Index %d beyond size=%d"} % i % size()}; return this->operator[](i); } }; /** deduction guide: derive content from container. */ template DataSpan (CON const& container) -> DataSpan::value_type>; /** summation of variances, for error propagation: √Σe² */ template inline double errorSum (NUMS ...vals) { auto sqr = [](auto val){ return val*val; }; return sqrt((sqr(vals)+ ... + 0.0)); } template inline double average (DataSpan const& data) { if (isnil(data)) return 0.0; double sum = 0.0; for (auto val : data) sum += val; return sum / data.size(); } template inline double sdev (DataSpan const& data, D mean) { if (isnil(data)) return 0.0; double sdev = 0.0; for (auto val : data) { D offset = val - mean; sdev += offset*offset; } size_t n = data.size(); sdev /= n<2? 1: n-1; return sqrt (sdev); } inline double sdev (VecD const& data, double mean) { return sdev(DataSpan{data}, mean); } inline DataSpan lastN (VecD const& data, size_t n) { n = min (n, data.size()); size_t oldest = data.size() - n; return DataSpan{data[oldest], *data.end()}; } inline double averageLastN (VecD const& data, size_t n) { return average (lastN (data,n)); } inline double sdevLastN (VecD const& data, size_t n, double mean) { return sdev (lastN (data,n), mean); } /** "building blocks" for mean, variance and covariance of time series data */ template inline auto computeStatSums (DataSpan const& series) { double ysum = 0.0; double yysum = 0.0; double xysum = 0.0; size_t x = 0; for (auto& y : series) { ysum += y; yysum += y*y; xysum += x*y; ++x; } return make_tuple (ysum,yysum, xysum); } /** * Single data point used for linear regression. * Simple case: single predictor variable (x). * @remark including a weight factor */ struct RegressionPoint { double x; double y; double w; }; using RegressionData = std::vector; /** "building blocks" for weighted mean, weighted variance and covariance */ inline auto computeWeightedStatSums (DataSpan const& points) { std::array sums; sums.fill(0.0); auto& [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = sums; for (auto& p : points) { wsum += p.w; wxsum += p.w * p.x; wysum += p.w * p.y; wxxsum += p.w * p.x*p.x; wyysum += p.w * p.y*p.y; wxysum += p.w * p.x*p.y; } return sums; } /** * Compute simple linear regression with a single predictor variable (x). * @param points 2D data to fit the linear model with, including weights. * @return the computed linear model `b + a·x`, and the resulting fit * - socket (constant offset `b`) * - gradient (linear factor `a`) * - a vector with a predicted `y` value for each `x` value * - a vector with the error, i.e `Δ = y - y_predicted` * - correlation between x and y values * - maximum absolute delta * - delta standard deviation */ inline auto computeLinearRegression (DataSpan const& points) { auto [wsum, wxsum, wysum, wxxsum, wyysum, wxysum] = computeWeightedStatSums(points); double xm = wxsum / wsum; // weighted mean x = 1/Σw · Σwx double ym = wysum / wsum; double varx = wxxsum + xm*xm * wsum - 2*xm * wxsum; // Σw · x-Variance = Σw(x-xm)² double vary = wyysum + ym*ym * wsum - 2*ym * wysum; double cova = wxysum + xm*ym * wsum - ym * wxsum - xm * wysum; // Σw · Covariance = Σw(x-xm)(y-ym) // Linear Regression minimising σ² double gradient = cova / varx; // gradient = correlation · σy / σx ; σ = √Variance double socket = ym - gradient * xm; // Regression line: Y-ym = gradient · (x-xm) ; set x≔0 yields socket // Correlation (Pearson's r) double correlation = wyysum==0.0? 1.0 : gradient * sqrt(varx/vary); // calculate error Δ for all measurement points size_t n = points.size(); VecD predicted; predicted.reserve(n); VecD deltas; deltas.reserve(n); double maxDelta = 0.0; double variance = 0.0; for (auto& p : points) { double y_pred = socket + gradient * p.x; double delta = p.y - y_pred; predicted.push_back (y_pred); deltas.push_back (delta); maxDelta = max (maxDelta, fabs(delta)); variance += p.w * delta*delta; } variance /= wsum * (n<=2? 1 : (n-2)/double(n)); // N-2 because it's an estimation, // based on 2 other estimated values (socket,gradient) return make_tuple (socket,gradient ,move(predicted) ,move(deltas) ,correlation ,maxDelta ,sqrt(variance) ); } inline auto computeLinearRegression (RegressionData const& points) { return computeLinearRegression (DataSpan{points}); } /** * Compute linear regression over a time series with zero-based indices. * @remark using the indices as x-values, the calculations for a regression line * can be simplified, using the known closed formula for a sum of integers, * shifting the indices to 0…n-1 (leaving out the 0 and 0² term) * - `1+…+n = n·(n+1)/2` * - `1+…+n² = n·(n+1)·(2n+1)/6` * @return `(socket,gradient)` to describe the regression line y = socket + gradient · i */ template inline auto computeTimeSeriesLinearRegression (DataSpan const& series) { if (series.size() < 2) return make_tuple(0.0,0.0,0.0); auto [ysum,yysum, xysum] = computeStatSums(series); size_t n = series.size(); double im = (n-1)/2.0; // mean of zero-based indices i ∈ {0 … n-1} double ym = ysum / n; // mean y double varx = (n-1)*(n+1)/12.0; // variance of zero-based indices Σ(i-im)² / n double vary = yysum/n - ym*ym; // variance of data values Σ(y-ym)² / n double cova = xysum - ysum *(n-1)/2; // Time series Covariance = Σ(i-im)(y-ym) = Σiy + im·ym·n - ym·Σi - im·Σy; use n·ym ≙ Σy // Linear Regression minimising σ² double gradient = cova / (n*varx); // Gradient = Correlation · σy / σx ; σ = √Variance; Correlation = Covariance /(√Σx √Σy) double socket = ym - gradient * im; // Regression line: Y-ym = Gradient · (i-im) ; set i≔0 yields socket // Correlation (Pearson's r) double correlation = yysum==0.0? 1.0 : gradient * sqrt(varx/vary); return make_tuple (socket,gradient,correlation); } inline auto computeTimeSeriesLinearRegression (VecD const& series) { return computeTimeSeriesLinearRegression (DataSpan{series}); } }} // namespace lib::stat #endif /*LIB_STAT_STATISTIC_H*/