LUMIERA.clone/src/lib/test/microbenchmark.hpp

134 lines
5.2 KiB
C++

/*
MICROBENCHMARK.hpp - multithreaded timing measurement
Copyright (C) Lumiera.org
2018, Hermann Vosseler <Ichthyostega@web.de>
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of
the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/** @file microbenchmark.hpp
** A function to perform multithreaded timing measurement on a given functor.
** This helper simplifies micro benchmarks of isolated implementation details.
** The test subject, given as function object or lambda, is copied into N threads
** and invoked numerous times within a tight loop. After waiting on termination of
** the test threads, results are summed up and then averaged into milliseconds
** per single invocation. The actual timing measurement relies on `chrono::duration`,
** which means to count micro ticks of the OS.
** @warning care has to bee taken when optimisation is involved!
** Optimisation usually has quite some impact on the results, but since
** this function is inline, the lambda can typically be inlined and the
** loop possibly be optimised away entirely. A simple workaround is to
** define a _volatile_ variable in the call context, close the lambda
** by reference, and perform a comparison with that volatile variable
** in each invocation. The compiler is required actually to access the
** value of the volatile each time.
** @remarks some interesting observations (in my setup, 8 core AMD FX-8350)
** - if we replace the global volatile by a local variable within the
** test subject, the initialisation of that local typically costs +5ns
** per invocation.
** - incrementing the volatile costs +10ns
** - multithreaded (unlocked) incrementing of the _global_ volatile
** creates massive overhead and increases the running time by factor 100.
** This nicely confirms that the x86_64 platform has strong cache coherence.
**
*/
#ifndef LIB_TEST_MICROBENCHMARK_H
#define LIB_TEST_MICROBENCHMARK_H
#include "vault/thread-wrapper.hpp"
#include <chrono>
#include <vector>
namespace lib {
namespace test{
namespace {
constexpr size_t DEFAULT_RUNS = 10000000;
constexpr double SCALE = 1e6; // Results are in µ sec
}
/** perform a multithreaded microbenchmark.
* This function fires up a number of threads
* and invokes the given test subject repeatedly.
* @tparam number of threads to run in parallel
* @param subject `void(void)` function to be timed
* @return the averaged invocation time in _mircroseconds_
* @remarks - the subject function will be _copied_ into each thread
* - so `nThreads` copies of this function will run in parallel
* - consider locking if this function accesses a shared closure.
* - if you pass a lambda, it is eligible for inlining followed
* by loop optimisation -- be sure to include some action, like
* e.g. accessing a volatile variable, to prevent the compiler
* from optimising it away entirely.
*/
template<size_t nThreads, class FUN>
inline double
microbenchmark(FUN const& subject, const size_t nRepeat = DEFAULT_RUNS)
{
using backend::ThreadJoinable;
using std::chrono::system_clock;
using Dur = std::chrono::duration<double>;
struct Thread
: ThreadJoinable
{
Thread(FUN const& subject, size_t loopCnt)
: ThreadJoinable("Micro-Benchmark"
,[=]() // local copy of the test-subject-Functor
{
syncPoint(); // block until all threads are ready
auto start = system_clock::now();
for (size_t i=0; i < loopCnt; ++i)
subject();
duration = system_clock::now () - start;
})
{ }
/** measured time within thread */
Dur duration{};
};
std::vector<Thread> threads;
threads.reserve(nThreads);
for (size_t n=0; n<nThreads; ++n) // create test threads
threads.emplace_back (subject, nRepeat);
for (auto& thread : threads)
thread.sync(); // start timing measurement
Dur sumDuration{0.0};
for (auto& thread : threads)
{
thread.join(); // block on measurement end
sumDuration += thread.duration;
}
return sumDuration.count() / (nThreads * nRepeat) * SCALE;
}
}} // namespace lib::test
#endif /*LIB_TEST_MICROBENCHMARK_H*/