LUMIERA.clone/src/lib/test/microbenchmark.hpp

/*
  MICROBENCHMARK.hpp  -  multithreaded timing measurement

  Copyright (C)         Lumiera.org
    2018,               Hermann Vosseler <Ichthyostega@web.de>

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
  published by the Free Software Foundation; either version 2 of
  the License, or (at your option) any later version.

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

*/


/** @file microbenchmark.hpp
 ** A function to perform multithreaded timing measurement on a given functor.
 ** This helper simplifies micro benchmarks of isolated implementation details.
 ** The test subject, given as function object or lambda, is copied into N threads
 ** and invoked numerous times within a tight loop. After waiting on termination of
 ** the test threads, results are summed up and then averaged into milliseconds
 ** per single invocation. The actual timing measurement relies on `chrono::duration`,
 ** which means to count micro ticks of the OS.
 ** @warning care has to bee taken when optimisation is involved!
 **     Optimisation usually has quite some impact on the results, but since
 **     this function is inline, the lambda can typically be inlined and the
 **     loop possibly be optimised away entirely. A simple workaround is to
 **     define a _volatile_ variable in the call context, close the lambda
 **     by reference, and perform a comparison with that volatile variable
 **     in each invocation. The compiler is required actually to access the
 **     value of the volatile each time.
 ** @remarks some interesting observations (in my setup, 8 core AMD FX-8350)
 **     - if we replace the global volatile by a local variable within the
 **       test subject, the initialisation of that local typically costs +5ns
 **       per invocation.
 **     - incrementing the volatile costs +10ns
 **     - multithreaded (unlocked) incrementing of the _global_ volatile
 **       creates massive overhead and increases the running time by factor 100.
 **       This nicely confirms that the x86_64 platform has strong cache coherence.
 **
 */


#ifndef LIB_TEST_MICROBENCHMARK_H
#define LIB_TEST_MICROBENCHMARK_H


#include "vault/thread-wrapper.hpp"

#include <chrono>
#include <vector>


namespace lib {
namespace test{

  namespace {
    constexpr size_t DEFAULT_RUNS = 10000000;
    constexpr double SCALE = 1e6;                  // Results are in µ sec
  }


  /** perform a multithreaded microbenchmark.
   * This function fires up a number of threads
   * and invokes the given test subject repeatedly.
   * @tparam number of threads to run in parallel
   * @param subject `void(void)` function to be timed
   * @return the averaged invocation time in _mircroseconds_
   * @remarks - the subject function will be _copied_ into each thread
   *          - so `nThreads` copies of this function will run in parallel
   *          - consider locking if this function accesses a shared closure.
   *          - if you pass a lambda, it is eligible for inlining followed
   *            by loop optimisation -- be sure to include some action, like
   *            e.g. accessing a volatile variable, to prevent the compiler
   *            from optimising it away entirely.
   */
  template<size_t nThreads, class FUN>
  inline double
  microbenchmark(FUN const& subject, const size_t nRepeat = DEFAULT_RUNS)
  {
    using backend::ThreadJoinable;
    using std::chrono::system_clock;

    using Dur = std::chrono::duration<double>;

    struct Thread
      : ThreadJoinable
      {
        Thread(FUN const& subject, size_t loopCnt)
          : ThreadJoinable("Micro-Benchmark"
                          ,[=]()                   // local copy of the test-subject-Functor
                             {
                               syncPoint();        // block until all threads are ready
                               auto start = system_clock::now();
                               for (size_t i=0; i < loopCnt; ++i)
                                 subject();
                               duration = system_clock::now () - start;
                             })
          { }
        /** measured time within thread */
        Dur duration{};
      };

    std::vector<Thread> threads;
    threads.reserve(nThreads);
    for (size_t n=0; n<nThreads; ++n)              // create test threads
      threads.emplace_back (subject, nRepeat);

    for (auto& thread : threads)
      thread.sync();                               // start timing measurement

    Dur sumDuration{0.0};
    for (auto& thread : threads)
      {
        thread.join();                             // block on measurement end
        sumDuration += thread.duration;
      }

    return sumDuration.count() / (nThreads * nRepeat) * SCALE;
  }


}} // namespace lib::test
#endif /*LIB_TEST_MICROBENCHMARK_H*/