From 7474f56e89b23943c5b98d2faf60f271e615beff Mon Sep 17 00:00:00 2001 From: Ichthyostega Date: Sun, 24 Sep 2023 20:38:27 +0200 Subject: [PATCH] Library: investigate performance of SyncBarrier Timing measurements in concurrent usage situation. Observed delay is in the order of magnitude of known scheduling leeway; assuming thus no relevant overhead related to implementation technique --- src/lib/sync-barrier.hpp | 8 + .../library/sync-barrier-performance-test.cpp | 87 ++++-- wiki/thinkPad.ichthyo.mm | 258 +++++++++++++++--- 3 files changed, 295 insertions(+), 58 deletions(-) diff --git a/src/lib/sync-barrier.hpp b/src/lib/sync-barrier.hpp index e3964f4ec..6ba40fb3b 100644 --- a/src/lib/sync-barrier.hpp +++ b/src/lib/sync-barrier.hpp @@ -35,6 +35,13 @@ ** @todo as of 9/2023 it remains to be seen if this facility is just a pre-C++20 workaround; ** otherwise it may present distinct performance characteristics than std::latch, ** possibly also a slightly more abstracted (and thus clearer) usage API. + ** @remark Typical overhead measured with optimised build on 8 Core machine + ** - Sync 2 threads : 280ns + ** - Sync 4 threads : 700ns + ** - increasing with number of threads, which implies we are measuring the time + ** it takes all threads to catch-up on average... + ** - these values are on par with typical thread scheduling leeway, + ** so this implementation seems adequate for the time being (2023). */ @@ -60,6 +67,7 @@ namespace lib { * when stretched out over extended time. * @remark intended use is to allow all participants to catch up and reach * a well defined point with initialisation or implementation logic. + * @see SyncBarrierPerformance_test::run for actual performance measurements! */ class SyncBarrier : util::NonCopyable diff --git a/tests/library/sync-barrier-performance-test.cpp b/tests/library/sync-barrier-performance-test.cpp index 1692efaac..2e7d89061 100644 --- a/tests/library/sync-barrier-performance-test.cpp +++ b/tests/library/sync-barrier-performance-test.cpp @@ -28,38 +28,29 @@ #include "lib/test/run.hpp" #include "lib/sync-barrier.hpp" -//#include "lib/iter-explorer.hpp" -//#include "lib/util-foreach.hpp" #include "lib/test/microbenchmark.hpp" -#include "lib/test/diagnostic-output.hpp" /////////////////////TODO - -//#include -//#include -//#include -#include +#include "lib/format-cout.hpp" using test::Test; -//using util::and_all; -//using lib::explore; using std::array; -//using std::atomic_uint; -using std::this_thread::sleep_for; -using namespace std::chrono_literals; - namespace lib { namespace test { - namespace {// Test setup for a concurrent calculation with checksum.... + namespace {// Test setup.... const uint NUM_STAGES = 1024; /** + * Empty placeholder implementation. + * Used for measurement of test setup overhead. */ class FakeBarrier { public: + FakeBarrier(uint=0) { /* be happy */ } + void sync() { /* indulge */ } }; }//(End)Test setup @@ -68,41 +59,85 @@ namespace test { /*******************************************************************//** * @test investigate performance of N-fold thread synchronisation. - * - start a _huge number_ of TestThread - * - all those pick up the partial sum from stage1 - * @remark without coordinated synchronisation, some threads would see - * an incomplete sum and thus the stage2 checksum would be lower + * - use the [multithreaded Microbenchmark](\ref lib::test::threadBenchmark() ) + * - use an array of consecutively used barriers, one for each per-thread repetition + * - test function is parametrised for comparison of different barrier implementations + * @warning for actually be useful, this test should be compiled with `-O3` and be invoked + * stand-alone several times, while otherwise system load is low * @see lib::SyncBarrier * @see steam::control::DispatcherLoop */ class SyncBarrierPerformance_test : public Test { - template + template double performanceTest() { + BAR barrier[NUM_STAGES]; + for (uint i=0; i size_t { - sleep_for (1us); - return 1; + barrier[i].sync(); + return i; // prevent empty loop optimisation }; auto [micros, cnt] = threadBenchmark (testSubject, NUM_STAGES); - CHECK (cnt == nThreads*NUM_STAGES); + CHECK (cnt == nThreads * NUM_STAGES*(NUM_STAGES-1)/2); return micros; } + + /** @test performance investigation of N-fold synchronisation barrier + * @remark typical values observed with release-build on a 8-core machine + * - emptySetup : 0.6ns + * - SyncBarrier (2 Thr) : 280ns + * - SyncBarrier (4 Thr) : 700ns + * - SyncBarrier (8 Thr) : 2µs + * - SyncBarrier (16 Thr) : 9µs + * - SyncBarrier (32 Thr) : 21µs + * - SyncBarrier (48 Thr) : 30µs + * - SyncBarrier (64 Thr) : 50µs + * - SyncBarrier (80 Thr) : 80µs + * @note what we are measuring here is actually the *time to catch up* + * for all threads involved, implying we are observing the _operational_ + * delay introduced by synchronisation, and not an overhead of the + * implementation technique. + */ virtual void run (Arg) { cout<<"\n\n■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■"<(); - cout<<"\n___Microbenchmark____" - <<"\nemptySetup : "<(); + double time_yieldWait_64 = performanceTest(); + double time_yieldWait_48 = performanceTest(); + double time_yieldWait_32 = performanceTest(); + double time_yieldWait_16 = performanceTest(); + double time_yieldWait_8 = performanceTest(); + double time_yieldWait_4 = performanceTest(); + double time_yieldWait_2 = performanceTest(); + // + double time_emptySetup = performanceTest(); + + cout<<"\n___Microbenchmark_______" + <<"\nemptySetup : "< + + + + + + + + + + + @@ -79226,16 +79237,16 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- + - + - - + + @@ -79253,7 +79264,7 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- + @@ -79263,11 +79274,11 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- + - + @@ -79277,7 +79288,7 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- + @@ -79289,8 +79300,42 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
+ + + + + + + + + +
    +
  • + 1 Thread 100ns +
  • +
  • + 2 Threads = 400ns +
  • +
  • + 100 Threads ⟶ 20µs +
  • +
  • + 1000 Threads ⟶ 250µs +
  • +
  • + 2000 Threads ⟶ 500µs +
  • +
+ +
- + + + + + +
+ @@ -79331,63 +79376,212 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- + - - + + + + + + + + +

+ für den BlockFlow-Test habe ich das definitiv gebraucht, um damit eine »Zeitachse« zu konstruieren; und auch für multithreaded-Tests ist das innerhalb des einzelnen Thread durchaus sinnvoll (⟹ siehe SyncBarrierPerformance_test) +

+ +
+ +
+ - +
- + - - - + + + + - + + + + + + + + + + +

+ es ist ja ein einziger Zufallszahlengenerator, und es wäre eine schlechte Idee, wenn die Stdlib das nicht gegen concurrency schützen würde +

+ + +
+ + +
+
+ - - - + + + - + - + - - + + + + + + - - - - + + + + - + - + + - + + + + + + + + + + + + + +

+ Messungen(Release-Build) +

+ +
+ + + + + + + + + + + + + + + + + + + + +
    +
  • + die Werte sind zwar verdächtig klein, aber stabil. +
  • +
  • + habe zum Vergleich einmal den testSubject(i)-Aufruf in der Schleife auskommentiert ⟹ Werte um > Faktor 10 kleiner, und fluktuieren stark +
  • +
  • + es ist wichtig, keine Konstante aus der Schleife zurückzugeben (sondern die Index-Variable). Mit Konstante verhält sich die Schleife wie leer! +
  • +
+ +
+
+ + + + + + +

+ ...die führen dann nochmal zu um den Faktor 10 größeren Werten (was mit meiner Erfahrung konsistent ist).
Daher erscheint die aktuelle Lösung als optimal: wir zwingen den Optimiser, die Schleife auszuführen, weil ein Wert berechnet wird; dieser greift aber nur auf eine Variable in der Klasse zu, und muß nicht atomar, volatil oder synchronisiert sein. Mit diesem Setup kann man also auch den Einfluß von Atomic-Zugriffen noch gut messen +

+ +
+
+
+ + + + + + + + + + +

+ wir messen, wie lange ein Thread im Durchschnitt baucht, bis er sich via SyncBarrier mit den anderen Partner-Threads synchronisiert hat. Dieser Wert ist nicht deterministisch, da die zeitliche Lage der Threads zueinander nicht deterministisch ist. Wir können aber auch nicht anders messen, da der Thread typischerweise in der sync()-Funktion blockt. +

+ + +
+
+ + + + + + +

+ ⟹ wir beobachten die Barriere bei ihrer bestimmungsgemäßen Arbeit +

+ + +
+ +
+ + + + + + +

+ ⟹ wir bekommen so nicht den Implementierungs-Overhead  zu fassen +

+ + +
+
+