Library: investigate Mutex+Condition-Var for comparison

...which is the technique used in the existing Threadpool framwork.
As expected, such a solution is significantly slower than the new
atomics-based implementation. Yet how much slower is still striking.
This commit is contained in:
Fischlurch 2023-09-24 21:52:38 +02:00
parent 7474f56e89
commit 11cb53a406
3 changed files with 75 additions and 14 deletions

View file

@ -40,6 +40,7 @@
** - Sync 4 threads : 700ns
** - increasing with number of threads, which implies we are measuring the time
** it takes all threads to catch-up on average...
** - the classical Mutex+Condition-Var solution is slower by orders of magnitude!
** - these values are on par with typical thread scheduling leeway,
** so this implementation seems adequate for the time being (2023).
*/

View file

@ -30,6 +30,7 @@
#include "lib/sync-barrier.hpp"
#include "lib/test/microbenchmark.hpp"
#include "lib/format-cout.hpp"
#include "lib/sync.hpp"
using test::Test;
using std::array;
@ -52,6 +53,35 @@ namespace test {
FakeBarrier(uint=0) { /* be happy */ }
void sync() { /* indulge */ }
};
/**
* A Monitor based reference implementation,
* using Mutex + Condition Variable for sleeping wait.
*/
class MonitorSync
: public Sync<NonrecursiveLock_Waitable>
{
int latch_;
bool allPassed() { return latch_ <= 0; }
public:
MonitorSync (uint nFold =2)
: latch_{int(nFold)}
{ }
void
sync()
{
Lock sync(this);
--latch_;
sync.wait(*this, &MonitorSync::allPassed);
sync.notifyAll();
}
private:
};
}//(End)Test setup
@ -101,10 +131,16 @@ namespace test {
* - SyncBarrier (48 Thr) : 30µs
* - SyncBarrier (64 Thr) : 50µs
* - SyncBarrier (80 Thr) : 80µs
* - MonitorWait (2 Thr) : 7µs
* - MonitorWait (4 Thr) : 12µs
* - MonitorWait (8 Thr) : 27µs
* - MonitorWait (16 Thr) : 75µs
* @note what we are measuring here is actually the *time to catch up*
* for all threads involved, implying we are observing the _operational_
* delay introduced by synchronisation, and not an overhead of the
* implementation technique.
* implementation technique as such. However the classical implementation
* based on Mutex + ConditionVar, which enters a thread sleep state on wait,
* is slower by orders of magnitude.
*/
virtual void
run (Arg)
@ -121,9 +157,15 @@ namespace test {
double time_yieldWait_2 = performanceTest<SyncBarrier, 2>();
//
double time_emptySetup = performanceTest<FakeBarrier, 5>();
//
double time_sleepWait_16 = performanceTest<MonitorSync, 16>();
double time_sleepWait_8 = performanceTest<MonitorSync, 8>();
double time_sleepWait_4 = performanceTest<MonitorSync, 4>();
double time_sleepWait_2 = performanceTest<MonitorSync, 2>();
cout<<"\n___Microbenchmark_______"
<<"\nemptySetup : "<<time_emptySetup
<<"\n : "
<<"\nSyncBarrier (2 Thr) : "<<time_yieldWait_2
<<"\nSyncBarrier (4 Thr) : "<<time_yieldWait_4
<<"\nSyncBarrier (8 Thr) : "<<time_yieldWait_8
@ -132,6 +174,11 @@ namespace test {
<<"\nSyncBarrier (48 Thr) : "<<time_yieldWait_48
<<"\nSyncBarrier (64 Thr) : "<<time_yieldWait_64
<<"\nSyncBarrier (80 Thr) : "<<time_yieldWait_80
<<"\n : "
<<"\nMonitorWait (2 Thr) : "<<time_sleepWait_2
<<"\nMonitorWait (4 Thr) : "<<time_sleepWait_4
<<"\nMonitorWait (8 Thr) : "<<time_sleepWait_8
<<"\nMonitorWait (16 Thr) : "<<time_sleepWait_16
<<"\n_____________________\n"
<<"\nbarriers..... "<<NUM_STAGES
<<endl;

View file

@ -79021,7 +79021,7 @@ Date:&#160;&#160;&#160;Thu Apr 20 18:53:17 2023 +0200<br/>
<node CREATED="1695314579779" ID="ID_643512170" MODIFIED="1695314602498" TEXT="diese mu&#xdf; initialisiert sein, bevor der Session-Thread ihre Logik verwendet"/>
</node>
</node>
<node COLOR="#338800" CREATED="1695315426871" ID="ID_1792980090" MODIFIED="1695484826031" TEXT="&#x27f9; Ersatzkonstrukt zwingend notwendig">
<node COLOR="#338800" CREATED="1695315426871" FOLDED="true" ID="ID_1792980090" MODIFIED="1695484826031" TEXT="&#x27f9; Ersatzkonstrukt zwingend notwendig">
<node CREATED="1695334520345" ID="ID_1748880887" MODIFIED="1695334539706" TEXT="sollte dann aber eine explizite Lib-Funktionalit&#xe4;t sein"/>
<node CREATED="1695334551413" ID="ID_257643954" LINK="https://stackoverflow.com/a/24218922" MODIFIED="1695334594067" TEXT="man k&#xf6;nnte ein spinning-latch mit yield verwenden">
<icon BUILTIN="idea"/>
@ -79106,6 +79106,19 @@ Date:&#160;&#160;&#160;Thu Apr 20 18:53:17 2023 +0200<br/>
<node COLOR="#435e98" CREATED="1695580020491" ID="ID_1772729195" MODIFIED="1695580029633" TEXT="Fazit">
<font BOLD="true" NAME="SansSerif" SIZE="12"/>
<icon BUILTIN="forward"/>
<node BACKGROUND_COLOR="#e0ceaa" COLOR="#5b3f20" CREATED="1695583819999" ID="ID_389670039" MODIFIED="1695584410343" TEXT="Yess... die klassische L&#xf6;sung ist viel aufwendiger...">
<richcontent TYPE="NOTE"><html>
<head>
</head>
<body>
<p>
...trotzdem war ich &#252;berrascht, <i>um wie viel langsamer</i>&#160;sie ist; das kann ich mir eigentlich nur dadurch erkl&#228;ren, da&#223; die Threads <i>in einen Schlafzustand versetzt </i>werden, ggfs auch bereits schon beim Versuch, die exclusive Zone zu betreten. M&#246;glicherweise dauert es auch grunds&#228;tzlich l&#228;nger, bis ein schlafender Thread &#252;berhaupt wieder aufgeweckt wird. Die Progression scheint allerdings linear in der Zahl der Threads zu sein, w&#228;hrend die Atomic-yield-Implementierung etwas &#252;berproportional langsamer wird. Das ist jetzt aber mehr Intuition, denn jenseits von 8 Threads gibt es ja zunehmend Stau im OS-Scheduler
</p>
</body>
</html></richcontent>
<icon BUILTIN="ksmiletris"/>
</node>
<node CREATED="1695580030937" ID="ID_634357060" MODIFIED="1695580179481" TEXT="die Performance von SyncBarrier ist ad&#xe4;quat f&#xfc;r den Einsatzzweck"/>
<node CREATED="1695580048111" ID="ID_784900194" MODIFIED="1695580199504" TEXT="es ist kein Overhead beobachtbar &#x2014; jenseits der typischen Scheduling-Unsch&#xe4;rfe">
<arrowlink COLOR="#c6fdd1" DESTINATION="ID_1515850328" ENDARROW="Default" ENDINCLINATION="-864;-29;" ID="Arrow_ID_1385448927" STARTARROW="None" STARTINCLINATION="99;698;"/>
@ -79216,14 +79229,14 @@ Date:&#160;&#160;&#160;Thu Apr 20 18:53:17 2023 +0200<br/>
<node BACKGROUND_COLOR="#eee5c3" COLOR="#990000" CREATED="1695394744028" ID="ID_1755648327" MODIFIED="1695394746744" TEXT="ThreadWrapperSelfRecognitionTest_test">
<icon BUILTIN="flag-yellow"/>
</node>
<node COLOR="#338800" CREATED="1695394753483" ID="ID_1338410455" MODIFIED="1695484708443" TEXT="SyncBarrier_test">
<node COLOR="#338800" CREATED="1695394753483" FOLDED="true" ID="ID_1338410455" MODIFIED="1695584550744" TEXT="SyncBarrier_test">
<icon BUILTIN="button_ok"/>
<node COLOR="#435e98" CREATED="1695394763010" ID="ID_1220273122" MODIFIED="1695484703977" TEXT="neuer Test f&#xfc;r neue (interims-) Implementierung">
<linktarget COLOR="#2b3fa9" DESTINATION="ID_1220273122" ENDARROW="Default" ENDINCLINATION="169;-7;" ID="Arrow_ID_1739726561" SOURCE="ID_281891239" STARTARROW="None" STARTINCLINATION="15;204;"/>
<icon BUILTIN="info"/>
</node>
<node COLOR="#435e98" CREATED="1695394861413" ID="ID_1334580079" MODIFIED="1695484809827" TEXT="ggfs auch gleich Performance-Test vorbereiten">
<icon BUILTIN="help"/>
<node COLOR="#435e98" CREATED="1695394861413" ID="ID_1334580079" MODIFIED="1695584537670" TEXT="auch gleich Performance-Test vorbereiten">
<icon BUILTIN="yes"/>
<node CREATED="1695484718567" ID="ID_1410950559" MODIFIED="1695484726450" TEXT="wird dann aber zu komplex"/>
<node CREATED="1695484727006" ID="ID_1647325641" MODIFIED="1695484735937" TEXT="Performance-Test braucht andere Zielsetzung"/>
<node CREATED="1695484752994" ID="ID_53306211" MODIFIED="1695484793336" TEXT="grunds&#xe4;tzlich aber sofort w&#xfc;nschenswert">
@ -79243,7 +79256,7 @@ Date:&#160;&#160;&#160;Thu Apr 20 18:53:17 2023 +0200<br/>
</node>
</node>
</node>
<node COLOR="#338800" CREATED="1695520134480" ID="ID_541502581" MODIFIED="1695578011351" TEXT="SyncBarrierPerformance_test">
<node COLOR="#338800" CREATED="1695520134480" FOLDED="true" ID="ID_541502581" MODIFIED="1695584521535" TEXT="SyncBarrierPerformance_test">
<linktarget COLOR="#5c9ed1" DESTINATION="ID_541502581" ENDARROW="Default" ENDINCLINATION="42;-52;" ID="Arrow_ID_499645960" SOURCE="ID_1096160672" STARTARROW="None" STARTINCLINATION="-155;12;"/>
<icon BUILTIN="button_ok"/>
<node COLOR="#435e98" CREATED="1695520189953" ID="ID_1322342349" MODIFIED="1695576421762" TEXT="microbenchmark.hpp &#xd83e;&#xdc46; threadBenchmark() verwenden">
@ -79428,8 +79441,7 @@ Date:&#160;&#160;&#160;Thu Apr 20 18:53:17 2023 +0200<br/>
es ist ja ein einziger Zufallszahlengenerator, und es w&#228;re eine schlechte Idee, wenn die Stdlib das nicht gegen concurrency sch&#252;tzen w&#252;rde
</p>
</body>
</html>
</richcontent>
</html></richcontent>
<linktarget COLOR="#e65386" DESTINATION="ID_356517631" ENDARROW="Default" ENDINCLINATION="343;0;" ID="Arrow_ID_1674760651" SOURCE="ID_1418903203" STARTARROW="None" STARTINCLINATION="109;97;"/>
<icon BUILTIN="stop-sign"/>
</node>
@ -79500,6 +79512,10 @@ Date:&#160;&#160;&#160;Thu Apr 20 18:53:17 2023 +0200<br/>
<node CREATED="1695576145934" ID="ID_1877591130" MODIFIED="1695576230336" TEXT="SyncBarrier (48 Thr) : 30&#xb5;s"/>
<node CREATED="1695576145935" ID="ID_469124083" MODIFIED="1695576238800" TEXT="SyncBarrier (64 Thr) : 50&#xb5;s"/>
<node CREATED="1695576145935" ID="ID_754356198" MODIFIED="1695576257440" TEXT="SyncBarrier (80 Thr) : 80&#xb5;s"/>
<node CREATED="1695583795249" MODIFIED="1695583795249" TEXT="MonitorWait (2 Thr) : 7&#xb5;s"/>
<node CREATED="1695583795249" MODIFIED="1695583795249" TEXT="MonitorWait (4 Thr) : 12&#xb5;s"/>
<node CREATED="1695583795250" MODIFIED="1695583795250" TEXT="MonitorWait (8 Thr) : 27&#xb5;s"/>
<node CREATED="1695583795250" MODIFIED="1695583795250" TEXT="MonitorWait (16 Thr) : 75&#xb5;s"/>
</node>
<node CREATED="1695576264352" ID="ID_1266717950" MODIFIED="1695578848332" TEXT="Debug-Buid-Werte weichen nur wenig ab">
<icon BUILTIN="idea"/>
@ -79551,8 +79567,7 @@ Date:&#160;&#160;&#160;Thu Apr 20 18:53:17 2023 +0200<br/>
wir messen, wie lange ein Thread im Durchschnitt baucht, bis er sich via SyncBarrier mit den anderen Partner-Threads synchronisiert hat. Dieser Wert ist nicht deterministisch, da die zeitliche Lage der Threads zueinander nicht deterministisch ist. Wir k&#246;nnen aber auch nicht anders messen, da der Thread typischerweise in der sync()-Funktion blockt.
</p>
</body>
</html>
</richcontent>
</html></richcontent>
</node>
<node CREATED="1695579826996" ID="ID_1515850328" MODIFIED="1695580199504">
<richcontent TYPE="NODE"><html>
@ -79564,8 +79579,7 @@ Date:&#160;&#160;&#160;Thu Apr 20 18:53:17 2023 +0200<br/>
&#10233; wir beobachten die Barriere bei ihrer <b>bestimmungsgem&#228;&#223;en Arbeit</b>
</p>
</body>
</html>
</richcontent>
</html></richcontent>
<linktarget COLOR="#c6fdd1" DESTINATION="ID_1515850328" ENDARROW="Default" ENDINCLINATION="-864;-29;" ID="Arrow_ID_1385448927" SOURCE="ID_784900194" STARTARROW="None" STARTINCLINATION="99;698;"/>
</node>
<node CREATED="1695579847753" ID="ID_1248378503" MODIFIED="1695579872734">
@ -79578,8 +79592,7 @@ Date:&#160;&#160;&#160;Thu Apr 20 18:53:17 2023 +0200<br/>
&#10233; wir bekommen so <b>nicht</b>&#160;den <b>Implementierungs-Overhead</b>&#160; zu fassen
</p>
</body>
</html>
</richcontent>
</html></richcontent>
</node>
</node>
</node>