From 6a08c9754310ae000450f44936b74c03b1b14408 Mon Sep 17 00:00:00 2001 From: Ichthyostega Date: Thu, 11 Jan 2024 22:03:36 +0100 Subject: [PATCH] Scheduler-test: fix Segfault in test setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ...as it turned out, this segfault was caused by flaws in the ScheduleCtx used for generate the test-schedule; especially when all node-spreads are set to zero and thus all jobs are scheduled immediately at t=0, there was a loophole in the logic to set the dependencies for the final »wake-up« job. When running such a schedule in the Stress-Test-Bench, the next measurement run could be started due to a premature wake-up job, thereby overrunning the previous test-run, which could be still in the middle of computations. So this was not a bug in the Scheduler itself, yet something to take care of later when programming the actual Job-Planning and schedule generation. --- tests/vault/gear/scheduler-stress-test.cpp | 43 ++++- tests/vault/gear/test-chain-load-test.cpp | 2 +- tests/vault/gear/test-chain-load.hpp | 44 +++-- wiki/thinkPad.ichthyo.mm | 203 +++++++++++++++++++-- 4 files changed, 255 insertions(+), 37 deletions(-) diff --git a/tests/vault/gear/scheduler-stress-test.cpp b/tests/vault/gear/scheduler-stress-test.cpp index 916348d97..7c3fc7f24 100644 --- a/tests/vault/gear/scheduler-stress-test.cpp +++ b/tests/vault/gear/scheduler-stress-test.cpp @@ -328,15 +328,42 @@ namespace test { void investigateWorkProcessing() { -// TestChainLoad<8> testLoad{64}; -// testLoad.seedingRule(testLoad.rule().probability(0.6).minVal(2)) -// .pruningRule(testLoad.rule().probability(0.44)) -// .setSeed(55) -// .buildTopology() -// .printTopologyDOT() -// .printTopologyStatistics() -// ; MARK_TEST_FUN + TestChainLoad<8> testLoad{64}; + testLoad.seedingRule(testLoad.rule().probability(0.6).minVal(2)) + .pruningRule(testLoad.rule().probability(0.44)) + .setSeed(55) + .buildTopology() + .printTopologyDOT() + .printTopologyStatistics() + ; +// ////////////////////////////////////////////////////////WIP : Run test directly for investigation of SEGFAULT.... +// BlockFlowAlloc bFlow; +// EngineObserver watch; +// Scheduler scheduler{bFlow, watch}; +// auto LOAD_BASE = 500us; +// auto stressFac = 1.0; +// auto concurrency = 8; +// +// ComputationalLoad cpuLoad; +// cpuLoad.timeBase = LOAD_BASE; +// cpuLoad.calibrate(); +// +// double loadMicros = cpuLoad.invoke(); +// double refTime = testLoad.calcRuntimeReference(LOAD_BASE); +//SHOW_EXPR(loadMicros) +// +// auto testSetup = +// testLoad.setupSchedule(scheduler) +// .withLoadTimeBase(LOAD_BASE) +// .withJobDeadline(50ms) +// .withUpfrontPlanning() +// .withAdaptedSchedule (stressFac, concurrency); +// double runTime = testSetup.launch_and_wait(); +// double expected = testSetup.getExpectedEndTime(); +//SHOW_EXPR(runTime) +//SHOW_EXPR(expected) +//SHOW_EXPR(refTime) struct Setup : StressRig { diff --git a/tests/vault/gear/test-chain-load-test.cpp b/tests/vault/gear/test-chain-load-test.cpp index dd135d4e8..c95a8a5ed 100644 --- a/tests/vault/gear/test-chain-load-test.cpp +++ b/tests/vault/gear/test-chain-load-test.cpp @@ -1315,7 +1315,7 @@ namespace test { // replicate this relation into the clone array clone[predIdx].addSucc(clone[succIdx]); }; - auto continuation = [&](size_t nodeDone, size_t levelDone, bool work_left) + auto continuation = [&](size_t, size_t nodeDone, size_t levelDone, bool work_left) { lastNode =nodeDone; lastLevel = levelDone; diff --git a/tests/vault/gear/test-chain-load.hpp b/tests/vault/gear/test-chain-load.hpp index e435ddeac..505ef05bb 100644 --- a/tests/vault/gear/test-chain-load.hpp +++ b/tests/vault/gear/test-chain-load.hpp @@ -193,6 +193,7 @@ namespace test { const Duration SCHEDULE_LEVEL_STEP{_uTicks(1ms)}; ///< time budget to plan for the calculation of each »time level« of jobs const Duration SCHEDULE_NODE_STEP{Duration::NIL}; ///< additional time step to include in the plan for each job (node). const Duration SCHEDULE_PLAN_STEP{_uTicks(100us)}; ///< time budget to reserve for each node to be planned and scheduled + const Offset SCHEDULE_WAKE_UP{_uTicks(10us)}; ///< tiny offset to place the final wake-up job behind any systematic schedule const bool SCHED_DEPENDS = false; ///< explicitly schedule a dependent job (or rely on NOTIFY) const bool SCHED_NOTIFY = true; ///< explicitly set notify dispatch time to the dependency's start time. @@ -1592,9 +1593,9 @@ namespace test { { using Node = typename TestChainLoad::Node; - function scheduleCalcJob_; - function markDependency_; - function continuation_; + function scheduleCalcJob_; + function markDependency_; + function continuation_; size_t maxCnt_; Node* nodes_; @@ -1621,6 +1622,7 @@ namespace test { void invokeJobOperation (JobParameter param) override { + size_t start{currIdx_}; size_t reachedLevel{0}; size_t targetNodeIDX = decodeNodeID (param.invoKey); for ( ; currIdx_ 0); - continuation_(currIdx_-1, reachedLevel, currIdx_ < maxCnt_); + continuation_(start, currIdx_-1, reachedLevel, currIdx_ < maxCnt_); } @@ -1719,7 +1721,7 @@ namespace test { /** continue planning: schedule follow-up planning job */ void - continuation (size_t lastNodeIDX, size_t levelDone, bool work_left) + continuation (size_t chunkStart, size_t lastNodeIDX, size_t levelDone, bool work_left) { if (work_left) { @@ -1729,13 +1731,15 @@ namespace test { ,manID_); } else - scheduler_.defineSchedule(wakeUpJob()) - .manifestation (manID_) - .startTime(jobStartTime (levelDone+1)) - .lifeWindow(SAFETY_TIMEOUT) - .post() - .linkToPredecessor (schedule_[lastNodeIDX], not schedNotify_) - ; // Setup wait-dependency on last computation + { + auto wakeUp = scheduler_.defineSchedule(wakeUpJob()) + .manifestation (manID_) + .startTime(jobStartTime (levelDone+1, lastNodeIDX+1) + SCHEDULE_WAKE_UP) + .lifeWindow(SAFETY_TIMEOUT) + .post(); + for (size_t exitIDX : lastExitNodes (chunkStart)) + wakeUp.linkToPredecessor (schedule_[exitIDX]); + } // Setup wait-dependency on last computations } @@ -1751,7 +1755,8 @@ namespace test { planFunctor_.reset (new RandomChainPlanFunctor{chainLoad_.nodes_[0], chainLoad_.numNodes_ ,[this](size_t i, size_t l){ disposeStep(i,l); } ,[this](auto* p, auto* s) { setDependency(p,s);} - ,[this](size_t n,size_t l, bool w){ continuation(n,l,w); } + ,[this](size_t s,size_t n,size_t l, bool w) + { continuation(s,n,l,w); } }); startTime_ = anchorSchedule(); scheduler_.seedCalcStream (planningJob(firstChunkEndNode) @@ -2026,14 +2031,15 @@ namespace test { } void - fillAdaptedSchedule (double stressFac, uint concurrency) + fillAdaptedSchedule (double stressFact, uint concurrency) { + REQUIRE (stressFact > 0); size_t numPoints = chainLoad_.topLevel()+2; startTimes_.clear(); startTimes_.reserve (numPoints); startTimes_.push_back (Time::ZERO); chainLoad_.levelScheduleSequence (concurrency) - .transform([&](double scheduleFact){ return (scheduleFact/stressFac) * Offset{1,levelSpeed_};}) + .transform([&](double scheduleFact){ return (scheduleFact/stressFact) * Offset{1,levelSpeed_};}) .effuse(startTimes_); } @@ -2045,6 +2051,14 @@ namespace test { + nodeExpense_ * nodeIDX; } + auto + lastExitNodes (size_t lastChunkStartIDX) + { + return chainLoad_.allExitNodes() + .transform([&](Node& n){ return chainLoad_.nodeID(n); }) + .filter([=](size_t idx){ return idx >= lastChunkStartIDX; }); + } // index of all Exit-Nodes within last planning-chunk... + Time calcPlanScheduleTime (size_t lastNodeIDX) {/* must be at least 1 level ahead, diff --git a/wiki/thinkPad.ichthyo.mm b/wiki/thinkPad.ichthyo.mm index 13238191a..ddc064f93 100644 --- a/wiki/thinkPad.ichthyo.mm +++ b/wiki/thinkPad.ichthyo.mm @@ -110773,14 +110773,12 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- + - - - +

die ganzen Test zur Integration und zum Aufbau der Testanordnung haben die »chained load bursts« verwendet, ein hochgradig unregelmäßiges und abschnittsweise stark verknüpftes Pattern. Damit konnte ich in etwa die erwartete Parallelisierung beobachten, aber die Computational Load ist typischerweise doppelt so lang gelaufen wie kalibriert, während gleichzeitig permanent Koordinations-Aufwand zu leisten war. Deshalb wähle ich nun einen anderen Blickwinkel: Wie gut können wir die theoretisch vorhandene »Rechenkapazität« zum Einsatz bringen? Dafür braucht es ein möglichst einfaches Pattern, das aber hinreichend breit sein muß, um alle Kerne auszulasten. Ziel ist es, einen gleichmäßigen »Flow« von länger laufenden Rechen-Jobs zu generieren @@ -110795,9 +110793,7 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- - - +

          TestChainLoad<8> testLoad{64}; @@ -110828,9 +110824,7 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- - - +

          TestChainLoad<8> testLoad{64}; @@ -110861,9 +110855,7 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- - - +

          TestChainLoad<8> testLoad{64}; @@ -110905,8 +110897,193 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
+ + + + + + + + + + + + + + + + +

+ andere Stack-Frames oder Thrads inspizieren ⟶ Debugger terminiert +

+ +
+ + + + + + + +

+ ⟹ Verdacht: Daten-Korruption durch ungesicherten Race +

+ +
+
+
+
+ + + + + + + + + + + + + + + + + + +

+ theoretisch sollte es auch so sein: sie befindet sich in einem anonymen Namespace... +

+ +
+
+ + + +
+ + + + + + + + + + + + + + + +

+ Problem erkannt: re-Entrance +

+ +
+ + + + + + + + + + + + + + +
+
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + +

+ über alles andere können wir keine Annahmen machen — frühere Nodes könnten schon „durch“ sein und dann würde der Wake-up für-immer warten +

+ +
+ +
+
+ + + + + + +

+ habe ich nicht gemacht, weil ich dachte, es sei eine gute Idee, wenn die Contuation »frühest-möglich« zurückkommt. An das Dependency-Problem habe ich nicht gedach. +

+ +
+
+ + + + + + + +

+ sofern SCHED_NOTIFY wären wir theoretisch sicher +

+ +
+ +
+ + + + + + + + +

+ zwar kann (nach diesem ersten Fix) die Continuation erst starten, nachdem das von Anfang an geplante Schedule abgearbeitet war; jedoch laufen zu dem Zeitpunkt u.U noch eine Anzahl Nodes (Paralleitäts-Grad FAN_OUT). Und wenn diese dann fertig sind, könnten die weitere NOTIFY-Dependencies „abwerfen“ ⟹ SEGFAULT +

+ +
+
+
+
+
+ + + + + + + +