From 097001d16f3312a401d42b0147b4d670a7ef0fcf Mon Sep 17 00:00:00 2001 From: Ichthyostega Date: Fri, 27 Oct 2023 02:53:34 +0200 Subject: [PATCH] Scheduler: investigate timings of dispatch() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ...there seemed to be an anomaly of 50...100µs ==> conclusion: this is due to the instrumentation code - it largely caused by the EventLog, which was never meant to be used in performance-critical code, and does hefty heap allocations and string processing. - moreover, there clearly is a cache-effect, adding a Factor 2 whenever some time passed since the last EventLog call ==> can be considered just an artifact of the test setup and will have no impact on the scheduler remark: this commit adds a lot of instrumentation code --- src/lib/test/event-log.cpp | 1 + src/vault/gear/activity.hpp | 17 +- src/vault/gear/scheduler-commutator.hpp | 25 +- src/vault/gear/scheduler.hpp | 18 +- tests/vault/gear/activity-detector.hpp | 1 + tests/vault/gear/scheduler-service-test.cpp | 56 ++--- wiki/thinkPad.ichthyo.mm | 252 ++++++++++++++++++++ 7 files changed, 321 insertions(+), 49 deletions(-) diff --git a/src/lib/test/event-log.cpp b/src/lib/test/event-log.cpp index c4f5c15ae..c00a63e4e 100644 --- a/src/lib/test/event-log.cpp +++ b/src/lib/test/event-log.cpp @@ -613,6 +613,7 @@ namespace test{ EventLog::EventLog (string logID) : log_(new Log) { + log_->reserve(2000); log({"type=EventLogHeader", "this="+logID}); } diff --git a/src/vault/gear/activity.hpp b/src/vault/gear/activity.hpp index 07aa6bd06..faab2ad3e 100644 --- a/src/vault/gear/activity.hpp +++ b/src/vault/gear/activity.hpp @@ -70,6 +70,8 @@ #include "lib/time/timevalue.hpp" #include "lib/meta/function.hpp" #include "lib/util.hpp" +#include "lib/format-cout.hpp" /////////////////////////TODO +#include "vault/real-clock.hpp" namespace vault{ @@ -614,7 +616,14 @@ namespace gear { case FEED: return activity::PASS; case HOOK: - return callHook (now, executionCtx); +// return callHook (now, executionCtx); +{ +Time t0 = executionCtx.getSchedTime(); + activity::Proc res = callHook (now, executionCtx); +Time t2 = executionCtx.getSchedTime(); +cout << "+++++++++++HO: "<<_raw(t2) - _raw(t0)<<"µs"< #include @@ -106,7 +108,7 @@ namespace gear { using ThreadID = std::thread::id; atomic groomingToken_{}; - auto thisThread() { return std::this_thread::get_id(); } + auto thisThread() { return ThreadID();}/////////////////TODO std::this_thread::get_id(); } public: @@ -123,6 +125,7 @@ namespace gear { bool acquireGoomingToken() noexcept { +return true; ThreadID expect_noThread; // expect no one else to be in... return groomingToken_.compare_exchange_strong (expect_noThread, thisThread() ,memory_order_acquire // success also constitutes an acquire barrier @@ -139,6 +142,7 @@ namespace gear { void dropGroomingToken() noexcept { // expect that this thread actually holds the Grooming-Token +return; REQUIRE (groomingToken_.load(memory_order_relaxed) == thisThread()); const ThreadID noThreadHoldsIt; groomingToken_.store (noThreadHoldsIt, memory_order_release); @@ -151,6 +155,7 @@ namespace gear { bool holdsGroomingToken (ThreadID id) noexcept { +return true; return id == groomingToken_.load (memory_order_relaxed); } @@ -215,14 +220,26 @@ namespace gear { if (!chain) return activity::WAIT; Time now = executionCtx.getSchedTime(); - if (decideDispatchNow (when, now)) - return ActivityLang::dispatchChain (chain, executionCtx); + bool shallDispatch =decideDispatchNow (when, now); + Time t1 = executionCtx.getSchedTime(); + activity::Proc res; + if (shallDispatch) + { + res = ActivityLang::dispatchChain (chain, executionCtx); + } +// return ActivityLang::dispatchChain (chain, executionCtx); else + { if (holdsGroomingToken (thisThread())) layer1.feedPrioritisation (*chain, when); else layer1.instruct (*chain, when); - return activity::PASS; +// return activity::PASS; + res = activity::PASS; + } + Time t2 = executionCtx.getSchedTime(); +cout << "+++++++++Post: "<<_raw(t2) - _raw(now)<<"µs ("<<_raw(t1)-_raw(now)<<"|+|"<<_raw(t2)-_raw(t1)<<(shallDispatch?":dispatch":":enqueue")<<")"< #include @@ -335,7 +336,11 @@ namespace gear { .performStep([&]{ Time now = ctx.getSchedTime(); Activity* act = layer2_.findWork (layer1_,now); - return ctx.post (now, act, ctx); + Time moo = ctx.getSchedTime(); + auto res = ctx.post (now, act, ctx); + Time boo = ctx.getSchedTime(); +cout << "+++++Dispatch: "<<_raw(boo) - _raw(now)<<"µs ("<<_raw(moo)-_raw(now)<<"|+|"<<_raw(boo)-_raw(moo)<<")"<"<<_raw(targetedDelay)<<"\n"; std::this_thread::sleep_for (std::chrono::microseconds (_raw(targetedDelay))); return activity::SKIP; // indicates to abort this processing-chain for good } diff --git a/tests/vault/gear/activity-detector.hpp b/tests/vault/gear/activity-detector.hpp index 259e40f2f..5e2cdc412 100644 --- a/tests/vault/gear/activity-detector.hpp +++ b/tests/vault/gear/activity-detector.hpp @@ -360,6 +360,7 @@ namespace test { if (data_.callback.arg == 0) {// no adapted target; just record this activation log_(util::toString(now) + " ⧐ "); +// log_(" ⧐ "); return activity::PASS; } else diff --git a/tests/vault/gear/scheduler-service-test.cpp b/tests/vault/gear/scheduler-service-test.cpp index f62d8900b..03ac47171 100644 --- a/tests/vault/gear/scheduler-service-test.cpp +++ b/tests/vault/gear/scheduler-service-test.cpp @@ -143,12 +143,21 @@ namespace test { and wasClose (invoked, start); }; + start = Time::ZERO; +// auto& ctx = detector.executionCtx; + auto& ctx = Scheduler::ExecutionCtx::from(scheduler); + probe.activate(start, ctx); + auto [muck,_] = lib::test::microBenchmark([&](size_t){ probe.activate(start, ctx); + return size_t(1); + } + , 200); +SHOW_EXPR(muck) + cout << "Scheduled right away..."< 500); CHECK (delay_us < 1000); pullWork(); -SHOW_EXPR(_raw(now)) -SHOW_EXPR(_raw(start)) -SHOW_EXPR(_raw(detector.invokeTime(probe))) -SHOW_EXPR(res); -SHOW_EXPR(delay_us) -SHOW_EXPR(slip_us) -SHOW_EXPR(wasInvoked(start)) CHECK (wasInvoked(start)); - CHECK (delay_us < 200); + CHECK (delay_us < 200); ///////////////OOO dangerously tight CHECK (slip_us < 500); CHECK (activity::WAIT == res); CHECK (scheduler.empty()); @@ -248,14 +230,6 @@ SHOW_EXPR(scheduler.empty()) start += t1ms; pullWork(); -SHOW_EXPR(_raw(now)) -SHOW_EXPR(_raw(start)) -SHOW_EXPR(_raw(detector.invokeTime(probe))) -SHOW_EXPR(res); -SHOW_EXPR(delay_us) -SHOW_EXPR(slip_us) -SHOW_EXPR(wasInvoked(start)) -SHOW_EXPR(scheduler.empty()) CHECK (wasInvoked(start)); CHECK (delay_us < 500); CHECK (slip_us < 500); diff --git a/wiki/thinkPad.ichthyo.mm b/wiki/thinkPad.ichthyo.mm index 252045f4f..3970624d5 100644 --- a/wiki/thinkPad.ichthyo.mm +++ b/wiki/thinkPad.ichthyo.mm @@ -82222,6 +82222,15 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
+ + + + + + + + + @@ -88262,6 +88271,249 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
+ + + + + + + + + + + + + + + +

+ ...und das ist so auch plausibel (interessanterweise gibt es hier keinen Unterschied debug/O3 )...  hier wird ein Vector befüllt und jede Menge String-Konvertierungen gemacht, und auch ein virtual call +

+ + +
+ + +
+ + + + + + + +

+ das bedeutet: in diesem Fall liegt der überwiegende Teil des extra-Aufwands vermutlich vor der Activity-Activation (weil die bekanntermaßen aufwendigen Log-Operationen erst nach dem Eintritt in die activation-Funktion der ActivityProbe erfolgen). ~50µs bedeutet ⟹ gut die Hälfte wird außerhalb aber im Dispatch verbrannt +

+ + +
+
+ + + + + + + + + + + + + + + + +

+ ...wir sind noch nicht so weit (nach Plan) +

+
    +
  • + Grooming-Token soll nur per explizitem Übergang geDropped werden; beide Fälle sind noch nicht implementiert (work-Function und λ-work) +
  • +
  • + dieser Test läuft im Haupt-Thread, d.h. der erste Aufruf (in dem das post() auch direkt den Dispatch macht) erlangt das Grooming-Token +
  • +
+ + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +

+ ...zumal ich jetzt alles im Debug-Mode untersuche, und auch noch COUT-Statements und Clock-Aufrufe eingebaut habe +

+ + +
+
+ + + + + + +

+ steckt in Activity::activate +

+ + +
+ +
+ + + + + + +

+ steckt in Activity::callHook +

+ + +
+ + + + + + + + + + + + + +
+
+ + + + + + + + + + + + + + + + +

+ |capacity=0 +

+

+ +++++++++++HO: 0µs +

+

+ ++++++++++ACT: 4µs +

+

+ +++++++++Post: 10µs (0|+|10:dispatch) +

+

+ +++++Dispatch: 17µs (2|+|15) +

+

+ |capacity=5 +

+

+ res:2 delay=29µs +

+ + +
+ +
+ + + + + + + + + + + + + + + + + +

+ einziger Unterschied: Zeiten im Call-stack addieren sich weniger auf... +

+

+ +

+

+ +++++++++++HO: 45µs +

+

+ ++++++++++ACT: 50µs +

+

+ +++++++++Post: 52µs (0|+|52:dispatch) +

+

+ +++++Dispatch: 54µs (0|+|54) +

+ + +
+
+ + + + + + +

+ ...sondern nur die wenigen Translation-Units, die ich für relevant halte. Das könnte dann insgesamt nochmal einen Unterschied machen +

+ + +
+ +
+
+
+
+