From 28b39002846aba9ef3dab18dae10f67fa8b063dd Mon Sep 17 00:00:00 2001 From: Ichthyostega Date: Sat, 22 Jul 2023 01:54:25 +0200 Subject: [PATCH] Block-Flow: final adjustments from performance test (closes: #1311) Further extensive testing with parameter variations, using the test setup in `BlockFlow_test::storageFlow()` - Tweaks to improve convergence under extreme overload; sudden load peaks are now accomodated typically < 5 sec - Make the test definition parametric, to simplify variations - Extract the generic microbenchmark helper function - Documentation --- src/lib/test/microbenchmark.hpp | 86 ++++++- src/vault/gear/block-flow.hpp | 24 +- tests/vault/gear/block-flow-test.cpp | 148 +++++------ wiki/renderengine.html | 17 +- wiki/thinkPad.ichthyo.mm | 356 ++++++++++++++++++++++----- 5 files changed, 469 insertions(+), 162 deletions(-) diff --git a/src/lib/test/microbenchmark.hpp b/src/lib/test/microbenchmark.hpp index c836c64dc..a56bba0ac 100644 --- a/src/lib/test/microbenchmark.hpp +++ b/src/lib/test/microbenchmark.hpp @@ -22,18 +22,19 @@ /** @file microbenchmark.hpp - ** A function to perform multithreaded timing measurement on a given functor. + ** Functions to perform (multithreaded) timing measurement on a given functor. ** This helper simplifies micro benchmarks of isolated implementation details. - ** The test subject, given as function object or lambda, is copied into N threads - ** and invoked numerous times within a tight loop. After waiting on termination of - ** the test threads, results are summed up and then averaged into milliseconds - ** per single invocation. The actual timing measurement relies on `chrono::duration`, - ** which means to count micro ticks of the OS. + ** The test subject, given as function object or lambda, is invoked numerous times + ** within a tight loop. In the [multithreaded variant](\ref threadBenchmark()), + ** the lambda is copied into N threads and performed in each thread in parallel; + ** after waiting on termination of the test threads, results are summed up and then + ** averaged into milliseconds per single invocation. The actual timing measurement + ** relies on `chrono::duration`, which means to count micro ticks of the OS. ** @warning care has to bee taken when optimisation is involved! ** Optimisation usually has quite some impact on the results, but since ** this function is inline, the lambda can typically be inlined and the - ** loop possibly be optimised away entirely. A simple workaround is to - ** define a _volatile_ variable in the call context, close the lambda + ** loop possibly be optimised away altogether. A simple workaround is + ** to define a _volatile_ variable in the call context, close the lambda ** by reference, and perform a comparison with that volatile variable ** in each invocation. The compiler is required actually to access the ** value of the volatile each time. @@ -45,7 +46,7 @@ ** - multithreaded (unlocked) incrementing of the _global_ volatile ** creates massive overhead and increases the running time by factor 100. ** This nicely confirms that the x86_64 platform has strong cache coherence. - ** + ** */ @@ -53,6 +54,7 @@ #define LIB_TEST_MICROBENCHMARK_H +#include "lib/meta/function.hpp" #include "vault/thread-wrapper.hpp" #include @@ -64,11 +66,73 @@ namespace lib { namespace test{ namespace { - constexpr size_t DEFAULT_RUNS = 10000000; + constexpr size_t DEFAULT_RUNS = 10'000'000; constexpr double SCALE = 1e6; // Results are in µ sec } + /** + * Helper to invoke a functor or λ to observe its running time. + * @param invokeTestLoop the test (complete including loop) invoked once + * @param repeatCnt number of repetitions to divide the timing measurement + * @return averaged time for one repetition, in nanoseconds + */ + template + inline double + benchmarkTime (FUN const& invokeTestLoop, const size_t repeatCnt = DEFAULT_RUNS) + { + using std::chrono::system_clock; + using Dur = std::chrono::duration; + const double SCALE = 1e9; // Results are in ns + + auto start = system_clock::now(); + invokeTestLoop(); + Dur duration = system_clock::now () - start; + return duration.count()/(repeatCnt) * SCALE; + }; + + + /** + * Benchmark building block to invoke a functor or λ in a tight loop, + * passing the current loop index and capturing a result checksum value. + * @return sum of all individual invocation results as checksum + */ + template + inline size_t + benchmarkLoop (FUN const& testSubject, const size_t repeatCnt = DEFAULT_RUNS) + { + // the test subject gets the current loop-index and returns a checksum value + ASSERT_VALID_SIGNATURE (decltype(testSubject), size_t&(size_t)); + + size_t checksum{0}; + for (size_t i=0; i + inline auto + microBenchmark (FUN const& testSubject, const size_t repeatCnt = DEFAULT_RUNS) + { + size_t checksum{0}; + auto invokeTestLoop = [&]{ checksum = benchmarkLoop (testSubject, repeatCnt); }; + double nanos = benchmarkTime (invokeTestLoop, repeatCnt); + return std::make_tuple (nanos, checksum); + } + + + + /** perform a multithreaded microbenchmark. * This function fires up a number of threads * and invokes the given test subject repeatedly. @@ -85,7 +149,7 @@ namespace test{ */ template inline double - microbenchmark(FUN const& subject, const size_t nRepeat = DEFAULT_RUNS) + threadBenchmark(FUN const& subject, const size_t nRepeat = DEFAULT_RUNS) { using vault::ThreadJoinable; using std::chrono::system_clock; diff --git a/src/vault/gear/block-flow.hpp b/src/vault/gear/block-flow.hpp index 6146c5315..ac7c709f0 100644 --- a/src/vault/gear/block-flow.hpp +++ b/src/vault/gear/block-flow.hpp @@ -76,9 +76,7 @@ ** @see BlockFlow_test ** @see SchedulerUsage_test ** @see extent-family.hpp underlying allocation scheme - ** - ** @todo WIP-WIP-WIP 6/2023 »Playback Vertical Slice« - ** + ** */ @@ -124,12 +122,12 @@ namespace gear { /* === algorithm tuning settings === */ const double TARGET_FILL = 0.90; ///< aim at using this fraction of Epoch space on average (slightly below 100%) const double BOOST_FACTOR = 0.85; ///< adjust capacity by this factor on Epoch overflow/underflow events - const double DAMP_THRESHOLD = 0.06; ///< do not account for (almost) empty Epochs to avoid overshooting regulation + const double DAMP_THRESHOLD = 0.08; ///< do not account for (almost) empty Epochs to avoid overshooting regulation /* === contextual assumptions === */ const size_t ACTIVITIES_PER_FRAME = 10; ///< how many Activity records are typically used to implement a single frame const size_t REFERENCE_FPS = 25; ///< frame rate to use as reference point to relate DUTY_CYCLE and default counts - const size_t OVERLOAD_LIMIT = 200; ///< load factor over normal use where to assume saturation and limit throughput + const size_t OVERLOAD_LIMIT = 60; ///< load factor over normal use where to assume saturation and limit throughput }; /** @@ -371,8 +369,6 @@ namespace gear { public: BlockFlow() -// : alloc_{INITIAL_ALLOC}//Strategy::initialEpochCnt()} -// , epochStep_{INITIAL_EPOCH_STEP}//Strategy::initialEpochStep()} : alloc_{Strategy::initialEpochCnt()} , epochStep_{Strategy::initialEpochStep()} { } @@ -435,17 +431,10 @@ namespace gear { void* claimSlot() ///< EX_SANE { - bool first{true}; while (not (epoch_ and epoch_->gate().hasFreeSlot())) - // Epoch overflow - // use following Epoch; possibly allocate - { - if (first) - {// each shifted allocation accounted once as overflow - flow_->markEpochOverflow(); - first = false; - } + // Epoch overflow... + {// shift to following Epoch; possibly allocate if (not epoch_) { auto lastDeadline = flow_->lastEpoch().deadline(); @@ -455,6 +444,7 @@ namespace gear { } else { + flow_->markEpochOverflow(); ++epoch_; } } @@ -565,7 +555,7 @@ namespace gear { /** * On clean-up of past Epochs, the actual fill factor is checked to guess an - * Epoch duration for optimal usage of epoch storage. Assuming that requested + * Epoch duration to make optimal use of epoch storage. Assuming that requested * Activity deadlines are evenly spaced, for a simple heuristic we can just divide * actual Epoch duration by the fill factor (longer Epoch => less capacity). * To avoid control oscillations however, it seems prudent to use damping by diff --git a/tests/vault/gear/block-flow-test.cpp b/tests/vault/gear/block-flow-test.cpp index 3dd31afb7..e0ebb6de4 100644 --- a/tests/vault/gear/block-flow-test.cpp +++ b/tests/vault/gear/block-flow-test.cpp @@ -28,10 +28,10 @@ #include "lib/test/run.hpp" #include "lib/test/test-helper.hpp" #include "vault/gear/block-flow.hpp" +#include "lib/test/microbenchmark.hpp" #include "lib/time/timevalue.hpp" -//#include "lib/format-cout.hpp" -#include "lib/test/diagnostic-output.hpp" ////////////////////////////////TODO #include "lib/meta/function.hpp" +#include "lib/format-cout.hpp" #include "lib/util.hpp" #include @@ -66,6 +66,7 @@ namespace test { const size_t AVERAGE_EPOCHS = Strategy{}.averageEpochs(); const double BOOST_OVERFLOW = Strategy{}.boostFactorOverflow(); const double TARGET_FILL = Strategy{}.config().TARGET_FILL; + const double ACTIVITIES_P_FR = Strategy{}.config().ACTIVITIES_PER_FRAME; } @@ -221,7 +222,6 @@ namespace test { * - exhaust last Epoch, causing setup of new Epoch, with reduced spacing * - use this reduced spacing also for subsequently created Epochs * - clean up obsoleted Epochs, based on given deadline - * @todo WIP 7/23 ⟶ ✔define ⟶ ✔implement */ void placeActivity() @@ -288,32 +288,31 @@ namespace test { CHECK (not allocHandle.hasFreeSlot()); auto& a6 = bFlow.until(Time{850,10}).create(); // Note: encountered four overflow-Events, leading to decreased Epoch spacing for new Epochs - CHECK (watch(bFlow).find(a6) == "11s193ms"_expect); - CHECK (watch(bFlow).allEpochs() == "10s200ms|10s400ms|10s600ms|10s800ms|11s|11s193ms"_expect); + CHECK (watch(bFlow).find(a6) == "11s192ms"_expect); + CHECK (watch(bFlow).allEpochs() == "10s200ms|10s400ms|10s600ms|10s800ms|11s|11s192ms"_expect); auto& a7 = bFlow.until(Time{500,11}).create(); // this allocation does not count as overflow, but has to expand the Epoch grid, now using the reduced Epoch spacing - CHECK (watch(bFlow).allEpochs() == "10s200ms|10s400ms|10s600ms|10s800ms|11s|11s193ms|11s387ms|11s580ms"_expect); - CHECK (watch(bFlow).find(a7) == "11s580ms"_expect); + CHECK (watch(bFlow).allEpochs() == "10s200ms|10s400ms|10s600ms|10s800ms|11s|11s192ms|11s384ms|11s576ms"_expect); + CHECK (watch(bFlow).find(a7) == "11s576ms"_expect); // on clean-up, actual fill ratio is used to adjust to optimise Epoch length for better space usage - CHECK (bFlow.getEpochStep() == "≺193ms≻"_expect); + CHECK (bFlow.getEpochStep() == "≺192ms≻"_expect); bFlow.discardBefore (Time{999,10}); - CHECK (bFlow.getEpochStep() == "≺234ms≻"_expect); - CHECK (watch(bFlow).allEpochs() == "11s|11s193ms|11s387ms|11s580ms"_expect); + CHECK (bFlow.getEpochStep() == "≺218ms≻"_expect); + CHECK (watch(bFlow).allEpochs() == "11s|11s192ms|11s384ms|11s576ms"_expect); // placed into the oldest Epoch still alive auto& a8 = bFlow.until(Time{500,10}).create(); - CHECK (watch(bFlow).find(a8) == "11s193ms"_expect); + CHECK (watch(bFlow).find(a8) == "11s192ms"_expect); } /** @test load based regulation of Epoch spacing * - on overflow, capacity is boosted by a fixed factor - * - on clean-up, a moving average of (in hindsight) optimal length is - * computed and used as the new Epoch spacing - * @todo WIP 7/23 ⟶ ✔define ⟶ 🔁implement + * - on clean-up, a moving average of (in hindsight) optimal length + * is computed and used as the new Epoch spacing */ void adjustEpochs() @@ -322,11 +321,7 @@ namespace test { CHECK (bFlow.getEpochStep() == INITIAL_EPOCH_STEP); // whenever an Epoch overflow happens, capacity is boosted by reducing the Epoch duration -SHOW_EXPR(bFlow.getEpochStep()) bFlow.markEpochOverflow(); -SHOW_EXPR(bFlow.getEpochStep()) -SHOW_EXPR(INITIAL_EPOCH_STEP) -SHOW_EXPR(INITIAL_EPOCH_STEP*BOOST_OVERFLOW) CHECK (bFlow.getEpochStep() == INITIAL_EPOCH_STEP * BOOST_OVERFLOW); bFlow.markEpochOverflow(); CHECK (bFlow.getEpochStep() == INITIAL_EPOCH_STEP * BOOST_OVERFLOW*BOOST_OVERFLOW); @@ -351,41 +346,48 @@ SHOW_EXPR(INITIAL_EPOCH_STEP*BOOST_OVERFLOW) }; TimeVar step = bFlow.getEpochStep(); -SHOW_EXPR(bFlow.getEpochStep()) bFlow.markEpochUnderflow (dur1, fac1); -SHOW_EXPR(bFlow.getEpochStep()) -SHOW_EXPR(fac1/TARGET_FILL) -SHOW_EXPR(goal1) -SHOW_EXPR(movingAverage(step, goal1)) CHECK (bFlow.getEpochStep() == movingAverage(step, goal1)); step = bFlow.getEpochStep(); bFlow.markEpochUnderflow (dur2, fac2); -SHOW_EXPR(_raw(bFlow.getEpochStep())) -SHOW_EXPR(_raw(movingAverage(step, goal2))) CHECK (bFlow.getEpochStep() == movingAverage(step, goal2)); } + + /** @test investigate progression of epochs under realistic load - * - expose the allocator to a load of 200fps for simulated 60sec - * - assuming 10 Activities per frame, this means a throughput of 120000 Activities + * - expose the allocator to a load of 200fps for simulated 3 Minutes + * - assuming 10 Activities per frame, this means a throughput of 360000 Activities * - run this load exposure under saturation for performance measurement * - use a planning to deadline delay of 500ms, but with ±200ms random spread * - after 250ms (500 steps), »invoke« by accessing and adding the random checksum * - run a comparison of all-pre-allocated ⟷ heap allocated ⟷ Refcount ⟷ BlockFlow - * @todo WIP 7/23 ⟶ 🔁define ⟶ 🔁implement + * @remarks + * This test setup can be used to investigate different load scenarios. + * In the standard as defined, the BlockFlow allocator is overloaded initially; + * within 5 seconds, the algorithm should have regulated the Epoch stepping down + * to accommodate the load peak. As immediate response, excess allocation requests + * are shifted into later Epochs. To cope with a persisting higher load, the spacing + * is reduced swiftly, by growing the internal pool with additional heap allocated Extents. + * In the following balancing phase, the mechanism aims at bringing back the Epoch duration + * into a narrow corridor, to keep the usage quotient as close as possible to 90% */ void storageFlow() { - const uint INSTANCES = 120000; // Activities to send through the test subject - const uint MAX_TIME = 121000; // Test steps to perform - const gavl_time_t STP = 500; // with 2 steps per ms - Offset BASE_DEADLINE{FSecs{1,2}}; // base pre-roll before deadline - Offset SPREAD_DEAD{FSecs{2,100}}; // random spread of deadline around base - const uint INVOKE_LAG = 500; // „invoke“ the Activity after 500 steps (≙ simulated 250ms) - const uint CLEAN_UP = 200; // perform clean-up every 200 steps + const size_t FPS = 200; + const size_t TICK_P_S = FPS * ACTIVITIES_P_FR; // Simulated throughput 200 frames per second + const gavl_time_t STP = Time::SCALE / TICK_P_S; // Simulation stepping (here 2 steps per ms) + const gavl_time_t RUN = _raw(Time{0,0,3}); // nominal length of the simulation time axis + Offset BASE_DEADLINE{FSecs{1,2}}; // base pre-roll before deadline + Offset SPREAD_DEAD{FSecs{2,100}}; // random spread of deadline around base + const uint INVOKE_LAG = _raw(Time{250,0}) /STP; // „invoke“ the Activity after simulated 250ms (≙ 500 steps) + const uint CLEAN_UP = _raw(Time{100,0}) /STP; // perform clean-up every 200 steps + const uint INSTANCES = RUN /STP; // 120000 Activity records to send through the test subject + const uint MAX_TIME = INSTANCES + +INVOKE_LAG+2*CLEAN_UP; // overall count of Test steps to perform using TestData = vector>; using Subjects = vector>; @@ -428,18 +430,12 @@ SHOW_EXPR(_raw(movingAverage(step, goal2))) }; auto benchmark = [INSTANCES](auto invokeTest) - { - using std::chrono::system_clock; - using Dur = std::chrono::duration; - const double SCALE = 1e9; // Results are in ns - - auto start = system_clock::now(); - invokeTest(); - Dur duration = system_clock::now () - start; - return duration.count()/(INSTANCES) * SCALE; + { // does the timing measurement with result in nanoseconds + return lib::test::benchmarkTime(invokeTest, INSTANCES); }; + /* =========== Test-Setup-1: no individual allocations/deallocations ========== */ size_t sum1{0}; vector storage{INSTANCES}; @@ -466,7 +462,7 @@ SHOW_EXPR(_raw(movingAverage(step, goal2))) }; auto invoke = [](Activity& feedActivity) { - size_t check = feedActivity.data_.feed.one; + size_t check = feedActivity.data_.feed.one; delete &feedActivity; return check; }; @@ -488,7 +484,7 @@ SHOW_EXPR(_raw(movingAverage(step, goal2))) }; auto invoke = [&, i=0](Activity& feedActivity) mutable { - size_t check = feedActivity.data_.feed.one; + size_t check = feedActivity.data_.feed.one; manager[i].reset(); return check; }; @@ -527,39 +523,51 @@ SHOW_EXPR(_raw(movingAverage(step, goal2))) // INVOKE Setup-1 auto time_noAlloc = benchmark(noAlloc); -SHOW_EXPR(time_noAlloc) -SHOW_EXPR(sum1); // INVOKE Setup-2 auto time_heapAlloc = benchmark(heapAlloc); -SHOW_EXPR(time_heapAlloc) -SHOW_EXPR(sum2); // INVOKE Setup-3 auto time_sharedAlloc = benchmark(sharedAlloc); -SHOW_EXPR(time_sharedAlloc) -SHOW_EXPR(sum3); -cout<<"\n\n■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□■□"< -
+
//Integration effort to promote the development of rendering, playback and video display in the GUI//
 This IntegrationSlice was started in {{red{2023}}} as [[Ticket #1221|https://issues.lumiera.org/ticket/1221]] to coordinate the completion and integration of various implementation facilities, planned, drafted and built during the last years; this effort marks the return of development focus to the lower layers (after years of focussed UI development) and will implement the asynchronous and time-bound rendering coordinated by the [[Scheduler]] in the [[Vault|Vault-Layer]]
 
@@ -6210,8 +6210,8 @@ __June23__: building upon this prototyping approach, the dispatcher pipeline cou
 
 __July23__: this leads to a shift of work focus towards implementing the [[Scheduler]] itself.
 The Scheduler will be structured into two Layers, where the lower layer is implemented as //priority queue// (using the STL). So the most tricky part to solve is the representation of //dependencies// between jobs, with the possible extension to handling IO operations asynchronously. Analysis and planning of the implementation indicate that the [[scheduler memory managment|SchedulerMemory]] can be based on //Extents//, which are interpreted as »Epochs« with a deadline. These considerations imply what steps to take next for building up Scheduler functionality and memory management required for processing a simple job
-* 🗘 build a first working draft for the {{{BlockFlow}}} allocation scheme [[#1311|https://issues.lumiera.org/ticket/1311]]
-* ⌛ define and cover the basic [[Activities|RenderActivity]] necessary to implement a plain-simple-Job (without dependencies)
+* ✔ build a first working draft for the {{{BlockFlow}}} allocation scheme [[#1311|https://issues.lumiera.org/ticket/1311]]
+* 🗘 define and cover the basic [[Activities|RenderActivity]] necessary to implement a plain-simple-Job (without dependencies)
 * ⌛ pass such an Activity through the two layers of the Scheduler
 * ⌛ adapt the [[job planning pipeline|JobPlanningPipeline]] implemented thus far to produce the appropriate {{{Activity}}} records for the scheduler
 
@@ -7100,7 +7100,7 @@ The Scheduler is now considered an implementation-level facility with an interfa
 &rarr; [[Workers|SchedulerWorker]]
 
-
+
//The Scheduler uses an »Extent« based memory management scheme known as {{{BlockFlow}}}.//
 The organisation of rendering happens in terms of [[Activities|RenderActivity]], which may bound by //dependencies// and limited by //deadlines.// For the operational point of view this implies that a sequence of allocations must be able to „flow through the Scheduler“ -- in fact, only references to these {{{Activity}}}-records are passed, while the actual descriptors reside at fixed memory locations. This is essential to model the dependencies and conditional execution structures efficiently. At some point however, any {{{Activity}}}-record will either be //performed// or //obsoleted// -- and this leads to the idea of managing the allocations in //extents// of memory here termed as »Epochs«
 * a new Activity is planted into a suitable //Epoch,// based on its deadline
@@ -7122,7 +7122,14 @@ The memory management for the scheduler is arranged into three layers...
 * raw memory is allocated in large blocks of {{{Extent}}} size -- {{red{currently as of 7/23}}} claimed from regular heap storage
 * a low-level allocation scheme, the {{{ExtentFamily}}} uses a //pool of extents cyclically,// with the ability to claim more extents on-demand
 * the high-level {{{BlockFlow}}} allocation manager is aware of scheduler semantics and dresses up those extents as {{{Epoch}}}
-For each new RenderActivity, the API usage with the help of the {{{ActivityLang}}} is required to designate a ''deadline'' -- which can be used to associate the corresponding {{{Activity}}}-records with a suitable {{{Epoch}}}. The //temporal spacing// of epochs, as well as the number of active epochs (=extents) must be managed dynamically. {{red{As of 7/23, a scheme to avoid control oscillations}}} need to be devised, see [[#1316|https://issues.lumiera.org/ticket/1316]]. When the reserved allocation for an epoch turns out as insufficient (i.e. the underlying extent has been filled up prior to maturity), further {{{Activity}}} records will be //„borrowed“// from the next epoch, while reducing the epoch spacing for compensation. Each {{{Epoch}}} automatically maintains a specifically rigged »''~EpochGuard''«-{{{Activity}}}, always located in the first »slot« of the epoch storage. This guard models the deadline and additionally allows to block deallocation with a count-down latch, which can be tied to pending IO operations.
+For each new RenderActivity, the API usage with the help of the {{{ActivityLang}}} is required to designate a ''deadline'' -- which can be used to associate the corresponding {{{Activity}}}-records with a suitable {{{Epoch}}}. The //temporal spacing// of epochs, as well as the number of active epochs (=extents) must be managed dynamically, to accommodate varying levels of load. This bears the danger of control oscillations, and more fine tuning and observations under real-world conditions are indicated {{red{as of 7/23}}}, see [[#1316|https://issues.lumiera.org/ticket/1316]]. When the reserved allocation for an epoch turns out as insufficient (i.e. the underlying extent has been filled up prior to maturity), further {{{Activity}}} records will be //„borrowed“// from the next epoch, while reducing the epoch spacing for compensation. Each {{{Epoch}}} automatically maintains a specifically rigged »''~EpochGuard''«-{{{Activity}}}, always located in the first »slot« of the epoch storage. This guard models the deadline and additionally allows to block deallocation with a count-down latch, which can be tied to pending IO operations.
+
+The auto-regulation of this allocation scheme is based on //controlling the Epoch duration// dynamically. As immediate response to sudden load peaks, the Epoch stepping is reduced eagerly while excess allocations are shifted into Epochs with later deadline; the underlying {{{Extent}}} allocation pool is increased to satisfy additional demand. Overshooting regulation need to be limited however, which is achieved by watching the fill level of each Epoch at the later time point when is //is discarded.// This generates a signal to counteract the eager increase of capacity, and can be used to regulate the load factor to be close to 90%. Yet this quite precise accounting is only possibly with some delay (the limited life time of the Epochs is fundamental trait of this allocation scheme); this second steering signal is thus passed through an exponential moving average and applied with considerable damping, albeit with higher binding force than the eager capacity increase.
+
+!!!performance considerations
+It should be noted that {{{BlockFlow}}} provides very specific services, which are more elaborate than just a custom memory allocator. By leveraging possible scale factors, it is possible however to bring the amortised effort for a single {{{Activity}}} allocation down into the same order of magnitude as a standard heap allocation, which equates to roughly 30ns on contemporary machines. For context, an individually managed allocation-deallocation pair with a ref-counting {{{std::shared_ptr}}} has a typical performance cost of ~100ns.
+
+The primary scaling effects exploited to achieve this level of performance are the combined de-allocation of a complete Epoch, and the combination of several allocations tied to a common deadline. However -- since the Epoch duration is chosen dynamically, performance can potentially be //degraded drastically// once the scheme is put under pressure -- since each new allocation has to search through the list of active Epochs. Parameters are tuned such as to ensure this list remains very short (about 5 Epochs) under typical operational conditions.
 
diff --git a/wiki/thinkPad.ichthyo.mm b/wiki/thinkPad.ichthyo.mm index ec7cc7d52..01e09f31a 100644 --- a/wiki/thinkPad.ichthyo.mm +++ b/wiki/thinkPad.ichthyo.mm @@ -78824,9 +78824,9 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- - - + + + @@ -78917,19 +78917,24 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- - - - + + + + - + - + + + + + + @@ -78948,13 +78953,12 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- + - - - - - + + + + @@ -78962,7 +78966,7 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- + @@ -78970,15 +78974,21 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- - - - - + + + + + + + + + + + + + + - - - @@ -79186,7 +79196,7 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- + @@ -79196,7 +79206,8 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- + + @@ -79230,7 +79241,8 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- + + @@ -79271,7 +79283,8 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- + + @@ -79401,8 +79414,8 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- - + + @@ -79775,8 +79788,8 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- - + + @@ -79878,12 +79891,12 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- + - + @@ -80106,7 +80119,7 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- + @@ -80204,8 +80217,8 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- - + + @@ -80390,7 +80403,7 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- + @@ -80471,10 +80484,12 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- - - - + + + + + + @@ -80963,7 +80978,7 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- + @@ -81015,11 +81030,143 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
+ + + + + + + + + + +

+ vor allem die 2-Schichtige Regelung, die Übersteuerungen wieder einfängt... +

+ +
+
+ + + + + + + + + + + + + + +

+ bereits nach 5 sec im lock-step +

+ +
+
+ + + + + + +

+ unter extremer Überlast gibt es einen ungünstigen Mitkopplungs-Effekt +

+ +
+ + + + + + + + + + +

+ ... dann werden extrem viele neue Blöcke hinzugefügt +

+ +
+ + + + + +

+ In diesem extremen Overload gilt: jeder neue Block wandert die Kette entlang und wird „hinten abgeworfen“. Dort hinten ist dann zwar das Spacing bereits klein genug, um die Last aufzufangen — aber vorne besteht noch ein viel größeres Spacing. Konsequenz: während die Einfügeposition vorne immer noch die gleichen alten zu langen Epochen überfüllt, werden hinten permanent neue Mini-Epochen angehängt. Erst wenn auch die Einfüge-Position im Bereich der kleinen Blöcke angekommen ist, baut sich der Rückstau (ziemlich schnell) ab. +

+ +
+
+ + + + + + + + + - - + + + + + + + + + + + + + + + + + + + + + + +

+ nach weiteren 6 Sekunden ist die Regelung locked to target +

+ +
+
+ + + + + + + + + + +

+ blockFlow: 32ns +

+ +
+
+
+
+ + +
+
+ + @@ -81043,7 +81190,7 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- + @@ -81073,19 +81220,20 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- + +
- + - - - + + + @@ -81097,13 +81245,57 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- - + + - + + - - + + + + + + +

+ damit kann die Mittelung in beide Richtungen um den Optimalwert arbeiten +

+ +
+ +
+
+ + + + + + +
    +
  • + die Überläufe erzeugen einen sofort wirksamen Druck Richtung mehr Kapazität +
  • +
  • + aber eine exponentielle Mittelung wirkt verzögert, jedoch mit größerer Kraft +
  • +
  • + die Mittelung zielt auf 90% Füllung, hat also eine kleine Regelzone, in der sie beidseitig wirkt +
  • +
  • + im Extremfall jedoch wirken Overflow und Mittelwert gegensinning, wobei der Overflow direkter und schneller wirksam wird, um dem System Luft zu verschaffen +
  • +
  • + die längerfristige Regelung jedoch bremst den Overflow mit einiger Verzögerung auch wieder aus; zwar führt das dazu, daß die Überlastung länger besteht, dafür aber auch weicher ausgeregelt wird, wodurch das System anschließend ohne weitere Schwingungen direkt in den lock-step übergeht +
  • +
+ +
+ +
+ + + +
@@ -81121,8 +81313,8 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- - + + @@ -81252,8 +81444,24 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- + + + + + + + +

+ Zunächst dachte ich, das ist nur ein Provisorium... +

+

+ Aber derzeit ist eine volltändige und elaborierte Regelung im BlockFlow selber implementiert, die bereits robust aussieht und auch kurzzeitige Lastspitzen gut abfedert; es erscheint zweifelhaft, ob hier noch eine externe Regelung benötigt wird. Möglicherweise könnte das TimingObservable einen 2. Layer bilden, der die Grundparameter des Algorithmus längerfristig optimiert +

+ +
+ +
@@ -88851,6 +89059,36 @@ class Something
+ + + + + + + + + +

+ Bezug: Changeset bf35ae0 +

+ +
+
+ + + + +

+ als Patch (Tag) dump.blockFlow +

+ +
+
+ + + +
+