From 1d4f6afd181a9297d7604d84c4f2846dbd11e60e Mon Sep 17 00:00:00 2001 From: Ichthyostega Date: Thu, 11 Apr 2024 23:08:20 +0200 Subject: [PATCH] Scheduler-test: complete and document the Load-peak tests - use parameters known to produce a clean linear model - assert on properties of this linear model Add extended documentation into the !TiddlyWiki, with a textual account of the various findings, also including some of the images and diagrams, rendered as SVG --- .../2024-04-08.Scheduler-LoadTest/Graph-13 | 4 +- .../2024-04-08.Scheduler-LoadTest/Graph-15 | 4 +- .../2024-04-08.Scheduler-LoadTest/Graph13.svg | 967 ++++++++++++++++ .../2024-04-08.Scheduler-LoadTest/Graph15.svg | 1030 +++++++++++++++++ .../2024-04-08.Scheduler-LoadTest/Topo-10 | 182 +++ .../2024-04-08.Scheduler-LoadTest/Topo-10.svg | 902 +++++++++++++++ .../2024-04-08.Scheduler-LoadTest/index.txt | 14 +- tests/vault/gear/scheduler-stress-test.cpp | 164 ++- wiki/dump | 1 + wiki/renderengine.html | 85 +- wiki/thinkPad.ichthyo.mm | 137 ++- 11 files changed, 3385 insertions(+), 105 deletions(-) create mode 100644 doc/devel/dump/2024-04-08.Scheduler-LoadTest/Graph13.svg create mode 100644 doc/devel/dump/2024-04-08.Scheduler-LoadTest/Graph15.svg create mode 100644 doc/devel/dump/2024-04-08.Scheduler-LoadTest/Topo-10 create mode 100644 doc/devel/dump/2024-04-08.Scheduler-LoadTest/Topo-10.svg create mode 120000 wiki/dump diff --git a/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Graph-13 b/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Graph-13 index c2581c344..f248460a3 100644 --- a/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Graph-13 +++ b/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Graph-13 @@ -122,7 +122,7 @@ set arrow 11 from graph 0,0 to graph 0,1.08 size screen 0.025,15,60 filled ls 10 set xlabel 'load size ⟶ number of jobs' -set ylabel 'active time ⟶ ms' +set ylabel 'active time ⟶ ms' offset 2 set key autotitle columnheader tmargin # @@ -169,7 +169,7 @@ if (plots > 1) { # more than one additional data sequence # set y2tics - set y2label 'avg job time ⟶ µs' offset -2.2 + set y2label 'avg job time ⟶ µs' offset -2.6 ####--------------------------------------------- plot $RunData using 1:3 with impulses linestyle 3, \ diff --git a/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Graph-15 b/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Graph-15 index 316be9da9..5d30774b6 100644 --- a/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Graph-15 +++ b/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Graph-15 @@ -122,7 +122,7 @@ set arrow 11 from graph 0,0 to graph 0,1.08 size screen 0.025,15,60 filled ls 10 set xlabel 'load size ⟶ number of jobs' -set ylabel 'active time ⟶ ms' +set ylabel 'active time ⟶ ms' offset 2 set key autotitle columnheader tmargin # @@ -170,7 +170,7 @@ if (plots > 1) { # more than one additional data sequence # set y2tics - set y2label 'avg job time ⟶ µs' offset -1.5 + set y2label 'avg job time ⟶ µs' offset -2.6 ####--------------------------------------------- plot $RunData using 1:3 with impulses linestyle 3, \ diff --git a/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Graph13.svg b/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Graph13.svg new file mode 100644 index 000000000..daca11a39 --- /dev/null +++ b/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Graph13.svg @@ -0,0 +1,967 @@ + + + +Gnuplot +Produced by GNUPLOT 5.2 patchlevel 6 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 40 + + + + + + + + + 60 + + + + + + + + + 80 + + + + + + + + + 100 + + + + + + + + + 120 + + + + + + + + + 140 + + + + + + + + + 160 + + + + + + + + + 180 + + + + + + + + + 200 + + + + + + + + + 220 + + + + + + + + + 240 + + + + + + + + + 30 + + + + + + + + + 40 + + + + + + + + + 50 + + + + + + + + + 60 + + + + + + + + + 70 + + + + + + + + + 80 + + + + + + + + + 90 + + + + + + + + + 100 + + + + + + + + + 110 + + + + + + + + + 120 + + + + + + + + + 130 + + + + + + + + + 140 + + + + + + + + + 150 + + + + + + + + + 160 + + + + + + + + + 170 + + + + + + + + + 180 + + + + + + + + + 190 + + + + + + + + + 200 + + + + + + + + + + + + + + + + + active time ⟶ ms + + + + + load size ⟶ number of jobs + + + + + result time + + + + + result time + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Model: 1.06·p + 12.85 + + + Model: 1.06·p + 12.85 + + + + + + + + + + + + + + + + + + 0 + + + + + + + + + 1 + + + + + + + + + 2 + + + + + + + + + 3 + + + + + + + + + 4 + + + + + + + + + 5 + + + + + + + + + 6 + + + + + + + + + 7 + + + + + + + + + 8 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0 + + + + + 2000 + + + + + 4000 + + + + + 6000 + + + + + 8000 + + + + + 10000 + + + + + 12000 + + + + + + + + + concurrent threads ⟶ + + + + + avg job time ⟶ µs + + + + + concurrency + + + + + concurrency + + + + + + avg jobtime + + + avg jobtime + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + avg impeded + + + avg impeded + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + gnuplot_plot_4b + + + + + + + + + + + + + + diff --git a/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Graph15.svg b/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Graph15.svg new file mode 100644 index 000000000..9089322ee --- /dev/null +++ b/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Graph15.svg @@ -0,0 +1,1030 @@ + + + +Gnuplot +Produced by GNUPLOT 5.2 patchlevel 6 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0 + + + + + + + + + 5 + + + + + + + + + 10 + + + + + + + + + 15 + + + + + + + + + 20 + + + + + + + + + 25 + + + + + + + + + 30 + + + + + + + + + 35 + + + + + + + + + 40 + + + + + + + + + 45 + + + + + + + + + 0 + + + + + + + + + 10 + + + + + + + + + 20 + + + + + + + + + 30 + + + + + + + + + 40 + + + + + + + + + 50 + + + + + + + + + 60 + + + + + + + + + 70 + + + + + + + + + 80 + + + + + + + + + 90 + + + + + + + + + 100 + + + + + + + + + 110 + + + + + + + + + 120 + + + + + + + + + 130 + + + + + + + + + 140 + + + + + + + + + 150 + + + + + + + + + 160 + + + + + + + + + 170 + + + + + + + + + 180 + + + + + + + + + 190 + + + + + + + + + 200 + + + + + + + + + + + + + + + + + active time ⟶ ms + + + + + load size ⟶ number of jobs + + + + + result time + + + + + result time + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Model: 0.21·p + -1.92 + + + Model: 0.21·p + -1.92 + + + + + + + + + + + + + + + + + + 0 + + + + + + + + + 0.5 + + + + + + + + + 1 + + + + + + + + + 1.5 + + + + + + + + + 2 + + + + + + + + + 2.5 + + + + + + + + + 3 + + + + + + + + + 3.5 + + + + + + + + + 4 + + + + + + + + + 4.5 + + + + + + + + + 5 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0 + + + + + 100 + + + + + 200 + + + + + 300 + + + + + 400 + + + + + 500 + + + + + 600 + + + + + + + + + concurrent threads ⟶ + + + + + avg job time ⟶ µs + + + + + concurrency + + + + + concurrency + + + + + + avg jobtime + + + avg jobtime + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + avg impeded + + + avg impeded + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + gnuplot_plot_4b + + + + + + + + + + + + + + diff --git a/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Topo-10 b/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Topo-10 new file mode 100644 index 000000000..4975dd3ac --- /dev/null +++ b/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Topo-10 @@ -0,0 +1,182 @@ +digraph { + // Nodes + N0[label="0: 37", shape=doublecircle ] + N1[label="1: 4F" ] + N2[label="2: 95.2" ] + N3[label="3: 1C" ] + N4[label="4: 0A" ] + N5[label="5: 03" ] + N6[label="6: 7E.1" ] + N7[label="7: B4.2" ] + N8[label="8: BB.2" ] + N9[label="9: 8F.1" ] + N10[label="10: 8F.1" ] + N11[label="11: 4C" ] + N12[label="12: 4C" ] + N13[label="13: 0F" ] + N14[label="14: 0F" ] + N15[label="15: 5A.1" ] + N16[label="16: 5A.1" ] + N17[label="17: 75.1" ] + N18[label="18: 75.1" ] + N19[label="19: AC.2" ] + N20[label="20: AC.2" ] + N21[label="21: 38" ] + N22[label="22: 79.1" ] + N23[label="23: FC.3" ] + N24[label="24: 29" ] + N25[label="25: 29" ] + N26[label="26: 29" ] + N27[label="27: 29" ] + N28[label="28: 29" ] + N29[label="29: A3.2" ] + N30[label="30: A3.2" ] + N31[label="31: A3.2" ] + N32[label="32: A3.2" ] + N33[label="33: A3.2" ] + N34[label="34: D8.3" ] + N35[label="35: B0.2" ] + N36[label="36: 64.1" ] + N37[label="37: 64.1" ] + N38[label="38: E8.3" ] + N39[label="39: CE.3" ] + N40[label="40: CE.3" ] + N41[label="41: 2B" ] + N42[label="42: 2B" ] + N43[label="43: 2B" ] + N44[label="44: 2B" ] + N45[label="45: 27" ] + N46[label="46: 27" ] + N47[label="47: F3.3" ] + N48[label="48: 27" ] + N49[label="49: 27" ] + N50[label="50: EB.3" ] + N51[label="51: E9.3" ] + N52[label="52: E9.3" ] + N53[label="53: 49" ] + N54[label="54: 49" ] + N55[label="55: 49" ] + N56[label="56: 49" ] + N57[label="57: CB.3" ] + N58[label="58: 3A" ] + N59[label="59: 3A" ] + N60[label="60: 3A" ] + N61[label="61: BC.2", shape=box, style=rounded ] + N62[label="62: BC.2", shape=box, style=rounded ] + N63[label="63: 4B", shape=box, style=rounded ] + + // Layers + { /*0*/ rank=min N0 } + { /*1*/ rank=same N1 } + { /*2*/ rank=same N2 } + { /*3*/ rank=same N3 } + { /*4*/ rank=same N4 } + { /*5*/ rank=same N5 } + { /*6*/ rank=same N6 } + { /*7*/ rank=same N7 } + { /*8*/ rank=same N8 } + { /*9*/ rank=same N9 N10 } + { /*10*/ rank=same N11 N12 } + { /*11*/ rank=same N13 N14 } + { /*12*/ rank=same N15 N16 } + { /*13*/ rank=same N17 N18 } + { /*14*/ rank=same N19 N20 } + { /*15*/ rank=same N21 } + { /*16*/ rank=same N22 } + { /*17*/ rank=same N23 } + { /*18*/ rank=same N24 N25 N26 N27 N28 } + { /*19*/ rank=same N29 N30 N31 N32 N33 } + { /*20*/ rank=same N34 N35 } + { /*21*/ rank=same N36 N37 N38 } + { /*22*/ rank=same N39 N40 N41 N42 N43 N44 } + { /*23*/ rank=same N45 N46 N47 N48 N49 N50 } + { /*24*/ rank=same N51 N52 N53 N54 N55 N56 N57 N58 N59 N60 } + { /*25*/ rank=same N61 N62 N63 } + + // Topology + N0 -> N1 + N1 -> N2 + N2 -> N3 + N3 -> N4 + N4 -> N5 + N5 -> N6 + N6 -> N7 + N7 -> N8 + N8 -> N9 + N8 -> N10 + N9 -> N11 + N10 -> N12 + N11 -> N13 + N12 -> N14 + N13 -> N15 + N14 -> N16 + N15 -> N17 + N16 -> N18 + N17 -> N19 + N18 -> N20 + N19 -> N21 + N20 -> N21 + N21 -> N22 + N22 -> N23 + N23 -> N24 + N23 -> N25 + N23 -> N26 + N23 -> N27 + N23 -> N28 + N24 -> N29 + N25 -> N30 + N26 -> N31 + N27 -> N32 + N28 -> N33 + N29 -> N34 + N30 -> N34 + N31 -> N34 + N32 -> N35 + N33 -> N35 + N34 -> N36 + N34 -> N37 + N34 -> N38 + N35 -> N38 + N36 -> N39 + N37 -> N40 + N38 -> N41 + N38 -> N42 + N38 -> N43 + N38 -> N44 + N39 -> N45 + N39 -> N46 + N39 -> N47 + N40 -> N48 + N40 -> N49 + N40 -> N47 + N41 -> N47 + N42 -> N47 + N43 -> N47 + N44 -> N50 + N45 -> N51 + N46 -> N52 + N47 -> N53 + N47 -> N54 + N47 -> N55 + N47 -> N56 + N47 -> N57 + N48 -> N57 + N49 -> N57 + N50 -> N58 + N50 -> N59 + N50 -> N60 + N50 -> N57 + N51 -> N61 + N51 -> N62 + N51 -> N63 + N52 -> N63 + N53 -> N63 + N54 -> N63 + N55 -> N63 + N56 -> N63 + N57 -> N63 + N58 -> N63 + N59 -> N63 + N60 -> N63 +} + diff --git a/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Topo-10.svg b/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Topo-10.svg new file mode 100644 index 000000000..cccff74be --- /dev/null +++ b/doc/devel/dump/2024-04-08.Scheduler-LoadTest/Topo-10.svg @@ -0,0 +1,902 @@ + + + + + + +%3 + + + +N0 + + +0: 37 + + + +N1 + +1: 4F + + + +N0->N1 + + + + + +N2 + +2: 95.2 + + + +N1->N2 + + + + + +N3 + +3: 1C + + + +N2->N3 + + + + + +N4 + +4: 0A + + + +N3->N4 + + + + + +N5 + +5: 03 + + + +N4->N5 + + + + + +N6 + +6: 7E.1 + + + +N5->N6 + + + + + +N7 + +7: B4.2 + + + +N6->N7 + + + + + +N8 + +8: BB.2 + + + +N7->N8 + + + + + +N9 + +9: 8F.1 + + + +N8->N9 + + + + + +N10 + +10: 8F.1 + + + +N8->N10 + + + + + +N11 + +11: 4C + + + +N9->N11 + + + + + +N12 + +12: 4C + + + +N10->N12 + + + + + +N13 + +13: 0F + + + +N11->N13 + + + + + +N14 + +14: 0F + + + +N12->N14 + + + + + +N15 + +15: 5A.1 + + + +N13->N15 + + + + + +N16 + +16: 5A.1 + + + +N14->N16 + + + + + +N17 + +17: 75.1 + + + +N15->N17 + + + + + +N18 + +18: 75.1 + + + +N16->N18 + + + + + +N19 + +19: AC.2 + + + +N17->N19 + + + + + +N20 + +20: AC.2 + + + +N18->N20 + + + + + +N21 + +21: 38 + + + +N19->N21 + + + + + +N20->N21 + + + + + +N22 + +22: 79.1 + + + +N21->N22 + + + + + +N23 + +23: FC.3 + + + +N22->N23 + + + + + +N24 + +24: 29 + + + +N23->N24 + + + + + +N25 + +25: 29 + + + +N23->N25 + + + + + +N26 + +26: 29 + + + +N23->N26 + + + + + +N27 + +27: 29 + + + +N23->N27 + + + + + +N28 + +28: 29 + + + +N23->N28 + + + + + +N29 + +29: A3.2 + + + +N24->N29 + + + + + +N30 + +30: A3.2 + + + +N25->N30 + + + + + +N31 + +31: A3.2 + + + +N26->N31 + + + + + +N32 + +32: A3.2 + + + +N27->N32 + + + + + +N33 + +33: A3.2 + + + +N28->N33 + + + + + +N34 + +34: D8.3 + + + +N29->N34 + + + + + +N30->N34 + + + + + +N31->N34 + + + + + +N35 + +35: B0.2 + + + +N32->N35 + + + + + +N33->N35 + + + + + +N36 + +36: 64.1 + + + +N34->N36 + + + + + +N37 + +37: 64.1 + + + +N34->N37 + + + + + +N38 + +38: E8.3 + + + +N34->N38 + + + + + +N35->N38 + + + + + +N39 + +39: CE.3 + + + +N36->N39 + + + + + +N40 + +40: CE.3 + + + +N37->N40 + + + + + +N41 + +41: 2B + + + +N38->N41 + + + + + +N42 + +42: 2B + + + +N38->N42 + + + + + +N43 + +43: 2B + + + +N38->N43 + + + + + +N44 + +44: 2B + + + +N38->N44 + + + + + +N45 + +45: 27 + + + +N39->N45 + + + + + +N46 + +46: 27 + + + +N39->N46 + + + + + +N47 + +47: F3.3 + + + +N39->N47 + + + + + +N40->N47 + + + + + +N48 + +48: 27 + + + +N40->N48 + + + + + +N49 + +49: 27 + + + +N40->N49 + + + + + +N41->N47 + + + + + +N42->N47 + + + + + +N43->N47 + + + + + +N50 + +50: EB.3 + + + +N44->N50 + + + + + +N51 + +51: E9.3 + + + +N45->N51 + + + + + +N52 + +52: E9.3 + + + +N46->N52 + + + + + +N53 + +53: 49 + + + +N47->N53 + + + + + +N54 + +54: 49 + + + +N47->N54 + + + + + +N55 + +55: 49 + + + +N47->N55 + + + + + +N56 + +56: 49 + + + +N47->N56 + + + + + +N57 + +57: CB.3 + + + +N47->N57 + + + + + +N48->N57 + + + + + +N49->N57 + + + + + +N50->N57 + + + + + +N58 + +58: 3A + + + +N50->N58 + + + + + +N59 + +59: 3A + + + +N50->N59 + + + + + +N60 + +60: 3A + + + +N50->N60 + + + + + +N61 + +61: BC.2 + + + +N51->N61 + + + + + +N62 + +62: BC.2 + + + +N51->N62 + + + + + +N63 + +63: 4B + + + +N51->N63 + + + + + +N52->N63 + + + + + +N53->N63 + + + + + +N54->N63 + + + + + +N55->N63 + + + + + +N56->N63 + + + + + +N57->N63 + + + + + +N58->N63 + + + + + +N59->N63 + + + + + +N60->N63 + + + + + diff --git a/doc/devel/dump/2024-04-08.Scheduler-LoadTest/index.txt b/doc/devel/dump/2024-04-08.Scheduler-LoadTest/index.txt index d38700b1b..893b78cc4 100644 --- a/doc/devel/dump/2024-04-08.Scheduler-LoadTest/index.txt +++ b/doc/devel/dump/2024-04-08.Scheduler-LoadTest/index.txt @@ -19,6 +19,18 @@ of tools in the `StressTestRig` generate report output and data visualisation as _Gnuplot_ script. Raw measurement data is stored as CSV (see 'csv.hpp'). +Breaking Point Testing +---------------------- +Topo-10:: + Topology of the processing load used as typical example for _breaking a schedule._ + This Graph with 64 nodes is generated by the pre-configured rules + `configureShape_chain_loadBursts()`; it starts with a single linear, yet »bursts« + into an excessively interconnected parallel graph roughly in the middle. Due to + the dependency connections, typically the capacity is first underused, followed + by a short and massive overload peak. Deliberately, there is not much room for + speed-up through parallelisation. + + Load Peak Testing ----------------- @@ -37,7 +49,7 @@ Dump-10:: Graph-10:: Plot from a parametric measurement series with the same settings as used for the Dump-10; settings in these range are known to produce clean processing - with no obivous irregular overhead. In this case here + with no obvious irregular overhead. In this case here + - the series is comprised of 40 measurements, covering a parameter range 10...100 - the free parameter is the number of jobs in a homogenous load peak diff --git a/tests/vault/gear/scheduler-stress-test.cpp b/tests/vault/gear/scheduler-stress-test.cpp index 710d229e8..f8df0672e 100644 --- a/tests/vault/gear/scheduler-stress-test.cpp +++ b/tests/vault/gear/scheduler-stress-test.cpp @@ -33,36 +33,17 @@ #include "lib/format-string.hpp" #include "lib/format-cout.hpp" #include "lib/test/diagnostic-output.hpp"//////////////////////////TODO work in distress -//#include "lib/format-string.hpp" -#include "lib/test/transiently.hpp" -//#include "lib/test/microbenchmark.hpp" -//#include "lib/util.hpp" - -//#include -//#include -#include +#include "lib/util.hpp" using test::Test; -//using std::move; -//using util::isSameObject; namespace vault{ namespace gear { namespace test { -// using lib::time::FrameRate; -// using lib::time::Offset; -// using lib::time::Time; using util::_Fmt; -// using std::vector; - using std::array; - - namespace { // Test definitions and setup... - - } - - + using util::isLimited; @@ -79,13 +60,12 @@ namespace test { virtual void run (Arg) { - //smokeTest(); -// setup_systematicSchedule(); -// verify_instrumentation(); -// search_breaking_point(); + smokeTest(); + setup_systematicSchedule(); + verify_instrumentation(); + search_breaking_point(); watch_expenseFunction(); -// investigateWorkProcessing(); - walkingDeadline(); + investigateWorkProcessing(); } @@ -376,12 +356,68 @@ namespace test { - /** @test TODO Investigate the relation of run time (expense) to input length. + /** @test Investigate the relation of run time (expense) to input length. + * - again use the integrated StressRig + * - this time overload the scheduler with a peak of uncorrelated jobs + * and watch the time and load required to work through this challenge + * - conduct a series of runs with random number of jobs (within bounds) + * - collect the observed data (as CSV), calculate a **linear regression model** + * - optionally generate a **Gnuplot** script for visualisation * @see vault::gear::bench::ParameterRange - * @todo WIP 1/24 🔁 define ⟶ 🔁 implement + * @see gnuplot-gen.hpp + * @todo WIP 4/24 ✔ define ⟶ ✔ implement */ void watch_expenseFunction() + { + MARK_TEST_FUN + + struct Setup + : StressRig, bench::LoadPeak_ParamRange_Evaluation + { + uint CONCURRENCY = 4; + uint REPETITIONS = 50; + + auto testLoad(Param nodes) + { + TestLoad testLoad{nodes}; + return testLoad.configure_isolated_nodes(); + } + + auto testSetup (TestLoad& testLoad) + { + return StressRig::testSetup(testLoad) + .withLoadTimeBase(2ms); + } + }; + + auto results = StressRig::with() + .perform (33,128); + + auto [socket,gradient,v1,v2,corr,maxDelta,stdev] = bench::linearRegression (results.param, results.time); + double avgConc = Setup::avgConcurrency (results); +/* + cout << "───═══───═══───═══───═══───═══───═══───═══───═══───═══───═══───"< 0.80); // clearly a linear correlated behaviour + CHECK (isLimited (0.4, gradient, 0.7)); // should be slightly above 0.5 (2ms and 4 threads => 0.5ms / Job) + CHECK (isLimited (3, socket, 9 )); // we have a spin-up and a shut-down both ~ 2ms plus some further overhead + + CHECK (avgConc > 3); // should be able to utilise 4 workers (minus the spin-up/shut-down phase) + } + + + + /** @test TODO build a load pattern to emolate a typical high work load + * @todo WIP 4/24 🔁 define ⟶ implement + */ + void + investigateWorkProcessing() { ComputationalLoad cpuLoad; cpuLoad.timeBase = 200us; @@ -415,49 +451,6 @@ cout << "time="<() - .perform (20,200); - - cout << "───═══───═══───═══───═══───═══───═══───═══───═══───═══───═══───"< testLoad{256}; testLoad.seedingRule(testLoad.rule().probability(0.6).minVal(2)) .pruningRule(testLoad.rule().probability(0.44)) @@ -466,21 +459,9 @@ cout << "time="<; struct Setup : StressRig @@ -530,16 +514,6 @@ SHOW_EXPR(stress) SHOW_EXPR(delta) SHOW_EXPR(time) } - - - - /** @test TODO - * @todo WIP 1/24 🔁 define ⟶ implement - */ - void - walkingDeadline() - { - } }; diff --git a/wiki/dump b/wiki/dump new file mode 120000 index 000000000..f3316dcee --- /dev/null +++ b/wiki/dump @@ -0,0 +1 @@ +../doc/devel/dump \ No newline at end of file diff --git a/wiki/renderengine.html b/wiki/renderengine.html index 144645c96..cc8746992 100644 --- a/wiki/renderengine.html +++ b/wiki/renderengine.html @@ -7317,12 +7317,20 @@ The primary scaling effects exploited to achieve this level of performance are t &rarr; [[Scheduler performance testing|SchedulerTest]] -
+
At first sight, the internals of [[Activity|RenderActivity]] processing may seem overwhelmingly complex -- especially since there is no active »processing loop« which might serve as a starting point for the understanding. It is thus necessary to restate the working mode of the Scheduler: it is an //accounting and direction service// for the //active// [[render workers|SchedulerWorker]]. Any processing happens stochastically and is driven by various kinds of events --
 * a //worker// becoming ready to perform further tasks
 * an external //IO event// {{red{12/23 only planned yet}}}
 * a //planning job// to add new elements to the schedule
 The last point highlights a //circular structure:// the planning job itself was part of the schedule, picked up by a worker and activated; this way, the system feeds itself.
+
+!Participants and Realms
+From the outside, the Scheduler appears as a service component, exposing two points-of-access: Jobs can be added to the schedule (planned), and a worker can retrieve the next instruction, which is either to perform an (opaque) computation, or to sleep for some given short period of time. Jobs are further distinguished into processing tasks, IO activities and meta jobs, which are related to the self-regulation of the scheduling process. On a closer look however, several distinct realms can be identified, each offering an unique perspective of operation.
+
+!!!The Activity Realm
+!!!Queues and Activation Events
+!!!The Worker Aspect
+!!!Process Control
 
@@ -7340,7 +7348,7 @@ The last point highlights a //circular structure:// the planning job itself was The way other parts of the system are built, requires us to obtain a guaranteed knowledge of some job's termination. It is possible to obtain that knowledge with some limited delay, but it nees to be absoultely reliable (violations leading to segfault). The requirements stated above assume this can be achieved through //jobs with guaranteed execution.// Alternatively we could consider installing specific callbacks -- in this case the scheduler itself has to guarantee the invocation of these callbacks, even if the corresponding job fails or is never invoked. It doesn't seem there is any other option.
-
+
With the Scheduler testing effort [[#1344|https://issues.lumiera.org/ticket/1344]], several goals are pursued
 * by exposing the new scheduler implementation to excessive overload, its robustness can be assessed and defects can be spotted
 * with the help of a systematic, calibrated load, characteristic performance limits and breaking points can be established
@@ -7378,6 +7386,79 @@ A method to determine such a »''breaking point''« in a systematic way relies o
 Observing this breaking point in correlation with various load patterns will unveil performance characteristics and weak spots of the implementation.
 &rarr; [[Scheduler behaviour traits|SchedulerBehaviour]]
 
+Another, quite different avenue of testing is to investigate a ''steady full-load state'' of processing. Contrary to the //breaking point// technique discussed above, for this method a fluid, homogenous schedule is required, and effects of scaffolding, ramp-up and load adaptation should be minimised. By watching a constant flow of back-to-back processing, in a state of //saturation,// the boundary capabilities for throughput and parallelisation can be derived, ideally expressed as a model of processing efficiency. A setup for this kind of investigation would be to challenge the scheduler with a massive load peak of predetermined size: a set of jobs //without any further interdependencies,// which are scheduled effectively instantaneously, so that the scheduler is immediately in a state of total overload. The actual measurement entails to watch the time until completing this work load, together with the individual job activation times during that period; the latter can be integrated to account for the //effective parallelism// and the amount of time in //impeded state,// where at most single threaded processing is observed. Challenging the Scheduler with a random series of such homogenous load peaks allows build a correlation table and to compute a linear regression model.
+
+! Observations
+!!!Breaking Point and Stress
+Several investigations to determine the »breaking point« of a schedule were conducted with the load topology depicted to the right. This load pattern is challenging on various levels. There are dependency chains leading from the single start node to the three exit nodes, and thus the order of processing must be strictly observed. Moreover, several nodes bear //no weight,// and so the processing for those jobs returns immediately, producing mostly administrative overhead. Some nodes however are attributed with a weight up to 3.
+
+<html><img title="Load topology with 64 nodes joined into a dependency chain, used for »breaking point« search"  src="dump/2024-04-08.Scheduler-LoadTest/Topo-10.svg"  style="float:right; margin-left:2ex"/></html>For execution, this weight is loaded with a base time, for example ''500''µs. An //adapted schedule// is generated based on the //node layers,// and using a simplified heuristic to account both for the accumulated node weight found within a given level, and the ability for speed-up through concurrency. Nodes without a weight are assumed to take no time (a deliberate simplification), while possible parallelisation is applied solely as factor based on the node count, completely disregarding any concerns of »optimal stacking«. This leads to the following schedule:
+
+|! Level|!stepFac|!Schedule |
+| 0|  0.000|  0.000ms|
+| 1|  0.000|  0.000ms|
+| 2|  0.000|  0.000ms|
+| 3|  2.000|  1.000ms|
+| 4|  2.000|  1.000ms|
+| 5|  2.000|  1.000ms|
+| 6|  2.000|  1.000ms|
+| 7|  3.000|  1.500ms|
+| 8|  5.000|  2.500ms|
+| 9|  7.000|  3.500ms|
+| 10|  8.000|  4.000ms|
+| 11|  8.000|  4.000ms|
+| 12|  8.000|  4.000ms|
+| 13|  9.000|  4.500ms|
+| 14| 10.000|  5.000ms|
+| 15| 12.000|  6.000ms|
+| 16| 12.000|  6.000ms|
+| 17| 13.000|  6.500ms|
+| 18| 16.000|  8.000ms|
+| 19| 16.000|  8.000ms|
+| 20| 20.000| 10.000ms|
+| 21| 22.500| 11.250ms|
+| 22| 24.167| 12.083ms|
+| 23| 26.167| 13.083ms|
+| 24| 28.167| 14.083ms|
+| 25| 30.867| 15.433ms|
+| 26| 32.200| 16.100ms|
+The tests were typically performed with ''4 workers''. Thus e.g. at begin of Layer-3, a factor 2 is added as increment, since Node-2 is attributed with weight≔2; in the .dot-diagram, the weight is added as suffix, behind the node hash mark, in this case ({{{2: 95.2}}}). With 500µs as weight, the node(s) in Layer-2 will thus be scheduled at t=1ms. To discuss a more interesting example, Layer-19 holds 5 nodes with weight≔2. Scheduling for 4 workers will allow to parallelise 4 nodes, but require another round for the remaining node. Thus an increment of +4 is added at the beginning of Layer-20, thus scheduling the two following nodes ({{{34: D8.3}}}) and  ({{{35: B0.2}}}) at t=10ms. So there is a combined weight≡5 in Layer-20, and the two nodes are parallelised, thus allocating an offset of +5/2 · 500µs, placing Layer-21 at t=11.25ms
+
+This heuristic time allocation leads to a schedule, which somehow considers the weight distribution, yet is deliberately unrealistic, since it does not consider any base effort, nor does it fully account for the limited worker pool size. At the beginning, the Scheduler will thus be partially idle waiting, while at the end, a massive short overload peak is added, further exacerbated by the dependency constraints. After all, the objective of this test is to tighten or stretch this schedule by a constant ''stress factor'', and to search the point at which a cascading catastrophic slippage can be observed.
+
+And such a well defined breaking point can indeed be determined reliably. However -- initial experiments placed this event at a stress-factor closely above 0.5 -- which is way off any expectation. The point in question is not the absolute value of slippage, which is expectedly strong, due to the overload peak at end; rather, we are looking for the point, at which the scheduler is unable to follow even the general outline of this schedule. And the expectation would be that this happens close to the nominal schedule, which implies stress-factor ≡ 1. Further //instrumentation// was thus added, allowing to capture invocations of the actual processing function; values integrated from these events allowed to draw conclusions about various aspects of the actual behaviour, especially
+* the actual job run times consistently showed a significant drift towards longer run times (slower execution) than calibrated
+* the effective average concurrency deviates systematically from the average speed-up factor, as assumed in the schedule generation; notably this deviation is stable over a wide range of stress factors, but obviously depends strongly on the actual load graph topology and work pool size; is can thus be interpreted as a ''form factor'' to describe the topological ability to map a given local node connectivity to a scheduling situation.
+By virtue of the instrumentation, both effects can be determined //empirically// during the test run, and compounded into a correction factor, applied to the scale of the stress-factor value determined as result. With this correction in place, the observed »breaking point« moved ''very close to 1.0''. This is considered an //important result;// both corrected effects relate to circumstances considered external to the Scheduler implementation -- which, beyond that, seems to handle timings and the control of the processing path //without significant further overhead.//
+<html><div style="clear: both"/></html>
+
+!!!Overload
+Once the Scheduler is //overloaded,// the actual schedule does not matter much any more -- within the confines of an important limitation: the deadlines. The schedule may slip, but when a given job is pushed ahead beyond its deadline, the general promise of the schedule is //broken.// This draws on an important distinction; deadlines are hard, while start times can be shifted to accommodate. As long as the timings stay within the overall confines, as defined by the deadlines, the scheduling is able to absorb short load peaks. While in this mode of operation, no further timing waits are performed, rather, jobs are processed in order, as defined by their start times; when done with one job, a worker immediately retrieves the next job, which in state of overload is likewise overdue. So this setup allows to observe the efficiency of the „mechanics“ of job invocation.
+<html><img title="Load Peak with 8ms"  src="dump/2024-04-08.Scheduler-LoadTest/Graph13.svg"  style="float:right; width: 80ex; margin-left:2ex"/></html>
+The measurement shown to the right uses a pool of ''8 workers'' to processes a load peak of jobs, each loaded with a processing function calibrated to run ''8''ms. Workers are thus occupied with processing the job load for a significant amount of time -- and so the probability of workers asking for work at precisely the same time is low. Since the typical {{{ActicityTerm}}} for [[regular job processing|RenderOperationLogic]] entails dropping the {{{GroomingToken}}} prior to invocation of the actual {{{JobFunctor}}}, another worker can access the queues meanwhile, and process Activities up to invocation of the next {{{JobFunctor}}}
+With such a work load, the //worker pull processing// plays out to its full strength; since there is no »manager« thread, the administrative work is distributed evenly to all workers, and performed on average without imposing any synchronisation overhead to other workers. Especially with larger load peaks, the concurrency converges towards the theoretical maximum of 8, as can be seen at the light blue vertical bars in the secondary diagram below (left Y scale: concurrency). The remaining headroom can be linked (by investigation of trace dumps) to the inevitable ramp-up and tear-down; the work capacity shows up with some distribution of random delay, and thus it typically takes several milliseconds until all workers got their first task. Moreover, there is the overhead of the management work, which happens outside the measurement bracket inserted around the invocation of the job function -- even while in this load scenario also the management work is done concurrent to the other worker's payload processing, it is not accounted as part of the payload effort, and thus reduces the average concurrency.
+
+The dependency of load size to processing time is clearly linear, with a very high correlation (0.98). A ''linear regression model'' indicates a gradient very close to the expected value of 1ms/job (8ms nominal job time distributed to 8 cores). The slight deviation is due to the fact that //actual job times// (&rarr; dark green dots) tend to diverge to higher values than calibrated, an effect consistently observed on this machine throughout this scheduler testing effort. An explanation might be that the calibration of the work load is done in a tight loop (single threaded or multi threaded does not make much of difference here), while in the actual processing within the scheduler, some global slowdown is generated by cache misses, pipeline stalls and the coordination overhead caused by accessing the atomic {{{GroomingToken}}} variable. Moreover, the linear model indicates a socket overhead, which largely can be attributed to the ramp-up / tear-down phase, where -- inevitably -- not all workers can be put to work. In accordance with that theory, the socket overhead indeed increases with larger job times. The probabilistic capacity management employed in this Scheduler implementation adds a further socket overhead; the actual work start depends on workers pulling further work, which, depending on the circumstances, happens more or less randomly at the begin of each test run. Incidentally, there is further significant scaffolding overhead, which is not accounted for in the numbers presented here: At start, the worker pool must be booted, the jobs for the processing load must be planned, and a dependency to a wake-up job will be maintained, prompting the controlling test-thread to collect the measurement data and to commence with the next run in the series.
+
+Note also that all these investigations were performed with ''debug builds''.
+<html><div style="clear: both"/></html>
+<html><img title="Load Peak with 8ms"  src="dump/2024-04-08.Scheduler-LoadTest/Graph15.svg"  style="float:right; width: 80ex; margin-left:2ex"/></html>
+However, ''very short jobs'' cause ''significant loss of efficiency''.
+The example presented to the right uses a similar setup (''8 workers''), but reduced the calibrated job-time to only ''200''µs. This comes close to the general overhead required for retrieving and launching the next job, which amounts to further 100µs running a debug build, including the ~30µs required solely to access, set and drop the {{{GroomingToken}}}. Consequently, only a small number of //other workers// get a chance to acquire the {{{GroomingToken}}} and then to work through the Activities up to the next Job invocation, before the first worker returns already -- causing a significant amount of ''contention'' on the {{{GroomingToken}}}. Now, the handling of work-pull requests in the Scheduler implementation is arranged in a way to prefer workers just returning from active processing. Thus (intentionally) only a small subset of the workers is able to pull work repeatedly, while the other workers will encounter a series of »contention {{{KICK}}}« events; an inbuilt //contention mitigation scheme// responds to this kind of repeated pull-failures by interspersing sleep cycles, thereby effectively throttling down the {{{WorkForce}}} until contention events return to an attainable level. This mitigation is important, since contention, even just on an atomic variable, can cause a significant global slow-down of the system.
+
+As net effect, most of the load peaks are just handled by two workers, especially for larger load sizes; most of the available processing capacity remains unused for such short running payloads. Moreover, on average a significant amount of time is spent with partially blocked or impeded operation (&rarr; light green circles), since administrative work must be done non-concurrently. Depending on the perspective, this can be seen as a weakness -- or as the result of a deliberate trade-off made by the choice of active work-pulling and a passive Scheduler.
+
+The actual average in-job time (&rarr; dark green dots) is offset significantly here, and closer to 400µs -- which is also confirmed by the gradient of the linear model (0.4ms / 2 Threads ≙ 0.2ms/job). With shorter load sizes below 90 jobs, increased variance can be observerd, and measurements can no longer be subsumed under a single linear relation -- in fact, data points seem to be arranged into several groups with differing, yet mostly linear correlation, which also explains the negative socket value of the overall computed model; using only the data points with > 90 jobs would yield a model with slightly lower gradient but a positive offset of ~2ms.
+<html><div style="clear: both"/></html>
+Further measurement runs with other parameter values fit well in between the two extremes presented above. It can be concluded that this Scheduler implementation strongly favours larger job sizes starting with several milliseconds, when it comes to processing through a extended homogenous work load without much job interdependencies. Such larger lot sizes can be handled efficiently and close to expected limits, while very small jobs massively degrade the available performance. This can be attributed both to the choice of a randomised capacity distribution, and of pull processing without a central manager.
+
+!!!Stationary Processing
+
+lorem ipsum
+lorem ipsum nebbich
+ja luia sog I
+
+
 
diff --git a/wiki/thinkPad.ichthyo.mm b/wiki/thinkPad.ichthyo.mm index 0c859332b..d85c924bc 100644 --- a/wiki/thinkPad.ichthyo.mm +++ b/wiki/thinkPad.ichthyo.mm @@ -111009,8 +111009,8 @@ Date:   Thu Apr 20 18:53:17 2023 +0200
- - + + @@ -116210,6 +116210,107 @@ std::cout << tmpl.render({"what", "World"}) << s + + + + + + + + + + + + + +

+ Und zwar unabhängig davon, ob die Kalibrierung mit kurzen oder langen Zeiten und single- oder multithreaded erfolgte. Die Abweichtung tritt nur im realen Last-Kontext auf, und ist (visuell. den Diagrammen nach zu urteilen) korreliert mit dem Grad an contention und irregularität im Ablauf. Tendentiell nimmt sie für längere Testläufe ab, konvergiert aber — auch für ganz große Lasten und sehr lange Läufe — typischerweise zu einem Offset von ~ +1ms +

+ + +
+
+ + + + + + +

+ Und das ist schon aus rein-logischen Gründen so zu erwarten. Bewußt habe ich beim Aufstellen der Heuristig für das Test-Schedule auf jedwede optimale Anordnung der Rechenwege verzichtet (kein Box-stacking problem lösen!). Hinzu kommen die tatsächlichen Beschränkungen des Worker-Pools. Daraus ergibt sich eine charakteristische Abweichung zwischen einem theoretisch berechneten concurrency-speed-up (wie er in's Schedule eingerechnet ist) und der empirisch beobachteten durchschnittlichen concurrency. Das wird als Form-Faktor gedeutet. +

+ + +
+
+
+ + + + + + + + + +

+ zwischen Load-Size und Laufzeit zum kompletten Abarbeiten der erzeugten Lastspitze +

+ + +
+
+ + + + + + + + +

+ Gradient sehr nah am zu erwartenden Wert +

+ + +
+ + + + + +

+ wenn man die empirisch beobachtete, effektive Concurrency und reale durchschnittliche Job-Zeit ansetzt +

+ + +
+
+ + + + + + +

+ das bedeutet: der tatsächlich beobachtete Sockel hängt von der Länge der Job-Last und der Concurrency ab: Grundsätzlich muß man einmal die ganze Worker-Pool-size zu Beginn und am Ende aufschlagen — mit reduzierter Concurrency. Das ergibt sich bereits aus einer rein logischen Überlegung: »Voll-Last« kann erst konstituiert werden, wenn der erste Worker sich den zweiten Job holt. Analog beginnt der spin-down, wenn der erste worker idle fällt. +

+ + +
+
+ + + +
+ + + + + + + + @@ -116922,6 +117023,10 @@ std::cout << tmpl.render({"what", "World"}) << s
+ + + + @@ -116981,6 +117086,21 @@ std::cout << tmpl.render({"what", "World"}) << s + + + + + + + + + + + + + + + @@ -124690,7 +124810,18 @@ unsigned int ThreadIdAsInt = *static_cast<unsigned int*>(static_cast<vo - + + + + +

+ size="10!"
+ ratio="expand" +

+

Beachte das Ausrufezeichen; auch expand ist wichtig, um die Apsect-Ratio zu erhalten. Wenn man stark verkleinert, sollte man ein generisches node-Template hinzufügen mit node[fontsize="16"]... dann Rendern mit -Tsvg

+ +
+