DI: set up framework for investigation of performance impact

We are about to switch to Double Checked Locking with C++11 atomics,
and we want some rough numbers regarding the Impact
This commit is contained in:
Fischlurch 2018-03-23 23:42:10 +01:00
parent 364dcd5291
commit 3104016cf2
4 changed files with 263 additions and 7 deletions

View file

@ -53,6 +53,7 @@ typedef unsigned int uint;
#include "lib/util.hpp"
#define SHOW_TYPE(_TY_) \
cout << "typeof( " << STRINGIFY(_TY_) << " )= " << lib::meta::typeStr<_TY_>() <<endl;
#define SHOW_EXPR(_XX_) \
@ -64,19 +65,111 @@ typedef unsigned int uint;
using lib::Depend;
using lib::DependInject;
//////////////////////////////////////////////////////////////////////////Microbenchmark
#include "backend/thread-wrapper.hpp"
#include <chrono>
#include <vector>
namespace {
constexpr size_t NUM_MEASUREMENTS = 10000000;
constexpr double SCALE = 1e6; // Results are in µ sec
}
/** perform a multithreaded microbenchmark.
* This function fires up a number of threads
* and invokes the given test subject repeatedly.
* @tparam number of threads to run in parallel
* @param subject `void(void)` function to be timed
* @return the averaged invocation time in _mircroseconds_
* @remarks - the subject function will be _copied_ into each thread
* - so `nThreads` copies of this function will run in parallel
* - consider locking if this function accesses a shared closure.
* - if you pass a lambda, it is eligible for inlining followed
* by loop optimisation -- be sure to include some action, like
* e.g. accessing a volatile variable, to prevent the compiler
* from optimising it away entirely.
*/
template<size_t nThreads, class FUN>
double
microbenchmark(FUN const& subject)
{
using backend::ThreadJoinable;
using std::chrono::system_clock;
using Dur = std::chrono::duration<double>;
struct Thread
: ThreadJoinable
{
Thread(FUN const& subject)
: ThreadJoinable("Micro-Benchmark"
,[subject, this]() // local copy of the test-subject-Functor
{
syncPoint(); // block until all threads are ready
auto start = system_clock::now();
for (size_t i=0; i < NUM_MEASUREMENTS; ++i)
subject();
duration = system_clock::now () - start;
})
{ }
/** measured time within thread */
Dur duration{};
};
std::vector<Thread> threads;
threads.reserve(nThreads);
for (size_t n=0; n<nThreads; ++n) // create test threads
threads.emplace_back (subject);
for (auto& thread : threads)
thread.sync(); // start timing measurement
Dur sumDuration{0.0};
for (auto& thread : threads)
{
thread.join(); // block on measurement end
sumDuration += thread.duration;
}
return sumDuration.count() / (nThreads * NUM_MEASUREMENTS) * SCALE;
}
//////////////////////////////////////////////////////////////////////////(End)Microbenchmark
#include "include/lifecycle.h"
#include "lib/test/testoption.hpp"
#include "lib/test/suite.hpp"
using lumiera::LifecycleHook;
using lumiera::ON_GLOBAL_INIT;
using lumiera::ON_GLOBAL_SHUTDOWN;
///////////////////////////////////////////////////////Usage
int
main (int, char**)
{
std::srand(std::time(nullptr));
LifecycleHook::trigger (ON_GLOBAL_INIT);
// DependInject<long>::useSingleton ([&] { return "long{rand() % 100}"; });
// DependInject<long>::Local<std::string> dummy ([&]{ return new long{rand() % 100}; });
cout << "rrrrrr.."<< Depend<long>{}() <<endl;
volatile int blackHole{0};
cout << "pling..."<<endl;
cout << "plong..."<< microbenchmark<8> ([&]()
{
//volatile int dummy =0;
//dummy == 0;
//++dummy;
blackHole == 0;
//++blackHole;
})
<< endl;
cout << "........"<< blackHole/8<<endl;
LifecycleHook::trigger (ON_GLOBAL_SHUTDOWN);
cout << "\n.gulp.\n";
return 0;

View file

@ -108,7 +108,6 @@ namespace backend {
* If this doesn't happen, you'll block forever.
*/
class Thread
: boost::noncopyable
{
protected:
@ -179,7 +178,14 @@ namespace backend {
LumieraThread thread_;
// Threads can be default constructed (inactive) and moved
Thread() : thread_(0) { }
Thread (Thread &&) = default;
// Threads must not be copied and assigned
Thread (Thread const&) = delete;
Thread& operator= (Thread &&) = delete;
Thread& operator= (Thread const&) = delete;
public:

View file

@ -1927,17 +1927,17 @@ As we don't have a Prolog interpreter on board yet, we utilize a mock store with
{{{default(Obj)}}} is a predicate expressing that the object {{{Obj}}} can be considered the default setup under the given conditions. Using the //default// can be considered as a shortcut for actually finding an exact and unique solution. The latter would require to specify all sorts of detailed properties up to the point where only one single object can satisfy all conditions. On the other hand, leaving some properties unspecified would yield a set of solutions (and the user code issuing the query had to provide means for selecting one solution from this set). Just falling back on the //default// means that the user code actually doesn't care for any additional properties (as long as the properties he //does// care for are satisfied). Nothing is said specifically on //how//&amp;nbsp; this default gets configured; actually there can be rules //somewhere,// and, additionally, anything encountered once while asking for a default can be re-used as default under similar circumstances.
&amp;rarr; [[implementing defaults|DefaultsImplementation]]</pre>
</div>
<div title="DependencyFactory" creator="Ichthyostega" modifier="Ichthyostega" created="201803110155" modified="201803181619" tags="def Concepts draft" changecount="10">
<div title="DependencyFactory" creator="Ichthyostega" modifier="Ichthyostega" created="201803110155" modified="201803232220" tags="def Concepts draft" changecount="23">
<pre>//Access point to dependencies by-name.//
In the Lumiera code base, we refrain from building or using a full-blown Dependency Injection Container. A lot of FUD has been spread regarding Dependency Injection and Singletons, to the point that a majority of developers confuses and conflates the ~Inversion-of-Control principle (which is essential) with the use of a ~DI-Container. Today, you can not even mention the word &quot;Singleton&quot; without everyone yelling out &quot;Evil! Evil!&quot; -- while most of these people just feel comfortable living in the annotation hell.
In the Lumiera code base, we refrain from building or using a full-blown Dependency Injection Container. A lot of FUD has been spread regarding Dependency Injection and Singletons, to the point that a majority of developers confuses and conflates the ~Inversion-of-Control principle (which is essential) with the use of a ~DI-Container. Today, you can not even mention the word &quot;Singleton&quot; without everyone yelling out &quot;Evil! Evil!&quot; -- while most of these people just feel comfortable living in the metadata hell.
Not Singletons as such are problematic -- rather, the coupling of the Singleton class itself with the instantiation and lifecycle mechanism is what creates the problems. In C++ these problems can be mitigated by use of a generic //Singleton Factory// -- which can be augmented into a DependencyFactory for those rare cases where we actually need more instance and lifecycle management beyond lazy initialisation. Client code indicates the dependence on some other service by planting an instance of that Dependency Factory (for Lumiera this is {{{lib::Depend&lt;TY&gt;}}}) and remain unaware if the instance is created lazily in singleton style (which is the default) or has been reconfigured to expose a service instance explicitly created by some subsystem lifecycle.
Not Singletons as such are problematic -- rather, the coupling of the Singleton class itself with the instantiation and lifecycle mechanism is what creates the problems. In C++ these problems can be mitigated by use of a generic //Singleton Factory// -- which can be augmented into a DependencyFactory for those rare cases where we actually need more instance and lifecycle management beyond lazy initialisation. Client code indicates the dependence on some other service by planting an instance of that Dependency Factory (for Lumiera this is {{{lib::Depend&lt;TY&gt;}}}) and remain unaware if the instance is created lazily in singleton style (which is the default) or has been reconfigured to expose a service instance explicitly created by some subsystem lifecycle. The //essence of a &quot;dependency&quot;// of this kind is that we ''access a service //by name//''. And this service name or service ID is in our case a //type name.//
!Requirements
Our DependencyFactory satisfies the following requirements
* client code is able to access some service //by-name// -- where the name is actually the //type name// of the service interface.
* client code remains agnostic with regard to the lifecycle or backing context of the service it relies on
* in the simplest (and most prominent case), //nothing// has to be done at all by anyone to manage that lifecycle. By default, the DependencyFactory creates a singleton instance lazily in static memory on demand and ensures thread-safe initialisation and access.
* in the simplest (and most prominent case), //nothing// has to be done at all by anyone to manage that lifecycle.&lt;br/&gt;By default, the DependencyFactory creates a singleton instance lazily (heap allocated) on demand and ensures thread-safe initialisation and access.
* we establish a policy to ''disallow any significant functionality during application shutdown''. After leaving {{{main()}}}, only trivial dtors are invoked and possibly a few resource handles are dropped. No filesystem writes, no clean-up and reorganisation, not even any logging is allowed. For this reason, we established a [[Subsystem]] concept with explicit shutdown hooks, which are invoked beforehand.
* the DependencyFactory can be re-configured for individual services (type names) to refer to an explicitly installed service instance. In those cases, access while the service is not available will raise an exception. There is a simple one-shot mechanism to reconfigure DependencyFactory and create a link to an actual service implementation, including automatic deregistration.
@ -1963,9 +1963,24 @@ Deliberately, we do not enforce global consistency statically (since that would
:the next access will create a (non singleton) {{{SubBlah}}} instance in heap memory and return a {{{Blah&amp;}}}
:the generated object again acts as lifecycle handle and smart-ptr to access the {{{SubBlah}}} instance like {{{mock-&gt;doItSpecial()}}}
:when this handle goes out of scope, the original configuration of the dependency factory is restored
;custom constructors
:both the subclass singleton configuration and the test mock support optionally accept a functor or lambda argument with signature {{{SubBlah*()}}}.
:the contract is for this construction functor to return a heap allocated object, which will be owned and managed by the DependencyFactory.
:especially this enables use of subclasses with non default ctor and / or binding to some additional hidden context.
:please note //that this closure will be invoked later, on-demand.//
We consider the usage pattern of dependencies a question of architecture rather -- such can not be solved by any mechanism on implementation level.
We consider the usage pattern of dependencies a question of architecture rather -- such can not be solved by any mechanism at implementation level.
For this reason, DependencyFactory prevents reconfiguration after use, but does nothing exceeding such basic sanity checks
!!!Performance considerations
We acknowledge that such a dependency or service will be accessed frequently and even from rather performance critical parts of the application. We have to optimise for low overhead on access, while initialisation happens only once and can be arbitrarily expensive. At which point precisely initialisation happens is a question of architecture -- lazy initialisation can be used to avoid expensive setup of rarely used services, or it can be employed to simplify the bootstrap of complex subsystems, or to break service dependency cycles. All of this builds on the assumption that the global application structure is fixed and finite and well-known -- we assume we are in full control about when and how parts of the application start and stop working.
Our requirements on (optional) reconfigurability have some impact on the implementation technique though, since we need access to the instance pointer for individual service types. This basically rules out //Meyers Singleton// -- and so the adequate implementation technique for our usage pattern is //Double Checked Locking.// In the past, there was much debate about DCL being broken -- which indeed was true when //assuming full portability and arbitrary target platform.// Since our focus is primarily on ~PC-with-Linux systems, this argument seems rather theoretical though, since the x86/64 platform is known to employ rather strong memory and cache coherency constraints. With the advent of ARM systems, the situation has changed however. Anyway, since C++11 there is a portable solution for writing a correct DCL implementation, based on {{{std::atomic}}}.
To give some idea of the rough proportions of performance impact, in 2018 we conducted some micro benchmarks (using a 8 core AMD 64bit processor running Debian/Jessie building with GCC 4.9)
The following table lists averaged results in relative numbers
| !Access Technique |&gt;| !development |&gt;| !optimised |
|~| single threaded|multithreaded | single threaded|multithreaded |
</pre>
</div>
<div title="DesignDecisions" modifier="Ichthyostega" created="200801062209" modified="201505310104" tags="decision design discuss Concepts" changecount="5">

View file

@ -26468,6 +26468,148 @@
</node>
</node>
</node>
<node CREATED="1521843763852" ID="ID_1208981523" MODIFIED="1521843769399" TEXT="Microbenchmarks">
<node CREATED="1521843772026" ID="ID_1334641753" MODIFIED="1521843776448" TEXT="selber schreiben">
<icon BUILTIN="ksmiletris"/>
</node>
<node CREATED="1521843783161" ID="ID_1151208744" MODIFIED="1521843819176" TEXT="mehrere Threads unterst&#xfc;tzen"/>
<node BACKGROUND_COLOR="#eee5c3" COLOR="#990000" CREATED="1521843822020" ID="ID_1862348515" MODIFIED="1521844094107" TEXT="Library-Funktion">
<icon BUILTIN="pencil"/>
<node CREATED="1521843838865" ID="ID_1649689781" MODIFIED="1521843944839" TEXT="bekommt eigentlichen Testcode als Lambda">
<icon BUILTIN="idea"/>
</node>
<node CREATED="1521843849712" ID="ID_579604510" MODIFIED="1521843940495" STYLE="fork" TEXT="verwendet std::chrono::duration&lt;double&gt;">
<icon BUILTIN="info"/>
<node CREATED="1521843873173" ID="ID_952548619" MODIFIED="1521843938941" TEXT="mi&#xdf;t micro-Ticks"/>
<node CREATED="1521843880155" ID="ID_1245721460" MODIFIED="1521843938941" TEXT="Ergebnis f&#xe4;llt in Sekunden"/>
</node>
<node CREATED="1521843959433" ID="ID_133306105" MODIFIED="1521844083428" TEXT="verwendet Lumiera&apos;s Threading-Framework">
<richcontent TYPE="NOTE"><html>
<head>
</head>
<body>
<p>
man h&#228;tte genausogut std::future und std::async verwenden k&#246;nnen.
</p>
<p>
Vorteil von unseren Framework:
</p>
<ul>
<li>
wir haben es schon, und wir werden es verwenden, wegen den Thradpools
</li>
<li>
man baut ein Objekt f&#252;r einen Thread. Das ist explizit und sauber
</li>
<li>
wir haben eine eingebaute Barriere und k&#246;nnen unseren Objekt-Monitor nutzen
</li>
</ul>
</body>
</html>
</richcontent>
<icon BUILTIN="yes"/>
</node>
<node COLOR="#338800" CREATED="1521843907144" ID="ID_848033940" MODIFIED="1521843935500" TEXT="Ergebnis normieren auf einzelnen Aufruf">
<icon BUILTIN="button_ok"/>
</node>
<node COLOR="#338800" CREATED="1521843926501" ID="ID_984510403" MODIFIED="1521843934725" TEXT="Ergebnis in Mirkosekunden">
<icon BUILTIN="button_ok"/>
</node>
<node COLOR="#338800" CREATED="1521844261439" ID="ID_1538240853" MODIFIED="1521844284174" TEXT="Korrektheit der Zeitangaben verifiziert">
<richcontent TYPE="NOTE"><html>
<head>
</head>
<body>
<p>
habe einen usleep(1000) getimed
</p>
</body>
</html>
</richcontent>
<icon BUILTIN="button_ok"/>
</node>
</node>
<node CREATED="1521844101045" ID="ID_1065212487" MODIFIED="1521844108488" TEXT="Erfahrungen">
<node CREATED="1521844121043" ID="ID_1041692639" MODIFIED="1521844128201" TEXT="Optimizer">
<icon BUILTIN="messagebox_warning"/>
<node CREATED="1521844504174" ID="ID_318676926" MODIFIED="1521844520954" TEXT="Optimierung per -O3 ist sehr deutlich me&#xdf;bar">
<icon BUILTIN="idea"/>
</node>
<node CREATED="1521844130561" ID="ID_1541346658" MODIFIED="1521844139739" TEXT="&#xfc;bergebene Lambdas werden tats&#xe4;chlich ge-inlined"/>
<node CREATED="1521844157909" ID="ID_1833250915" MODIFIED="1521844656552" TEXT="wenn wir in der Loop messen, messen wir die Aufrufe von chrono::system_clock mit">
<richcontent TYPE="NOTE"><html>
<head>
</head>
<body>
<p>
daher messen wir die Loop als Ganzes.
</p>
<p>
Es gibt daher keine M&#246;glichkeit, den Loop-Overhead selber zu messen.
</p>
<p>
Er sollte sich aber bei einer Wiederholung im Millionenbereich gut amortisieren
</p>
<p>
</p>
<p>
Au&#223;erdem ist ja auch noch der Aufruf des Funktors mit im Spiel, wenngleich der auch typischerweise geinlined wird
</p>
</body>
</html>
</richcontent>
<icon BUILTIN="messagebox_warning"/>
<node CREATED="1521844236539" ID="ID_176799135" MODIFIED="1521844248411" TEXT="tats&#xe4;chlich verifiziert">
<icon BUILTIN="idea"/>
</node>
<node CREATED="1521844204015" ID="ID_188310893" MODIFIED="1521844251027" TEXT="Gr&#xf6;&#xdf;enordnung 10 ns">
<icon BUILTIN="info"/>
</node>
</node>
<node CREATED="1521844141248" ID="ID_328319936" MODIFIED="1521844156769" TEXT="wenn man nicht aufpa&#xdf;t, wird die ganze Loop wegoptimiert"/>
<node CREATED="1521844306601" ID="ID_126627605" MODIFIED="1521844338425" TEXT="Beste L&#xf6;sung">
<node CREATED="1521844339269" ID="ID_1885600326" MODIFIED="1521844361265">
<richcontent TYPE="NODE"><html>
<head>
</head>
<body>
<p>
volatile Variable <i>au&#223;en,</i>&#160;im Aufrufkontext
</p>
</body>
</html>
</richcontent>
</node>
<node CREATED="1521844366944" ID="ID_1258575129" MODIFIED="1521844373484" TEXT="Zugriff via Closure und Referenz"/>
<node CREATED="1521844374503" ID="ID_1548234021" MODIFIED="1521844386090" TEXT="diese Variable mit Konstante vergleichen"/>
<node CREATED="1521844391605" ID="ID_1603234514" MODIFIED="1521844411462" TEXT="Lokale volatile Variable: Initialisierung kostet +5ns"/>
<node CREATED="1521844412602" ID="ID_1758142813" MODIFIED="1521844418949" TEXT="Inkrementieren kostet +10ns"/>
<node CREATED="1521844419657" ID="ID_1892679887" MODIFIED="1521844482141" TEXT="konkurrentes Inkrementieren auf globale Volatile: Faktor 100 !!!!">
<richcontent TYPE="NOTE"><html>
<head>
</head>
<body>
<p>
...was sehr sch&#246;n beweist,
</p>
<p>
da&#223; x86_64 tats&#228;chlich cache-koh&#228;rent ist
</p>
</body>
</html>
</richcontent>
</node>
</node>
</node>
</node>
</node>
</node>
<node CREATED="1482524641484" ID="ID_1651495185" MODIFIED="1518487921096" TEXT="Architektur"/>
<node CREATED="1482524498822" ID="ID_431883229" MODIFIED="1518487921096" TEXT="Datenstrom"/>