From 150fdea7a0ad63897f8359e565a178794f34fa9b Mon Sep 17 00:00:00 2001
From: Ichthyostega <prg@ichthyostega.de>
Date: Sat, 15 Aug 2015 05:31:50 +0200
Subject: [PATCH] improve spread of the hash function used for EntryID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

basically this is the well known problem #587
Just it became more pressing with the Upgrade to Jessie and Boost 1.55
So I've pulled off the well known "Knuth trick" to spread the
input data more evenly within the hash domain.

And voilà: now we're able to use 100000 number suffixes without collision
---
 src/lib/idi/entry-id.hpp              | 22 +++++++++---
 src/lib/idi/genfunc.hpp               |  2 ++
 tests/library/entry-id-test.cpp       | 11 ++++--
 tests/library/hash-generator-test.cpp | 52 +++++++++++++++++++++++++--
 4 files changed, 77 insertions(+), 10 deletions(-)

diff --git a/src/lib/idi/entry-id.hpp b/src/lib/idi/entry-id.hpp
index edca5c30f..4579fc438 100644
--- a/src/lib/idi/entry-id.hpp
+++ b/src/lib/idi/entry-id.hpp
@@ -77,6 +77,14 @@ namespace idi {
   using lib::HashVal;
   
   namespace {
+    /** lousy old tinkerer's trick:
+     *  hash values with poor distribution can be improved
+     *  by spreading the input with something close to the golden ratio.
+     *  Additionally, the scaling factor (for hashing) should be prime.
+     *  2^32 * (√5-1)/2 = 2654435769.49723
+     */
+    const size_t KNUTH_MAGIC = 2654435761;
+    
     
     /** build up a hash value, packaged as LUID.
      *  @param sym symbolic ID-string to be hashed
@@ -92,16 +100,22 @@ namespace idi {
      *        conjunction with LUID. How to create a LuidH instance, if not generating
      *        a new random value. How to make EntryID and asset::Ident interchangeable,  /////////TICKET #739
      *        which would require both to yield the same hash values....
-     *  @warning there is a weakness in boost::hash for strings of running numbers, causing
-     *        collisions already for a small set with less than 100000 entries.
-     *        To ameliorate the problem, we hash the symbol twice                        /////////TICKET #865
+     *  @warning there is a weakness in boost::hash for strings of running numbers,
+     *        causing collisions already for a small set with less than 100000 entries.
+     *        To ameliorate the problem, we hash in the trailing digits, and
+     *        spread them by the #KNUTH_MAGIC                                            /////////TICKET #865
      *  @warning this code isn't portable and breaks if sizeof(size_t) < sizeof(void*)
+     *  @see HashGenerator_test#verify_Knuth_workaround
      */
     inline LuidH
     buildHash (string const& sym, HashVal seed =0)
     {
+      size_t l = sym.length();
+      if (l > 1) boost::hash_combine(seed, KNUTH_MAGIC * sym[l-1]);
+      if (l > 2) boost::hash_combine(seed, KNUTH_MAGIC * sym[l-2]);
+      if (l > 3) boost::hash_combine(seed, KNUTH_MAGIC * sym[l-3]);       ////////////////////////TICKET #865
+      
       boost::hash_combine(seed, sym);
-      boost::hash_combine(seed, sym);                                     ////////////////////////TICKET #865
       lumiera_uid tmpLUID;
       lumiera_uid_set_ptr (&tmpLUID, reinterpret_cast<void*> (seed));
       return reinterpret_cast<LuidH&> (tmpLUID);
diff --git a/src/lib/idi/genfunc.hpp b/src/lib/idi/genfunc.hpp
index 657f34d7c..45cc4f434 100644
--- a/src/lib/idi/genfunc.hpp
+++ b/src/lib/idi/genfunc.hpp
@@ -108,6 +108,8 @@ namespace idi {
   /** build a per-type unique identifier.
    * @return a type based prefix, followed by an instance number
    * @note we use the short prefix without namespace, not necessarily unique
+   * @todo consequently the generated IDs might clash for two distinct types,
+   *          which generate the same \c namePrefix(). Is this a problem?
    * @warning this operation is not exactly cheap; it acquires a lock
    *          for the counter and, after increasing and dropping the lock,
    *          it builds and uses a boost::format instance.
diff --git a/tests/library/entry-id-test.cpp b/tests/library/entry-id-test.cpp
index 670f1f861..354849640 100644
--- a/tests/library/entry-id-test.cpp
+++ b/tests/library/entry-id-test.cpp
@@ -258,20 +258,25 @@ namespace test{
       typedef std::unordered_map<DummyID, string, DummyID::UseEmbeddedHash> Hashtable;
       
       /** @test build a hashtable, using EntryID as key,
-       *        thereby using the embedded hash-ID */
+       *        thereby using the embedded hash-ID
+       *  @note there is a known weakness of the boost::hash
+       *        when used on IDs with a running number suffix. /////TICKET #587
+       *        We use a trick to spread the numbers better.
+       *  @see  HashGenerator_test#verify_Knuth_workaround
+       */
       void
       buildHashtable ()
         {
           Hashtable tab;
           
-          for (uint i=0; i<10000; ++i)         //////////////////TICKET #865 hash collisions for 100000 entries
+          for (uint i=0; i<100000; ++i)
             {
               DummyID dummy;
               tab[dummy] = string(dummy);
             }
           
           CHECK (and_all (tab, verifyEntry));
-          CHECK (10000 == tab.size());
+          CHECK (100000 == tab.size());
         }
       
       
diff --git a/tests/library/hash-generator-test.cpp b/tests/library/hash-generator-test.cpp
index 1d35d9e8b..930c60824 100644
--- a/tests/library/hash-generator-test.cpp
+++ b/tests/library/hash-generator-test.cpp
@@ -58,9 +58,14 @@ namespace test{
       run (Arg)
         {
           demonstrate_boost_hash_weakness();
+          verify_Knuth_workaround();
         }
       
       
+      typedef boost::hash<string> BoostStringHasher;
+      typedef std::map<size_t, string> StringsTable;
+      
+      
       /** @test demonstrate a serious weakness of boost::hash for strings.
        * When hashing just the plain string representation of integers,
        * we get collisions already with small numbers below 100000.
@@ -73,9 +78,6 @@ namespace test{
       void
       demonstrate_boost_hash_weakness ()
         {
-          typedef boost::hash<string> BoostStringHasher;
-          typedef std::map<size_t, string> StringsTable;
-          
           BoostStringHasher hashFunction;
           StringsTable hashValues;
           string prefix = "Entry.";
@@ -103,6 +105,50 @@ namespace test{
           CHECK (0 < collisions, "boost::hash for strings is expected to produce collisions");
         }
       
+      
+      
+      /** @test verify a well-known pragmatic trick to help with unevenly spaced hash values.
+       * The boost::hash function is known to perform poorly on strings with common prefix
+       * plus running number. The mentioned trick (attributed to Donald Knuth) is spread the
+       * input numbers by something below the full domain, best close to the golden ratio;
+       * bonus points if this number is also a prime. An additional factor of 2 does not hurt
+       * (so in case of 64bit platform).
+       * 
+       * In our case, it is sufficient to apply this trick to the trailing two digits;
+       * without this trick, we get the first collisions after about 20000 running numbers.
+       * @see BareEntryID
+       */
+      void
+      verify_Knuth_workaround()
+        {
+          StringsTable hashValues;
+          string prefix = "Entry.";
+          const size_t seed = rand();
+          
+          const size_t KNUTH_MAGIC = 2654435761;
+          
+          uint collisions(0);
+          for (uint i=0; i<100000; ++i)
+            {
+              string candidate = prefix + lexical_cast<string> (i);
+              size_t l = candidate.length();
+              size_t hashVal = seed;
+              
+              boost::hash_combine(hashVal, KNUTH_MAGIC * candidate[l-1]);
+              boost::hash_combine(hashVal, KNUTH_MAGIC * candidate[l-2]);
+              boost::hash_combine(hashVal, candidate);
+              
+              if (contains (hashValues, hashVal))
+                {
+                  ++collisions;
+                  string other = hashValues[hashVal];
+                  cout << "Hash collision between " << i << " and " << other <<endl;
+                }
+              hashValues[hashVal] = candidate;
+            }
+          CHECK (!collisions, "the Knuth trick failed to spread our hash values evenly enough, what a shame...");
+        }
+      
     };