diff --git a/src/lib/idi/entry-id.hpp b/src/lib/idi/entry-id.hpp index edca5c30f..4579fc438 100644 --- a/src/lib/idi/entry-id.hpp +++ b/src/lib/idi/entry-id.hpp @@ -77,6 +77,14 @@ namespace idi { using lib::HashVal; namespace { + /** lousy old tinkerer's trick: + * hash values with poor distribution can be improved + * by spreading the input with something close to the golden ratio. + * Additionally, the scaling factor (for hashing) should be prime. + * 2^32 * (√5-1)/2 = 2654435769.49723 + */ + const size_t KNUTH_MAGIC = 2654435761; + /** build up a hash value, packaged as LUID. * @param sym symbolic ID-string to be hashed @@ -92,16 +100,22 @@ namespace idi { * conjunction with LUID. How to create a LuidH instance, if not generating * a new random value. How to make EntryID and asset::Ident interchangeable, /////////TICKET #739 * which would require both to yield the same hash values.... - * @warning there is a weakness in boost::hash for strings of running numbers, causing - * collisions already for a small set with less than 100000 entries. - * To ameliorate the problem, we hash the symbol twice /////////TICKET #865 + * @warning there is a weakness in boost::hash for strings of running numbers, + * causing collisions already for a small set with less than 100000 entries. + * To ameliorate the problem, we hash in the trailing digits, and + * spread them by the #KNUTH_MAGIC /////////TICKET #865 * @warning this code isn't portable and breaks if sizeof(size_t) < sizeof(void*) + * @see HashGenerator_test#verify_Knuth_workaround */ inline LuidH buildHash (string const& sym, HashVal seed =0) { + size_t l = sym.length(); + if (l > 1) boost::hash_combine(seed, KNUTH_MAGIC * sym[l-1]); + if (l > 2) boost::hash_combine(seed, KNUTH_MAGIC * sym[l-2]); + if (l > 3) boost::hash_combine(seed, KNUTH_MAGIC * sym[l-3]); ////////////////////////TICKET #865 + boost::hash_combine(seed, sym); - boost::hash_combine(seed, sym); ////////////////////////TICKET #865 lumiera_uid tmpLUID; lumiera_uid_set_ptr (&tmpLUID, reinterpret_cast (seed)); return reinterpret_cast (tmpLUID); diff --git a/src/lib/idi/genfunc.hpp b/src/lib/idi/genfunc.hpp index 657f34d7c..45cc4f434 100644 --- a/src/lib/idi/genfunc.hpp +++ b/src/lib/idi/genfunc.hpp @@ -108,6 +108,8 @@ namespace idi { /** build a per-type unique identifier. * @return a type based prefix, followed by an instance number * @note we use the short prefix without namespace, not necessarily unique + * @todo consequently the generated IDs might clash for two distinct types, + * which generate the same \c namePrefix(). Is this a problem? * @warning this operation is not exactly cheap; it acquires a lock * for the counter and, after increasing and dropping the lock, * it builds and uses a boost::format instance. diff --git a/tests/library/entry-id-test.cpp b/tests/library/entry-id-test.cpp index 670f1f861..354849640 100644 --- a/tests/library/entry-id-test.cpp +++ b/tests/library/entry-id-test.cpp @@ -258,20 +258,25 @@ namespace test{ typedef std::unordered_map Hashtable; /** @test build a hashtable, using EntryID as key, - * thereby using the embedded hash-ID */ + * thereby using the embedded hash-ID + * @note there is a known weakness of the boost::hash + * when used on IDs with a running number suffix. /////TICKET #587 + * We use a trick to spread the numbers better. + * @see HashGenerator_test#verify_Knuth_workaround + */ void buildHashtable () { Hashtable tab; - for (uint i=0; i<10000; ++i) //////////////////TICKET #865 hash collisions for 100000 entries + for (uint i=0; i<100000; ++i) { DummyID dummy; tab[dummy] = string(dummy); } CHECK (and_all (tab, verifyEntry)); - CHECK (10000 == tab.size()); + CHECK (100000 == tab.size()); } diff --git a/tests/library/hash-generator-test.cpp b/tests/library/hash-generator-test.cpp index 1d35d9e8b..930c60824 100644 --- a/tests/library/hash-generator-test.cpp +++ b/tests/library/hash-generator-test.cpp @@ -58,9 +58,14 @@ namespace test{ run (Arg) { demonstrate_boost_hash_weakness(); + verify_Knuth_workaround(); } + typedef boost::hash BoostStringHasher; + typedef std::map StringsTable; + + /** @test demonstrate a serious weakness of boost::hash for strings. * When hashing just the plain string representation of integers, * we get collisions already with small numbers below 100000. @@ -73,9 +78,6 @@ namespace test{ void demonstrate_boost_hash_weakness () { - typedef boost::hash BoostStringHasher; - typedef std::map StringsTable; - BoostStringHasher hashFunction; StringsTable hashValues; string prefix = "Entry."; @@ -103,6 +105,50 @@ namespace test{ CHECK (0 < collisions, "boost::hash for strings is expected to produce collisions"); } + + + /** @test verify a well-known pragmatic trick to help with unevenly spaced hash values. + * The boost::hash function is known to perform poorly on strings with common prefix + * plus running number. The mentioned trick (attributed to Donald Knuth) is spread the + * input numbers by something below the full domain, best close to the golden ratio; + * bonus points if this number is also a prime. An additional factor of 2 does not hurt + * (so in case of 64bit platform). + * + * In our case, it is sufficient to apply this trick to the trailing two digits; + * without this trick, we get the first collisions after about 20000 running numbers. + * @see BareEntryID + */ + void + verify_Knuth_workaround() + { + StringsTable hashValues; + string prefix = "Entry."; + const size_t seed = rand(); + + const size_t KNUTH_MAGIC = 2654435761; + + uint collisions(0); + for (uint i=0; i<100000; ++i) + { + string candidate = prefix + lexical_cast (i); + size_t l = candidate.length(); + size_t hashVal = seed; + + boost::hash_combine(hashVal, KNUTH_MAGIC * candidate[l-1]); + boost::hash_combine(hashVal, KNUTH_MAGIC * candidate[l-2]); + boost::hash_combine(hashVal, candidate); + + if (contains (hashValues, hashVal)) + { + ++collisions; + string other = hashValues[hashVal]; + cout << "Hash collision between " << i << " and " << other <