2011-12-03 03:18:03 +01:00
|
|
|
|
/*
|
|
|
|
|
|
HashGenerator(Test) - hash value generation details
|
|
|
|
|
|
|
Copyright: clarify and simplify the file headers
* Lumiera source code always was copyrighted by individual contributors
* there is no entity "Lumiera.org" which holds any copyrights
* Lumiera source code is provided under the GPL Version 2+
== Explanations ==
Lumiera as a whole is distributed under Copyleft, GNU General Public License Version 2 or above.
For this to become legally effective, the ''File COPYING in the root directory is sufficient.''
The licensing header in each file is not strictly necessary, yet considered good practice;
attaching a licence notice increases the likeliness that this information is retained
in case someone extracts individual code files. However, it is not by the presence of some
text, that legally binding licensing terms become effective; rather the fact matters that a
given piece of code was provably copyrighted and published under a license. Even reformatting
the code, renaming some variables or deleting parts of the code will not alter this legal
situation, but rather creates a derivative work, which is likewise covered by the GPL!
The most relevant information in the file header is the notice regarding the
time of the first individual copyright claim. By virtue of this initial copyright,
the first author is entitled to choose the terms of licensing. All further
modifications are permitted and covered by the License. The specific wording
or format of the copyright header is not legally relevant, as long as the
intention to publish under the GPL remains clear. The extended wording was
based on a recommendation by the FSF. It can be shortened, because the full terms
of the license are provided alongside the distribution, in the file COPYING.
2024-11-17 23:42:55 +01:00
|
|
|
|
Copyright (C)
|
|
|
|
|
|
2011, Hermann Vosseler <Ichthyostega@web.de>
|
2011-12-03 03:18:03 +01:00
|
|
|
|
|
Copyright: clarify and simplify the file headers
* Lumiera source code always was copyrighted by individual contributors
* there is no entity "Lumiera.org" which holds any copyrights
* Lumiera source code is provided under the GPL Version 2+
== Explanations ==
Lumiera as a whole is distributed under Copyleft, GNU General Public License Version 2 or above.
For this to become legally effective, the ''File COPYING in the root directory is sufficient.''
The licensing header in each file is not strictly necessary, yet considered good practice;
attaching a licence notice increases the likeliness that this information is retained
in case someone extracts individual code files. However, it is not by the presence of some
text, that legally binding licensing terms become effective; rather the fact matters that a
given piece of code was provably copyrighted and published under a license. Even reformatting
the code, renaming some variables or deleting parts of the code will not alter this legal
situation, but rather creates a derivative work, which is likewise covered by the GPL!
The most relevant information in the file header is the notice regarding the
time of the first individual copyright claim. By virtue of this initial copyright,
the first author is entitled to choose the terms of licensing. All further
modifications are permitted and covered by the License. The specific wording
or format of the copyright header is not legally relevant, as long as the
intention to publish under the GPL remains clear. The extended wording was
based on a recommendation by the FSF. It can be shortened, because the full terms
of the license are provided alongside the distribution, in the file COPYING.
2024-11-17 23:42:55 +01:00
|
|
|
|
**Lumiera** is free software; you can redistribute it and/or modify it
|
|
|
|
|
|
under the terms of the GNU General Public License as published by the
|
|
|
|
|
|
Free Software Foundation; either version 2 of the License, or (at your
|
|
|
|
|
|
option) any later version. See the file COPYING for further details.
|
2011-12-03 03:18:03 +01:00
|
|
|
|
|
Copyright: clarify and simplify the file headers
* Lumiera source code always was copyrighted by individual contributors
* there is no entity "Lumiera.org" which holds any copyrights
* Lumiera source code is provided under the GPL Version 2+
== Explanations ==
Lumiera as a whole is distributed under Copyleft, GNU General Public License Version 2 or above.
For this to become legally effective, the ''File COPYING in the root directory is sufficient.''
The licensing header in each file is not strictly necessary, yet considered good practice;
attaching a licence notice increases the likeliness that this information is retained
in case someone extracts individual code files. However, it is not by the presence of some
text, that legally binding licensing terms become effective; rather the fact matters that a
given piece of code was provably copyrighted and published under a license. Even reformatting
the code, renaming some variables or deleting parts of the code will not alter this legal
situation, but rather creates a derivative work, which is likewise covered by the GPL!
The most relevant information in the file header is the notice regarding the
time of the first individual copyright claim. By virtue of this initial copyright,
the first author is entitled to choose the terms of licensing. All further
modifications are permitted and covered by the License. The specific wording
or format of the copyright header is not legally relevant, as long as the
intention to publish under the GPL remains clear. The extended wording was
based on a recommendation by the FSF. It can be shortened, because the full terms
of the license are provided alongside the distribution, in the file COPYING.
2024-11-17 23:42:55 +01:00
|
|
|
|
* *****************************************************************/
|
2011-12-03 03:18:03 +01:00
|
|
|
|
|
2017-02-22 01:54:20 +01:00
|
|
|
|
/** @file hash-generator-test.cpp
|
2017-02-22 03:17:18 +01:00
|
|
|
|
** unit test \ref HashGenerator_test
|
2016-11-03 18:20:10 +01:00
|
|
|
|
*/
|
|
|
|
|
|
|
2011-12-03 03:18:03 +01:00
|
|
|
|
|
|
|
|
|
|
#include "lib/test/run.hpp"
|
|
|
|
|
|
#include "lib/util.hpp"
|
|
|
|
|
|
|
|
|
|
|
|
#include <boost/functional/hash.hpp>
|
|
|
|
|
|
#include <boost/lexical_cast.hpp>
|
|
|
|
|
|
#include <iostream>
|
|
|
|
|
|
#include <string>
|
|
|
|
|
|
#include <map>
|
|
|
|
|
|
|
|
|
|
|
|
using boost::lexical_cast;
|
|
|
|
|
|
using util::contains;
|
|
|
|
|
|
using std::string;
|
|
|
|
|
|
using std::cout;
|
|
|
|
|
|
using std::endl;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
namespace lib {
|
|
|
|
|
|
namespace test{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2013-10-24 23:06:36 +02:00
|
|
|
|
/***********************************************************************//**
|
2011-12-03 03:18:03 +01:00
|
|
|
|
* @test cover various detail aspects regarding hash value generation
|
|
|
|
|
|
* - weakness of boost::hash
|
|
|
|
|
|
*
|
|
|
|
|
|
* @see HashIndexed_test
|
2025-06-07 23:59:57 +02:00
|
|
|
|
* @see EntryID_test
|
2011-12-03 03:18:03 +01:00
|
|
|
|
*/
|
|
|
|
|
|
class HashGenerator_test : public Test
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
|
|
virtual void
|
|
|
|
|
|
run (Arg)
|
|
|
|
|
|
{
|
2024-11-13 02:23:23 +01:00
|
|
|
|
seedRand();
|
2011-12-03 03:18:03 +01:00
|
|
|
|
demonstrate_boost_hash_weakness();
|
2015-08-15 05:31:50 +02:00
|
|
|
|
verify_Knuth_workaround();
|
2011-12-03 03:18:03 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2015-08-15 05:31:50 +02:00
|
|
|
|
typedef boost::hash<string> BoostStringHasher;
|
|
|
|
|
|
typedef std::map<size_t, string> StringsTable;
|
|
|
|
|
|
|
|
|
|
|
|
|
2011-12-03 03:18:03 +01:00
|
|
|
|
/** @test demonstrate a serious weakness of boost::hash for strings.
|
|
|
|
|
|
* When hashing just the plain string representation of integers,
|
|
|
|
|
|
* we get collisions already with small numbers below 100000.
|
|
|
|
|
|
* This is counter-intuitive, as the generated hash values
|
|
|
|
|
|
* are 17 digits long and could span much wider scale.
|
|
|
|
|
|
*
|
|
|
|
|
|
* This problem is especially dangerous when storing objects keyed
|
|
|
|
|
|
* by a string-id, which is generated from running numbers.
|
2018-04-29 03:15:57 +02:00
|
|
|
|
* @remark as of 2018 the boost::hash function does not show this weakness anymore
|
2011-12-03 03:18:03 +01:00
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
demonstrate_boost_hash_weakness ()
|
|
|
|
|
|
{
|
|
|
|
|
|
BoostStringHasher hashFunction;
|
|
|
|
|
|
StringsTable hashValues;
|
|
|
|
|
|
string prefix = "Entry.";
|
|
|
|
|
|
uint collisions(0);
|
|
|
|
|
|
for (uint i=0; i<100000; ++i)
|
|
|
|
|
|
{
|
|
|
|
|
|
string candidate = prefix + lexical_cast<string> (i);
|
|
|
|
|
|
size_t hashVal = hashFunction(candidate);
|
|
|
|
|
|
|
|
|
|
|
|
if (contains (hashValues, hashVal))
|
|
|
|
|
|
{
|
|
|
|
|
|
++collisions;
|
|
|
|
|
|
string other = hashValues[hashVal];
|
|
|
|
|
|
cout << "Duplicate at "<< i << endl;
|
|
|
|
|
|
cout << "existing--->" << other << endl;
|
|
|
|
|
|
cout << "new-------->" << candidate << endl;
|
|
|
|
|
|
|
|
|
|
|
|
size_t exHash = hashFunction(other);
|
|
|
|
|
|
size_t newHash = hashFunction(candidate);
|
|
|
|
|
|
cout << "hash-ex---->" << exHash << endl;
|
|
|
|
|
|
cout << "hash_new--->" << newHash << endl;
|
|
|
|
|
|
}
|
|
|
|
|
|
hashValues[hashVal] = candidate;
|
|
|
|
|
|
}
|
2018-04-29 03:15:57 +02:00
|
|
|
|
if (0 < collisions)
|
|
|
|
|
|
cout << "boost::hash for strings produced "<<collisions<<" collisions. This is a known problem."<<endl;
|
|
|
|
|
|
else
|
|
|
|
|
|
cout << "SURPRISE. No collisions with the boost::hash function." <<endl;
|
2011-12-03 03:18:03 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-08-15 05:31:50 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/** @test verify a well-known pragmatic trick to help with unevenly spaced hash values.
|
|
|
|
|
|
* The boost::hash function is known to perform poorly on strings with common prefix
|
|
|
|
|
|
* plus running number. The mentioned trick (attributed to Donald Knuth) is spread the
|
|
|
|
|
|
* input numbers by something below the full domain, best close to the golden ratio;
|
|
|
|
|
|
* bonus points if this number is also a prime. An additional factor of 2 does not hurt
|
|
|
|
|
|
* (so in case of 64bit platform).
|
|
|
|
|
|
*
|
2015-08-27 20:42:44 +02:00
|
|
|
|
* In our case, it is sufficient to apply this trick to the trailing four digits;
|
2015-08-15 05:31:50 +02:00
|
|
|
|
* without this trick, we get the first collisions after about 20000 running numbers.
|
2015-08-27 20:42:44 +02:00
|
|
|
|
* @note on x86_64, even just spreading the trailing two digits seem to be sufficient
|
|
|
|
|
|
* to remove any collisions from the first 100000 numbers.
|
2015-08-15 05:31:50 +02:00
|
|
|
|
* @see BareEntryID
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
verify_Knuth_workaround()
|
|
|
|
|
|
{
|
|
|
|
|
|
StringsTable hashValues;
|
|
|
|
|
|
string prefix = "Entry.";
|
2024-11-13 02:23:23 +01:00
|
|
|
|
const size_t seed = rani();
|
2015-08-15 05:31:50 +02:00
|
|
|
|
|
|
|
|
|
|
const size_t KNUTH_MAGIC = 2654435761;
|
|
|
|
|
|
|
|
|
|
|
|
uint collisions(0);
|
2015-08-27 20:42:44 +02:00
|
|
|
|
for (uint i=0; i<20000; ++i)
|
2015-08-15 05:31:50 +02:00
|
|
|
|
{
|
|
|
|
|
|
string candidate = prefix + lexical_cast<string> (i);
|
|
|
|
|
|
size_t l = candidate.length();
|
|
|
|
|
|
size_t hashVal = seed;
|
|
|
|
|
|
|
|
|
|
|
|
boost::hash_combine(hashVal, KNUTH_MAGIC * candidate[l-1]);
|
|
|
|
|
|
boost::hash_combine(hashVal, KNUTH_MAGIC * candidate[l-2]);
|
2015-08-27 20:42:44 +02:00
|
|
|
|
boost::hash_combine(hashVal, KNUTH_MAGIC * candidate[l-3]);
|
|
|
|
|
|
boost::hash_combine(hashVal, KNUTH_MAGIC * candidate[l-4]);
|
2015-08-15 05:31:50 +02:00
|
|
|
|
boost::hash_combine(hashVal, candidate);
|
|
|
|
|
|
|
|
|
|
|
|
if (contains (hashValues, hashVal))
|
|
|
|
|
|
{
|
|
|
|
|
|
++collisions;
|
|
|
|
|
|
string other = hashValues[hashVal];
|
|
|
|
|
|
cout << "Hash collision between " << i << " and " << other <<endl;
|
|
|
|
|
|
}
|
|
|
|
|
|
hashValues[hashVal] = candidate;
|
|
|
|
|
|
}
|
|
|
|
|
|
CHECK (!collisions, "the Knuth trick failed to spread our hash values evenly enough, what a shame...");
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-12-03 03:18:03 +01:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/** Register this test class... */
|
|
|
|
|
|
LAUNCHER (HashGenerator_test, "unit common");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}} // namespace lib::test
|