| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120 |
- /*
- * Brute force collision tester for 64-bit hashes
- * Part of the xxHash project
- * Copyright (C) 2019-2020 Yann Collet
- *
- * GPL v2 License
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * You can contact the author at:
- * - xxHash homepage: https://www.xxhash.com
- * - xxHash source repository: https://github.com/Cyan4973/xxHash
- */
- /*
- * The collision tester will generate 24 billion hashes (by default),
- * and count how many collisions were produced by the 64-bit hash algorithm.
- * The optimal amount of collisions for 64-bit is ~18 collisions.
- * A good hash should be close to this figure.
- *
- * This program requires a lot of memory:
- * - Either store hash values directly => 192 GB
- * - Or use a filter:
- * - 32 GB (by default) for the filter itself
- * - + ~14 GB for the list of hashes (depending on the filter's outcome)
- * Due to these memory constraints, it requires a 64-bit system.
- */
- /* === Dependencies === */
- #include <stdint.h> /* uint64_t */
- #include <stdlib.h> /* malloc, free, qsort, exit */
- #include <string.h> /* memset */
- #include <stdio.h> /* printf, fflush */
- #undef NDEBUG /* ensure assert is _not_ disabled */
- #include <assert.h>
- #include "hashes.h" /* UniHash, hashfn, hashfnTable */
- #include "sort.hh" /* sort64 */
- typedef enum { ht32, ht64, ht128 } Htype_e;
- /* === Debug === */
- #define EXIT(...) { printf(__VA_ARGS__); printf("\n"); exit(1); }
- static void hexRaw(const void* buffer, size_t size)
- {
- const unsigned char* p = (const unsigned char*)buffer;
- for (size_t i=0; i<size; i++) {
- printf("%02X", p[i]);
- }
- }
- void printHash(const void* table, size_t n, Htype_e htype)
- {
- if ((htype == ht64) || (htype == ht32)){
- uint64_t const h64 = ((const uint64_t*)table)[n];
- hexRaw(&h64, sizeof(h64));
- } else {
- assert(htype == ht128);
- XXH128_hash_t const h128 = ((const XXH128_hash_t*)table)[n];
- hexRaw(&h128, sizeof(h128));
- }
- }
- /* === Generate Random unique Samples to hash === */
- /*
- * These functions will generate and update a sample to hash.
- * initSample() will fill a buffer with random bytes,
- * updateSample() will modify one slab in the input buffer.
- * updateSample() guarantees it will produce unique samples,
- * but it needs to know the total number of samples.
- */
- static const uint64_t prime64_1 = 11400714785074694791ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
- static const uint64_t prime64_2 = 14029467366897019727ULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
- static const uint64_t prime64_3 = 1609587929392839161ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
- static uint64_t avalanche64(uint64_t h64)
- {
- h64 ^= h64 >> 33;
- h64 *= prime64_2;
- h64 ^= h64 >> 29;
- h64 *= prime64_3;
- h64 ^= h64 >> 32;
- return h64;
- }
- static unsigned char randomByte(size_t n)
- {
- uint64_t n64 = avalanche64(n+1);
- n64 *= prime64_1;
- return (unsigned char)(n64 >> 56);
- }
- typedef enum { sf_slab5, sf_sparse } sf_genMode;
- #ifdef SLAB5
- /*
- * Slab5 sample generation.
- * This algorithm generates unique inputs flipping on average 16 bits per candidate.
- * It is generally much more friendly for most hash algorithms, especially
- * weaker ones, as it shuffles more the input.
- * The algorithm also avoids overfitting the per4 or per8 ingestion patterns.
- */
- #define SLAB_SIZE 5
- typedef struct {
- void* buffer;
- size_t size;
- sf_genMode mode;
- size_t prngSeed;
- uint64_t hnb;
- } sampleFactory;
- static void init_sampleFactory(sampleFactory* sf, uint64_t htotal)
- {
- uint64_t const minNbSlabs = ((htotal-1) >> 32) + 1;
- uint64_t const minSize = minNbSlabs * SLAB_SIZE;
- if (sf->size < minSize)
- EXIT("sample size must be >= %i bytes for this amount of hashes",
- (int)minSize);
- unsigned char* const p = (unsigned char*)sf->buffer;
- for (size_t n=0; n < sf->size; n++)
- p[n] = randomByte(n);
- sf->hnb = 0;
- }
- static sampleFactory*
- create_sampleFactory(size_t size, uint64_t htotal, uint64_t seed)
- {
- sampleFactory* const sf = malloc(sizeof(sampleFactory));
- if (!sf) EXIT("not enough memory");
- void* const buffer = malloc(size);
- if (!buffer) EXIT("not enough memory");
- sf->buffer = buffer;
- sf->size = size;
- sf->mode = sf_slab5;
- sf->prngSeed = seed;
- init_sampleFactory(sf, htotal);
- return sf;
- }
- static void free_sampleFactory(sampleFactory* sf)
- {
- if (!sf) return;
- free(sf->buffer);
- free(sf);
- }
- static inline void update_sampleFactory(sampleFactory* sf)
- {
- size_t const nbSlabs = sf->size / SLAB_SIZE;
- size_t const SlabNb = sf->hnb % nbSlabs;
- sf->hnb++;
- char* const ptr = (char*)sf->buffer;
- size_t const start = (SlabNb * SLAB_SIZE) + 1;
- uint32_t val32;
- memcpy(&val32, ptr+start, sizeof(val32));
- static const uint32_t prime32_5 = 374761393U;
- val32 += prime32_5;
- memcpy(ptr+start, &val32, sizeof(val32));
- }
- #else
- /*
- * Sparse sample generation.
- * This is the default pattern generator.
- * It only flips one bit at a time (mostly).
- * Low hamming distance scenario is more difficult for weak hash algorithms.
- * Note that CRC is immune to this scenario, since they are specifically
- * designed to detect low hamming distances.
- * Prefer the Slab5 pattern generator for collisions on CRC algorithms.
- */
- #define SPARSE_LEVEL_MAX 15
- /* Nb of combinations of m bits in a register of n bits */
- static double Cnm(int n, int m)
- {
- assert(n > 0);
- assert(m > 0);
- assert(m <= m);
- double acc = 1;
- for (int i=0; i<m; i++) {
- acc *= n - i;
- acc /= 1 + i;
- }
- return acc;
- }
- static int enoughCombos(size_t size, uint64_t htotal)
- {
- if (size < 2) return 0; /* ensure no multiplication by negative */
- uint64_t acc = 0;
- uint64_t const srcBits = size * 8; assert(srcBits < INT_MAX);
- int nbBitsSet = 0;
- while (acc < htotal) {
- nbBitsSet++;
- if (nbBitsSet >= SPARSE_LEVEL_MAX) return 0;
- acc += (uint64_t)Cnm((int)srcBits, nbBitsSet);
- }
- return 1;
- }
- typedef struct {
- void* buffer;
- size_t size;
- sf_genMode mode;
- /* sparse */
- size_t bitIdx[SPARSE_LEVEL_MAX];
- int level;
- size_t maxBitIdx;
- /* slab5 */
- size_t nbSlabs;
- size_t current;
- size_t prngSeed;
- } sampleFactory;
- static void init_sampleFactory(sampleFactory* sf, uint64_t htotal)
- {
- if (!enoughCombos(sf->size, htotal)) {
- EXIT("sample size must be larger for this amount of hashes");
- }
- memset(sf->bitIdx, 0, sizeof(sf->bitIdx));
- sf->level = 0;
- unsigned char* const p = (unsigned char*)sf->buffer;
- for (size_t n=0; n<sf->size; n++)
- p[n] = randomByte(sf->prngSeed + n);
- }
- static sampleFactory*
- create_sampleFactory(size_t size, uint64_t htotal, uint64_t seed)
- {
- sampleFactory* const sf = malloc(sizeof(sampleFactory));
- if (!sf) EXIT("not enough memory");
- void* const buffer = malloc(size);
- if (!buffer) EXIT("not enough memory");
- sf->buffer = buffer;
- sf->size = size;
- sf->mode = sf_sparse;
- sf->maxBitIdx = size * 8;
- sf->prngSeed = seed;
- init_sampleFactory(sf, htotal);
- return sf;
- }
- static void free_sampleFactory(sampleFactory* sf)
- {
- if (!sf) return;
- free(sf->buffer);
- free(sf);
- }
- static void flipbit(void* buffer, uint64_t bitID)
- {
- size_t const pos = bitID >> 3;
- unsigned char const mask = (unsigned char)(1 << (bitID & 7));
- unsigned char* const p = (unsigned char*)buffer;
- p[pos] ^= mask;
- }
- static int updateBit(void* buffer, size_t* bitIdx, int level, size_t max)
- {
- if (level==0) return 0; /* can't progress further */
- flipbit(buffer, bitIdx[level]); /* erase previous bits */
- if (bitIdx[level] < max-1) { /* simple case: go to next bit */
- bitIdx[level]++;
- flipbit(buffer, bitIdx[level]); /* set new bit */
- return 1;
- }
- /* reached last bit: need to update a bit from lower level */
- if (!updateBit(buffer, bitIdx, level-1, max-1)) return 0;
- bitIdx[level] = bitIdx[level-1] + 1;
- flipbit(buffer, bitIdx[level]); /* set new bit */
- return 1;
- }
- static inline void update_sampleFactory(sampleFactory* sf)
- {
- if (!updateBit(sf->buffer, sf->bitIdx, sf->level, sf->maxBitIdx)) {
- /* no more room => move to next level */
- sf->level++;
- assert(sf->level < SPARSE_LEVEL_MAX);
- /* set new bits */
- for (int i=1; i <= sf->level; i++) {
- sf->bitIdx[i] = (size_t)(i-1);
- flipbit(sf->buffer, sf->bitIdx[i]);
- }
- }
- }
- #endif /* pattern generator selection */
- /* === Candidate Filter === */
- typedef unsigned char Filter;
- Filter* create_Filter(int bflog)
- {
- assert(bflog < 64 && bflog > 1);
- size_t bfsize = (size_t)1 << bflog;
- Filter* bf = malloc(bfsize);
- assert(((void)"Filter creation failed", bf));
- memset(bf, 0, bfsize);
- return bf;
- }
- void free_Filter(Filter* bf)
- {
- free(bf);
- }
- #ifdef FILTER_1_PROBE
- /*
- * Attach hash to a slot
- * return: Nb of potential collision candidates detected
- * 0: position not yet occupied
- * 2: position previously occupied by a single candidate
- * 1: position already occupied by multiple candidates
- */
- inline int Filter_insert(Filter* bf, int bflog, uint64_t hash)
- {
- int const slotNb = hash & 3;
- int const shift = slotNb * 2 ;
- size_t const bfmask = ((size_t)1 << bflog) - 1;
- size_t const pos = (hash >> 2) & bfmask;
- int const existingCandidates = ((((unsigned char*)bf)[pos]) >> shift) & 3;
- static const int addCandidates[4] = { 0, 2, 1, 1 };
- static const int nextValue[4] = { 1, 2, 3, 3 };
- ((unsigned char*)bf)[pos] |= (unsigned char)(nextValue[existingCandidates] << shift);
- return addCandidates[existingCandidates];
- }
- /*
- * Check if provided 64-bit hash is a collision candidate
- * Requires the slot to be occupied by at least 2 candidates.
- * return >0 if hash is a collision candidate
- * 0 otherwise (slot unoccupied, or only one candidate)
- * note: unoccupied slots should not happen in this algorithm,
- * since all hashes are supposed to have been inserted at least once.
- */
- inline int Filter_check(const Filter* bf, int bflog, uint64_t hash)
- {
- int const slotNb = hash & 3;
- int const shift = slotNb * 2;
- size_t const bfmask = ((size_t)1 << bflog) - 1;
- size_t const pos = (hash >> 2) & bfmask;
- return (((const unsigned char*)bf)[pos]) >> (shift+1) & 1;
- }
- #else
- /*
- * 2-probes strategy,
- * more efficient at filtering candidates,
- * requires filter size to be > nb of hashes
- */
- #define MIN(a,b) ((a) < (b) ? (a) : (b))
- #define MAX(a,b) ((a) > (b) ? (a) : (b))
- /*
- * Attach hash to 2 slots
- * return: Nb of potential candidates detected
- * 0: position not yet occupied
- * 2: position previously occupied by a single candidate (at most)
- * 1: position already occupied by multiple candidates
- */
- static inline int Filter_insert(Filter* bf, int bflog, uint64_t hash)
- {
- hash = avalanche64(hash);
- unsigned const slot1 = hash & 255;
- hash >>= 8;
- unsigned const slot2 = hash & 255;
- hash >>= 8;
- size_t const fclmask = ((size_t)1 << (bflog-6)) - 1;
- size_t const cacheLineNb = hash & fclmask;
- size_t const pos1 = (cacheLineNb << 6) + (slot1 >> 2);
- unsigned const shift1 = (slot1 & 3) * 2;
- unsigned const ex1 = (bf[pos1] >> shift1) & 3;
- size_t const pos2 = (cacheLineNb << 6) + (slot2 >> 2);
- unsigned const shift2 = (slot2 & 3) * 2;
- unsigned const ex2 = (bf[pos2] >> shift2) & 3;
- unsigned const existing = MIN(ex1, ex2);
- static const int addCandidates[4] = { 0, 2, 1, 1 };
- static const unsigned nextValue[4] = { 1, 2, 3, 3 };
- bf[pos1] &= (Filter)(~(3 << shift1)); /* erase previous value */
- bf[pos1] |= (Filter)(MAX(ex1, nextValue[existing]) << shift1);
- bf[pos2] |= (Filter)(MAX(ex2, nextValue[existing]) << shift2);
- return addCandidates[existing];
- }
- /*
- * Check if provided 64-bit hash is a collision candidate
- * Requires the slot to be occupied by at least 2 candidates.
- * return >0 if hash is a collision candidate
- * 0 otherwise (slot unoccupied, or only one candidate)
- * note: unoccupied slots should not happen in this algorithm,
- * since all hashes are supposed to have been inserted at least once.
- */
- static inline int Filter_check(const Filter* bf, int bflog, uint64_t hash)
- {
- hash = avalanche64(hash);
- unsigned const slot1 = hash & 255;
- hash >>= 8;
- unsigned const slot2 = hash & 255;
- hash >>= 8;
- size_t const fclmask = ((size_t)1 << (bflog-6)) - 1;
- size_t const cacheLineNb = hash & fclmask;
- size_t const pos1 = (cacheLineNb << 6) + (slot1 >> 2);
- unsigned const shift1 = (slot1 & 3) * 2;
- unsigned const ex1 = (bf[pos1] >> shift1) & 3;
- size_t const pos2 = (cacheLineNb << 6) + (slot2 >> 2);
- unsigned const shift2 = (slot2 & 3) * 2;
- unsigned const ex2 = (bf[pos2] >> shift2) & 3;
- return (ex1 >= 2) && (ex2 >= 2);
- }
- #endif // FILTER_1_PROBE
- /* === Display === */
- #include <time.h> /* clock_t, clock, time_t, time, difftime */
- void update_indicator(uint64_t v, uint64_t total)
- {
- static clock_t start = 0;
- if (start==0) start = clock();
- clock_t const updateRate = CLOCKS_PER_SEC / 2;
- clock_t const clockSpan = (clock_t)(clock() - start);
- if (clockSpan > updateRate) {
- start = clock();
- assert(v <= total);
- assert(total > 0);
- double share = ((double)v / (double)total) * 100;
- printf("%6.2f%% (%llu) \r", share, (unsigned long long)v);
- fflush(NULL);
- }
- }
- /* note: not thread safe */
- const char* displayDelay(double delay_s)
- {
- static char delayString[50];
- memset(delayString, 0, sizeof(delayString));
- int const mn = ((int)delay_s / 60) % 60;
- int const h = (int)delay_s / 3600;
- int const sec = (int)delay_s % 60;
- char* p = delayString;
- if (h) sprintf(p, "%i h ", h);
- if (mn || h) {
- p = delayString + strlen(delayString);
- sprintf(p, "%i mn ", mn);
- }
- p = delayString + strlen(delayString);
- sprintf(p, "%is ", sec);
- return delayString;
- }
- /* === Math === */
- static double power(uint64_t base, int p)
- {
- double value = 1;
- assert(p>=0);
- for (int i=0; i<p; i++) {
- value *= (double)base;
- }
- return value;
- }
- static double estimateNbCollisions(uint64_t nbH, int nbBits)
- {
- return ((double)nbH * (double)(nbH-1)) / power(2, nbBits+1);
- }
- static int highestBitSet(uint64_t v)
- {
- assert(v!=0);
- int bitId = 0;
- while (v >>= 1) bitId++;
- return bitId;
- }
- /* === Filter and search collisions === */
- #undef NDEBUG /* ensure assert is not disabled */
- #include <assert.h>
- /* will recommend 24 billion samples for 64-bit hashes,
- * expecting 18 collisions for a good 64-bit hash */
- #define NB_BITS_MAX 64 /* can't store nor analyze hash wider than 64-bits for the time being */
- uint64_t select_nbh(int nbBits)
- {
- assert(nbBits > 0);
- if (nbBits > NB_BITS_MAX) nbBits = NB_BITS_MAX;
- double targetColls = (double)((128 + 17) - (nbBits * 2));
- uint64_t nbH = 24;
- while (estimateNbCollisions(nbH, nbBits) < targetColls) nbH *= 2;
- return nbH;
- }
- typedef struct {
- uint64_t nbCollisions;
- } searchCollisions_results;
- typedef struct {
- uint64_t nbH;
- uint64_t mask;
- uint64_t maskSelector;
- size_t sampleSize;
- uint64_t prngSeed;
- int filterLog; /* <0 = disable filter; 0 = auto-size; */
- int hashID;
- int display;
- int nbThreads;
- searchCollisions_results* resultPtr;
- } searchCollisions_parameters;
- #define DISPLAY(...) { if (display) printf(__VA_ARGS__); }
- static int isEqual(void* hTablePtr, size_t index1, size_t index2, Htype_e htype)
- {
- if ((htype == ht64) || (htype == ht32)) {
- uint64_t const h1 = ((const uint64_t*)hTablePtr)[index1];
- uint64_t const h2 = ((const uint64_t*)hTablePtr)[index2];
- return (h1 == h2);
- } else {
- assert(htype == ht128);
- XXH128_hash_t const h1 = ((const XXH128_hash_t*)hTablePtr)[index1];
- XXH128_hash_t const h2 = ((const XXH128_hash_t*)hTablePtr)[index2];
- return XXH128_isEqual(h1, h2);
- }
- }
- static int isHighEqual(void* hTablePtr, size_t index1, size_t index2, Htype_e htype, int rShift)
- {
- uint64_t h1, h2;
- if ((htype == ht64) || (htype == ht32)) {
- h1 = ((const uint64_t*)hTablePtr)[index1];
- h2 = ((const uint64_t*)hTablePtr)[index2];
- } else {
- assert(htype == ht128);
- h1 = ((const XXH128_hash_t*)hTablePtr)[index1].high64;
- h2 = ((const XXH128_hash_t*)hTablePtr)[index2].high64;
- assert(rShift >= 64);
- rShift -= 64;
- }
- assert(0 <= rShift && rShift < 64);
- return (h1 >> rShift) == (h2 >> rShift);
- }
- /* assumption: (htype*)hTablePtr[index] is valid */
- static void addHashCandidate(void* hTablePtr, UniHash h, Htype_e htype, size_t index)
- {
- if ((htype == ht64) || (htype == ht32)) {
- ((uint64_t*)hTablePtr)[index] = h.h64;
- } else {
- assert(htype == ht128);
- ((XXH128_hash_t*)hTablePtr)[index] = h.h128;
- }
- }
- static int getNbBits_fromHtype(Htype_e htype) {
- switch(htype) {
- case ht32: return 32;
- case ht64: return 64;
- case ht128:return 128;
- default: EXIT("hash size not supported");
- }
- }
- static Htype_e getHtype_fromHbits(int nbBits) {
- switch(nbBits) {
- case 32 : return ht32;
- case 64 : return ht64;
- case 128: return ht128;
- default: EXIT("hash size not supported");
- }
- }
- static size_t search_collisions(
- searchCollisions_parameters param)
- {
- uint64_t totalH = param.nbH;
- const uint64_t hMask = param.mask;
- const uint64_t hSelector = param.maskSelector;
- int bflog = param.filterLog;
- const int filter = (param.filterLog >= 0);
- const size_t sampleSize = param.sampleSize;
- const int hashID = param.hashID;
- const Htype_e htype = getHtype_fromHbits(hashfnTable[hashID].bits);
- const int display = param.display;
- /* init */
- sampleFactory* const sf = create_sampleFactory(sampleSize, totalH, param.prngSeed);
- if (!sf) EXIT("not enough memory");
- //const char* const hname = hashfnTable[hashID].name;
- hashfn const hfunction = hashfnTable[hashID].fn;
- int const hwidth = hashfnTable[hashID].bits;
- if (totalH == 0) totalH = select_nbh(hwidth);
- if (bflog == 0) bflog = highestBitSet(totalH) + 1; /* auto-size filter */
- uint64_t const bfsize = (1ULL << bflog);
- /* === filter hashes (optional) === */
- Filter* bf = NULL;
- uint64_t nbPresents = totalH;
- if (filter) {
- time_t const filterTBegin = time(NULL);
- DISPLAY(" Creating filter (%i GB) \n", (int)(bfsize >> 30));
- bf = create_Filter(bflog);
- if (!bf) EXIT("not enough memory for filter");
- DISPLAY(" Generate %llu hashes from samples of %u bytes \n",
- (unsigned long long)totalH, (unsigned)sampleSize);
- nbPresents = 0;
- for (uint64_t n=0; n < totalH; n++) {
- if (display && ((n&0xFFFFF) == 1) )
- update_indicator(n, totalH);
- update_sampleFactory(sf);
- UniHash const h = hfunction(sf->buffer, sampleSize);
- if ((h.h64 & hMask) != hSelector) continue;
- nbPresents += (uint64_t)Filter_insert(bf, bflog, h.h64);
- }
- if (nbPresents==0) {
- DISPLAY(" Analysis completed: No collision detected \n");
- if (param.resultPtr) param.resultPtr->nbCollisions = 0;
- free_Filter(bf);
- free_sampleFactory(sf);
- return 0;
- }
- { double const filterDelay = difftime(time(NULL), filterTBegin);
- DISPLAY(" Generation and filter completed in %s, detected up to %llu candidates \n",
- displayDelay(filterDelay), (unsigned long long) nbPresents);
- } }
- /* === store hash candidates: duplicates will be present here === */
- time_t const storeTBegin = time(NULL);
- size_t const hashByteSize = (htype == ht128) ? 16 : 8;
- size_t const tableSize = (nbPresents+1) * hashByteSize;
- assert(tableSize > nbPresents); /* check tableSize calculation overflow */
- DISPLAY(" Storing hash candidates (%i MB) \n", (int)(tableSize >> 20));
- /* Generate and store hashes */
- void* const hashCandidates = malloc(tableSize);
- if (!hashCandidates) EXIT("not enough memory to store candidates");
- init_sampleFactory(sf, totalH);
- size_t nbCandidates = 0;
- for (uint64_t n=0; n < totalH; n++) {
- if (display && ((n&0xFFFFF) == 1) ) update_indicator(n, totalH);
- update_sampleFactory(sf);
- UniHash const h = hfunction(sf->buffer, sampleSize);
- if ((h.h64 & hMask) != hSelector) continue;
- if (filter) {
- if (Filter_check(bf, bflog, h.h64)) {
- assert(nbCandidates < nbPresents);
- addHashCandidate(hashCandidates, h, htype, nbCandidates++);
- }
- } else {
- assert(nbCandidates < nbPresents);
- addHashCandidate(hashCandidates, h, htype, nbCandidates++);
- }
- }
- if (nbCandidates < nbPresents) {
- /* Try to mitigate gnuc_quicksort behavior, by reducing allocated memory,
- * since gnuc_quicksort uses a lot of additional memory for mergesort */
- void* const checkPtr = realloc(hashCandidates, nbCandidates * hashByteSize);
- assert(checkPtr != NULL);
- assert(checkPtr == hashCandidates); /* simplification: since we are reducing the size,
- * we hope to keep the same ptr position.
- * Otherwise, hashCandidates must be mutable. */
- DISPLAY(" List of hashes reduced to %u MB from %u MB (saved %u MB) \n",
- (unsigned)((nbCandidates * hashByteSize) >> 20),
- (unsigned)(tableSize >> 20),
- (unsigned)((tableSize - (nbCandidates * hashByteSize)) >> 20) );
- }
- double const storeTDelay = difftime(time(NULL), storeTBegin);
- DISPLAY(" Stored %llu hash candidates in %s \n",
- (unsigned long long) nbCandidates, displayDelay(storeTDelay));
- free_Filter(bf);
- free_sampleFactory(sf);
- /* === step 3: look for duplicates === */
- time_t const sortTBegin = time(NULL);
- DISPLAY(" Sorting candidates... ");
- fflush(NULL);
- if ((htype == ht64) || (htype == ht32)) {
- /*
- * Use C++'s std::sort, as it's faster than C stdlib's qsort, and
- * doesn't suffer from gnuc_libsort's memory expansion
- */
- sort64(hashCandidates, nbCandidates);
- } else {
- assert(htype == ht128);
- sort128(hashCandidates, nbCandidates); /* sort with custom comparator */
- }
- double const sortTDelay = difftime(time(NULL), sortTBegin);
- DISPLAY(" Completed in %s \n", displayDelay(sortTDelay));
- /* scan and count duplicates */
- time_t const countBegin = time(NULL);
- DISPLAY(" Looking for duplicates: ");
- fflush(NULL);
- size_t collisions = 0;
- for (size_t n=1; n<nbCandidates; n++) {
- if (isEqual(hashCandidates, n, n-1, htype)) {
- #if defined(COL_DISPLAY_DUPLICATES)
- printf("collision: ");
- printHash(hashCandidates, n, htype);
- printf(" / ");
- printHash(hashCandidates, n-1, htype);
- printf(" \n");
- #endif
- collisions++;
- } }
- if (!filter /* all candidates */ && display /*single thead*/ ) {
- /* check partial bitfields (high bits) */
- DISPLAY(" \n");
- int const hashBits = getNbBits_fromHtype(htype);
- double worstRatio = 0.;
- int worstNbHBits = 0;
- for (int nbHBits = 1; nbHBits < hashBits; nbHBits++) {
- uint64_t const nbSlots = (uint64_t)1 << nbHBits;
- double const expectedCollisions = estimateNbCollisions(nbCandidates, nbHBits);
- if ( (nbSlots > nbCandidates * 100) /* within range for meaningfull collision analysis results */
- && (expectedCollisions > 18.0) ) {
- int const rShift = hashBits - nbHBits;
- size_t HBits_collisions = 0;
- for (size_t n=1; n<nbCandidates; n++) {
- if (isHighEqual(hashCandidates, n, n-1, htype, rShift)) {
- HBits_collisions++;
- } }
- double const collisionRatio = (double)HBits_collisions / expectedCollisions;
- if (collisionRatio > 2.0) DISPLAY("WARNING !!! ===> ");
- DISPLAY(" high %i bits: %zu collision (%.1f expected): x%.2f \n",
- nbHBits, HBits_collisions, expectedCollisions, collisionRatio);
- if (collisionRatio > worstRatio) {
- worstNbHBits = nbHBits;
- worstRatio = collisionRatio;
- } } }
- DISPLAY("Worst collision ratio at %i high bits: x%.2f \n",
- worstNbHBits, worstRatio);
- }
- double const countDelay = difftime(time(NULL), countBegin);
- DISPLAY(" Completed in %s \n", displayDelay(countDelay));
- /* clean and exit */
- free (hashCandidates);
- #if 0 /* debug */
- for (size_t n=0; n<nbCandidates; n++)
- printf("0x%016llx \n", (unsigned long long)hashCandidates[n]);
- #endif
- if (param.resultPtr) param.resultPtr->nbCollisions = collisions;
- return collisions;
- }
- #if defined(__MACH__) || defined(__linux__)
- #include <sys/resource.h>
- static size_t getProcessMemUsage(int children)
- {
- struct rusage stats;
- if (getrusage(children ? RUSAGE_CHILDREN : RUSAGE_SELF, &stats) == 0)
- return (size_t)stats.ru_maxrss;
- return 0;
- }
- #else
- static size_t getProcessMemUsage(int ignore) { return 0; }
- #endif
- void time_collisions(searchCollisions_parameters param)
- {
- uint64_t totalH = param.nbH;
- int hashID = param.hashID;
- int display = param.display;
- /* init */
- assert(0 <= hashID && hashID < HASH_FN_TOTAL);
- //const char* const hname = hashfnTable[hashID].name;
- int const hwidth = hashfnTable[hashID].bits;
- if (totalH == 0) totalH = select_nbh(hwidth);
- double const targetColls = estimateNbCollisions(totalH, hwidth);
- /* Start the timer to measure start/end of hashing + collision detection. */
- time_t const programTBegin = time(NULL);
- /* Generate hashes, and count collisions */
- size_t const collisions = search_collisions(param);
- /* display results */
- double const programTDelay = difftime(time(NULL), programTBegin);
- size_t const programBytesSelf = getProcessMemUsage(0);
- size_t const programBytesChildren = getProcessMemUsage(1);
- DISPLAY("\n\n");
- DISPLAY("===> Found %llu collisions (x%.2f, %.1f expected) in %s\n",
- (unsigned long long)collisions,
- (double)collisions / targetColls,
- targetColls,
- displayDelay(programTDelay));
- if (programBytesSelf)
- DISPLAY("===> MaxRSS(self) %zuMB, MaxRSS(children) %zuMB\n",
- programBytesSelf>>20,
- programBytesChildren>>20);
- DISPLAY("------------------------------------------ \n");
- }
- // wrapper for pthread interface
- void MT_searchCollisions(void* payload)
- {
- search_collisions(*(searchCollisions_parameters*)payload);
- }
- /* === Command Line === */
- /*!
- * readU64FromChar():
- * Allows and interprets K, KB, KiB, M, MB and MiB suffix.
- * Will also modify `*stringPtr`, advancing it to the position where it stopped reading.
- */
- static uint64_t readU64FromChar(const char** stringPtr)
- {
- static uint64_t const max = (((uint64_t)(-1)) / 10) - 1;
- uint64_t result = 0;
- while ((**stringPtr >='0') && (**stringPtr <='9')) {
- assert(result < max);
- result *= 10;
- result += (unsigned)(**stringPtr - '0');
- (*stringPtr)++ ;
- }
- if ((**stringPtr=='K') || (**stringPtr=='M') || (**stringPtr=='G')) {
- uint64_t const maxK = ((uint64_t)(-1)) >> 10;
- assert(result < maxK);
- result <<= 10;
- if ((**stringPtr=='M') || (**stringPtr=='G')) {
- assert(result < maxK);
- result <<= 10;
- if (**stringPtr=='G') {
- assert(result < maxK);
- result <<= 10;
- }
- }
- (*stringPtr)++; /* skip `K` or `M` */
- if (**stringPtr=='i') (*stringPtr)++;
- if (**stringPtr=='B') (*stringPtr)++;
- }
- return result;
- }
- /**
- * longCommandWArg():
- * Checks if *stringPtr is the same as longCommand.
- * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
- * @return 0 and doesn't modify *stringPtr otherwise.
- */
- static int longCommandWArg(const char** stringPtr, const char* longCommand)
- {
- assert(longCommand); assert(stringPtr); assert(*stringPtr);
- size_t const comSize = strlen(longCommand);
- int const result = !strncmp(*stringPtr, longCommand, comSize);
- if (result) *stringPtr += comSize;
- return result;
- }
- #include "pool.h"
- /*
- * As some hashes use different algorithms depending on input size,
- * it can be necessary to test multiple input sizes
- * to paint an accurate picture of collision performance
- */
- #define SAMPLE_SIZE_DEFAULT 256
- #define HASHFN_ID_DEFAULT 0
- void help(const char* exeName)
- {
- printf("usage: %s [hashName] [opt] \n\n", exeName);
- printf("list of hashNames:");
- printf("%s ", hashfnTable[0].name);
- for (int i=1; i < HASH_FN_TOTAL; i++) {
- printf(", %s ", hashfnTable[i].name);
- }
- printf(" \n");
- printf("Default hashName is %s\n", hashfnTable[HASHFN_ID_DEFAULT].name);
- printf(" \n");
- printf("Optional parameters: \n");
- printf(" --nbh=NB Select nb of hashes to generate (%llu by default) \n", (unsigned long long)select_nbh(64));
- printf(" --filter Activates the filter. Slower, but reduces memory usage for the same nb of hashes.\n");
- printf(" --threadlog=NB Use 2^NB threads.\n");
- printf(" --len=MB Set length of the input (%i bytes by default) \n", SAMPLE_SIZE_DEFAULT);
- }
- int bad_argument(const char* exeName)
- {
- printf("incorrect command: \n");
- help(exeName);
- return 1;
- }
- int main(int argc, const char** argv)
- {
- if (sizeof(size_t) < 8) return 1; // cannot work on systems without ability to allocate objects >= 4 GB
- assert(argc > 0);
- const char* const exeName = argv[0];
- uint64_t totalH = 0; /* auto, based on nbBits */
- int bflog = 0; /* auto */
- int filter = 0; /* disabled */
- size_t sampleSize = SAMPLE_SIZE_DEFAULT;
- int hashID = HASHFN_ID_DEFAULT;
- int threadlog = 0;
- uint64_t prngSeed = 0;
- int arg_nb;
- for (arg_nb = 1; arg_nb < argc; arg_nb++) {
- const char** arg = argv + arg_nb;
- if (!strcmp(*arg, "-h")) { help(exeName); return 0; }
- if (longCommandWArg(arg, "-T")) { threadlog = (int)readU64FromChar(arg); continue; }
- if (!strcmp(*arg, "--filter")) { filter=1; continue; }
- if (!strcmp(*arg, "--no-filter")) { filter=0; continue; }
- if (longCommandWArg(arg, "--seed")) { prngSeed = readU64FromChar(arg); continue; }
- if (longCommandWArg(arg, "--nbh=")) { totalH = readU64FromChar(arg); continue; }
- if (longCommandWArg(arg, "--filter=")) { filter=1; bflog = (int)readU64FromChar(arg); assert(bflog < 64); continue; }
- if (longCommandWArg(arg, "--filterlog=")) { filter=1; bflog = (int)readU64FromChar(arg); assert(bflog < 64); continue; }
- if (longCommandWArg(arg, "--size=")) { sampleSize = (size_t)readU64FromChar(arg); continue; }
- if (longCommandWArg(arg, "--len=")) { sampleSize = (size_t)readU64FromChar(arg); continue; }
- if (longCommandWArg(arg, "--threadlog=")) { threadlog = (int)readU64FromChar(arg); continue; }
- /* argument understood as hash name (must be correct) */
- int hnb;
- for (hnb=0; hnb < HASH_FN_TOTAL; hnb++) {
- if (!strcmp(*arg, hashfnTable[hnb].name)) { hashID = hnb; break; }
- }
- if (hnb == HASH_FN_TOTAL) return bad_argument(exeName);
- }
- /* init */
- const char* const hname = hashfnTable[hashID].name;
- int const hwidth = hashfnTable[hashID].bits;
- if (totalH == 0) totalH = select_nbh(hwidth);
- double const targetColls = estimateNbCollisions(totalH, hwidth);
- if (bflog == 0) bflog = highestBitSet(totalH) + 1; /* auto-size filter */
- if (!filter) bflog = -1; // disable filter
- if (sizeof(size_t) < 8)
- EXIT("This program has not been validated on architectures other than "
- "64bit \n");
- printf(" *** Collision tester for 64+ bit hashes *** \n\n");
- printf("Testing %s algorithm (%i-bit) \n", hname, hwidth);
- printf("This program will allocate a lot of memory,\n");
- printf("generate %llu %i-bit hashes from samples of %u bytes, \n",
- (unsigned long long)totalH, hwidth, (unsigned)sampleSize);
- printf("and attempt to produce %.0f collisions. \n\n", targetColls);
- int const nbThreads = 1 << threadlog;
- if (nbThreads <= 0) EXIT("Invalid --threadlog value.");
- if (nbThreads == 1) {
- searchCollisions_parameters params;
- params.nbH = totalH;
- params.mask = 0;
- params.maskSelector = 0;
- params.sampleSize = sampleSize;
- params.filterLog = bflog;
- params.hashID = hashID;
- params.display = 1;
- params.resultPtr = NULL;
- params.prngSeed = prngSeed;
- params.nbThreads = 1;
- time_collisions(params);
- } else { /* nbThreads > 1 */
- /* use multithreading */
- if (threadlog >= 30) EXIT("too many threads requested");
- if ((uint64_t)nbThreads > (totalH >> 16))
- EXIT("too many threads requested");
- if (bflog > 0 && threadlog > (bflog-10))
- EXIT("too many threads requested");
- printf("using %i threads ... \n", nbThreads);
- /* allocation */
- time_t const programTBegin = time(NULL);
- POOL_ctx* const pt = POOL_create((size_t)nbThreads, 1);
- if (!pt) EXIT("not enough memory for threads");
- searchCollisions_results* const MTresults = calloc (sizeof(searchCollisions_results), (size_t)nbThreads);
- if (!MTresults) EXIT("not enough memory");
- searchCollisions_parameters* const MTparams = calloc (sizeof(searchCollisions_parameters), (size_t)nbThreads);
- if (!MTparams) EXIT("not enough memory");
- /* distribute jobs */
- for (int tnb=0; tnb<nbThreads; tnb++) {
- MTparams[tnb].nbH = totalH;
- MTparams[tnb].mask = (uint64_t)nbThreads - 1;
- MTparams[tnb].sampleSize = sampleSize;
- MTparams[tnb].filterLog = bflog ? bflog - threadlog : 0;
- MTparams[tnb].hashID = hashID;
- MTparams[tnb].display = 0;
- MTparams[tnb].resultPtr = MTresults+tnb;
- MTparams[tnb].prngSeed = prngSeed;
- MTparams[tnb].maskSelector = (uint64_t)tnb;
- POOL_add(pt, MT_searchCollisions, MTparams + tnb);
- }
- POOL_free(pt); /* actually joins and free */
- /* Gather results */
- uint64_t nbCollisions=0;
- for (int tnb=0; tnb<nbThreads; tnb++) {
- nbCollisions += MTresults[tnb].nbCollisions;
- }
- double const programTDelay = difftime(time(NULL), programTBegin);
- size_t const programBytesSelf = getProcessMemUsage(0);
- size_t const programBytesChildren = getProcessMemUsage(1);
- printf("\n\n");
- printf("===> Found %llu collisions (x%.2f, %.1f expected) in %s\n",
- (unsigned long long)nbCollisions,
- (double)nbCollisions / targetColls,
- targetColls,
- displayDelay(programTDelay));
- if (programBytesSelf)
- printf("===> MaxRSS(self) %zuMB, MaxRSS(children) %zuMB\n",
- programBytesSelf>>20,
- programBytesChildren>>20);
- printf("------------------------------------------ \n");
- /* Clean up */
- free(MTparams);
- free(MTresults);
- }
- return 0;
- }
|