commit 39f907c919176a8fed990e10e95a7f7371aedf40
parent 90f2ff9fcf896060da4be75c70cd148f18be28e9
Author: z3bra <contactatz3bradotorg>
Date: Sun, 17 Feb 2019 13:04:03 +0100
Provide better comment for chunking decisions
Diffstat:
1 file changed, 14 insertions(+), 7 deletions(-)
diff --git a/dedup.c b/dedup.c
@@ -74,6 +74,11 @@ char *argv0;
/*
* Static table for use in buzhash algorithm.
* 256 * 32 bits randomly generated unique integers
+ *
+ * To get better pseudo-random results, there is exactly the same number
+ * of 0 and 1 spread amongst these integers. It means that there is
+ * exactly 50% chance that a XOR operation would flip all the bits in
+ * the hash.
*/
uint32_t buz[] = {
0xbc9fa594,0x30a8f827,0xced627a7,0xdb46a745,0xcfa4a9e8,0x77cccb59,0xddb66276,0x3adc532f,
@@ -115,9 +120,9 @@ uint32_t
buzh_init(uint8_t *buf, size_t size)
{
size_t i;
- uint32_t fp = 0;
+ uint32_t fp;
- for (i = size - 1; i > 0; i--, buf++)
+ for (i = size - 1, fp = 0; i > 0; i--, buf++)
fp ^= ROTL(buz[*buf], i % 32);
return fp ^ buz[*buf];
@@ -136,11 +141,13 @@ chunk_blk(uint8_t *buf, size_t size)
uint32_t fp;
/*
- * Chunking blocks is decided using a rolling hash + binary pattern.
- * The buzhash algorithm is used to "fingerprint" a fixed size window.
- * Once the lower bits of this fingerprint are all zeros,
- * the block is chunked.
- * If the pattern can't be matched, then we return the buffer size.
+ * To achieve better deduplication, we chunk blocks based on a
+ * recurring pattern occuring on the data stream. A fixed window
+ * of WINSIZ bytes is slid over the data, and a rolling hash is
+ * computed for this window.
+ * When the rolling hash matches a given pattern (see HASHMSK),
+ * the block is chunked at the end of that window, thus making
+ * WINSIZ the smallest possible block size.
*/
fp = buzh_init(buf, WINSIZ);
for (i = 1; i < size - WINSIZ; i++) {