Provide better comment for chunking decisions - dedup

commit 39f907c919176a8fed990e10e95a7f7371aedf40
parent 90f2ff9fcf896060da4be75c70cd148f18be28e9
Author: z3bra <contactatz3bradotorg>
Date:   Sun, 17 Feb 2019 13:04:03 +0100

Provide better comment for chunking decisions

Diffstat:
M dedup.c  | 21 ++++++++++++++-------

1 file changed, 14 insertions(+), 7 deletions(-)
diff --git a/dedup.c b/dedup.c
@@ -74,6 +74,11 @@ char *argv0;
 /*
  * Static table for use in buzhash algorithm.
  * 256 * 32 bits randomly generated unique integers
+ *
+ * To get better pseudo-random results, there is exactly the same number
+ * of 0 and 1 spread amongst these integers. It means that there is
+ * exactly 50% chance that a XOR operation would flip all the bits in
+ * the hash.
  */
 uint32_t buz[] = {
 	0xbc9fa594,0x30a8f827,0xced627a7,0xdb46a745,0xcfa4a9e8,0x77cccb59,0xddb66276,0x3adc532f,
@@ -115,9 +120,9 @@ uint32_t
 buzh_init(uint8_t *buf, size_t size)
 {
 	size_t i;
-	uint32_t fp = 0;
+	uint32_t fp;
 
-	for (i = size - 1; i > 0; i--, buf++)
+	for (i = size - 1, fp = 0; i > 0; i--, buf++)
 		fp ^= ROTL(buz[*buf], i % 32);
 
 	return fp ^ buz[*buf];
@@ -136,11 +141,13 @@ chunk_blk(uint8_t *buf, size_t size)
 	uint32_t fp;
 
 	/*
-	 * Chunking blocks is decided using a rolling hash + binary pattern.
-	 * The buzhash algorithm is used to "fingerprint" a fixed size window.
-	 * Once the lower bits of this fingerprint are all zeros,
-	 * the block is chunked.
-	 * If the pattern can't be matched, then we return the buffer size.
+	 * To achieve better deduplication, we chunk blocks based on a
+	 * recurring pattern occuring on the data stream. A fixed window
+	 * of WINSIZ bytes is slid over the data, and a rolling hash is
+	 * computed for this window.
+	 * When the rolling hash matches a given pattern (see HASHMSK),
+	 * the block is chunked at the end of that window, thus making
+	 * WINSIZ the smallest possible block size.
 	 */
 	fp = buzh_init(buf, WINSIZ);
 	for (i = 1; i < size - WINSIZ; i++) {

	dedup deduplicating backup program
	git clone git://git.2f30.org/dedup
	Log \| Files \| Refs \| README \| LICENSE