dedup

data deduplication program
git clone git://git.2f30.org/dedup.git
Log | Files | Refs | README | LICENSE

commit 39f907c919176a8fed990e10e95a7f7371aedf40
parent 90f2ff9fcf896060da4be75c70cd148f18be28e9
Author: z3bra <contactatz3bradotorg>
Date:   Sun, 17 Feb 2019 13:04:03 +0100

Provide better comment for chunking decisions

Diffstat:
Mdedup.c | 21++++++++++++++-------
1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/dedup.c b/dedup.c @@ -74,6 +74,11 @@ char *argv0; /* * Static table for use in buzhash algorithm. * 256 * 32 bits randomly generated unique integers + * + * To get better pseudo-random results, there is exactly the same number + * of 0 and 1 spread amongst these integers. It means that there is + * exactly 50% chance that a XOR operation would flip all the bits in + * the hash. */ uint32_t buz[] = { 0xbc9fa594,0x30a8f827,0xced627a7,0xdb46a745,0xcfa4a9e8,0x77cccb59,0xddb66276,0x3adc532f, @@ -115,9 +120,9 @@ uint32_t buzh_init(uint8_t *buf, size_t size) { size_t i; - uint32_t fp = 0; + uint32_t fp; - for (i = size - 1; i > 0; i--, buf++) + for (i = size - 1, fp = 0; i > 0; i--, buf++) fp ^= ROTL(buz[*buf], i % 32); return fp ^ buz[*buf]; @@ -136,11 +141,13 @@ chunk_blk(uint8_t *buf, size_t size) uint32_t fp; /* - * Chunking blocks is decided using a rolling hash + binary pattern. - * The buzhash algorithm is used to "fingerprint" a fixed size window. - * Once the lower bits of this fingerprint are all zeros, - * the block is chunked. - * If the pattern can't be matched, then we return the buffer size. + * To achieve better deduplication, we chunk blocks based on a + * recurring pattern occuring on the data stream. A fixed window + * of WINSIZ bytes is slid over the data, and a rolling hash is + * computed for this window. + * When the rolling hash matches a given pattern (see HASHMSK), + * the block is chunked at the end of that window, thus making + * WINSIZ the smallest possible block size. */ fp = buzh_init(buf, WINSIZ); for (i = 1; i < size - WINSIZ; i++) {