dedup

data deduplication program
git clone git://git.2f30.org/dedup.git
Log | Files | Refs | README | LICENSE

commit d60ace395a74a5efe067ee9cd5d85446c7facf43
parent d8bfc3a69ce4c3c35dfa8c0d5cef3ce10e424300
Author: sin <sin@2f30.org>
Date:   Tue, 26 Feb 2019 09:48:57 +0000

When matching a pattern check if bottom bits of hash are 0

This approach is more efficient and easier to understand.

Diffstat:
Mchunker.c | 10+---------
Mconfig.h | 1+
2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/chunker.c b/chunker.c @@ -14,7 +14,6 @@ struct chunker { size_t cap; size_t rpos; size_t wpos; - size_t discr; int fd; }; @@ -88,7 +87,7 @@ match_pattern(struct chunker *chunker, size_t chunk_size, uint32_t fp) return 1; if (chunk_size < BLKSIZE_MIN) return 0; - return (fp % chunker->discr) == chunker->discr - 1; + return (fp & HASHMASK_BITS) == 0; } static size_t @@ -123,12 +122,6 @@ get_chunk_size(struct chunker *chunker) return chunk_size; } -static size_t -calc_discr(size_t avg) -{ - return avg / (-1.42888852e-7 * avg + 1.33237515); -} - struct chunker * alloc_chunker(size_t cap, int fd) { @@ -145,7 +138,6 @@ alloc_chunker(size_t cap, int fd) chunker->rpos = 0; chunker->wpos = 0; chunker->fd = fd; - chunker->discr = calc_discr(BLKSIZE_AVG); return chunker; } diff --git a/config.h b/config.h @@ -1,4 +1,5 @@ #define BLKSIZE_AVG ((size_t)524288) #define BLKSIZE_MIN ((BLKSIZE_AVG) / 4) #define BLKSIZE_MAX ((BLKSIZE_AVG) * 4) +#define HASHMASK_BITS (BLKSIZE_AVG - 1) #define WINSIZE 32