dedup

data deduplication program
git clone git://git.2f30.org/dedup.git
Log | Files | Refs | README | LICENSE

commit 23e23d22beed84fe844c6d76f453667d9a6f95c6
parent e8031b23797e666b43f96906372d97e2da4f4d0a
Author: sin <sin@2f30.org>
Date:   Fri, 22 Feb 2019 13:15:25 +0000

Tweak params

The parameters were taken from casync.

Diffstat:
Mchunker.c | 35+++++++++++++++++++++++++++--------
Mconfig.h | 5+++--
Mdedup.c | 18+++++++++---------
3 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/chunker.c b/chunker.c @@ -16,15 +16,33 @@ struct chunker { }; static size_t +calc_discr(size_t avg) +{ + return avg / (-1.42888852e-7 * avg + 1.33237515); +} + +static int +match_pattern(size_t chunk_size, uint32_t fp) +{ + size_t discr = calc_discr(BLKSIZE_AVG); + + if (chunk_size >= BLKSIZE_MAX) + return 1; + if (chunk_size < BLKSIZE_MIN) + return 0; + return (fp % discr) == discr - 1; +} + +static size_t get_chunk_size(struct chunker *chunker) { uint8_t *bp; uint32_t fp; - size_t i; + size_t i, chunk_size; - /* buzhash should be at least WINSIZE */ - if (chunker->wpos - chunker->rpos < WINSIZE) - return chunker->wpos - chunker->rpos; + chunk_size = chunker->wpos - chunker->rpos; + if (chunk_size < WINSIZE) + return chunk_size; bp = chunker->buf; @@ -39,13 +57,14 @@ get_chunk_size(struct chunker *chunker) */ fp = buzh_init(bp, WINSIZE); for (i = chunker->rpos; i < chunker->wpos - WINSIZE; i++) { + chunk_size = i + WINSIZE; if (i > 0) - fp = buzh_update(fp, bp[i - 1], bp[WINSIZE + i - 1], + fp = buzh_update(fp, bp[i - 1], bp[chunk_size - 1], WINSIZE); - if ((fp & HASHMSK) == 0) - return i + WINSIZE; + if (match_pattern(chunk_size, fp) == 1) + return chunk_size; } - return chunker->wpos - chunker->rpos; + return chunk_size; } struct chunker * diff --git a/config.h b/config.h @@ -1,3 +1,4 @@ -#define BLKSIZE 131072 +#define BLKSIZE_AVG ((size_t)131072) +#define BLKSIZE_MIN ((BLKSIZE_AVG) / 4) +#define BLKSIZE_MAX ((BLKSIZE_AVG) * 4) #define WINSIZE 32 -#define HASHMSK ((1ul << 15) - 1) diff --git a/dedup.c b/dedup.c @@ -325,9 +325,9 @@ dedup_chunk(struct snapshot *snap, uint8_t *chunkp, size_t chunk_size) struct blk_desc blk_desc; size_t n; - comp_buf = alloc_buf(comp_size(BLKSIZE)); + comp_buf = alloc_buf(comp_size(BLKSIZE_MAX)); - n = comp(chunkp, comp_buf, chunk_size, comp_size(BLKSIZE)); + n = comp(chunkp, comp_buf, chunk_size, comp_size(BLKSIZE_MAX)); hash_blk(comp_buf, n, md); snaphdr.st.orig_size += chunk_size; @@ -374,7 +374,7 @@ dedup(int fd, char *msg) ssize_t n; snap = alloc_snap(); - chunker = alloc_chunker(BLKSIZE, fd); + chunker = alloc_chunker(BLKSIZE_MAX, fd); SHA256_Init(&ctx); while ((n = fill_chunker(chunker)) > 0) { @@ -417,14 +417,14 @@ extract(struct snapshot *snap, void *arg) if (memcmp(snap->md, args->md, sizeof(snap->md)) != 0) return WALK_CONTINUE; - buf[0] = alloc_buf(BLKSIZE); - buf[1] = alloc_buf(comp_size(BLKSIZE)); + buf[0] = alloc_buf(BLKSIZE_MAX); + buf[1] = alloc_buf(comp_size(BLKSIZE_MAX)); for (i = 0; i < snap->nr_blk_descs; i++) { size_t blksize; read_blk(buf[1], &snap->blk_desc[i]); blksize = decomp(buf[1], buf[0], snap->blk_desc[i].size, - BLKSIZE); + BLKSIZE_MAX); xwrite(args->fd, buf[0], blksize); } free_buf(buf[1]); @@ -440,7 +440,7 @@ check(struct snapshot *snap, void *arg) SHA256_CTX ctx; uint64_t i; - buf = alloc_buf(comp_size(BLKSIZE)); + buf = alloc_buf(comp_size(BLKSIZE_MAX)); /* * Calculate hash for each block and compare * against snapshot entry block descriptor @@ -491,7 +491,7 @@ rebuild_cache(struct snapshot *snap, void *arg) SHA256_CTX ctx; uint64_t i; - buf = alloc_buf(comp_size(BLKSIZE)); + buf = alloc_buf(comp_size(BLKSIZE_MAX)); for (i = 0; i < snap->nr_blk_descs; i++) { struct cache_entry *ent; @@ -590,7 +590,7 @@ init(void) VER_MAJ, VER_MIN, maj, min); } else { snaphdr.flags = (VER_MAJ << 8) | VER_MIN; - snaphdr.st.min_blk_size = comp_size(BLKSIZE); + snaphdr.st.min_blk_size = comp_size(BLKSIZE_MAX); xwrite(ifd, &snaphdr, sizeof(snaphdr)); }