dedup

deduplicating backup program
git clone git://git.2f30.org/dedup
Log | Files | Refs | README | LICENSE

commit 9bee83eb425e6424e1cb6c9e5fbe6dced9d005ce
parent c0597760335a6dfcfeb37a023f6362567079ba7f
Author: sin <sin@2f30.org>
Date:   Sat, 16 Feb 2019 23:49:36 +0000

Hook in rolling hash

Diffstat:
Mdedup.c | 69+++++++++++++++++++++++++++++++++++++++++----------------------------
1 file changed, 41 insertions(+), 28 deletions(-)

diff --git a/dedup.c b/dedup.c @@ -16,8 +16,8 @@ #define STOREF ".store" #define CACHEF ".cache" -#define BLKSIZ 65536 -#define WINSIZ 4095 +#define BLKSIZ (65536*4) +#define WINSIZ 127 #define MDSIZ SHA256_DIGEST_LENGTH #define ROTL(x, y) (((x) << (y)) | ((x) >> (32 - (y)))) @@ -140,7 +140,7 @@ chunk_blk(uint8_t *buf, size_t size) */ fp = buzh_init(buf, WINSIZ); for (i = 1; i < size - WINSIZ; i++) { - fp = buzh_update(fp, buf[i - 1], buf[i + WINSIZ], WINSIZ); + fp = buzh_update(fp, buf[i - 1], buf[i + WINSIZ - 1], WINSIZ); if ((fp & 0x00001fff) == 0) return i + WINSIZ; } @@ -366,10 +366,9 @@ lookup_blk(uint8_t *md, struct bdescr *bdescr) void dedup(int fd) { - uint8_t md[MDSIZ]; uint8_t *buf; - SHA256_CTX ctx; struct ent *ent; + SHA256_CTX ctx; ssize_t n; buf = alloc_buf(BLKSIZ); @@ -377,36 +376,50 @@ dedup(int fd) SHA256_Init(&ctx); while ((n = xread(fd, buf, BLKSIZ)) > 0) { - struct bdescr bdescr; + uint8_t *bp = buf; - hash_blk(buf, n, md); + while (n > 0) { + uint8_t md[MDSIZ]; + struct bdescr bdescr; + size_t blksiz; - /* Calculate file hash one block at a time */ - SHA256_Update(&ctx, buf, n); + if (n > WINSIZ) + blksiz = chunk_blk(bp, n); + else + blksiz = n; - ent = grow_ent(ent, ent->nblks + 1); - if (lookup_blk(md, &bdescr) < 0) { - struct bdescr bdescr; - struct cent *cent; + hash_blk(bp, blksiz, md); - /* Block not found, create new block descriptor */ - memcpy(bdescr.md, md, sizeof(bdescr)); - bdescr.offset = store_size(); - bdescr.size = n; + /* Calculate file hash one block at a time */ + SHA256_Update(&ctx, bp, blksiz); - /* Update index entry */ - ent->bdescr[ent->nblks++] = bdescr; + ent = grow_ent(ent, ent->nblks + 1); + if (lookup_blk(md, &bdescr) < 0) { + struct bdescr bdescr; + struct cent *cent; - /* Store block */ - append_blk(buf, n); + /* Block not found, create new block descriptor */ + memcpy(bdescr.md, md, sizeof(bdescr)); + bdescr.offset = store_size(); + bdescr.size = blksiz; - /* Create a cache entry for this block */ - cent = alloc_cent(); - cent->bdescr = bdescr; - add_cent(cent); - } else { - /* Found block with the same hash, update index entry */ - ent->bdescr[ent->nblks++] = bdescr; + /* Update index entry */ + ent->bdescr[ent->nblks++] = bdescr; + + /* Store block */ + append_blk(bp, blksiz); + + /* Create a cache entry for this block */ + cent = alloc_cent(); + cent->bdescr = bdescr; + add_cent(cent); + } else { + /* Found block with the same hash, update index entry */ + ent->bdescr[ent->nblks++] = bdescr; + } + + bp += blksiz; + n -= blksiz; } }