dedup

data deduplication program
git clone git://git.2f30.org/dedup.git
Log | Files | Refs | README | LICENSE

commit 3ce9db0f144f9d5a39a9180ed343d9498d064778
parent 4ff76106cb1e2c6f99dfd163744224fc57d5fa65
Author: sin <sin@2f30.org>
Date:   Sun, 17 Feb 2019 21:28:00 +0000

Add primitive chunk compression support

Diffstat:
MMakefile | 2+-
Mdedup.c | 103++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
2 files changed, 66 insertions(+), 39 deletions(-)

diff --git a/Makefile b/Makefile @@ -8,7 +8,7 @@ DISTFILES = $(SRC) LICENSE Makefile README arg.h dedup.1 tree.h CFLAGS = -g -Wall CPPFLAGS = -I/usr/local/include -D_FILE_OFFSET_BITS=64 -LDLIBS = -lcrypto +LDLIBS = -lcrypto -llz4 all: $(BIN) diff --git a/dedup.c b/dedup.c @@ -7,6 +7,7 @@ #include <string.h> #include <unistd.h> +#include <lz4.h> #include <openssl/sha.h> #include "arg.h" @@ -16,11 +17,11 @@ #define STOREF ".store" #define CACHEF ".cache" -#define BLKSIZ (1024 * 512) -#define WINSIZ 1024 +#define BLKSIZE (1 * 1024 * 1024) +#define WINSIZE 4095 #define HASHMSK ((1ul << 21) - 1) -#define MSGSIZ 256 -#define MDSIZ SHA256_DIGEST_LENGTH +#define MSGSIZE 256 +#define MDSIZE SHA256_DIGEST_LENGTH #define ROTL(x, y) (((x) << (y)) | ((x) >> (32 - (y)))) @@ -38,7 +39,7 @@ struct enthdr { /* block descriptor */ struct bdescr { - uint8_t md[MDSIZ]; + uint8_t md[MDSIZE]; uint64_t offset; uint64_t size; }; @@ -46,8 +47,8 @@ struct bdescr { /* index file entry */ struct ent { uint64_t size; - uint8_t msg[MSGSIZ]; - uint8_t md[MDSIZ]; /* hash of file */ + uint8_t msg[MSGSIZE]; + uint8_t md[MDSIZE]; /* hash of file */ uint64_t nblks; struct bdescr bdescr[]; }; @@ -140,29 +141,47 @@ chunk_blk(uint8_t *buf, size_t size) size_t i; uint32_t fp; - /* buzhash should be at least WINSIZ */ - if (size < WINSIZ) + /* buzhash should be at least WINSIZE */ + if (size < WINSIZE) return size; /* * To achieve better deduplication, we chunk blocks based on a * recurring pattern occuring on the data stream. A fixed window - * of WINSIZ bytes is slid over the data, and a rolling hash is + * of WINSIZE bytes is slid over the data, and a rolling hash is * computed for this window. * When the rolling hash matches a given pattern (see HASHMSK), * the block is chunked at the end of that window, thus making - * WINSIZ the smallest possible block size. + * WINSIZE the smallest possible block size. */ - fp = buzh_init(buf, WINSIZ); - for (i = 0; i < size - WINSIZ; i++) { + fp = buzh_init(buf, WINSIZE); + for (i = 0; i < size - WINSIZE; i++) { if (i > 0) - fp = buzh_update(fp, buf[i - 1], buf[WINSIZ + i - 1], WINSIZ); + fp = buzh_update(fp, buf[i - 1], buf[WINSIZE + i - 1], WINSIZE); if ((fp & HASHMSK) == 0) - return i + WINSIZ; + return i + WINSIZE; } return size; } +size_t +comp_size(size_t size) +{ + return LZ4_compressBound(size); +} + +size_t +comp(uint8_t *in, uint8_t *out, size_t insize, size_t outsize) +{ + return LZ4_compress_default((char *)in, (char *)out, insize, outsize); +} + +size_t +decomp(uint8_t *in, uint8_t *out, size_t insize, size_t outsize) +{ + return LZ4_decompress_safe((char *)in, (char *)out, insize, outsize); +} + void print_md(const uint8_t *md, size_t size) { @@ -375,33 +394,35 @@ lookup_blk(uint8_t *md) void dedup(int fd, char *msg) { - uint8_t *buf; + uint8_t *buf, *cbuf; struct ent *ent; SHA256_CTX ctx; ssize_t n; - buf = alloc_buf(BLKSIZ); + buf = alloc_buf(BLKSIZE); + cbuf = alloc_buf(comp_size(BLKSIZE)); ent = alloc_ent(); SHA256_Init(&ctx); - while ((n = xread(fd, buf, BLKSIZ)) > 0) { + while ((n = xread(fd, buf, BLKSIZE)) > 0) { uint8_t *bp = buf; while (n > 0) { - uint8_t md[MDSIZ]; + uint8_t md[MDSIZE]; struct bdescr bdescr; - size_t blksiz; + size_t blksize, csize; - blksiz = chunk_blk(bp, n); + blksize = chunk_blk(bp, n); + csize = comp(bp, cbuf, blksize, comp_size(BLKSIZE)); memcpy(bdescr.md, md, sizeof(bdescr)); bdescr.offset = enthdr.store_size; - bdescr.size = blksiz; + bdescr.size = csize; - hash_blk(bp, bdescr.size, bdescr.md); + hash_blk(cbuf, bdescr.size, bdescr.md); /* Calculate file hash one block at a time */ - SHA256_Update(&ctx, bp, bdescr.size); + SHA256_Update(&ctx, cbuf, bdescr.size); ent = grow_ent(ent, ent->nblks + 1); @@ -412,7 +433,7 @@ dedup(int fd, char *msg) ent->bdescr[ent->nblks++] = bdescr; /* Store block */ - append_blk(bp, &bdescr); + append_blk(cbuf, &bdescr); /* Create a cache entry for this block */ cent = alloc_cent(); @@ -422,8 +443,8 @@ dedup(int fd, char *msg) ent->bdescr[ent->nblks++] = bdescr; } - bp += blksiz; - n -= blksiz; + bp += blksize; + n -= blksize; } } @@ -445,24 +466,30 @@ dedup(int fd, char *msg) } free(ent); + free(cbuf); free(buf); } int extract(struct ent *ent, void *arg) { - uint8_t *buf; + uint8_t *buf, *cbuf; struct extract_args *args = arg; uint64_t i; if (memcmp(ent->md, args->md, sizeof(ent->md)) != 0) return WALK_CONTINUE; - buf = alloc_buf(BLKSIZ); + buf = alloc_buf(BLKSIZE); + cbuf = alloc_buf(comp_size(BLKSIZE)); for (i = 0; i < ent->nblks; i++) { - read_blk(buf, &ent->bdescr[i]); - xwrite(args->fd, buf, ent->bdescr[i].size); + size_t blksize; + + read_blk(cbuf, &ent->bdescr[i]); + blksize = decomp(cbuf, buf, ent->bdescr[i].size, BLKSIZE); + xwrite(args->fd, buf, blksize); } + free(cbuf); free(buf); return WALK_STOP; } @@ -470,12 +497,12 @@ extract(struct ent *ent, void *arg) int check(struct ent *ent, void *arg) { - uint8_t md[MDSIZ]; + uint8_t md[MDSIZE]; uint8_t *buf; SHA256_CTX ctx; uint64_t i; - buf = alloc_buf(BLKSIZ); + buf = alloc_buf(BLKSIZE); /* * Calculate hash for each block and compare * with index entry block descriptor @@ -521,12 +548,12 @@ list(struct ent *ent, void *arg) int rebuild_cache(struct ent *ent, void *arg) { - uint8_t md[MDSIZ]; + uint8_t md[MDSIZE]; uint8_t *buf; SHA256_CTX ctx; uint64_t i; - buf = alloc_buf(BLKSIZ); + buf = alloc_buf(BLKSIZE); for (i = 0; i < ent->nblks; i++) { struct cent *cent; @@ -575,7 +602,7 @@ init_cache(void) uint64_t nents, i; uint64_t min, max, avg; - min = BLKSIZ; + min = BLKSIZE; max = 0; avg = 0; @@ -598,7 +625,7 @@ init_cache(void) avg /= nents; if (verbose) { - fprintf(stderr, "min/avg/max blksize: %llu/%llu/%llu\n", + fprintf(stderr, "min/avg/max block size: %llu/%llu/%llu\n", (unsigned long long)min, (unsigned long long)avg, (unsigned long long)max); @@ -661,7 +688,7 @@ usage(void) int main(int argc, char *argv[]) { - uint8_t md[MDSIZ]; + uint8_t md[MDSIZE]; char *id = NULL, *root = NULL, *msg = NULL; int fd = -1, lflag = 0, cflag = 0;