dedup

data deduplication program
git clone git://git.2f30.org/dedup.git
Log | Files | Refs | README | LICENSE

commit 53741db3458c3a8fc633b7a5336d08062fa5e8a6
parent e1c69d4298f03af158a5a5637dad600cdd41f3dc
Author: sin <sin@2f30.org>
Date:   Wed, 20 Feb 2019 14:06:23 +0000

Rename stuff

Diffstat:
Mdedup.c | 327+++++++++++++++++++++++++++++++++++++++----------------------------------------
1 file changed, 161 insertions(+), 166 deletions(-)

diff --git a/dedup.c b/dedup.c @@ -14,7 +14,7 @@ #include "arg.h" #include "tree.h" -#define INDEXF ".index" +#define SNAPSF ".snapshots" #define STOREF ".store" #define CACHEF ".cache" @@ -41,41 +41,37 @@ struct stats { uint64_t dedup_size; uint64_t min_blk_size; uint64_t max_blk_size; - uint64_t nblks; + uint64_t nr_blks; uint64_t cache_hits; uint64_t cache_misses; uint64_t reserved[4]; }; -/* index file header */ -struct enthdr { +struct snapshot_hdr { uint64_t flags; - uint64_t nents; + uint64_t nr_snapshots; uint64_t store_size; uint64_t reserved[4]; struct stats st; }; -/* block descriptor */ -struct bdescr { +struct blk_desc { uint8_t md[MDSIZE]; uint64_t offset; uint64_t size; }; -/* index file entry */ -struct ent { +struct snapshot { uint64_t size; uint8_t msg[MSGSIZE]; uint8_t md[MDSIZE]; /* hash of file */ - uint64_t nblks; - struct bdescr bdescr[]; + uint64_t nr_blk_descs; + struct blk_desc blk_desc[]; }; -/* cache entry */ -struct cent { - struct bdescr bdescr; - RB_ENTRY(cent) e; +struct cache_entry { + struct blk_desc blk_desc; + RB_ENTRY(cache_entry) e; }; struct extract_args { @@ -83,8 +79,8 @@ struct extract_args { int fd; }; -RB_HEAD(cache, cent) cache_head; -struct enthdr enthdr; +RB_HEAD(cache, cache_entry) cache_head; +struct snapshot_hdr snaphdr; int ifd; int sfd; int cfd; @@ -177,7 +173,8 @@ chunk_blk(uint8_t *buf, size_t size) fp = buzh_init(buf, WINSIZE); for (i = 0; i < size - WINSIZE; i++) { if (i > 0) - fp = buzh_update(fp, buf[i - 1], buf[WINSIZE + i - 1], WINSIZE); + fp = buzh_update(fp, buf[i - 1], buf[WINSIZE + i - 1], + WINSIZE); if ((fp & HASHMSK) == 0) return i + WINSIZE; } @@ -224,7 +221,7 @@ print_md(FILE *fp, uint8_t *md, size_t size) void print_stats(struct stats *st) { - if (st->nblks == 0) + if (st->nr_blks == 0) return; fprintf(stderr, "original size: %llu bytes\n", @@ -235,10 +232,10 @@ print_stats(struct stats *st) (unsigned long long)st->dedup_size); fprintf(stderr, "min/avg/max block size: %llu/%llu/%llu\n", (unsigned long long)st->min_blk_size, - (unsigned long long)st->dedup_size / st->nblks, + (unsigned long long)st->dedup_size / st->nr_blks, (unsigned long long)st->max_blk_size); fprintf(stderr, "number of blocks: %llu\n", - (unsigned long long)st->nblks); + (unsigned long long)st->nr_blks); fprintf(stderr, "total cache hits: %llu\n", (unsigned long long)st->cache_hits); fprintf(stderr, "total cache misses: %llu\n", @@ -306,24 +303,24 @@ xwrite(int fd, const void *buf, size_t nbytes) } int -cent_cmp(struct cent *e1, struct cent *e2) +cache_entry_cmp(struct cache_entry *e1, struct cache_entry *e2) { int r; - r = memcmp(e1->bdescr.md, e2->bdescr.md, sizeof(e1->bdescr.md)); + r = memcmp(e1->blk_desc.md, e2->blk_desc.md, sizeof(e1->blk_desc.md)); if (r > 0) return 1; else if (r < 0) return -1; return 0; } -RB_PROTOTYPE(cache, cent, e, cent_cmp); -RB_GENERATE(cache, cent, e, cent_cmp); +RB_PROTOTYPE(cache, cache_entry, e, cache_entry_cmp); +RB_GENERATE(cache, cache_entry, e, cache_entry_cmp); -struct cent * -alloc_cent(void) +struct cache_entry * +alloc_cache_entry(void) { - struct cent *ent; + struct cache_entry *ent; ent = calloc(1, sizeof(*ent)); if (ent == NULL) @@ -332,82 +329,82 @@ alloc_cent(void) } void -add_cent(struct cent *cent) +add_cache_entry(struct cache_entry *ent) { - RB_INSERT(cache, &cache_head, cent); + RB_INSERT(cache, &cache_head, ent); } void flush_cache(void) { - struct cent *cent; + struct cache_entry *ent; if (!cache_dirty) return; xlseek(cfd, 0, SEEK_SET); - RB_FOREACH(cent, cache, &cache_head) - xwrite(cfd, &cent->bdescr, sizeof(cent->bdescr)); + RB_FOREACH(ent, cache, &cache_head) + xwrite(cfd, &ent->blk_desc, sizeof(ent->blk_desc)); } void free_cache(void) { - struct cent *cent, *tmp; + struct cache_entry *ent, *tmp; - RB_FOREACH_SAFE(cent, cache, &cache_head, tmp) { - RB_REMOVE(cache, &cache_head, cent); - free(cent); + RB_FOREACH_SAFE(ent, cache, &cache_head, tmp) { + RB_REMOVE(cache, &cache_head, ent); + free(ent); } } uint64_t -cache_nents(void) +cache_nr_entries(void) { struct stat sb; if (fstat(cfd, &sb) < 0) err(1, "fstat"); - return sb.st_size / sizeof(struct bdescr); + return sb.st_size / sizeof(struct blk_desc); } void -append_ent(struct ent *ent) +append_snap(struct snapshot *snap) { - /* Update index header */ - enthdr.nents++; + /* Update snapshot header */ + snaphdr.nr_snapshots++; xlseek(ifd, 0, SEEK_SET); - xwrite(ifd, &enthdr, sizeof(enthdr)); + xwrite(ifd, &snaphdr, sizeof(snaphdr)); - /* Append entry */ + /* Append snapshot */ xlseek(ifd, 0, SEEK_END); - ent->size = sizeof(*ent); - ent->size += ent->nblks * sizeof(ent->bdescr[0]); - xwrite(ifd, ent, ent->size); + snap->size = sizeof(*snap); + snap->size += snap->nr_blk_descs * sizeof(snap->blk_desc[0]); + xwrite(ifd, snap, snap->size); } -struct ent * -alloc_ent(void) +struct snapshot * +alloc_snap(void) { - struct ent *ent; + struct snapshot *snap; - ent = calloc(1, sizeof(*ent)); - if (ent == NULL) + snap = calloc(1, sizeof(*snap)); + if (snap == NULL) err(1, "calloc"); - return ent; + return snap; } -struct ent * -grow_ent(struct ent *ent, uint64_t nblks) +struct snapshot * +grow_snap(struct snapshot *snap, uint64_t nr_blk_descs) { size_t size; - size = sizeof(*ent); - size += nblks * sizeof(ent->bdescr[0]); - ent = realloc(ent, size); - if (ent == NULL) + size = sizeof(*snap); + size += nr_blk_descs * sizeof(snap->blk_desc[0]); + snap = realloc(snap, size); + if (snap == NULL) err(1, "realloc"); - return ent; + return snap; } uint8_t * @@ -432,30 +429,30 @@ hash_blk(uint8_t *buf, size_t size, uint8_t *md) } void -read_blk(uint8_t *buf, struct bdescr *bdescr) +read_blk(uint8_t *buf, struct blk_desc *blk_desc) { - xlseek(sfd, bdescr->offset, SEEK_SET); - if (xread(sfd, buf, bdescr->size) == 0) + xlseek(sfd, blk_desc->offset, SEEK_SET); + if (xread(sfd, buf, blk_desc->size) == 0) errx(1, "read: unexpected EOF"); } void -append_blk(uint8_t *buf, struct bdescr *bdescr) +append_blk(uint8_t *buf, struct blk_desc *blk_desc) { - xlseek(sfd, enthdr.store_size, SEEK_SET); - xwrite(sfd, buf, bdescr->size); - enthdr.store_size += bdescr->size; + xlseek(sfd, snaphdr.store_size, SEEK_SET); + xwrite(sfd, buf, blk_desc->size); + snaphdr.store_size += blk_desc->size; } int -lookup_bdescr(uint8_t *md, struct bdescr *bdescr) +lookup_blk_desc(uint8_t *md, struct blk_desc *blk_desc) { - struct cent *ent, key; + struct cache_entry *ent, key; - memcpy(key.bdescr.md, md, sizeof(key.bdescr.md)); + memcpy(key.blk_desc.md, md, sizeof(key.blk_desc.md)); ent = RB_FIND(cache, &cache_head, &key); if (ent != NULL) { - *bdescr = ent->bdescr; + *blk_desc = ent->blk_desc; return 0; } return -1; @@ -465,115 +462,113 @@ void dedup(int fd, char *msg) { uint8_t *buf[2]; - struct ent *ent; + struct snapshot *snap; SHA256_CTX ctx; ssize_t n, bufsize; buf[0] = alloc_buf(BLKSIZE); buf[1] = alloc_buf(comp_size(BLKSIZE)); - ent = alloc_ent(); + snap = alloc_snap(); bufsize = 0; SHA256_Init(&ctx); - while ((n = xread(fd, buf[0] + bufsize, BLKSIZE - bufsize)) > 0 || bufsize > 0) { + while ((n = xread(fd, buf[0] + bufsize, BLKSIZE - bufsize)) > 0 || + bufsize > 0) { + uint8_t md[MDSIZE]; - struct bdescr bdescr; + struct blk_desc blk_desc; size_t blksize, csize; uint8_t *inp = buf[0]; /* input buf */ uint8_t *outp = buf[1]; /* compressed buf */ if (n > 0) { bufsize += n; - enthdr.st.orig_size += n; + snaphdr.st.orig_size += n; } blksize = chunk_blk(inp, bufsize); csize = comp(inp, outp, blksize, comp_size(BLKSIZE)); - enthdr.st.comp_size += csize; + snaphdr.st.comp_size += csize; hash_blk(outp, csize, md); /* Calculate file hash one block at a time */ SHA256_Update(&ctx, inp, blksize); - ent = grow_ent(ent, ent->nblks + 1); + snap = grow_snap(snap, snap->nr_blk_descs + 1); - if (lookup_bdescr(md, &bdescr) < 0) { - struct cent *cent; + if (lookup_blk_desc(md, &blk_desc) < 0) { + struct cache_entry *ent; - memcpy(bdescr.md, md, sizeof(bdescr.md)); - bdescr.offset = enthdr.store_size; - bdescr.size = csize; + memcpy(blk_desc.md, md, sizeof(blk_desc.md)); + blk_desc.offset = snaphdr.store_size; + blk_desc.size = csize; - /* Update index entry */ - ent->bdescr[ent->nblks++] = bdescr; + snap->blk_desc[snap->nr_blk_descs++] = blk_desc; - /* Store block */ - append_blk(outp, &bdescr); + append_blk(outp, &blk_desc); - /* Create a cache entry for this block */ - cent = alloc_cent(); - cent->bdescr = bdescr; - add_cent(cent); + ent = alloc_cache_entry(); + ent->blk_desc = blk_desc; + add_cache_entry(ent); cache_dirty = 1; - enthdr.st.dedup_size += bdescr.size; - enthdr.st.nblks++; - enthdr.st.cache_misses++; + snaphdr.st.dedup_size += blk_desc.size; + snaphdr.st.nr_blks++; + snaphdr.st.cache_misses++; - if (bdescr.size > enthdr.st.max_blk_size) - enthdr.st.max_blk_size = bdescr.size; - if (bdescr.size < enthdr.st.min_blk_size) - enthdr.st.min_blk_size = bdescr.size; + if (blk_desc.size > snaphdr.st.max_blk_size) + snaphdr.st.max_blk_size = blk_desc.size; + if (blk_desc.size < snaphdr.st.min_blk_size) + snaphdr.st.min_blk_size = blk_desc.size; } else { - ent->bdescr[ent->nblks++] = bdescr; - enthdr.st.cache_hits++; + snap->blk_desc[snap->nr_blk_descs++] = blk_desc; + snaphdr.st.cache_hits++; } memmove(inp, inp + blksize, bufsize - blksize); bufsize -= blksize; } - if (ent->nblks > 0) { - /* Calculate hash and add this entry to the index */ - SHA256_Final(ent->md, &ctx); + if (snap->nr_blk_descs > 0) { + SHA256_Final(snap->md, &ctx); if (msg != NULL) { size_t size; size = strlen(msg) + 1; - if (size > sizeof(ent->msg)) - size = sizeof(ent->msg); - memcpy(ent->msg, msg, size); - ent->msg[size - 1] = '\0'; + if (size > sizeof(snap->msg)) + size = sizeof(snap->msg); + memcpy(snap->msg, msg, size); + snap->msg[size - 1] = '\0'; } - append_ent(ent); + append_snap(snap); } - free(ent); + free(snap); free(buf[1]); free(buf[0]); } int -extract(struct ent *ent, void *arg) +extract(struct snapshot *snap, void *arg) { uint8_t *buf[2]; struct extract_args *args = arg; uint64_t i; - if (memcmp(ent->md, args->md, sizeof(ent->md)) != 0) + if (memcmp(snap->md, args->md, sizeof(snap->md)) != 0) return WALK_CONTINUE; buf[0] = alloc_buf(BLKSIZE); buf[1] = alloc_buf(comp_size(BLKSIZE)); - for (i = 0; i < ent->nblks; i++) { + for (i = 0; i < snap->nr_blk_descs; i++) { size_t blksize; - read_blk(buf[1], &ent->bdescr[i]); - blksize = decomp(buf[1], buf[0], ent->bdescr[i].size, BLKSIZE); + read_blk(buf[1], &snap->blk_desc[i]); + blksize = decomp(buf[1], buf[0], snap->blk_desc[i].size, BLKSIZE); xwrite(args->fd, buf[0], blksize); } free(buf[1]); @@ -582,7 +577,7 @@ extract(struct ent *ent, void *arg) } int -check(struct ent *ent, void *arg) +check(struct snapshot *snap, void *arg) { uint8_t md[MDSIZE]; uint8_t *buf; @@ -592,48 +587,48 @@ check(struct ent *ent, void *arg) buf = alloc_buf(comp_size(BLKSIZE)); /* * Calculate hash for each block and compare - * with index entry block descriptor + * against snapshot entry block descriptor */ - for (i = 0; i < ent->nblks; i++) { - read_blk(buf, &ent->bdescr[i]); + for (i = 0; i < snap->nr_blk_descs; i++) { + read_blk(buf, &snap->blk_desc[i]); SHA256_Init(&ctx); - SHA256_Update(&ctx, buf, ent->bdescr[i].size); + SHA256_Update(&ctx, buf, snap->blk_desc[i].size); SHA256_Final(md, &ctx); - if (memcmp(ent->bdescr[i].md, md, - sizeof(ent->bdescr[i]).md) == 0) + if (memcmp(snap->blk_desc[i].md, md, + sizeof(snap->blk_desc[i]).md) == 0) continue; fprintf(stderr, "Block hash mismatch\n"); fprintf(stderr, " Expected hash: "); - print_md(stderr, ent->md, sizeof(ent->md)); + print_md(stderr, snap->md, sizeof(snap->md)); fputc('\n', stderr); fprintf(stderr, " Actual hash: "); print_md(stderr, md, sizeof(md)); fputc('\n', stderr); fprintf(stderr, " Offset: %llu\n", - (unsigned long long)ent->bdescr[i].offset); + (unsigned long long)snap->blk_desc[i].offset); fprintf(stderr, " Size: %llu\n", - (unsigned long long)ent->bdescr[i].size); + (unsigned long long)snap->blk_desc[i].size); } free(buf); return WALK_CONTINUE; } int -list(struct ent *ent, void *arg) +list(struct snapshot *snap, void *arg) { - print_md(stdout, ent->md, sizeof(ent->md)); - if (ent->msg[0] != '\0') - printf("\t%s\n", ent->msg); + print_md(stdout, snap->md, sizeof(snap->md)); + if (snap->msg[0] != '\0') + printf("\t%s\n", snap->msg); else putchar('\n'); return WALK_CONTINUE; } int -rebuild_cache(struct ent *ent, void *arg) +rebuild_cache(struct snapshot *snap, void *arg) { uint8_t md[MDSIZE]; uint8_t *buf; @@ -641,47 +636,47 @@ rebuild_cache(struct ent *ent, void *arg) uint64_t i; buf = alloc_buf(comp_size(BLKSIZE)); - for (i = 0; i < ent->nblks; i++) { - struct cent *cent; + for (i = 0; i < snap->nr_blk_descs; i++) { + struct cache_entry *ent; - read_blk(buf, &ent->bdescr[i]); + read_blk(buf, &snap->blk_desc[i]); SHA256_Init(&ctx); - SHA256_Update(&ctx, buf, ent->bdescr[i].size); + SHA256_Update(&ctx, buf, snap->blk_desc[i].size); SHA256_Final(md, &ctx); - cent = alloc_cent(); - memcpy(cent->bdescr.md, md, sizeof(cent->bdescr.md)); - cent->bdescr = ent->bdescr[i]; - add_cent(cent); + ent = alloc_cache_entry(); + memcpy(ent->blk_desc.md, md, sizeof(ent->blk_desc.md)); + ent->blk_desc = snap->blk_desc[i]; + add_cache_entry(ent); cache_dirty = 1; } free(buf); return WALK_CONTINUE; } -/* Walk through all index entries and call fn() on each one */ +/* Walk through all snapshots and call fn() on each one */ void -walk(int (*fn)(struct ent *, void *), void *arg) +walk(int (*fn)(struct snapshot *, void *), void *arg) { - struct ent *ent; + struct snapshot *snap; uint64_t i; - ent = alloc_ent(); - xlseek(ifd, sizeof(enthdr), SEEK_SET); - for (i = 0; i < enthdr.nents; i++) { - if (xread(ifd, ent, sizeof(*ent)) == 0) + snap = alloc_snap(); + xlseek(ifd, sizeof(snaphdr), SEEK_SET); + for (i = 0; i < snaphdr.nr_snapshots; i++) { + if (xread(ifd, snap, sizeof(*snap)) == 0) errx(1, "read: unexpected EOF"); - ent = grow_ent(ent, ent->nblks); - if (xread(ifd, ent->bdescr, - ent->nblks * sizeof(ent->bdescr[0])) == 0) + snap = grow_snap(snap, snap->nr_blk_descs); + if (xread(ifd, snap->blk_desc, + snap->nr_blk_descs * sizeof(snap->blk_desc[0])) == 0) errx(1, "read: unexpected EOF"); - if ((*fn)(ent, arg) == WALK_STOP) + if ((*fn)(snap, arg) == WALK_STOP) break; } - free(ent); + free(snap); } void @@ -689,15 +684,15 @@ init_cache(void) { uint64_t nents, i; - nents = cache_nents(); + nents = cache_nr_entries(); xlseek(cfd, 0, SEEK_SET); for (i = 0; i < nents; i++) { - struct cent *cent; + struct cache_entry *ent; - cent = alloc_cent(); - if (xread(cfd, &cent->bdescr, sizeof(cent->bdescr)) == 0) + ent = alloc_cache_entry(); + if (xread(cfd, &ent->blk_desc, sizeof(ent->blk_desc)) == 0) errx(1, "read: unexpected EOF"); - add_cent(cent); + add_cache_entry(ent); } } @@ -706,9 +701,9 @@ init(void) { struct stat sb; - ifd = open(INDEXF, O_RDWR | O_CREAT, 0600); + ifd = open(SNAPSF, O_RDWR | O_CREAT, 0600); if (ifd < 0) - err(1, "open %s", INDEXF); + err(1, "open %s", SNAPSF); sfd = open(STOREF, O_RDWR | O_CREAT, 0600); if (sfd < 0) @@ -724,24 +719,24 @@ init(void) errx(1, "busy lock"); if (fstat(ifd, &sb) < 0) - err(1, "fstat %s", INDEXF); + err(1, "fstat %s", SNAPSF); if (sb.st_size != 0) { uint8_t maj, min; - xread(ifd, &enthdr, sizeof(enthdr)); - min = enthdr.flags & 0xff; - maj = (enthdr.flags >> 8) & 0xff; + xread(ifd, &snaphdr, sizeof(snaphdr)); + min = snaphdr.flags & 0xff; + maj = (snaphdr.flags >> 8) & 0xff; if (maj != VER_MAJ || min != VER_MIN) - errx(1, "expected index format version %u.%u but got %u.%u", + errx(1, "expected snapshot format version %u.%u but got %u.%u", VER_MAJ, VER_MIN, maj, min); } else { - enthdr.flags = (VER_MAJ << 8) | VER_MIN; - xwrite(ifd, &enthdr, sizeof(enthdr)); - enthdr.st.min_blk_size = comp_size(BLKSIZE); + snaphdr.flags = (VER_MAJ << 8) | VER_MIN; + xwrite(ifd, &snaphdr, sizeof(snaphdr)); + snaphdr.st.min_blk_size = comp_size(BLKSIZE); } - if (cache_nents() != 0) + if (cache_nr_entries() != 0) init_cache(); else walk(rebuild_cache, NULL); @@ -751,7 +746,7 @@ void term(void) { if (verbose) - print_stats(&enthdr.st); + print_stats(&snaphdr.st); flush_cache(); free_cache();