dedup

deduplicating backup program
git clone git://git.2f30.org/dedup
Log | Files | Refs | README | LICENSE

commit 5ae463d1c2cb6c77d735d53ee7e00c3a00b70090
parent 4a9c691eb97725f224362ee65dc088f0260bc8b6
Author: sin <sin@2f30.org>
Date:   Wed, 21 Mar 2018 13:45:58 +0000

Rework cache code

Diffstat:
Mdedup.c | 99+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
1 file changed, 64 insertions(+), 35 deletions(-)

diff --git a/dedup.c b/dedup.c @@ -35,17 +35,18 @@ struct blk { unsigned char data[BLKSIZ]; } __attribute__((packed)); -struct cent { +struct cache_data { unsigned char md[SHA256_DIGEST_LENGTH]; uint64_t blkidx; -} __attribute__((packed)); +}; -struct hash_ent { - struct cent cent; - RB_ENTRY(hash_ent) e; +struct cache_ent { + struct cache_data data; + int dirty; + RB_ENTRY(cache_ent) e; }; -RB_HEAD(hash_tree, hash_ent) hash_tree_head; +RB_HEAD(cache, cache_ent) cache_head; struct enthdr enthdr; int ifd; int sfd; @@ -122,33 +123,53 @@ xwrite(int fd, const void *buf, size_t nbytes) } int -hash_ent_cmp(struct hash_ent *e1, struct hash_ent *e2) +cache_ent_cmp(struct cache_ent *e1, struct cache_ent *e2) { int r; - r = memcmp(e1->cent.md, e2->cent.md, sizeof(e1->cent.md)); + r = memcmp(e1->data.md, e2->data.md, sizeof(e1->data.md)); if (r > 0) return 1; else if (r < 0) return -1; return 0; } -RB_PROTOTYPE(hash_tree, hash_ent, e, hash_ent_cmp); -RB_GENERATE(hash_tree, hash_ent, e, hash_ent_cmp); +RB_PROTOTYPE(cache, cache_ent, e, cache_ent_cmp); +RB_GENERATE(cache, cache_ent, e, cache_ent_cmp); -struct hash_ent * -hash_ent_add(unsigned char *md, uint64_t blkidx) +struct cache_ent * +alloc_cache_ent(unsigned char *md, uint64_t blkidx) { - struct hash_ent *hash_ent; + struct cache_ent *ent; - hash_ent = malloc(sizeof(*hash_ent)); - if (hash_ent == NULL) + ent = calloc(1, sizeof(*ent)); + if (ent == NULL) err(1, "malloc"); + memcpy(&ent->data.md, md, sizeof(ent->data.md)); + ent->data.blkidx = blkidx; + return ent; +} + +void +add_cache_ent(struct cache_ent *ent) +{ + RB_INSERT(cache, &cache_head, ent); +} - memcpy(&hash_ent->cent.md, md, sizeof(hash_ent->cent.md)); - hash_ent->cent.blkidx = blkidx; - RB_INSERT(hash_tree, &hash_tree_head, hash_ent); - return hash_ent; +void +flush_cache(void) +{ + struct cache_ent *ent; + + if (verbose) + fprintf(stderr, "flushing cache...\n"); + RB_FOREACH(ent, cache, &cache_head) { + if (!ent->dirty) + continue; + lseek(cfd, ent->data.blkidx * sizeof(ent->data), SEEK_SET); + xwrite(cfd, &ent->data, sizeof(ent->data)); + ent->dirty = 0; + } } void @@ -200,7 +221,7 @@ storefile_nblks(void) uint64_t cachefile_nblks(void) { - return lseek(cfd, 0, SEEK_END) / sizeof(struct cent); + return lseek(cfd, 0, SEEK_END) / sizeof(struct cache_data); } void @@ -231,12 +252,12 @@ append_blk(struct blk *blk) int lookup_blk(struct blk *blk, uint64_t *blkidx) { - struct hash_ent *hash_ent, key; + struct cache_ent *ent, key; - memcpy(key.cent.md, blk->md, sizeof(key.cent.md)); - hash_ent = RB_FIND(hash_tree, &hash_tree_head, &key); - if (hash_ent != NULL) { - *blkidx = hash_ent->cent.blkidx; + memcpy(key.data.md, blk->md, sizeof(key.data.md)); + ent = RB_FIND(cache, &cache_head, &key); + if (ent != NULL) { + *blkidx = ent->data.blkidx; return 0; } return -1; @@ -264,13 +285,15 @@ dedup(int fd) ent = grow_ent(ent, ent->nblks + 1); if (lookup_blk(&blk, &blkidx) == -1) { - struct hash_ent *hash_ent; + struct cache_ent *cache_ent; uint64_t nblks = storefile_nblks(); + /* Create a cache entry for this block */ + cache_ent = alloc_cache_ent(blk.md, nblks); + add_cache_ent(cache_ent); + cache_ent->dirty = 1; + ent->blks[ent->nblks++] = nblks; - hash_ent = hash_ent_add(blk.md, nblks); - lseek(cfd, 0, SEEK_END); - xwrite(cfd, &hash_ent->cent, sizeof(hash_ent->cent)); append_blk(&blk); } else { ent->blks[ent->nblks++] = blkidx; @@ -281,6 +304,7 @@ dedup(int fd) SHA256_Final(ent->md, &ctx); append_ent(ent); free(ent); + flush_cache(); } void @@ -339,13 +363,15 @@ rebuild_cache(void) nblks = storefile_nblks(); lseek(cfd, 0, SEEK_SET); for (i = 0; i < nblks; i++) { - struct hash_ent *hash_ent; + struct cache_ent *ent; struct blk blk; read_blk(&blk, i); - hash_ent = hash_ent_add(blk.md, i); - xwrite(cfd, &hash_ent->cent, sizeof(hash_ent->cent)); + ent = alloc_cache_ent(blk.md, i); + add_cache_ent(ent); + ent->dirty = 1; } + flush_cache(); } void @@ -359,11 +385,13 @@ init_cache(void) nblks = cachefile_nblks(); lseek(cfd, 0, SEEK_SET); for (i = 0; i < nblks; i++) { - struct cent cent; + struct blk blk; + struct cache_ent *ent; - if (xread(cfd, &cent, sizeof(cent)) == 0) + ent = alloc_cache_ent(blk.md, i); + if (xread(cfd, &ent->data, sizeof(ent->data)) == 0) errx(1, "unexpected EOF"); - hash_ent_add(cent.md, cent.blkidx); + add_cache_ent(ent); } } @@ -401,6 +429,7 @@ term(void) fsync(ifd); fsync(sfd); fsync(cfd); + close(ifd); close(sfd); close(cfd);