dedup

data deduplication program
git clone git://git.2f30.org/dedup.git
Log | Files | Refs | README | LICENSE

commit df98cd4221d5e54c88d5ce172690c6cc7cb08235
parent 13ea526261fd0d7821dbf271bd9ccd509ae4e6fd
Author: sin <sin@2f30.org>
Date:   Tue,  5 Mar 2019 22:39:08 +0000

Change cache consistency algorithm

Instead of doing complicated comparisons between snapshot blocks and
cache entries, just store the hash of all cache entries in the
snapshot header.  When the cache is loaded, the hash is calculated and
compared against the one in the snapshot header.  If they are
different, a cache rebuild is triggered.

Diffstat:
Mcache.c | 5+++--
Mdedup.c | 127++++++++++++++++++++++---------------------------------------------------------
Mdedup.h | 8+++++---
Mtypes.c | 14++++++++++----
4 files changed, 53 insertions(+), 101 deletions(-)

diff --git a/cache.c b/cache.c @@ -102,10 +102,11 @@ lookup_cache_entry(struct cache *cache, struct cache_entry *ent) } void -walk_cache(struct cache *cache, int (*fn)(struct cache_entry *)) +walk_cache(struct cache *cache, + int (*fn)(struct cache_entry *, void *), void *arg) { struct cache_node *node; RB_FOREACH(node, cache_head, &cache->nodes) - (*fn)(&node->ent); + (*fn)(&node->ent, arg); } diff --git a/dedup.c b/dedup.c @@ -37,7 +37,6 @@ static struct cache *cache; static int ifd; static int sfd; static int cfd; -static int cache_dirty; static unsigned long long cache_hits; static unsigned long long cache_misses; @@ -253,7 +252,6 @@ dedup_chunk(struct snapshot *snap, uint8_t *chunkp, size_t chunk_size) cache_entry.offset = blk_desc.offset; cache_entry.size = blk_desc.size; - cache_dirty = 1; add_cache_entry(cache, &cache_entry); cache_misses++; @@ -396,40 +394,10 @@ check_snap(struct snapshot *snap, void *arg) return WALK_CONTINUE; } -/* - * For each block descriptor within each snapshot, do a lookup - * of the block descriptor hash in the cache. If the lookup fails - * the cache is corrupted. The caller will rebuild the cache in - * that case. - */ -static int -check_cache_entry(struct snapshot *snap, void *arg) -{ - int *ret = arg; - uint64_t i; - - for (i = 0; i < snap->nr_blk_descs; i++) { - struct blk_desc *blk_desc; - struct cache_entry cache_entry; - - blk_desc = &snap->blk_desc[i]; - memcpy(&cache_entry.md, blk_desc->md, sizeof(cache_entry.md)); - if (lookup_cache_entry(cache, &cache_entry) < 0 || - cache_entry.offset != blk_desc->offset || - cache_entry.size != blk_desc->size) { - *ret = -1; - return WALK_STOP; - } - } - return WALK_CONTINUE; -} - static int reload_cache(struct snapshot *snap, void *arg) { - uint8_t md[MDSIZE]; uint8_t *buf; - SHA256_CTX ctx; uint64_t i; buf = alloc_buf(compr_size(BLKSIZE_MAX)); @@ -438,19 +406,12 @@ reload_cache(struct snapshot *snap, void *arg) struct blk_desc *blk_desc; blk_desc = &snap->blk_desc[i]; - read_blk(buf, blk_desc); - - SHA256_Init(&ctx); - SHA256_Update(&ctx, buf, blk_desc->size); - SHA256_Final(md, &ctx); - memcpy(cache_entry.md, blk_desc->md, sizeof(cache_entry.md)); cache_entry.offset = blk_desc->offset; cache_entry.size = blk_desc->size; add_cache_entry(cache, &cache_entry); } free(buf); - cache_dirty = 1; return WALK_CONTINUE; } @@ -500,77 +461,59 @@ match_ver(uint64_t v) VER_MAJ, VER_MIN, maj, min); } -static uint64_t -cache_nr_entries(void) +static void +hash_cache_entry_update(struct cache_entry *cache_entry, SHA256_CTX *ctx) { - struct stat sb; + uint8_t buf[CACHE_ENTRY_SIZE]; + char fmt[BUFSIZ]; + int n; - if (fstat(cfd, &sb) < 0) - err(1, "fstat"); - return sb.st_size / CACHE_ENTRY_SIZE; + snprintf(fmt, sizeof(fmt), "'%dqq", MDSIZE); + n = pack(buf, fmt, cache_entry->md, cache_entry->offset, + cache_entry->size); + SHA256_Update(ctx, buf, n); } static void -check_cache(void) +load_cache(void) { - int ret; + uint8_t md[MDSIZE]; + struct stat sb; + SHA256_CTX ctx; + uint64_t nr_entries, i; - if (verbose > 0) - fprintf(stderr, "Checking cache\n"); + if (fstat(cfd, &sb) < 0) + err(1, "fstat"); + nr_entries = sb.st_size / CACHE_ENTRY_SIZE; - if (cache_nr_entries() != snap_hdr.st.nr_blks) { - ret = -1; - } else { - xlseek(ifd, SNAP_HDR_SIZE, SEEK_SET); - ret = 0; - walk_snap(check_cache_entry, &ret); + xlseek(cfd, 0, SEEK_SET); + SHA256_Init(&ctx); + for (i = 0; i < nr_entries; i++) { + struct cache_entry cache_entry; + + read_cache_entry(cfd, &cache_entry); + hash_cache_entry_update(&cache_entry, &ctx); + add_cache_entry(cache, &cache_entry); } + SHA256_Final(md, &ctx); - if (ret != 0) { + if (memcmp(snap_hdr.cache_md, md, sizeof(snap_hdr.cache_md)) != 0) { if (verbose > 0) fprintf(stderr, "Rebuilding cache\n"); - free_cache(cache); cache = alloc_cache(); - if (ftruncate(cfd, 0) < 0) err(1, "ftruncate"); - xlseek(ifd, SNAP_HDR_SIZE, SEEK_SET); xlseek(cfd, 0, SEEK_SET); walk_snap(reload_cache, NULL); } } -static void -load_cache(void) -{ - uint64_t nr_entries; - uint64_t i; - - xlseek(cfd, 0, SEEK_SET); - nr_entries = cache_nr_entries(); - if (nr_entries != snap_hdr.st.nr_blks) { - if (verbose > 0) - fprintf(stderr, "Rebuilding cache\n"); - if (ftruncate(cfd, 0) < 0) - err(1, "ftruncate"); - xlseek(ifd, SNAP_HDR_SIZE, SEEK_SET); - walk_snap(reload_cache, NULL); - return; - } - - for (i = 0; i < nr_entries; i++) { - struct cache_entry cache_entry; - - read_cache_entry(cfd, &cache_entry); - add_cache_entry(cache, &cache_entry); - } -} - static int -flush_cache(struct cache_entry *cache_entry) +flush_cache(struct cache_entry *cache_entry, void *arg) { + hash_cache_entry_update(cache_entry, arg); write_cache_entry(cfd, cache_entry); return 0; } @@ -578,10 +521,12 @@ flush_cache(struct cache_entry *cache_entry) static void save_cache(void) { - if (cache_dirty) { - xlseek(cfd, 0, SEEK_SET); - walk_cache(cache, flush_cache); - } + SHA256_CTX ctx; + + SHA256_Init(&ctx); + xlseek(cfd, 0, SEEK_SET); + walk_cache(cache, flush_cache, &ctx); + SHA256_Final(snap_hdr.cache_md, &ctx); } static void @@ -784,8 +729,6 @@ main(int argc, char *argv[]) if (ret != 0) errx(1, "%s or %s is corrupted", SNAPSF, STOREF); - check_cache(); - term(); return 0; } diff --git a/dedup.h b/dedup.h @@ -6,7 +6,7 @@ * using the helpers from types.c. Any modification made to * the structs below will need to be reflected here and in types.c. */ -#define SNAP_HDR_SIZE 104 +#define SNAP_HDR_SIZE 136 #define BLK_HDR_SIZE 16 #define BLK_DESC_SIZE 48 #define SNAPSHOT_SIZE 304 @@ -16,7 +16,7 @@ #define MDSIZE 32 /* file format version */ -#define VER_MIN 2 +#define VER_MIN 3 #define VER_MAJ 0 #define VER_MIN_MASK 0xff @@ -43,6 +43,7 @@ struct snapshot_hdr { uint64_t flags; /* bottom 16 bits are maj/min version */ uint64_t size; /* size of snapshots file */ uint64_t nr_snapshots; + uint8_t cache_md[MDSIZE]; struct stats st; }; @@ -82,7 +83,8 @@ struct cache *alloc_cache(void); void free_cache(struct cache *cache); void add_cache_entry(struct cache *cache, struct cache_entry *ent); int lookup_cache_entry(struct cache *cache, struct cache_entry *ent); -void walk_cache(struct cache *cache, int (*fn)(struct cache_entry *)); +void walk_cache(struct cache *cache, + int (*fn)(struct cache_entry *, void *), void *arg); /* chunker.c */ struct chunker *alloc_chunker(int fd, size_t cap); diff --git a/types.c b/types.c @@ -10,15 +10,18 @@ void read_snap_hdr(int fd, struct snapshot_hdr *hdr) { uint8_t buf[SNAP_HDR_SIZE]; + char fmt[BUFSIZ]; int n; if (xread(fd, buf, sizeof(buf)) == 0) errx(1, "read_snap_hdr: unexpected EOF"); - n = unpack(buf, "qqq", + snprintf(fmt, sizeof(fmt), "qqq'%d", MDSIZE); + n = unpack(buf, fmt, &hdr->flags, &hdr->size, - &hdr->nr_snapshots); + &hdr->nr_snapshots, + hdr->cache_md); n += unpack(&buf[n], "qqqqqq", &hdr->st.orig_size, @@ -41,12 +44,15 @@ void write_snap_hdr(int fd, struct snapshot_hdr *hdr) { uint8_t buf[SNAP_HDR_SIZE]; + char fmt[BUFSIZ]; int n; - n = pack(buf, "qqq", + snprintf(fmt, sizeof(fmt), "qqq'%d", MDSIZE); + n = pack(buf, fmt, hdr->flags, hdr->size, - hdr->nr_snapshots); + hdr->nr_snapshots, + hdr->cache_md); n += pack(&buf[n], "qqqqqq", hdr->st.orig_size,