dedup

data deduplication program
git clone git://git.2f30.org/dedup.git
Log | Files | Refs | README | LICENSE

commit 9adb9c6c1a308e2845d822987948afbf12fc943b
parent 19420eb96fb72d9d5f7940678d1b957aa26f1fce
Author: sin <sin@2f30.org>
Date:   Fri, 22 Feb 2019 23:24:03 +0000

Move rbtree cache to cache.c

Diffstat:
MMakefile | 5+++--
Acache.c | 94+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdedup.c | 222++++++++++++++++++++++---------------------------------------------------------
Mdedup.h | 7+++++++
Atypes.h | 44++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 210 insertions(+), 162 deletions(-)

diff --git a/Makefile b/Makefile @@ -2,8 +2,8 @@ VERSION = 0.4 PREFIX = /usr/local MANPREFIX = $(PREFIX)/man BIN = dedup -SRC = $(BIN).c chunker.c hash.c pack.c unpack.c utils.c -OBJ = $(BIN).o chunker.o hash.o pack.o unpack.o utils.o +SRC = $(BIN).c cache.c chunker.c hash.c pack.c unpack.c utils.c +OBJ = $(BIN).o cache.o chunker.o hash.o pack.o unpack.o utils.o DISTFILES = \ $(SRC) \ LICENSE \ @@ -14,6 +14,7 @@ DISTFILES = \ $(BIN).1 \ dedup.h \ tree.h \ + types.h \ CFLAGS = -g -Wall CPPFLAGS = -I/usr/local/include -D_FILE_OFFSET_BITS=64 diff --git a/cache.c b/cache.c @@ -0,0 +1,94 @@ +#include <sys/types.h> +#include <sys/stat.h> + +#include <err.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "dedup.h" +#include "tree.h" + +struct cache_node { + struct cache_entry ent; + RB_ENTRY(cache_node) e; +}; + +static RB_HEAD(cache, cache_node) cache_head; + +static int +cache_node_cmp(struct cache_node *e1, struct cache_node *e2) +{ + int r; + + r = memcmp(e1->ent.md, e2->ent.md, sizeof(e1->ent.md)); + if (r > 0) + return 1; + else if (r < 0) + return -1; + return 0; +} +static RB_PROTOTYPE(cache, cache_node, e, cache_node_cmp); +static RB_GENERATE(cache, cache_node, e, cache_node_cmp); + +static struct cache_node * +alloc_cache_node(struct cache_entry *ent) +{ + struct cache_node *node; + + node = calloc(1, sizeof(*node)); + if (node == NULL) + err(1, "calloc"); + node->ent = *ent; + return node; +} + +static void +free_cache_node(struct cache_node *node) +{ + free(node); +} + +void +add_cache_entry(struct cache_entry *ent) +{ + struct cache_node *node; + + node = alloc_cache_node(ent); + RB_INSERT(cache, &cache_head, node); +} + +int +lookup_cache_entry(struct cache_entry *ent) +{ + struct cache_node *node, key; + + key.ent = *ent; + node = RB_FIND(cache, &cache_head, &key); + if (node != NULL) { + *ent = node->ent; + return 0; + } + return -1; +} + +void +walk_cache(int (*fn)(struct cache_entry *)) +{ + struct cache_node *node; + + RB_FOREACH(node, cache, &cache_head) + (*fn)(&node->ent); +} + +void +free_cache(void) +{ + struct cache_node *node, *tmp; + + RB_FOREACH_SAFE(node, cache, &cache_head, tmp) { + RB_REMOVE(cache, &cache_head, node); + free_cache_node(node); + } +} diff --git a/dedup.c b/dedup.c @@ -1,3 +1,4 @@ +#include <sys/types.h> #include <sys/stat.h> #include <sys/file.h> @@ -14,67 +15,21 @@ #include "arg.h" #include "dedup.h" -#include "tree.h" #define SNAPSF ".snapshots" #define STOREF ".store" #define CACHEF ".cache" -#define MSGSIZE 256 -#define MDSIZE SHA256_DIGEST_LENGTH - -/* file format version */ -#define VER_MIN 1 -#define VER_MAJ 0 - enum { WALK_CONTINUE, WALK_STOP }; -struct stats { - uint64_t orig_size; - uint64_t comp_size; - uint64_t dedup_size; - uint64_t min_blk_size; - uint64_t max_blk_size; - uint64_t nr_blks; - uint64_t reserved[6]; -}; - -struct snapshot_hdr { - uint64_t flags; - uint64_t nr_snapshots; - uint64_t store_size; - uint64_t reserved[4]; - struct stats st; -}; - -struct blk_desc { - uint8_t md[MDSIZE]; - uint64_t offset; - uint64_t size; -}; - -struct snapshot { - uint64_t size; - uint8_t msg[MSGSIZE]; - uint8_t md[MDSIZE]; /* hash of file */ - uint64_t nr_blk_descs; - struct blk_desc blk_desc[]; -}; - -struct cache_entry { - struct blk_desc blk_desc; - RB_ENTRY(cache_entry) e; -}; - struct extract_args { uint8_t *md; int fd; }; -static RB_HEAD(cache, cache_entry) cache_head; static struct snapshot_hdr snaphdr; static int ifd; static int sfd; @@ -144,78 +99,6 @@ print_stats(struct stats *st) fprintf(stderr, "cache misses: %llu\n", cache_misses); } -static int -cache_entry_cmp(struct cache_entry *e1, struct cache_entry *e2) -{ - int r; - - r = memcmp(e1->blk_desc.md, e2->blk_desc.md, sizeof(e1->blk_desc.md)); - if (r > 0) - return 1; - else if (r < 0) - return -1; - return 0; -} -static RB_PROTOTYPE(cache, cache_entry, e, cache_entry_cmp); -static RB_GENERATE(cache, cache_entry, e, cache_entry_cmp); - -static struct cache_entry * -alloc_cache_entry(void) -{ - struct cache_entry *ent; - - ent = calloc(1, sizeof(*ent)); - if (ent == NULL) - err(1, "calloc"); - return ent; -} - -static void -free_cache_entry(struct cache_entry *ent) -{ - free(ent); -} - -static void -add_cache_entry(struct cache_entry *ent) -{ - RB_INSERT(cache, &cache_head, ent); -} - -static void -flush_cache(void) -{ - struct cache_entry *ent; - - if (!cache_dirty) - return; - - xlseek(cfd, 0, SEEK_SET); - RB_FOREACH(ent, cache, &cache_head) - xwrite(cfd, &ent->blk_desc, sizeof(ent->blk_desc)); -} - -static void -free_cache(void) -{ - struct cache_entry *ent, *tmp; - - RB_FOREACH_SAFE(ent, cache, &cache_head, tmp) { - RB_REMOVE(cache, &cache_head, ent); - free_cache_entry(ent); - } -} - -static uint64_t -cache_nr_entries(void) -{ - struct stat sb; - - if (fstat(cfd, &sb) < 0) - err(1, "fstat"); - return sb.st_size / sizeof(struct blk_desc); -} - static void append_snap(struct snapshot *snap) { @@ -304,26 +187,12 @@ append_blk(uint8_t *buf, struct blk_desc *blk_desc) snaphdr.store_size += blk_desc->size; } -static int -lookup_blk_desc(uint8_t *md, struct blk_desc *blk_desc) -{ - struct cache_entry *ent, key; - - memcpy(key.blk_desc.md, md, sizeof(key.blk_desc.md)); - ent = RB_FIND(cache, &cache_head, &key); - if (ent != NULL) { - *blk_desc = ent->blk_desc; - return 0; - } - return -1; -} - static void dedup_chunk(struct snapshot *snap, uint8_t *chunkp, size_t chunk_size) { uint8_t md[MDSIZE]; + struct cache_entry cache_entry; uint8_t *comp_buf; - struct blk_desc blk_desc; size_t n; comp_buf = alloc_buf(comp_size(BLKSIZE_MAX)); @@ -334,21 +203,21 @@ dedup_chunk(struct snapshot *snap, uint8_t *chunkp, size_t chunk_size) snaphdr.st.orig_size += chunk_size; snaphdr.st.comp_size += n; - if (lookup_blk_desc(md, &blk_desc) < 0) { - struct cache_entry *ent; + memcpy(cache_entry.md, md, sizeof(cache_entry.md)); + if (lookup_cache_entry(&cache_entry) < 0) { + struct blk_desc blk_desc; - memcpy(blk_desc.md, md, sizeof(blk_desc.md)); + memcpy(&blk_desc.md, md, sizeof(blk_desc.md)); blk_desc.offset = snaphdr.store_size; blk_desc.size = n; snap->blk_desc[snap->nr_blk_descs++] = blk_desc; - append_blk(comp_buf, &blk_desc); - ent = alloc_cache_entry(); - ent->blk_desc = blk_desc; - add_cache_entry(ent); + cache_entry.offset = blk_desc.offset; + cache_entry.size = blk_desc.size; cache_dirty = 1; + add_cache_entry(&cache_entry); cache_misses++; snaphdr.st.dedup_size += blk_desc.size; @@ -359,6 +228,11 @@ dedup_chunk(struct snapshot *snap, uint8_t *chunkp, size_t chunk_size) if (blk_desc.size < snaphdr.st.min_blk_size) snaphdr.st.min_blk_size = blk_desc.size; } else { + struct blk_desc blk_desc; + + memcpy(&blk_desc.md, cache_entry.md, sizeof(blk_desc.md)); + blk_desc.offset = cache_entry.offset; + blk_desc.size = cache_entry.size; snap->blk_desc[snap->nr_blk_descs++] = blk_desc; cache_hits++; } @@ -494,19 +368,21 @@ rebuild_cache(struct snapshot *snap, void *arg) buf = alloc_buf(comp_size(BLKSIZE_MAX)); for (i = 0; i < snap->nr_blk_descs; i++) { - struct cache_entry *ent; + struct cache_entry cache_entry; + struct blk_desc *blk_desc; - read_blk(buf, &snap->blk_desc[i]); + blk_desc = &snap->blk_desc[i]; + read_blk(buf, blk_desc); SHA256_Init(&ctx); - SHA256_Update(&ctx, buf, snap->blk_desc[i].size); + SHA256_Update(&ctx, buf, blk_desc->size); SHA256_Final(md, &ctx); - ent = alloc_cache_entry(); - memcpy(ent->blk_desc.md, md, sizeof(ent->blk_desc.md)); - ent->blk_desc = snap->blk_desc[i]; - add_cache_entry(ent); + memcpy(cache_entry.md, blk_desc->md, sizeof(cache_entry.md)); + cache_entry.offset = blk_desc->offset; + cache_entry.size = blk_desc->size; cache_dirty = 1; + add_cache_entry(&cache_entry); } free(buf); return WALK_CONTINUE; @@ -514,7 +390,7 @@ rebuild_cache(struct snapshot *snap, void *arg) /* Walk through all snapshots and call fn() on each one */ static void -walk(int (*fn)(struct snapshot *, void *), void *arg) +walk_snap(int (*fn)(struct snapshot *, void *), void *arg) { uint64_t i; @@ -539,19 +415,37 @@ walk(int (*fn)(struct snapshot *, void *), void *arg) } } +static int +flush_cache(struct cache_entry *cache_entry) +{ + xwrite(cfd, cache_entry, sizeof(*cache_entry)); + return 0; +} + +static uint64_t +cache_nr_entries(void) +{ + struct stat sb; + + if (fstat(cfd, &sb) < 0) + err(1, "fstat"); + return sb.st_size / sizeof(struct cache_entry); +} + static void -init_cache(void) +load_cache(void) { + uint64_t nr_entries; uint64_t i; xlseek(cfd, 0, SEEK_SET); - for (i = 0; i < cache_nr_entries(); i++) { - struct cache_entry *ent; + nr_entries = cache_nr_entries(); + for (i = 0; i < nr_entries; i++) { + struct cache_entry cache_entry; - ent = alloc_cache_entry(); - if (xread(cfd, &ent->blk_desc, sizeof(ent->blk_desc)) == 0) + if (xread(cfd, &cache_entry, sizeof(cache_entry)) == 0) errx(1, "read: unexpected EOF"); - add_cache_entry(ent); + add_cache_entry(&cache_entry); } } @@ -596,9 +490,9 @@ init(void) } if (cache_nr_entries() != 0) - init_cache(); + load_cache(); else - walk(rebuild_cache, NULL); + walk_snap(rebuild_cache, NULL); } static void @@ -606,7 +500,11 @@ term(void) { if (verbose) print_stats(&snaphdr.st); - flush_cache(); + + if (cache_dirty) { + xlseek(cfd, 0, SEEK_SET); + walk_cache(flush_cache); + } free_cache(); fsync(ifd); @@ -683,20 +581,24 @@ main(int argc, char *argv[]) init(); if (cflag) { - walk(check, NULL); + walk_snap(check, NULL); term(); return 0; } if (lflag) { - walk(list, NULL); + walk_snap(list, NULL); term(); return 0; } if (id) { + struct extract_args args; + str2bin(id, md); - walk(extract, &(struct extract_args){ .md = md, .fd = fd }); + args.md = md; + args.fd = fd; + walk_snap(extract, &args); } else { dedup(fd, msg); } diff --git a/dedup.h b/dedup.h @@ -1,7 +1,14 @@ #include "config.h" +#include "types.h" struct chunker; +/* cache.c */ +void add_cache_entry(struct cache_entry *ent); +int lookup_cache_entry(struct cache_entry *ent); +void walk_cache(int (*fn)(struct cache_entry *)); +void free_cache(void); + /* chunker.c */ struct chunker *alloc_chunker(size_t cap, int fd); void free_chunker(struct chunker *chunker); diff --git a/types.h b/types.h @@ -0,0 +1,44 @@ +#define MSGSIZE 256 +#define MDSIZE 32 + +/* snashot file format version */ +#define VER_MIN 1 +#define VER_MAJ 0 + +struct stats { + uint64_t orig_size; + uint64_t comp_size; + uint64_t dedup_size; + uint64_t min_blk_size; + uint64_t max_blk_size; + uint64_t nr_blks; + uint64_t reserved[6]; +}; + +struct snapshot_hdr { + uint64_t flags; + uint64_t nr_snapshots; + uint64_t store_size; + uint64_t reserved[4]; + struct stats st; +}; + +struct blk_desc { + uint8_t md[MDSIZE]; + uint64_t offset; + uint64_t size; +}; + +struct snapshot { + uint64_t size; + uint8_t msg[MSGSIZE]; + uint8_t md[MDSIZE]; /* hash of snapshot */ + uint64_t nr_blk_descs; + struct blk_desc blk_desc[]; +}; + +struct cache_entry { + uint8_t md[MDSIZE]; + uint64_t offset; + uint64_t size; +};