dedup

data deduplication program
git clone git://git.2f30.org/dedup.git
Log | Files | Refs | README | LICENSE

commit 3205dbd75458fc84e08ca98ee1e1fe17b19f2693
parent 2d0701e96dd5242eefe456dca44a5c2b8ba67eb5
Author: sin <sin@2f30.org>
Date:   Thu, 25 Apr 2019 20:54:30 +0100

Implement dup-check(1)

Diffstat:
MMakefile | 12++++++++----
MTODO | 1-
Mbcompress.c | 11+++++++++++
Mblock.c | 12++++++++++++
Mblock.h | 2++
Mbstorage.c | 49+++++++++++++++++++++++++++++++++++++++++++++++++
Mdotest | 4++++
Adup-check.1 | 25+++++++++++++++++++++++++
Adup-check.c | 81+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
9 files changed, 192 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile @@ -1,7 +1,7 @@ include config.mk -BIN = dup-init dup-pack dup-unpack -MAN = dup-init.1 dup-pack.1 dup-unpack.1 +BIN = dup-check dup-init dup-pack dup-unpack +MAN = dup-check.1 dup-init.1 dup-pack.1 dup-unpack.1 HDR = \ arg.h \ @@ -24,6 +24,7 @@ COMMOBJ = \ snap.o \ unpack.o \ +DCHECKOBJ = $(COMMOBJ) dup-check.o DINITOBJ = $(COMMOBJ) dup-init.o DPACKOBJ = $(COMMOBJ) dup-pack.o DUNPACKOBJ = $(COMMOBJ) dup-unpack.o @@ -32,10 +33,10 @@ LDLIBS = -lsnappy all: $(BIN) -$(DINITOBJ) $(DPACKOBJ) $(DUNPACKOBJ): $(HDR) +$(DCHECKOBJ) $(DINITOBJ) $(DPACKOBJ) $(DUNPACKOBJ): $(HDR) clean: - rm -f $(DINITOBJ) $(DPACKOBJ) $(DUNPACKOBJ) $(BIN) + rm -f $(DCHECKOBJ) $(DINITOBJ) $(DPACKOBJ) $(DUNPACKOBJ) $(BIN) rm -rf dedup-$(VERSION) dedup-$(VERSION).tar.gz install: all @@ -58,6 +59,9 @@ dist: clean .c.o: $(CC) $(CPPFLAGS) $(CFLAGS) -c $< +dup-check: $(DCHECKOBJ) + $(CC) -o $@ $(DCHECKOBJ) $(LDFLAGS) $(LDLIBS) + dup-init: $(DINITOBJ) $(CC) -o $@ $(DINITOBJ) $(LDFLAGS) $(LDLIBS) diff --git a/TODO b/TODO @@ -1,5 +1,4 @@ Use a ring buffer in the chunker (avoid memmove() call) Create a library archive out of the blake2b files and link with it pledge/unveil support -Implement dup-check(1) Use flock() to avoid corruption diff --git a/bcompress.c b/bcompress.c @@ -25,6 +25,7 @@ static int bccreat(struct bctx *bctx, char *path, int mode, struct bparam *bpar) static int bcopen(struct bctx *bctx, char *path, int flags, int mode, struct bparam *bpar); static int bcput(struct bctx *bctx, void *buf, size_t n, unsigned char *md); static int bcget(struct bctx *bctx, unsigned char *md, void *buf, size_t *n); +static int bccheck(struct bctx *bctx, unsigned char *md); static int bcsync(struct bctx *bctx); static int bcclose(struct bctx *bctx); @@ -33,6 +34,7 @@ static struct bops bops = { .open = bcopen, .put = bcput, .get = bcget, + .check = bccheck, .sync = bcsync, .close = bcclose, }; @@ -236,6 +238,15 @@ bcget(struct bctx *bctx, unsigned char *md, void *buf, size_t *n) } static int +bccheck(struct bctx *bctx, unsigned char *md) +{ + struct bops *bops = bstorageops(); + + return bops->check(bctx, md); + +} + +static int bcsync(struct bctx *bctx) { struct bops *bops = bstorageops(); diff --git a/block.c b/block.c @@ -78,6 +78,18 @@ bget(struct bctx *bctx, unsigned char *md, void *buf, size_t *n) } int +bcheck(struct bctx *bctx, unsigned char *md) +{ + struct bops *bops; + + if (bctx == NULL || md == NULL) + return -1; + + bops = bcompressops(); + return bops->check(bctx, md); +} + +int bsync(struct bctx *bctx) { struct bops *bops; diff --git a/block.h b/block.h @@ -17,6 +17,7 @@ struct bops { int (*open)(struct bctx *bctx, char *path, int flags, int mode, struct bparam *bpar); int (*put)(struct bctx *bctx, void *buf, size_t n, unsigned char *md); int (*get)(struct bctx *bctx, unsigned char *md, void *buf, size_t *n); + int (*check)(struct bctx *bctx, unsigned char *md); int (*sync)(struct bctx *bctx); int (*close)(struct bctx *bctx); }; @@ -26,6 +27,7 @@ extern int bcreat(char *path, int mode, struct bparam *bpar, struct bctx **bctx) extern int bopen(char *path, int flags, int mode, struct bparam *bpar, struct bctx **bctx); extern int bput(struct bctx *bctx, void *buf, size_t n, unsigned char *md); extern int bget(struct bctx *bctx, unsigned char *md, void *buf, size_t *n); +extern int bcheck(struct bctx *bctx, unsigned char *md); extern int bsync(struct bctx *bctx); extern int bclose(struct bctx *bctx); struct bparam *bparamdef(void); diff --git a/bstorage.c b/bstorage.c @@ -51,6 +51,7 @@ static int bscreat(struct bctx *bctx, char *path, int mode, struct bparam *bpar) static int bsopen(struct bctx *bctx, char *path, int flags, int mode, struct bparam *bpar); static int bsput(struct bctx *bctx, void *buf, size_t n, unsigned char *md); static int bsget(struct bctx *bctx, unsigned char *md, void *buf, size_t *n); +static int bscheck(struct bctx *bctx, unsigned char *md); static int bssync(struct bctx *bctx); static int bsclose(struct bctx *bctx); @@ -59,6 +60,7 @@ static struct bops bops = { .open = bsopen, .put = bsput, .get = bsget, + .check = bscheck, .sync = bssync, .close = bsclose, }; @@ -515,6 +517,53 @@ bsget(struct bctx *bctx, unsigned char *md, void *buf, size_t *n) return 0; } +/* + * Lookup the block and rehash it. Check that the + * resulting hash matches the given hash. + */ +static int +bscheck(struct bctx *bctx, unsigned char *md) +{ + struct sctx *sctx; + struct bd key, *bd; + void *buf; + + sctx = bctx->sctx; + + /* Lookup block in the cache */ + memcpy(key.md, md, MDSIZE); + bd = RB_FIND(bdcache, &sctx->bdcache, &key); + if (bd == NULL) + return -1; + + buf = malloc(bd->size); + if (buf == NULL) + return -1; + + if (lseek(sctx->fd, bd->offset, SEEK_SET) < 0) { + free(buf); + return -1; + } + + if (xread(sctx->fd, buf, bd->size) != bd->size) { + free(buf); + return -1; + } + + if (bhash(buf, bd->size, key.md) < 0) { + free(buf); + return -1; + } + + if (memcmp(key.md, md, MDSIZE) != 0) { + free(buf); + return -1; + } + + free(buf); + return 0; +} + /* Sync block header to storage */ static int bssync(struct bctx *bctx) diff --git a/dotest b/dotest @@ -9,6 +9,8 @@ test0() ./dup-init -Z none "$repo" ./dup-pack -r "$repo" snap0 < "$data" ./dup-pack -r "$repo" snap1 < "$data" + ./dup-check -r "$repo" snap0 + ./dup-check -r "$repo" snap1 du -sh "$repo" sum0=`sha1sum "$data" | awk '{print $1}'` sum1=`./dup-unpack -r "$repo" snap0 | sha1sum | awk '{print $1}'` @@ -26,6 +28,8 @@ test1() ./dup-init -Z snappy "$repo" ./dup-pack -r "$repo" snap0 < "$data" ./dup-pack -r "$repo" snap1 < "$data" + ./dup-check -r "$repo" snap0 + ./dup-check -r "$repo" snap1 du -sh "$repo" sum0=`sha1sum "$data" | awk '{print $1}'` sum1=`./dup-unpack -r "$repo" snap0 | sha1sum | awk '{print $1}'` diff --git a/dup-check.1 b/dup-check.1 @@ -0,0 +1,25 @@ +.Dd April 25, 2019 +.Dt DUP-CHECK 1 +.Os +.Sh NAME +.Nm dup-check +.Nd Check snapshot consistency +.Sh SYNOPSIS +.Nm dup-check +.Op Fl v +.Op Fl r Ar repo +.Ar name +.Sh DESCRIPTION +.Nm +checks that a snapshot is internally consistent. +.Sh OPTIONS +.Bl -tag -width "-r repo" +.It Fl r Ar repo +Repository directory. +By default the current working directory is used. +.It Fl v +Enable verbose mode. +.El +.Sh AUTHORS +.An Dimitris Papastamos Aq Mt sin@2f30.org , +.An z3bra Aq Mt contactatz3bradotorg . diff --git a/dup-check.c b/dup-check.c @@ -0,0 +1,81 @@ +#include <sys/types.h> +#include <sys/stat.h> + +#include <err.h> +#include <fcntl.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "arg.h" +#include "block.h" +#include "config.h" +#include "snap.h" + +int verbose; +char *argv0; + +static int +check(struct sctx *sctx, struct bctx *bctx) +{ + unsigned char md[MDSIZE]; + int sn; + + while ((sn = sget(sctx, md)) == MDSIZE) { + if (bcheck(bctx, md) < 0) + return -1; + } + if (sn < 0) + return -1; + return 0; +} + +static void +usage(void) +{ + fprintf(stderr, "usage: %s [-v] [-r repo] name\n", argv0); + exit(1); +} + +int +main(int argc, char *argv[]) +{ + char path[PATH_MAX]; + struct sctx *sctx; + struct bctx *bctx; + struct bparam bparam; + char *repo = "."; + + ARGBEGIN { + case 'r': + repo = EARGF(usage()); + break; + case 'v': + verbose++; + break; + default: + usage(); + } ARGEND + + if (argc != 1) + usage(); + + snprintf(path, sizeof(path), "%s/archive/%s", repo, argv[0]); + if (sopen(path, O_RDONLY, 0600, &sctx) < 0) + errx(1, "sopen: %s: failed", path); + + snprintf(path, sizeof(path), "%s/storage", repo); + if (bopen(path, O_RDONLY, 0600, &bparam, &bctx) <0) + errx(1, "bopen: %s: failed", path); + + if (check(sctx, bctx) < 0) + errx(1, "dedup: failed"); + + if (bclose(bctx) < 0) + errx(1, "bclose: failed"); + if (sclose(sctx) < 0) + errx(1, "sclose: failed"); + + return 0; +}