dedup

deduplicating backup program
git clone git://git.2f30.org/dedup
Log | Files | Refs | README | LICENSE

commit 891041b03c66d2c37f4aef65de717e0f3740d1d1
parent bb81bcb6bd4b4bf7dbe36d2a47087dec0c58f812
Author: sin <sin@2f30.org>
Date:   Wed,  1 May 2019 20:54:49 +0100

Some more comments in bstorage.c

Diffstat:
Mbstorage.c | 111+++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
1 file changed, 75 insertions(+), 36 deletions(-)

diff --git a/bstorage.c b/bstorage.c @@ -2,10 +2,10 @@ * Storage layer implementation using a single backing file. * The file format is as follows: * - * [storage header] - * [storage descriptor 0] + * [block header] + * [block descriptor 0] * [data] - * [storage descriptor 1] + * [block descriptor 1] * [data] * ... */ @@ -26,31 +26,30 @@ #include "queue.h" #include "tree.h" +/* block header flags */ +#define BHDRMAGIC "DEDUPDIDUPDIDUP" +#define NBHDRMAGIC sizeof(BHDRMAGIC) #define VMIN 0 #define VMAJ 1 - #define VMINMASK 0xff #define VMAJSHIFT 8 #define VMAJMASK 0xff - #define HALGOSHIFT 19 #define HALGOMASK 0x7 #define CALGOSHIFT 16 #define CALGOMASK 0x7 - -#define BHDRMAGIC "DEDUPDIDUPDIDUP" -#define NBHDRMAGIC sizeof(BHDRMAGIC) -#define BD2BTYPE 0x100 -#define BD2STYPE 0x101 - -#define BHDRSIZE (NBHDRMAGIC + 8 + 8) -#define BDSIZE (8 + 8 + 8 + 8 + (MDSIZE)) - #define CNONETYPE 0 #define CSNAPPYTYPE 1 #define HBLAKE2BTYPE 0 #define HBLAKE2STYPE 1 +#define BHDRSIZE (NBHDRMAGIC + 8 + 8) + +/* block descriptor flags */ +#define BD2BTYPE 0x100 +#define BD2STYPE 0x101 +#define BDSIZE (8 + 8 + 8 + 8 + (MDSIZE)) +/* misc helpers */ extern ssize_t xread(int, void *, size_t); extern ssize_t xwrite(int, void *, size_t); extern int pack(unsigned char *, char *, ...); @@ -80,30 +79,30 @@ static struct bops bops = { /* Block header structure */ struct bhdr { - char magic[NBHDRMAGIC]; - uint64_t flags; - uint64_t nbd; + char magic[NBHDRMAGIC]; /* magic number for file(1) */ + uint64_t flags; /* version number, compression/hashing configuration */ + uint64_t nbd; /* number of block descriptors */ }; /* Block descriptor */ struct bd { - uint16_t type; - uint8_t reserved[6]; - uint64_t offset; /* offset of block */ - uint64_t size; /* size of block */ - uint64_t refcnt; - unsigned char md[MDSIZE]; - RB_ENTRY(bd) rbe; - SLIST_ENTRY(bd) sle; + uint16_t type; /* type of hashing algorithm used */ + uint8_t reserved[6]; /* should be set to 0 when writing */ + uint64_t offset; /* block offset */ + uint64_t size; /* block size */ + uint64_t refcnt; /* reference count of block, 0 if block is removed */ + unsigned char md[MDSIZE]; /* hash of block */ + RB_ENTRY(bd) rbe; /* bdcache link node */ + SLIST_ENTRY(bd) sle; /* gchead link node */ }; RB_HEAD(bdcache, bd); /* Storage layer context */ struct sctx { - struct bdcache bdcache; - SLIST_HEAD(gchead, bd) gchead; - struct bhdr bhdr; - int fd; + struct bdcache bdcache; /* cache of block descriptors */ + SLIST_HEAD(gchead, bd) gchead; /* list of all blocks with a zero refcount */ + struct bhdr bhdr; /* block header entry */ + int fd; /* underlying storage file descriptor */ int rdonly; /* when set to 1, the bssync() operation is a no-op */ int type; /* hash algorithm for new blocks */ }; @@ -231,7 +230,7 @@ packbd(int fd, struct bd *bd) return n; } -/* Insert block descriptor to cache */ +/* Load block descriptor from file */ static int loadbd(struct sctx *sctx) { @@ -257,6 +256,16 @@ loadbd(struct sctx *sctx) return -1; } + /* + * When refcount is 0 the block has been removed. + * In that case, the block descriptor is still present + * in the file as it is used to locate the next block + * descriptor which could be live. + * + * The garbage collection list links together all block + * descriptors that have a reference count of 0. + * This is needed to implement the gc operation. + */ if (bd->refcnt > 0) RB_INSERT(bdcache, &sctx->bdcache, bd); else @@ -295,7 +304,7 @@ initbdcache(struct sctx *sctx) return 0; } -/* Create storage */ +/* Create storage file */ static int bscreat(struct bctx *bctx, char *path, int mode, struct bparam *bpar) { @@ -346,6 +355,7 @@ bscreat(struct bctx *bctx, char *path, int mode, struct bparam *bpar) bhdr->nbd = 0; sctx->fd = fd; + /* Write the block header entry to the file */ if (packbhdr(fd, bhdr) < 0) { free(sctx); close(fd); @@ -354,7 +364,7 @@ bscreat(struct bctx *bctx, char *path, int mode, struct bparam *bpar) return 0; } -/* Open storage */ +/* Open storage file */ static int bsopen(struct bctx *bctx, char *path, int flags, int mode, struct bparam *bpar) { @@ -387,6 +397,8 @@ bsopen(struct bctx *bctx, char *path, int flags, int mode, struct bparam *bpar) RB_INIT(&sctx->bdcache); SLIST_INIT(&sctx->gchead); bhdr = &sctx->bhdr; + + /* Read block header entry from file */ if (unpackbhdr(fd, bhdr) < 0) { free(sctx); close(fd); @@ -442,6 +454,10 @@ bsopen(struct bctx *bctx, char *path, int flags, int mode, struct bparam *bpar) sctx->fd = fd; sctx->rdonly = flags == O_RDONLY; + /* + * Initialize block descriptor cache + * and garbage collection list. + */ if (initbdcache(sctx) < 0) { free(sctx); close(fd); @@ -450,7 +466,7 @@ bsopen(struct bctx *bctx, char *path, int flags, int mode, struct bparam *bpar) return 0; } -/* Write a block */ +/* Write a block to the storage file */ static int bsput(struct bctx *bctx, void *buf, size_t n, unsigned char *md) { @@ -460,7 +476,6 @@ bsput(struct bctx *bctx, void *buf, size_t n, unsigned char *md) off_t offs; sctx = bctx->sctx; - switch (sctx->type) { case BD2BTYPE: if (b2bhash(buf, n, key.md) < 0) @@ -474,6 +489,11 @@ bsput(struct bctx *bctx, void *buf, size_t n, unsigned char *md) return -1; } + /* + * If the block is already present in the cache + * just increment the reference count and write back + * the block descriptor associated for that block. + */ bd = RB_FIND(bdcache, &sctx->bdcache, &key); if (bd != NULL) { off_t bdoffs; @@ -482,7 +502,6 @@ bsput(struct bctx *bctx, void *buf, size_t n, unsigned char *md) if (lseek(sctx->fd, bdoffs, SEEK_SET) < 0) return -1; - /* Block already present, increment the reference count */ bd->refcnt++; if (packbd(sctx->fd, bd) < 0) { bd->refcnt--; @@ -493,10 +512,12 @@ bsput(struct bctx *bctx, void *buf, size_t n, unsigned char *md) return 0; } + /* New blocks are always appended to the storage file */ offs = lseek(sctx->fd, 0, SEEK_END); if (offs < 0) return -1; + /* Allocate a new block descriptor */ bd = calloc(1, sizeof(*bd)); if (bd == NULL) return -1; @@ -506,25 +527,34 @@ bsput(struct bctx *bctx, void *buf, size_t n, unsigned char *md) bd->refcnt = 1; memcpy(bd->md, key.md, MDSIZE); + /* Write block descriptor to storage file */ if (packbd(sctx->fd, bd) < 0) { free(bd); return -1; } + /* Append block payload to block descriptor */ if (xwrite(sctx->fd, buf, n) != n) { + /* Shouldn't fail but if it does rewind storage file state */ ftruncate(sctx->fd, offs); free(bd); return -1; } + /* + * Update block entry header. + * The header will be written to the storage file + * when bsclose() or bssync() is called. + */ bhdr = &sctx->bhdr; bhdr->nbd++; + RB_INSERT(bdcache, &sctx->bdcache, bd); memcpy(md, bd->md, MDSIZE); return bd->size; } -/* Read a block */ +/* Read a block from the storage file */ static int bsget(struct bctx *bctx, unsigned char *md, void *buf, size_t *n) { @@ -576,6 +606,7 @@ bsrm(struct bctx *bctx, unsigned char *md) return -1; } + /* This block is still referenced so just return */ if (bd->refcnt > 0) return 0; @@ -600,6 +631,14 @@ bsrm(struct bctx *bctx, unsigned char *md) return 0; } +/* + * Re-punch all holes in the storage file. + * This is needed when the storage file is copied from + * one system to another and back. The target system + * may not support hole punching so the holes will be + * filled with literal zeroes, negating the space saving + * effects. + */ static int bsgc(struct bctx *bctx) {