dedup

data deduplication program
git clone git://git.2f30.org/dedup.git
Log | Files | Refs | README | LICENSE

commit c0597760335a6dfcfeb37a023f6362567079ba7f
parent 367c26efe780933f7fc2201384d8e1c0feb8a77a
Author: sin <sin@2f30.org>
Date:   Fri, 15 Feb 2019 16:54:09 +0000

Preparation for variable length dedup support

Diffstat:
MLICENSE | 3++-
MMakefile | 10+++++-----
MTODO | 3++-
Mdedup.1 | 1+
Mdedup.c | 586+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Dsha256.c | 261-------------------------------------------------------------------------------
Dsha256.h | 24------------------------
7 files changed, 331 insertions(+), 557 deletions(-)

diff --git a/LICENSE b/LICENSE @@ -1,4 +1,5 @@ -© 2018 Dimitris Papastamos <sin@2f30.org> +© 2019 Dimitris Papastamos <sin@2f30.org> +© 2019 z3bra <contactatz3bradotorg> Permission to use, copy, modify, and distribute this software for any purpose with or without fee is hereby granted, provided that the above diff --git a/Makefile b/Makefile @@ -1,18 +1,18 @@ VERSION = 0.0 PREFIX = /usr/local MANPREFIX = $(PREFIX)/man -SRC = dedup.c sha256.c -OBJ = dedup.o sha256.o +SRC = dedup.c +OBJ = dedup.o BIN = dedup DISTFILES = $(SRC) LICENSE Makefile README arg.h dedup.1 tree.h CFLAGS = -g -Wall -CPPFLAGS = -I/usr/local/include +CPPFLAGS = -I/usr/local/include -D_FILE_OFFSET_BITS=64 +LDLIBS = -lcrypto all: $(BIN) -dedup.o: arg.h sha256.h tree.h -sha256.o: sha256.h +dedup.o: arg.h tree.h clean: rm -f $(OBJ) $(BIN) $(BIN)-$(VERSION).tar.gz diff --git a/TODO b/TODO @@ -1,4 +1,5 @@ endianness agnostic version field in entry header -lseek64 support look into variable-length dedup +file locking +overflow checks diff --git a/dedup.1 b/dedup.1 @@ -31,3 +31,4 @@ files will be created. .El .Sh AUTHORS .An Dimitris Papastamos Aq Mt sin@2f30.org , +.An z3bra Aq Mt contactatz3bradotorg . diff --git a/dedup.c b/dedup.c @@ -7,47 +7,59 @@ #include <string.h> #include <unistd.h> +#include <openssl/sha.h> + #include "arg.h" -#include "sha256.h" #include "tree.h" #define INDEXF ".index" #define STOREF ".store" #define CACHEF ".cache" -#define BLKSIZ 4096 +#define BLKSIZ 65536 +#define WINSIZ 4095 +#define MDSIZ SHA256_DIGEST_LENGTH + +#define ROTL(x, y) (((x) << (y)) | ((x) >> (32 - (y)))) + +enum { + WALK_CONTINUE, + WALK_STOP +}; +/* index file header */ struct enthdr { uint64_t flags; uint64_t nents; -} __attribute__((packed)); +}; + +/* block descriptor */ +struct bdescr { + uint8_t md[MDSIZ]; + uint64_t offset; + uint64_t size; +}; +/* index file entry */ struct ent { uint64_t size; - uint8_t reserved[7]; - uint8_t md[32]; + uint8_t md[MDSIZ]; /* hash of file */ uint64_t nblks; - uint64_t blks[]; -} __attribute__((packed)); + struct bdescr bdescr[]; +}; -struct blk { - uint8_t md[32]; - uint64_t size; - uint8_t data[BLKSIZ]; -} __attribute__((packed)); - -struct cache_data { - uint8_t md[32]; - uint64_t blkidx; -} __attribute__((packed)); - -struct cache_ent { - struct cache_data data; - int dirty; - RB_ENTRY(cache_ent) e; +/* cache entry */ +struct cent { + struct bdescr bdescr; + RB_ENTRY(cent) e; +}; + +struct extract_args { + uint8_t *md; + int fd; }; -RB_HEAD(cache, cache_ent) cache_head; +RB_HEAD(cache, cent) cache_head; struct enthdr enthdr; int ifd; int sfd; @@ -55,58 +67,101 @@ int cfd; int verbose; char *argv0; -void -dump_md(const uint8_t *md, size_t len) +/* + * Static table for use in buzhash algorithm. + * 256 * 32 bits randomly generated unique integers + */ +uint32_t buz[] = { + 0xbc9fa594,0x30a8f827,0xced627a7,0xdb46a745,0xcfa4a9e8,0x77cccb59,0xddb66276,0x3adc532f, + 0xfe8b67d3,0x8155b59e,0x0c893666,0x1d757009,0x17394ee4,0x85d94c07,0xcacd52da,0x076c6f79, + 0xead0a798,0x6c7ccb4a,0x2639a1b8,0x3aa5ae32,0x3e6218d2,0xb290d980,0xa5149521,0x4b426119, + 0xd3230fc7,0x677c1cc4,0x2b64603c,0x01fe92a8,0xbe358296,0xa7e7fac7,0xf509bf41,0x04b017ad, + 0xf900344c,0x8e14e202,0xb2a6e9b4,0x3db3c311,0x960286a8,0xf6bf0468,0xed54ec94,0xf358070c, + 0x6a4795dd,0x3f7b925c,0x5e13a060,0xfaecbafe,0x03c8bb55,0x8a56ba88,0x633e3b49,0xe036bbbe, + 0x1ed3dbb5,0x76e8ad74,0x79d346ab,0x44b4ccc4,0x71eb22d3,0xa1aa3f24,0x50e05b81,0xa3b450d3, + 0x7f5caffb,0xa1990650,0x54c44800,0xda134b65,0x72362eea,0xbd12b8e6,0xf7c99fdc,0x020d48c7, + 0x9d9c3d46,0x32b75615,0xe61923cf,0xadc09d8f,0xab11376b,0xd66fe4cd,0xb3b086b6,0xb8345b9f, + 0x59029667,0xae0e937c,0xcbd4d4ba,0x720bb3fb,0x5f7d2ca3,0xec24ba15,0x6b40109b,0xf0a54587, + 0x3acf9420,0x466e981d,0xc66dc124,0x150ef7b4,0xc3ce718e,0x136774f5,0x46684ab4,0xb4b490f0, + 0x26508a8b,0xf12febc8,0x4b99171b,0xfc373c84,0x339b5677,0x41703ff3,0x7cadbbd7,0x15ea24e2, + 0x7a2f9783,0xed6a383a,0x649eb072,0x79970941,0x2abd28ad,0x4375e00c,0x9df084f7,0x6fdeec6c, + 0x6619ac6d,0x7d256f4d,0x9b8e658a,0x3d7627e9,0xd5a98d45,0x15f84223,0x9b6acef5,0xf876be67, + 0xe3ae7089,0x84e2b64a,0x6818a969,0x86e9ba4e,0xa24a5b57,0x61570cf1,0xa5f8fc91,0x879d8383, + 0x91b13866,0x75e87961,0x16db8138,0x5a2ff6b8,0x8f664e9b,0x894e1496,0x88235c5b,0xcdb3b580, + 0xa2e80109,0xb0f88a82,0xd12cd340,0x93fbc37d,0xf4d1eb82,0xce42f309,0x16ffd2c2,0xb4dfef2b, + 0xb8b1a33e,0x4708a5e6,0xba66dd88,0xa9ec0da6,0x6f8ee2c9,0xad8b9993,0x1d6a25a8,0x1f3d08ce, + 0x149c04e7,0x5cd1fa51,0xb84c89c7,0xeced6f8c,0xe328b30f,0x084fa836,0x6d1bb1b7,0x94c78ea5, + 0x14973034,0xf1a1bcef,0x48b798d2,0xded9ca9e,0x5fd965d0,0x92544eb1,0x5e80f189,0xcbbf5e15, + 0x4d8121f0,0x5dd3b92f,0xd9ea98fb,0x2dbf5644,0x0fbcb9b7,0x20a1db53,0x7c3fcc98,0x36744fbd, + 0xced08954,0x8e7c5efe,0x3c5f6733,0x657477be,0x3630a02d,0x38bcbda0,0xb7702575,0x4a7f4bce, + 0x0e7660fe,0x4dcb91b5,0x4fd7ffd3,0x041821c1,0xa846a181,0xc8048e9e,0xd4b05072,0x986e0509, + 0xa00aaeeb,0x02e3526a,0x2fac4843,0xfa98e805,0x923ecd8d,0x395d9546,0x8674c3cd,0xae5a8a71, + 0x966dfe45,0x5c9ceba5,0x0830a1cf,0xa1750981,0x8f604480,0x28ea0c9a,0x0da12413,0x98b0b3c5, + 0xa21d473a,0x96ce4308,0xe9a1001b,0x8bbacb44,0x18bad3f4,0xe3121acb,0x46a9b45f,0x92cd9704, + 0xc1a7c619,0x3281e361,0x462e8c79,0x9e572f93,0x7239e5f0,0x67d8e6ba,0x13747ce3,0xf01ee64a, + 0xe7d0ae12,0xeea04088,0xe5b36767,0x17558eae,0x678ffbe6,0xe0bbc866,0x0c24adec,0xa9cbb869, + 0x3fd44ee1,0x9ca4ca06,0x04c0ef00,0x04589a21,0x9cf9c819,0x976f6ca1,0x8a30e66a,0x004d6f7e, + 0x384c8851,0x5bc97eb8,0xc6c49339,0x5aa386c7,0x74bdf8af,0x9b713750,0x4112f8c2,0x2895dae1, + 0xf576d905,0x9de98bce,0xb2b26bcd,0xd46707a0,0x147fbb46,0xa52c6e50,0xe43128fc,0x374ad964, + 0x8dfd4d53,0xc4d0c087,0x31dfb5ca,0xa44589b5,0x6b637e2e,0x663f6b45,0xd2d8baa0,0x1dac7e4c +}; + +/* Buzhash: https://en.wikipedia.org/wiki/Rolling_hash#Cyclic_polynomial */ +uint32_t +buzh_init(uint8_t *buf, size_t size) { size_t i; + uint32_t fp = 0; - for (i = 0; i < len; i++) - fprintf(stderr, "%02x", md[i]); + for (i = size - 1; i > 0; i--, buf++) + fp ^= ROTL(buz[*buf], i % 32); + + return fp ^ buz[*buf]; } -void -dump_enthdr(struct enthdr *hdr) +uint32_t +buzh_update(uint32_t fp, uint8_t in, uint8_t out, size_t size) { - fprintf(stderr, "hdr->flags = %llx\n", - (unsigned long long)hdr->flags); - fprintf(stderr, "hdr->nents = %llx\n", - (unsigned long long)hdr->nents); + return ROTL(fp, 1) ^ ROTL(buz[out], size % 32) ^ buz[in]; } -void -dump_ent(struct ent *ent) +uint64_t +chunk_blk(uint8_t *buf, size_t size) { - uint64_t i; - - fprintf(stderr, "ent->size: %llu\n", (unsigned long long)ent->size); - fprintf(stderr, "ent->md: "); - dump_md(ent->md, sizeof(ent->md)); - fputc('\n', stderr); - if (verbose) { - fprintf(stderr, "ent->nblks: %llu\n", - (unsigned long long)ent->nblks); - for (i = 0; i < ent->nblks; i++) - fprintf(stderr, "ent->blks[%llu]: %llu\n", - (unsigned long long)i, - (unsigned long long)ent->blks[i]); + size_t i; + uint32_t fp; + + /* + * Chunking blocks is decided using a rolling hash + binary pattern. + * The buzhash algorithm is used to "fingerprint" a fixed size window. + * Once the lower 13 bits of this fingerprint are all zeros, + * the block is chunked. + * If the pattern can't be matched, then we return the buffer size. + */ + fp = buzh_init(buf, WINSIZ); + for (i = 1; i < size - WINSIZ; i++) { + fp = buzh_update(fp, buf[i - 1], buf[i + WINSIZ], WINSIZ); + if ((fp & 0x00001fff) == 0) + return i + WINSIZ; } + return size; } void -dump_blk(struct blk *blk) +print_md(const uint8_t *md, size_t size) { - fprintf(stderr, "blk->md: "); - dump_md(blk->md, sizeof(blk->md)); - putchar('\n'); - fprintf(stderr, "blk->size: %llu\n", (unsigned long long)blk->size); + size_t i; + + for (i = 0; i < size; i++) + fprintf(stderr, "%02x", md[i]); } void str2bin(char *s, uint8_t *d) { - size_t i, len = strlen(s) / 2; + size_t i, size = strlen(s) / 2; - for (i = 0; i < len; i++, s += 2) + for (i = 0; i < size; i++, s += 2) sscanf(s, "%2hhx", &d[i]); } @@ -151,64 +206,68 @@ xwrite(int fd, const void *buf, size_t nbytes) } int -cache_ent_cmp(struct cache_ent *e1, struct cache_ent *e2) +cent_cmp(struct cent *e1, struct cent *e2) { int r; - r = memcmp(e1->data.md, e2->data.md, sizeof(e1->data.md)); + r = memcmp(e1->bdescr.md, e2->bdescr.md, sizeof(e1->bdescr.md)); if (r > 0) return 1; else if (r < 0) return -1; return 0; } -RB_PROTOTYPE(cache, cache_ent, e, cache_ent_cmp); -RB_GENERATE(cache, cache_ent, e, cache_ent_cmp); +RB_PROTOTYPE(cache, cent, e, cent_cmp); +RB_GENERATE(cache, cent, e, cent_cmp); -struct cache_ent * -alloc_cache_ent(uint8_t *md, uint64_t blkidx) +struct cent * +alloc_cent(void) { - struct cache_ent *ent; + struct cent *ent; ent = calloc(1, sizeof(*ent)); if (ent == NULL) - err(1, "malloc"); - memcpy(&ent->data.md, md, sizeof(ent->data.md)); - ent->data.blkidx = blkidx; + err(1, "calloc"); return ent; } void -add_cache_ent(struct cache_ent *ent) +add_cent(struct cent *cent) { - RB_INSERT(cache, &cache_head, ent); + RB_INSERT(cache, &cache_head, cent); } void flush_cache(void) { - struct cache_ent *ent; + struct cent *cent; - RB_FOREACH(ent, cache, &cache_head) { - if (!ent->dirty) - continue; - lseek(cfd, ent->data.blkidx * sizeof(ent->data), SEEK_SET); - xwrite(cfd, &ent->data, sizeof(ent->data)); - ent->dirty = 0; - } + lseek(cfd, 0, SEEK_SET); + RB_FOREACH(cent, cache, &cache_head) + xwrite(cfd, &cent->bdescr, sizeof(cent->bdescr)); } void free_cache(void) { - struct cache_ent *ent, *tmp; + struct cent *cent, *tmp; - RB_FOREACH_SAFE(ent, cache, &cache_head, tmp) { - RB_REMOVE(cache, &cache_head, ent); - free(ent); + RB_FOREACH_SAFE(cent, cache, &cache_head, tmp) { + RB_REMOVE(cache, &cache_head, cent); + free(cent); } } +uint64_t +cache_nents(void) +{ + struct stat sb; + + if (fstat(cfd, &sb) < 0) + err(1, "fstat"); + return sb.st_size / sizeof(struct bdescr); +} + void append_ent(struct ent *ent) { @@ -220,7 +279,7 @@ append_ent(struct ent *ent) /* Append entry */ lseek(ifd, 0, SEEK_END); ent->size = sizeof(*ent); - ent->size += ent->nblks * sizeof(ent->blks[0]); + ent->size += ent->nblks * sizeof(ent->bdescr[0]); xwrite(ifd, ent, ent->size); } @@ -231,7 +290,7 @@ alloc_ent(void) ent = calloc(1, sizeof(*ent)); if (ent == NULL) - err(1, "malloc"); + err(1, "calloc"); return ent; } @@ -241,271 +300,265 @@ grow_ent(struct ent *ent, uint64_t nblks) size_t size; size = sizeof(*ent); - size += nblks * sizeof(ent->blks[0]); + size += nblks * sizeof(ent->bdescr[0]); ent = realloc(ent, size); if (ent == NULL) err(1, "realloc"); return ent; } -uint64_t -storefile_nblks(void) +uint8_t * +alloc_buf(size_t size) { - struct stat sb; + void *p; - if (fstat(sfd, &sb) < 0) - err(1, "fstat"); - return sb.st_size / sizeof(struct blk); + p = calloc(1, size); + if (p == NULL) + err(1, "calloc"); + return p; } -uint64_t -cachefile_nblks(void) +void +hash_blk(uint8_t *buf, size_t size, uint8_t *md) { - struct stat sb; + SHA256_CTX ctx; - if (fstat(cfd, &sb) < 0) - err(1, "fstat"); - return sb.st_size / sizeof(struct cache_data); + SHA256_Init(&ctx); + SHA256_Update(&ctx, buf, size); + SHA256_Final(md, &ctx); } void -hash_blk(struct blk *blk) +read_blk(uint8_t *buf, struct bdescr *bdescr) { - sha256_context ctx; - - sha256_starts(&ctx); - sha256_update(&ctx, blk->data, blk->size); - sha256_finish(&ctx, blk->md); + lseek(sfd, bdescr->offset, SEEK_SET); + if (xread(sfd, buf, bdescr->size) == 0) + errx(1, "read: unexpected EOF"); } void -read_blk(struct blk *blk, off_t blkidx) +append_blk(uint8_t *buf, size_t size) { - lseek(sfd, blkidx * sizeof(*blk), SEEK_SET); - if (xread(sfd, blk, sizeof(*blk)) == 0) - errx(1, "unexpected EOF"); + lseek(sfd, 0, SEEK_END); + xwrite(sfd, buf, size); } -void -append_blk(struct blk *blk) +off_t +store_size(void) { - lseek(sfd, 0, SEEK_END); - xwrite(sfd, blk, sizeof(*blk)); + return lseek(sfd, 0, SEEK_END); } int -lookup_blk(struct blk *blk, uint64_t *blkidx) +lookup_blk(uint8_t *md, struct bdescr *bdescr) { - struct cache_ent *ent, key; + struct cent *ent, key; - memcpy(key.data.md, blk->md, sizeof(key.data.md)); + memcpy(key.bdescr.md, md, sizeof(key.bdescr.md)); ent = RB_FIND(cache, &cache_head, &key); if (ent != NULL) { - *blkidx = ent->data.blkidx; + *bdescr = ent->bdescr; return 0; } return -1; } void -extract(char *id, int fd) -{ - uint8_t md[32]; - uint64_t nblks, i; - - str2bin(id, md); - nblks = storefile_nblks(); - lseek(ifd, sizeof(enthdr), SEEK_SET); - for (i = 0; i < enthdr.nents; i++) { - uint64_t j; - struct ent *ent; - - /* Load index entry */ - ent = alloc_ent(); - if (xread(ifd, ent, sizeof(*ent)) == 0) - errx(1, "unexpected EOF"); - - /* Check if we've located the right file */ - if (memcmp(ent->md, md, sizeof(ent->md)) != 0) { - free(ent); - /* Skip over index entry block table */ - lseek(ifd, ent->nblks * sizeof(ent->blks[0]), SEEK_CUR); - continue; - } - - /* Load index entry block table */ - ent = grow_ent(ent, ent->nblks); - if (xread(ifd, ent->blks, - ent->nblks * sizeof(ent->blks[0])) == 0) - errx(1, "unexpected EOF"); - - /* Blast file blocks to file descriptor */ - for (j = 0; j < ent->nblks; j++) { - struct blk blk; - - if (ent->blks[j] > nblks) - errx(1, "index is corrupted"); - read_blk(&blk, ent->blks[j]); - xwrite(fd, blk.data, blk.size); - } - free(ent); - break; - } - if (i == enthdr.nents) - errx(1, "%s: unknown hash %s", __func__, id); -} - -void dedup(int fd) { - sha256_context ctx; - struct blk blk; + uint8_t md[MDSIZ]; + uint8_t *buf; + SHA256_CTX ctx; struct ent *ent; ssize_t n; + buf = alloc_buf(BLKSIZ); ent = alloc_ent(); - sha256_starts(&ctx); - while ((n = xread(fd, blk.data, BLKSIZ)) > 0) { - uint64_t blkidx; - blk.size = n; - hash_blk(&blk); + SHA256_Init(&ctx); + while ((n = xread(fd, buf, BLKSIZ)) > 0) { + struct bdescr bdescr; + + hash_blk(buf, n, md); + + /* Calculate file hash one block at a time */ + SHA256_Update(&ctx, buf, n); - /* Rolling hash of input stream */ - sha256_update(&ctx, blk.data, blk.size); - /* Prepare for adding a new block index for this entry */ ent = grow_ent(ent, ent->nblks + 1); + if (lookup_blk(md, &bdescr) < 0) { + struct bdescr bdescr; + struct cent *cent; - if (lookup_blk(&blk, &blkidx) < 0) { - struct cache_ent *cache_ent; + /* Block not found, create new block descriptor */ + memcpy(bdescr.md, md, sizeof(bdescr)); + bdescr.offset = store_size(); + bdescr.size = n; - blkidx = storefile_nblks(); + /* Update index entry */ + ent->bdescr[ent->nblks++] = bdescr; - /* Create a cache entry for this block */ - cache_ent = alloc_cache_ent(blk.md, blkidx); - add_cache_ent(cache_ent); - cache_ent->dirty = 1; + /* Store block */ + append_blk(buf, n); - ent->blks[ent->nblks++] = blkidx; - append_blk(&blk); + /* Create a cache entry for this block */ + cent = alloc_cent(); + cent->bdescr = bdescr; + add_cent(cent); } else { - ent->blks[ent->nblks++] = blkidx; + /* Found block with the same hash, update index entry */ + ent->bdescr[ent->nblks++] = bdescr; } } if (ent->nblks > 0) { /* Calculate hash and add this entry to the index */ - sha256_finish(&ctx, ent->md); + SHA256_Final(ent->md, &ctx); append_ent(ent); } + free(ent); + free(buf); +} - flush_cache(); +int +extract(struct ent *ent, void *arg) +{ + uint8_t *buf; + struct extract_args *args = arg; + uint64_t i; + + if (memcmp(ent->md, args->md, sizeof(ent->md)) != 0) + return WALK_CONTINUE; + + buf = alloc_buf(BLKSIZ); + for (i = 0; i < ent->nblks; i++) { + read_blk(buf, &ent->bdescr[i]); + xwrite(args->fd, buf, ent->bdescr[i].size); + } + free(buf); + return WALK_STOP; } -void -check(void) +int +check(struct ent *ent, void *arg) { - uint64_t nblks, i, j; + uint8_t md[MDSIZ]; + uint8_t *buf; + SHA256_CTX ctx; + uint64_t i; - nblks = storefile_nblks(); - lseek(ifd, sizeof(enthdr), SEEK_SET); - for (i = 0; i < enthdr.nents; i++) { - uint8_t md[32]; - sha256_context ctx; - struct ent *ent; + buf = alloc_buf(BLKSIZ); + /* + * Calculate hash for each block and compare + * with index entry block descriptor + */ + for (i = 0; i < ent->nblks; i++) { + read_blk(buf, &ent->bdescr[i]); - ent = alloc_ent(); - if (xread(ifd, ent, sizeof(*ent)) == 0) - errx(1, "unexpected EOF"); - ent = grow_ent(ent, ent->nblks); - if (xread(ifd, ent->blks, - ent->nblks * sizeof(ent->blks[0])) == 0) - errx(1, "unexpected EOF"); - - sha256_starts(&ctx); - for (j = 0; j < ent->nblks; j++) { - struct blk blk; - - if (ent->blks[j] > nblks) - errx(1, "index is corrupted"); - read_blk(&blk, ent->blks[j]); - sha256_update(&ctx, blk.data, blk.size); - } - sha256_finish(&ctx, md); + SHA256_Init(&ctx); + SHA256_Update(&ctx, buf, ent->bdescr[i].size); + SHA256_Final(md, &ctx); - if (memcmp(ent->md, md, sizeof(ent->md)) != 0) - errx(1, "hash mismatch"); + if (memcmp(ent->bdescr[i].md, md, + sizeof(ent->bdescr[i]).md) == 0) + continue; - free(ent); + fprintf(stderr, "Block hash mismatch\n"); + fprintf(stderr, " Expected hash: "); + print_md(ent->md, sizeof(ent->md)); + fputc('\n', stderr); + fprintf(stderr, " Actual hash: "); + print_md(md, sizeof(md)); + fputc('\n', stderr); + fprintf(stderr, " Offset: %llu\n", + (unsigned long long)ent->bdescr[i].offset); + fprintf(stderr, " Size: %llu\n", + (unsigned long long)ent->bdescr[i].size); } + free(buf); + return WALK_CONTINUE; } -void -list(void) +int +list(struct ent *ent, void *arg) +{ + print_md(ent->md, sizeof(ent->md)); + putchar('\n'); + return WALK_CONTINUE; +} + +int +rebuild_cache(struct ent *ent, void *arg) { + uint8_t md[MDSIZ]; + uint8_t *buf; + SHA256_CTX ctx; uint64_t i; - lseek(ifd, sizeof(enthdr), SEEK_SET); - for (i = 0; i < enthdr.nents; i++) { - struct ent ent; - size_t i; - - if (xread(ifd, &ent, sizeof(ent)) == 0) - errx(1, "unexpected EOF"); - - for (i = 0; i < sizeof(ent.md); i++) - printf("%02x", ent.md[i]); - if (verbose) - printf(" %llu", (unsigned long long)ent.nblks * BLKSIZ); - putchar('\n'); - lseek(ifd, ent.nblks * sizeof(ent.blks[0]), SEEK_CUR); + buf = alloc_buf(BLKSIZ); + for (i = 0; i < ent->nblks; i++) { + struct cent *cent; + + read_blk(buf, &ent->bdescr[i]); + + SHA256_Init(&ctx); + SHA256_Update(&ctx, buf, ent->bdescr[i].size); + SHA256_Final(md, &ctx); + + cent = alloc_cent(); + memcpy(cent->bdescr.md, md, sizeof(cent->bdescr.md)); + cent->bdescr = ent->bdescr[i]; + add_cent(cent); } + free(buf); + return WALK_CONTINUE; } +/* Walk through all index entries and call fn() on each one */ void -rebuild_cache(void) +walk(int (*fn)(struct ent *, void *), void *arg) { - uint64_t nblks, i; + struct ent *ent; + uint64_t i; - if (verbose) - fprintf(stderr, "rebuilding cache..."); - nblks = storefile_nblks(); - lseek(cfd, 0, SEEK_SET); - for (i = 0; i < nblks; i++) { - struct cache_ent *ent; - struct blk blk; - - read_blk(&blk, i); - ent = alloc_cache_ent(blk.md, i); - add_cache_ent(ent); - ent->dirty = 1; + lseek(ifd, sizeof(enthdr), SEEK_SET); + for (i = 0; i < enthdr.nents; i++) { + ent = alloc_ent(); + if (xread(ifd, ent, sizeof(*ent)) == 0) + errx(1, "read: unexpected EOF"); + + ent = grow_ent(ent, ent->nblks); + if (xread(ifd, ent->bdescr, + ent->nblks * sizeof(ent->bdescr[0])) == 0) + errx(1, "read: unexpected EOF"); + + if ((*fn)(ent, arg) == WALK_STOP) + break; } - flush_cache(); - if (verbose) - fprintf(stderr, "done\n"); + free(ent); } void init_cache(void) { - uint64_t nblks, i; + uint64_t nents, i; - if (verbose) - fprintf(stderr, "initializing cache..."); - nblks = cachefile_nblks(); + nents = cache_nents(); lseek(cfd, 0, SEEK_SET); - for (i = 0; i < nblks; i++) { - struct blk blk; - struct cache_ent *ent; - - ent = alloc_cache_ent(blk.md, i); - if (xread(cfd, &ent->data, sizeof(ent->data)) == 0) - errx(1, "unexpected EOF"); - add_cache_ent(ent); + for (i = 0; i < nents; i++) { + struct cent *cent; + + cent = alloc_cent(); + if (xread(cfd, &cent->bdescr, sizeof(cent->bdescr)) == 0) + errx(1, "read: unexpected EOF"); + add_cent(cent); + + if (verbose) { + fprintf(stderr, "bdescr.offset: %llu bdescr.size: %llu\n", + (unsigned long long)cent->bdescr.offset, + (unsigned long long)cent->bdescr.size); + } } - if (verbose) - fprintf(stderr, "done\n"); } void @@ -530,15 +583,16 @@ init(void) if (sb.st_size != 0) xread(ifd, &enthdr, sizeof(enthdr)); - if (cachefile_nblks() != storefile_nblks()) - rebuild_cache(); - else + if (cache_nents() != 0) init_cache(); + else + walk(rebuild_cache, NULL); } void term(void) { + flush_cache(); free_cache(); fsync(ifd); @@ -560,6 +614,7 @@ usage(void) int main(int argc, char *argv[]) { + uint8_t md[MDSIZ]; char *id = NULL, *root = NULL; int fd = -1, lflag = 0, cflag = 0; @@ -611,19 +666,20 @@ main(int argc, char *argv[]) init(); if (cflag) { - check(); + walk(check, NULL); term(); return 0; } if (lflag) { - list(); + walk(list, NULL); term(); return 0; } if (id) { - extract(id, fd); + str2bin(id, md); + walk(extract, &(struct extract_args){ .md = md, .fd = fd }); } else { dedup(fd); } diff --git a/sha256.c b/sha256.c @@ -1,261 +0,0 @@ -/* - * FIPS-180-2 compliant SHA-256 implementation - * - * Copyright (C) 2001-2003 Christophe Devine - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include <string.h> - -#include "sha256.h" - -#define GET_UINT32(n,b,i) \ -{ \ - (n) = ( (uint32) (b)[(i) ] << 24 ) \ - | ( (uint32) (b)[(i) + 1] << 16 ) \ - | ( (uint32) (b)[(i) + 2] << 8 ) \ - | ( (uint32) (b)[(i) + 3] ); \ -} - -#define PUT_UINT32(n,b,i) \ -{ \ - (b)[(i) ] = (uint8) ( (n) >> 24 ); \ - (b)[(i) + 1] = (uint8) ( (n) >> 16 ); \ - (b)[(i) + 2] = (uint8) ( (n) >> 8 ); \ - (b)[(i) + 3] = (uint8) ( (n) ); \ -} - -void sha256_starts( sha256_context *ctx ) -{ - ctx->total[0] = 0; - ctx->total[1] = 0; - - ctx->state[0] = 0x6A09E667; - ctx->state[1] = 0xBB67AE85; - ctx->state[2] = 0x3C6EF372; - ctx->state[3] = 0xA54FF53A; - ctx->state[4] = 0x510E527F; - ctx->state[5] = 0x9B05688C; - ctx->state[6] = 0x1F83D9AB; - ctx->state[7] = 0x5BE0CD19; -} - -void sha256_process( sha256_context *ctx, uint8 data[64] ) -{ - uint32 temp1, temp2, W[64]; - uint32 A, B, C, D, E, F, G, H; - - GET_UINT32( W[0], data, 0 ); - GET_UINT32( W[1], data, 4 ); - GET_UINT32( W[2], data, 8 ); - GET_UINT32( W[3], data, 12 ); - GET_UINT32( W[4], data, 16 ); - GET_UINT32( W[5], data, 20 ); - GET_UINT32( W[6], data, 24 ); - GET_UINT32( W[7], data, 28 ); - GET_UINT32( W[8], data, 32 ); - GET_UINT32( W[9], data, 36 ); - GET_UINT32( W[10], data, 40 ); - GET_UINT32( W[11], data, 44 ); - GET_UINT32( W[12], data, 48 ); - GET_UINT32( W[13], data, 52 ); - GET_UINT32( W[14], data, 56 ); - GET_UINT32( W[15], data, 60 ); - -#define SHR(x,n) ((x & 0xFFFFFFFF) >> n) -#define ROTR(x,n) (SHR(x,n) | (x << (32 - n))) - -#define S0(x) (ROTR(x, 7) ^ ROTR(x,18) ^ SHR(x, 3)) -#define S1(x) (ROTR(x,17) ^ ROTR(x,19) ^ SHR(x,10)) - -#define S2(x) (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22)) -#define S3(x) (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25)) - -#define F0(x,y,z) ((x & y) | (z & (x | y))) -#define F1(x,y,z) (z ^ (x & (y ^ z))) - -#define R(t) \ -( \ - W[t] = S1(W[t - 2]) + W[t - 7] + \ - S0(W[t - 15]) + W[t - 16] \ -) - -#define P(a,b,c,d,e,f,g,h,x,K) \ -{ \ - temp1 = h + S3(e) + F1(e,f,g) + K + x; \ - temp2 = S2(a) + F0(a,b,c); \ - d += temp1; h = temp1 + temp2; \ -} - - A = ctx->state[0]; - B = ctx->state[1]; - C = ctx->state[2]; - D = ctx->state[3]; - E = ctx->state[4]; - F = ctx->state[5]; - G = ctx->state[6]; - H = ctx->state[7]; - - P( A, B, C, D, E, F, G, H, W[ 0], 0x428A2F98 ); - P( H, A, B, C, D, E, F, G, W[ 1], 0x71374491 ); - P( G, H, A, B, C, D, E, F, W[ 2], 0xB5C0FBCF ); - P( F, G, H, A, B, C, D, E, W[ 3], 0xE9B5DBA5 ); - P( E, F, G, H, A, B, C, D, W[ 4], 0x3956C25B ); - P( D, E, F, G, H, A, B, C, W[ 5], 0x59F111F1 ); - P( C, D, E, F, G, H, A, B, W[ 6], 0x923F82A4 ); - P( B, C, D, E, F, G, H, A, W[ 7], 0xAB1C5ED5 ); - P( A, B, C, D, E, F, G, H, W[ 8], 0xD807AA98 ); - P( H, A, B, C, D, E, F, G, W[ 9], 0x12835B01 ); - P( G, H, A, B, C, D, E, F, W[10], 0x243185BE ); - P( F, G, H, A, B, C, D, E, W[11], 0x550C7DC3 ); - P( E, F, G, H, A, B, C, D, W[12], 0x72BE5D74 ); - P( D, E, F, G, H, A, B, C, W[13], 0x80DEB1FE ); - P( C, D, E, F, G, H, A, B, W[14], 0x9BDC06A7 ); - P( B, C, D, E, F, G, H, A, W[15], 0xC19BF174 ); - P( A, B, C, D, E, F, G, H, R(16), 0xE49B69C1 ); - P( H, A, B, C, D, E, F, G, R(17), 0xEFBE4786 ); - P( G, H, A, B, C, D, E, F, R(18), 0x0FC19DC6 ); - P( F, G, H, A, B, C, D, E, R(19), 0x240CA1CC ); - P( E, F, G, H, A, B, C, D, R(20), 0x2DE92C6F ); - P( D, E, F, G, H, A, B, C, R(21), 0x4A7484AA ); - P( C, D, E, F, G, H, A, B, R(22), 0x5CB0A9DC ); - P( B, C, D, E, F, G, H, A, R(23), 0x76F988DA ); - P( A, B, C, D, E, F, G, H, R(24), 0x983E5152 ); - P( H, A, B, C, D, E, F, G, R(25), 0xA831C66D ); - P( G, H, A, B, C, D, E, F, R(26), 0xB00327C8 ); - P( F, G, H, A, B, C, D, E, R(27), 0xBF597FC7 ); - P( E, F, G, H, A, B, C, D, R(28), 0xC6E00BF3 ); - P( D, E, F, G, H, A, B, C, R(29), 0xD5A79147 ); - P( C, D, E, F, G, H, A, B, R(30), 0x06CA6351 ); - P( B, C, D, E, F, G, H, A, R(31), 0x14292967 ); - P( A, B, C, D, E, F, G, H, R(32), 0x27B70A85 ); - P( H, A, B, C, D, E, F, G, R(33), 0x2E1B2138 ); - P( G, H, A, B, C, D, E, F, R(34), 0x4D2C6DFC ); - P( F, G, H, A, B, C, D, E, R(35), 0x53380D13 ); - P( E, F, G, H, A, B, C, D, R(36), 0x650A7354 ); - P( D, E, F, G, H, A, B, C, R(37), 0x766A0ABB ); - P( C, D, E, F, G, H, A, B, R(38), 0x81C2C92E ); - P( B, C, D, E, F, G, H, A, R(39), 0x92722C85 ); - P( A, B, C, D, E, F, G, H, R(40), 0xA2BFE8A1 ); - P( H, A, B, C, D, E, F, G, R(41), 0xA81A664B ); - P( G, H, A, B, C, D, E, F, R(42), 0xC24B8B70 ); - P( F, G, H, A, B, C, D, E, R(43), 0xC76C51A3 ); - P( E, F, G, H, A, B, C, D, R(44), 0xD192E819 ); - P( D, E, F, G, H, A, B, C, R(45), 0xD6990624 ); - P( C, D, E, F, G, H, A, B, R(46), 0xF40E3585 ); - P( B, C, D, E, F, G, H, A, R(47), 0x106AA070 ); - P( A, B, C, D, E, F, G, H, R(48), 0x19A4C116 ); - P( H, A, B, C, D, E, F, G, R(49), 0x1E376C08 ); - P( G, H, A, B, C, D, E, F, R(50), 0x2748774C ); - P( F, G, H, A, B, C, D, E, R(51), 0x34B0BCB5 ); - P( E, F, G, H, A, B, C, D, R(52), 0x391C0CB3 ); - P( D, E, F, G, H, A, B, C, R(53), 0x4ED8AA4A ); - P( C, D, E, F, G, H, A, B, R(54), 0x5B9CCA4F ); - P( B, C, D, E, F, G, H, A, R(55), 0x682E6FF3 ); - P( A, B, C, D, E, F, G, H, R(56), 0x748F82EE ); - P( H, A, B, C, D, E, F, G, R(57), 0x78A5636F ); - P( G, H, A, B, C, D, E, F, R(58), 0x84C87814 ); - P( F, G, H, A, B, C, D, E, R(59), 0x8CC70208 ); - P( E, F, G, H, A, B, C, D, R(60), 0x90BEFFFA ); - P( D, E, F, G, H, A, B, C, R(61), 0xA4506CEB ); - P( C, D, E, F, G, H, A, B, R(62), 0xBEF9A3F7 ); - P( B, C, D, E, F, G, H, A, R(63), 0xC67178F2 ); - - ctx->state[0] += A; - ctx->state[1] += B; - ctx->state[2] += C; - ctx->state[3] += D; - ctx->state[4] += E; - ctx->state[5] += F; - ctx->state[6] += G; - ctx->state[7] += H; -} - -void sha256_update( sha256_context *ctx, uint8 *input, uint32 length ) -{ - uint32 left, fill; - - if( ! length ) return; - - left = ctx->total[0] & 0x3F; - fill = 64 - left; - - ctx->total[0] += length; - ctx->total[0] &= 0xFFFFFFFF; - - if( ctx->total[0] < length ) - ctx->total[1]++; - - if( left && length >= fill ) - { - memcpy( (void *) (ctx->buffer + left), - (void *) input, fill ); - sha256_process( ctx, ctx->buffer ); - length -= fill; - input += fill; - left = 0; - } - - while( length >= 64 ) - { - sha256_process( ctx, input ); - length -= 64; - input += 64; - } - - if( length ) - { - memcpy( (void *) (ctx->buffer + left), - (void *) input, length ); - } -} - -static uint8 sha256_padding[64] = -{ - 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -void sha256_finish( sha256_context *ctx, uint8 digest[32] ) -{ - uint32 last, padn; - uint32 high, low; - uint8 msglen[8]; - - high = ( ctx->total[0] >> 29 ) - | ( ctx->total[1] << 3 ); - low = ( ctx->total[0] << 3 ); - - PUT_UINT32( high, msglen, 0 ); - PUT_UINT32( low, msglen, 4 ); - - last = ctx->total[0] & 0x3F; - padn = ( last < 56 ) ? ( 56 - last ) : ( 120 - last ); - - sha256_update( ctx, sha256_padding, padn ); - sha256_update( ctx, msglen, 8 ); - - PUT_UINT32( ctx->state[0], digest, 0 ); - PUT_UINT32( ctx->state[1], digest, 4 ); - PUT_UINT32( ctx->state[2], digest, 8 ); - PUT_UINT32( ctx->state[3], digest, 12 ); - PUT_UINT32( ctx->state[4], digest, 16 ); - PUT_UINT32( ctx->state[5], digest, 20 ); - PUT_UINT32( ctx->state[6], digest, 24 ); - PUT_UINT32( ctx->state[7], digest, 28 ); -} diff --git a/sha256.h b/sha256.h @@ -1,24 +0,0 @@ -#ifndef _SHA256_H -#define _SHA256_H - -#ifndef uint8 -#define uint8 unsigned char -#endif - -#ifndef uint32 -#define uint32 unsigned long int -#endif - -typedef struct -{ - uint32 total[2]; - uint32 state[8]; - uint8 buffer[64]; -} -sha256_context; - -void sha256_starts( sha256_context *ctx ); -void sha256_update( sha256_context *ctx, uint8 *input, uint32 length ); -void sha256_finish( sha256_context *ctx, uint8 digest[32] ); - -#endif /* sha256.h */