dedup

small dedup program
git clone git://git.2f30.org/dedup.git
Log | Files | Refs | LICENSE

commit 72a5c2c2269c959061bfb34a05f29875653f3e92
Author: sin <sin@2f30.org>
Date:   Tue, 20 Mar 2018 16:02:29 +0000

Initial commit

Diffstat:
LICENSE | 13+++++++++++++
arg.h | 65+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
dedup.c | 320+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 398 insertions(+), 0 deletions(-)

diff --git a/LICENSE b/LICENSE @@ -0,0 +1,13 @@ +© 2018 Dimitris Papastamos <sin@2f30.org> + +Permission to use, copy, modify, and distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/arg.h b/arg.h @@ -0,0 +1,65 @@ +/* + * Copy me if you can. + * by 20h + */ + +#ifndef ARG_H__ +#define ARG_H__ + +extern char *argv0; + +/* use main(int argc, char *argv[]) */ +#define ARGBEGIN for (argv0 = *argv, argv++, argc--;\ + argv[0] && argv[0][0] == '-'\ + && argv[0][1];\ + argc--, argv++) {\ + char argc_;\ + char **argv_;\ + int brk_;\ + if (argv[0][1] == '-' && argv[0][2] == '\0') {\ + argv++;\ + argc--;\ + break;\ + }\ + for (brk_ = 0, argv[0]++, argv_ = argv;\ + argv[0][0] && !brk_;\ + argv[0]++) {\ + if (argv_ != argv)\ + break;\ + argc_ = argv[0][0];\ + switch (argc_) + +/* Handles obsolete -NUM syntax */ +#define ARGNUM case '0':\ + case '1':\ + case '2':\ + case '3':\ + case '4':\ + case '5':\ + case '6':\ + case '7':\ + case '8':\ + case '9' + +#define ARGEND }\ + } + +#define ARGC() argc_ + +#define ARGNUMF() (brk_ = 1, estrtonum(argv[0], 0, INT_MAX)) + +#define EARGF(x) ((argv[0][1] == '\0' && argv[1] == NULL)?\ + ((x), abort(), (char *)0) :\ + (brk_ = 1, (argv[0][1] != '\0')?\ + (&argv[0][1]) :\ + (argc--, argv++, argv[0]))) + +#define ARGF() ((argv[0][1] == '\0' && argv[1] == NULL)?\ + (char *)0 :\ + (brk_ = 1, (argv[0][1] != '\0')?\ + (&argv[0][1]) :\ + (argc--, argv++, argv[0]))) + +#define LNGARG() &argv[0][0] + +#endif diff --git a/dedup.c b/dedup.c @@ -0,0 +1,320 @@ +#include <sys/stat.h> +#include <err.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <openssl/sha.h> +#include "arg.h" + +#define BLKSIZ 32768 + +struct enthdr { + uint64_t flags; + uint64_t nents; +} __attribute__((packed)); + +struct ent { + uint64_t sz; + unsigned char md[SHA256_DIGEST_LENGTH]; + uint64_t nblks; + uint64_t blks[]; +} __attribute__((packed)); + +struct blk { + unsigned char md[SHA256_DIGEST_LENGTH]; + uint64_t sz; + unsigned char data[BLKSIZ]; +} __attribute__((packed)); + +struct enthdr enthdr; +int ifd; +int sfd; +int verbose; +char *argv0; + +void +dump_md(const unsigned char *md, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) + fprintf(stderr, "%02x", md[i]); +} + +void +dump_enthdr(struct enthdr *hdr) +{ + uint64_t i; + + fprintf(stderr, "hdr->flags = %llx\n", + (unsigned long long)hdr->flags); + fprintf(stderr, "hdr->nents = %llx\n", + (unsigned long long)hdr->nents); +} + +void +dump_ent(struct ent *ent) +{ + uint64_t i; + + fprintf(stderr, "ent->sz: %lld\n", (unsigned long long)ent->sz); + fprintf(stderr, "ent->md: "); + dump_md(ent->md, sizeof(ent->md)); + fputc('\n', stderr); + if (verbose) { + fprintf(stderr, "ent->nblks: %lld\n", + (unsigned long long)ent->nblks); + for (i = 0; i < ent->nblks; i++) + fprintf(stderr, "ent->blks[%lld]: %lld\n", + (unsigned long long)i, + (unsigned long long)ent->blks[i]); + } +} + +void +dump_blk(struct blk *blk) +{ + uint64_t i; + + fprintf(stderr, "blk->md: "); + dump_md(blk->md, sizeof(blk->md)); + putchar('\n'); + fprintf(stderr, "blk->sz: %lld\n", (unsigned long long)blk->sz); +} + +void +append_ent(struct ent *ent) +{ + enthdr.nents++; + lseek(ifd, 0, SEEK_SET); + write(ifd, &enthdr, sizeof(enthdr)); + + lseek(ifd, 0, SEEK_END); + ent->sz = sizeof(*ent); + ent->sz += ent->nblks * sizeof(ent->blks[0]); + write(ifd, ent, ent->sz); +} + +struct ent * +alloc_ent(void) +{ + struct ent *ent; + + ent = malloc(sizeof(*ent)); + if (ent == NULL) + err(1, "malloc"); + return ent; +} + +struct ent * +grow_ent(struct ent *ent, uint64_t nblks) +{ + size_t sz; + + sz = sizeof(*ent); + sz += nblks * sizeof(ent->blks[0]); + ent = realloc(ent, sz); + if (ent == NULL) + err(1, "realloc"); + return ent; +} + +void +hash_blk(struct blk *blk) +{ + SHA256_CTX ctx; + + SHA256_Init(&ctx); + SHA256_Update(&ctx, blk->data, blk->sz); + SHA256_Final(blk->md, &ctx); +} + +void +read_blk(struct blk *blk, off_t blkidx) +{ + lseek(sfd, blkidx * sizeof(*blk), SEEK_SET); + read(sfd, blk, sizeof(*blk)); +} + +void +append_blk(struct blk *blk) +{ + lseek(sfd, 0, SEEK_END); + write(sfd, blk, sizeof(*blk)); +} + +int +lookup_blk(struct blk *b1, uint64_t *blkidx) +{ + uint64_t nblks; + uint64_t i; + + nblks = lseek(sfd, 0, SEEK_END); + nblks /= sizeof(struct blk); + for (i = 0; i < nblks; i++) { + struct blk b2; + + read_blk(&b2, i); + if (memcmp(b1->md, b2.md, sizeof(b1->md)) == 0) { + *blkidx = i; + return 0; + } + } + return -1; +} + +void +dedup(int fd) +{ + struct blk blk; + struct ent *ent; + SHA256_CTX ctx; + ssize_t n; + + ent = alloc_ent(); + SHA256_Init(&ctx); + while ((n = read(fd, blk.data, BLKSIZ)) > 0) { + uint64_t blkidx; + + blk.sz = n; + hash_blk(&blk); + SHA256_Update(&ctx, blk.data, blk.sz); + ent = grow_ent(ent, ent->nblks + 1); + + if (lookup_blk(&blk, &blkidx) == -1) { + off_t offs; + + offs = lseek(sfd, 0, SEEK_END); + offs /= sizeof(blk); + ent->blks[ent->nblks++] = offs; + + append_blk(&blk); + } else { + ent->blks[ent->nblks++] = blkidx; + } + } + if (n < 0) + err(1, "read"); + + SHA256_Final(ent->md, &ctx); + append_ent(ent); +} + +void +str2id(unsigned char *idstr, uint8_t *id) +{ + size_t i, len = strlen(idstr) / 2; + char *p = idstr; + + for (i = 0; i < len; i++, p += 2) + sscanf(p, "%2hhx", &id[i]); +} + +void +extract(unsigned char *id, int fd) +{ + unsigned char md[SHA256_DIGEST_LENGTH]; + struct ent *ent; + uint64_t i; + + str2id(id, md); + lseek(ifd, sizeof(enthdr), SEEK_SET); + for (i = 0; i < enthdr.nents; i++) { + ent = alloc_ent(); + read(ifd, ent, sizeof(*ent)); + ent = grow_ent(ent, ent->nblks); + read(ifd, ent->blks, ent->nblks * sizeof(ent->blks[0])); + if (memcmp(ent->md, md, sizeof(ent->md)) == 0) { + uint64_t j; + + for (j = 0; j < ent->nblks; j++) { + struct blk blk; + + read_blk(&blk, ent->blks[j]); + write(1, blk.data, blk.sz); + } + break; + } + free(ent); + } +} + +void +init(void) +{ + struct stat sb; + + ifd = open("index", O_RDWR | O_CREAT, 0600); + if (ifd == -1) + err(1, "open index"); + + sfd = open("store", O_RDWR | O_CREAT, 0600); + if (sfd == -1) + err(1, "open store"); + + if (fstat(ifd, &sb) == -1) + err(1, "stat index"); + if (sb.st_size != 0) + read(ifd, &enthdr, sizeof(enthdr)); +} + +void +dump_index(void) +{ + struct ent *ent; + uint64_t i; + + dump_enthdr(&enthdr); + lseek(ifd, sizeof(enthdr), SEEK_SET); + for (i = 0; i < enthdr.nents; i++) { + ent = alloc_ent(); + read(ifd, ent, sizeof(*ent)); + ent = grow_ent(ent, ent->nblks); + read(ifd, ent->blks, ent->nblks * sizeof(ent->blks[0])); + dump_ent(ent); + free(ent); + } +} + +void +usage(void) +{ + fprintf(stderr, "usage: %s [-lv] [-e id]\n", argv0); + exit(1); +} + +int +main(int argc, char *argv[]) +{ + unsigned char *id = NULL; + int lflag = 0; + + ARGBEGIN { + case 'e': + id = EARGF(usage()); + break; + case 'l': + lflag = 1; + break; + case 'v': + verbose = 1; + break; + default: + usage(); + } ARGEND + + init(); + + if (lflag) { + dump_index(); + return 0; + } + + if (id) + extract(id, 0); + else + dedup(0); +}