commit 72a5c2c2269c959061bfb34a05f29875653f3e92
Author: sin <sin@2f30.org>
Date: Tue, 20 Mar 2018 16:02:29 +0000
Initial commit
Diffstat:
A | LICENSE | | | 13 | +++++++++++++ |
A | arg.h | | | 65 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | dedup.c | | | 320 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
3 files changed, 398 insertions(+), 0 deletions(-)
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,13 @@
+© 2018 Dimitris Papastamos <sin@2f30.org>
+
+Permission to use, copy, modify, and distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/arg.h b/arg.h
@@ -0,0 +1,65 @@
+/*
+ * Copy me if you can.
+ * by 20h
+ */
+
+#ifndef ARG_H__
+#define ARG_H__
+
+extern char *argv0;
+
+/* use main(int argc, char *argv[]) */
+#define ARGBEGIN for (argv0 = *argv, argv++, argc--;\
+ argv[0] && argv[0][0] == '-'\
+ && argv[0][1];\
+ argc--, argv++) {\
+ char argc_;\
+ char **argv_;\
+ int brk_;\
+ if (argv[0][1] == '-' && argv[0][2] == '\0') {\
+ argv++;\
+ argc--;\
+ break;\
+ }\
+ for (brk_ = 0, argv[0]++, argv_ = argv;\
+ argv[0][0] && !brk_;\
+ argv[0]++) {\
+ if (argv_ != argv)\
+ break;\
+ argc_ = argv[0][0];\
+ switch (argc_)
+
+/* Handles obsolete -NUM syntax */
+#define ARGNUM case '0':\
+ case '1':\
+ case '2':\
+ case '3':\
+ case '4':\
+ case '5':\
+ case '6':\
+ case '7':\
+ case '8':\
+ case '9'
+
+#define ARGEND }\
+ }
+
+#define ARGC() argc_
+
+#define ARGNUMF() (brk_ = 1, estrtonum(argv[0], 0, INT_MAX))
+
+#define EARGF(x) ((argv[0][1] == '\0' && argv[1] == NULL)?\
+ ((x), abort(), (char *)0) :\
+ (brk_ = 1, (argv[0][1] != '\0')?\
+ (&argv[0][1]) :\
+ (argc--, argv++, argv[0])))
+
+#define ARGF() ((argv[0][1] == '\0' && argv[1] == NULL)?\
+ (char *)0 :\
+ (brk_ = 1, (argv[0][1] != '\0')?\
+ (&argv[0][1]) :\
+ (argc--, argv++, argv[0])))
+
+#define LNGARG() &argv[0][0]
+
+#endif
diff --git a/dedup.c b/dedup.c
@@ -0,0 +1,320 @@
+#include <sys/stat.h>
+#include <err.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <openssl/sha.h>
+#include "arg.h"
+
+#define BLKSIZ 32768
+
+struct enthdr {
+ uint64_t flags;
+ uint64_t nents;
+} __attribute__((packed));
+
+struct ent {
+ uint64_t sz;
+ unsigned char md[SHA256_DIGEST_LENGTH];
+ uint64_t nblks;
+ uint64_t blks[];
+} __attribute__((packed));
+
+struct blk {
+ unsigned char md[SHA256_DIGEST_LENGTH];
+ uint64_t sz;
+ unsigned char data[BLKSIZ];
+} __attribute__((packed));
+
+struct enthdr enthdr;
+int ifd;
+int sfd;
+int verbose;
+char *argv0;
+
+void
+dump_md(const unsigned char *md, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++)
+ fprintf(stderr, "%02x", md[i]);
+}
+
+void
+dump_enthdr(struct enthdr *hdr)
+{
+ uint64_t i;
+
+ fprintf(stderr, "hdr->flags = %llx\n",
+ (unsigned long long)hdr->flags);
+ fprintf(stderr, "hdr->nents = %llx\n",
+ (unsigned long long)hdr->nents);
+}
+
+void
+dump_ent(struct ent *ent)
+{
+ uint64_t i;
+
+ fprintf(stderr, "ent->sz: %lld\n", (unsigned long long)ent->sz);
+ fprintf(stderr, "ent->md: ");
+ dump_md(ent->md, sizeof(ent->md));
+ fputc('\n', stderr);
+ if (verbose) {
+ fprintf(stderr, "ent->nblks: %lld\n",
+ (unsigned long long)ent->nblks);
+ for (i = 0; i < ent->nblks; i++)
+ fprintf(stderr, "ent->blks[%lld]: %lld\n",
+ (unsigned long long)i,
+ (unsigned long long)ent->blks[i]);
+ }
+}
+
+void
+dump_blk(struct blk *blk)
+{
+ uint64_t i;
+
+ fprintf(stderr, "blk->md: ");
+ dump_md(blk->md, sizeof(blk->md));
+ putchar('\n');
+ fprintf(stderr, "blk->sz: %lld\n", (unsigned long long)blk->sz);
+}
+
+void
+append_ent(struct ent *ent)
+{
+ enthdr.nents++;
+ lseek(ifd, 0, SEEK_SET);
+ write(ifd, &enthdr, sizeof(enthdr));
+
+ lseek(ifd, 0, SEEK_END);
+ ent->sz = sizeof(*ent);
+ ent->sz += ent->nblks * sizeof(ent->blks[0]);
+ write(ifd, ent, ent->sz);
+}
+
+struct ent *
+alloc_ent(void)
+{
+ struct ent *ent;
+
+ ent = malloc(sizeof(*ent));
+ if (ent == NULL)
+ err(1, "malloc");
+ return ent;
+}
+
+struct ent *
+grow_ent(struct ent *ent, uint64_t nblks)
+{
+ size_t sz;
+
+ sz = sizeof(*ent);
+ sz += nblks * sizeof(ent->blks[0]);
+ ent = realloc(ent, sz);
+ if (ent == NULL)
+ err(1, "realloc");
+ return ent;
+}
+
+void
+hash_blk(struct blk *blk)
+{
+ SHA256_CTX ctx;
+
+ SHA256_Init(&ctx);
+ SHA256_Update(&ctx, blk->data, blk->sz);
+ SHA256_Final(blk->md, &ctx);
+}
+
+void
+read_blk(struct blk *blk, off_t blkidx)
+{
+ lseek(sfd, blkidx * sizeof(*blk), SEEK_SET);
+ read(sfd, blk, sizeof(*blk));
+}
+
+void
+append_blk(struct blk *blk)
+{
+ lseek(sfd, 0, SEEK_END);
+ write(sfd, blk, sizeof(*blk));
+}
+
+int
+lookup_blk(struct blk *b1, uint64_t *blkidx)
+{
+ uint64_t nblks;
+ uint64_t i;
+
+ nblks = lseek(sfd, 0, SEEK_END);
+ nblks /= sizeof(struct blk);
+ for (i = 0; i < nblks; i++) {
+ struct blk b2;
+
+ read_blk(&b2, i);
+ if (memcmp(b1->md, b2.md, sizeof(b1->md)) == 0) {
+ *blkidx = i;
+ return 0;
+ }
+ }
+ return -1;
+}
+
+void
+dedup(int fd)
+{
+ struct blk blk;
+ struct ent *ent;
+ SHA256_CTX ctx;
+ ssize_t n;
+
+ ent = alloc_ent();
+ SHA256_Init(&ctx);
+ while ((n = read(fd, blk.data, BLKSIZ)) > 0) {
+ uint64_t blkidx;
+
+ blk.sz = n;
+ hash_blk(&blk);
+ SHA256_Update(&ctx, blk.data, blk.sz);
+ ent = grow_ent(ent, ent->nblks + 1);
+
+ if (lookup_blk(&blk, &blkidx) == -1) {
+ off_t offs;
+
+ offs = lseek(sfd, 0, SEEK_END);
+ offs /= sizeof(blk);
+ ent->blks[ent->nblks++] = offs;
+
+ append_blk(&blk);
+ } else {
+ ent->blks[ent->nblks++] = blkidx;
+ }
+ }
+ if (n < 0)
+ err(1, "read");
+
+ SHA256_Final(ent->md, &ctx);
+ append_ent(ent);
+}
+
+void
+str2id(unsigned char *idstr, uint8_t *id)
+{
+ size_t i, len = strlen(idstr) / 2;
+ char *p = idstr;
+
+ for (i = 0; i < len; i++, p += 2)
+ sscanf(p, "%2hhx", &id[i]);
+}
+
+void
+extract(unsigned char *id, int fd)
+{
+ unsigned char md[SHA256_DIGEST_LENGTH];
+ struct ent *ent;
+ uint64_t i;
+
+ str2id(id, md);
+ lseek(ifd, sizeof(enthdr), SEEK_SET);
+ for (i = 0; i < enthdr.nents; i++) {
+ ent = alloc_ent();
+ read(ifd, ent, sizeof(*ent));
+ ent = grow_ent(ent, ent->nblks);
+ read(ifd, ent->blks, ent->nblks * sizeof(ent->blks[0]));
+ if (memcmp(ent->md, md, sizeof(ent->md)) == 0) {
+ uint64_t j;
+
+ for (j = 0; j < ent->nblks; j++) {
+ struct blk blk;
+
+ read_blk(&blk, ent->blks[j]);
+ write(1, blk.data, blk.sz);
+ }
+ break;
+ }
+ free(ent);
+ }
+}
+
+void
+init(void)
+{
+ struct stat sb;
+
+ ifd = open("index", O_RDWR | O_CREAT, 0600);
+ if (ifd == -1)
+ err(1, "open index");
+
+ sfd = open("store", O_RDWR | O_CREAT, 0600);
+ if (sfd == -1)
+ err(1, "open store");
+
+ if (fstat(ifd, &sb) == -1)
+ err(1, "stat index");
+ if (sb.st_size != 0)
+ read(ifd, &enthdr, sizeof(enthdr));
+}
+
+void
+dump_index(void)
+{
+ struct ent *ent;
+ uint64_t i;
+
+ dump_enthdr(&enthdr);
+ lseek(ifd, sizeof(enthdr), SEEK_SET);
+ for (i = 0; i < enthdr.nents; i++) {
+ ent = alloc_ent();
+ read(ifd, ent, sizeof(*ent));
+ ent = grow_ent(ent, ent->nblks);
+ read(ifd, ent->blks, ent->nblks * sizeof(ent->blks[0]));
+ dump_ent(ent);
+ free(ent);
+ }
+}
+
+void
+usage(void)
+{
+ fprintf(stderr, "usage: %s [-lv] [-e id]\n", argv0);
+ exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+ unsigned char *id = NULL;
+ int lflag = 0;
+
+ ARGBEGIN {
+ case 'e':
+ id = EARGF(usage());
+ break;
+ case 'l':
+ lflag = 1;
+ break;
+ case 'v':
+ verbose = 1;
+ break;
+ default:
+ usage();
+ } ARGEND
+
+ init();
+
+ if (lflag) {
+ dump_index();
+ return 0;
+ }
+
+ if (id)
+ extract(id, 0);
+ else
+ dedup(0);
+}