commit 3ce9db0f144f9d5a39a9180ed343d9498d064778
parent 4ff76106cb1e2c6f99dfd163744224fc57d5fa65
Author: sin <sin@2f30.org>
Date: Sun, 17 Feb 2019 21:28:00 +0000
Add primitive chunk compression support
Diffstat:
M | Makefile | | | 2 | +- |
M | dedup.c | | | 103 | ++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------- |
2 files changed, 66 insertions(+), 39 deletions(-)
diff --git a/Makefile b/Makefile
@@ -8,7 +8,7 @@ DISTFILES = $(SRC) LICENSE Makefile README arg.h dedup.1 tree.h
CFLAGS = -g -Wall
CPPFLAGS = -I/usr/local/include -D_FILE_OFFSET_BITS=64
-LDLIBS = -lcrypto
+LDLIBS = -lcrypto -llz4
all: $(BIN)
diff --git a/dedup.c b/dedup.c
@@ -7,6 +7,7 @@
#include <string.h>
#include <unistd.h>
+#include <lz4.h>
#include <openssl/sha.h>
#include "arg.h"
@@ -16,11 +17,11 @@
#define STOREF ".store"
#define CACHEF ".cache"
-#define BLKSIZ (1024 * 512)
-#define WINSIZ 1024
+#define BLKSIZE (1 * 1024 * 1024)
+#define WINSIZE 4095
#define HASHMSK ((1ul << 21) - 1)
-#define MSGSIZ 256
-#define MDSIZ SHA256_DIGEST_LENGTH
+#define MSGSIZE 256
+#define MDSIZE SHA256_DIGEST_LENGTH
#define ROTL(x, y) (((x) << (y)) | ((x) >> (32 - (y))))
@@ -38,7 +39,7 @@ struct enthdr {
/* block descriptor */
struct bdescr {
- uint8_t md[MDSIZ];
+ uint8_t md[MDSIZE];
uint64_t offset;
uint64_t size;
};
@@ -46,8 +47,8 @@ struct bdescr {
/* index file entry */
struct ent {
uint64_t size;
- uint8_t msg[MSGSIZ];
- uint8_t md[MDSIZ]; /* hash of file */
+ uint8_t msg[MSGSIZE];
+ uint8_t md[MDSIZE]; /* hash of file */
uint64_t nblks;
struct bdescr bdescr[];
};
@@ -140,29 +141,47 @@ chunk_blk(uint8_t *buf, size_t size)
size_t i;
uint32_t fp;
- /* buzhash should be at least WINSIZ */
- if (size < WINSIZ)
+ /* buzhash should be at least WINSIZE */
+ if (size < WINSIZE)
return size;
/*
* To achieve better deduplication, we chunk blocks based on a
* recurring pattern occuring on the data stream. A fixed window
- * of WINSIZ bytes is slid over the data, and a rolling hash is
+ * of WINSIZE bytes is slid over the data, and a rolling hash is
* computed for this window.
* When the rolling hash matches a given pattern (see HASHMSK),
* the block is chunked at the end of that window, thus making
- * WINSIZ the smallest possible block size.
+ * WINSIZE the smallest possible block size.
*/
- fp = buzh_init(buf, WINSIZ);
- for (i = 0; i < size - WINSIZ; i++) {
+ fp = buzh_init(buf, WINSIZE);
+ for (i = 0; i < size - WINSIZE; i++) {
if (i > 0)
- fp = buzh_update(fp, buf[i - 1], buf[WINSIZ + i - 1], WINSIZ);
+ fp = buzh_update(fp, buf[i - 1], buf[WINSIZE + i - 1], WINSIZE);
if ((fp & HASHMSK) == 0)
- return i + WINSIZ;
+ return i + WINSIZE;
}
return size;
}
+size_t
+comp_size(size_t size)
+{
+ return LZ4_compressBound(size);
+}
+
+size_t
+comp(uint8_t *in, uint8_t *out, size_t insize, size_t outsize)
+{
+ return LZ4_compress_default((char *)in, (char *)out, insize, outsize);
+}
+
+size_t
+decomp(uint8_t *in, uint8_t *out, size_t insize, size_t outsize)
+{
+ return LZ4_decompress_safe((char *)in, (char *)out, insize, outsize);
+}
+
void
print_md(const uint8_t *md, size_t size)
{
@@ -375,33 +394,35 @@ lookup_blk(uint8_t *md)
void
dedup(int fd, char *msg)
{
- uint8_t *buf;
+ uint8_t *buf, *cbuf;
struct ent *ent;
SHA256_CTX ctx;
ssize_t n;
- buf = alloc_buf(BLKSIZ);
+ buf = alloc_buf(BLKSIZE);
+ cbuf = alloc_buf(comp_size(BLKSIZE));
ent = alloc_ent();
SHA256_Init(&ctx);
- while ((n = xread(fd, buf, BLKSIZ)) > 0) {
+ while ((n = xread(fd, buf, BLKSIZE)) > 0) {
uint8_t *bp = buf;
while (n > 0) {
- uint8_t md[MDSIZ];
+ uint8_t md[MDSIZE];
struct bdescr bdescr;
- size_t blksiz;
+ size_t blksize, csize;
- blksiz = chunk_blk(bp, n);
+ blksize = chunk_blk(bp, n);
+ csize = comp(bp, cbuf, blksize, comp_size(BLKSIZE));
memcpy(bdescr.md, md, sizeof(bdescr));
bdescr.offset = enthdr.store_size;
- bdescr.size = blksiz;
+ bdescr.size = csize;
- hash_blk(bp, bdescr.size, bdescr.md);
+ hash_blk(cbuf, bdescr.size, bdescr.md);
/* Calculate file hash one block at a time */
- SHA256_Update(&ctx, bp, bdescr.size);
+ SHA256_Update(&ctx, cbuf, bdescr.size);
ent = grow_ent(ent, ent->nblks + 1);
@@ -412,7 +433,7 @@ dedup(int fd, char *msg)
ent->bdescr[ent->nblks++] = bdescr;
/* Store block */
- append_blk(bp, &bdescr);
+ append_blk(cbuf, &bdescr);
/* Create a cache entry for this block */
cent = alloc_cent();
@@ -422,8 +443,8 @@ dedup(int fd, char *msg)
ent->bdescr[ent->nblks++] = bdescr;
}
- bp += blksiz;
- n -= blksiz;
+ bp += blksize;
+ n -= blksize;
}
}
@@ -445,24 +466,30 @@ dedup(int fd, char *msg)
}
free(ent);
+ free(cbuf);
free(buf);
}
int
extract(struct ent *ent, void *arg)
{
- uint8_t *buf;
+ uint8_t *buf, *cbuf;
struct extract_args *args = arg;
uint64_t i;
if (memcmp(ent->md, args->md, sizeof(ent->md)) != 0)
return WALK_CONTINUE;
- buf = alloc_buf(BLKSIZ);
+ buf = alloc_buf(BLKSIZE);
+ cbuf = alloc_buf(comp_size(BLKSIZE));
for (i = 0; i < ent->nblks; i++) {
- read_blk(buf, &ent->bdescr[i]);
- xwrite(args->fd, buf, ent->bdescr[i].size);
+ size_t blksize;
+
+ read_blk(cbuf, &ent->bdescr[i]);
+ blksize = decomp(cbuf, buf, ent->bdescr[i].size, BLKSIZE);
+ xwrite(args->fd, buf, blksize);
}
+ free(cbuf);
free(buf);
return WALK_STOP;
}
@@ -470,12 +497,12 @@ extract(struct ent *ent, void *arg)
int
check(struct ent *ent, void *arg)
{
- uint8_t md[MDSIZ];
+ uint8_t md[MDSIZE];
uint8_t *buf;
SHA256_CTX ctx;
uint64_t i;
- buf = alloc_buf(BLKSIZ);
+ buf = alloc_buf(BLKSIZE);
/*
* Calculate hash for each block and compare
* with index entry block descriptor
@@ -521,12 +548,12 @@ list(struct ent *ent, void *arg)
int
rebuild_cache(struct ent *ent, void *arg)
{
- uint8_t md[MDSIZ];
+ uint8_t md[MDSIZE];
uint8_t *buf;
SHA256_CTX ctx;
uint64_t i;
- buf = alloc_buf(BLKSIZ);
+ buf = alloc_buf(BLKSIZE);
for (i = 0; i < ent->nblks; i++) {
struct cent *cent;
@@ -575,7 +602,7 @@ init_cache(void)
uint64_t nents, i;
uint64_t min, max, avg;
- min = BLKSIZ;
+ min = BLKSIZE;
max = 0;
avg = 0;
@@ -598,7 +625,7 @@ init_cache(void)
avg /= nents;
if (verbose) {
- fprintf(stderr, "min/avg/max blksize: %llu/%llu/%llu\n",
+ fprintf(stderr, "min/avg/max block size: %llu/%llu/%llu\n",
(unsigned long long)min,
(unsigned long long)avg,
(unsigned long long)max);
@@ -661,7 +688,7 @@ usage(void)
int
main(int argc, char *argv[])
{
- uint8_t md[MDSIZ];
+ uint8_t md[MDSIZE];
char *id = NULL, *root = NULL, *msg = NULL;
int fd = -1, lflag = 0, cflag = 0;