dedup

data deduplication program
git clone git://git.2f30.org/dedup.git
Log | Files | Refs | README | LICENSE

commit 7f8b5b3e7b72b0d64437a87c6b412743f2ab6187
parent ba61c65bb274657b7ea643de789db2a24ea836f8
Author: sin <sin@2f30.org>
Date:   Sun, 10 Mar 2019 09:36:05 +0000

Increase dedup throughput by a factor of 2

Calculating the hash of the entire snapshot inside the loop slows the
process down by 2x.  This is because we hash the block twice.  We hash
first the raw uncompressed stream (which will become the snapshot
hash) and then we hash the compressed block which is stored in the
block descriptor.

Change the calcuation so we only hash the compressed block inside
dedup_chunk().  The hash of the snapshot is the hash of its block
hashes.

Diffstat:
Mdedup.c | 24++++++++++++++++++------
1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/dedup.c b/dedup.c @@ -229,27 +229,39 @@ dedup(int fd, char *msg) { struct snapshot *snap; struct chunker *chunker; - SHA256_CTX ctx; - ssize_t n; snap = alloc_snap(); chunker = alloc_chunker(fd, BLKSIZE_MIN, BLKSIZE_MAX, HASHMASK_BITS, WINSIZE); - SHA256_Init(&ctx); - while ((n = fill_chunker(chunker)) > 0) { + while (fill_chunker(chunker) > 0) { uint8_t *chunkp; size_t chunk_size; chunkp = get_chunk(chunker, &chunk_size); - SHA256_Update(&ctx, chunkp, chunk_size); snap = grow_snap(snap, snap->nr_blk_descs + 1); dedup_chunk(snap, chunkp, chunk_size); drain_chunker(chunker); } - SHA256_Final(snap->md, &ctx); if (snap->nr_blk_descs > 0) { + SHA256_CTX ctx; + uint64_t i; + + /* + * The snapshot hash is calculated over the + * hash of its block descriptors. + */ + SHA256_Init(&ctx); + for (i = 0; i < snap->nr_blk_descs; i++) { + struct blk_desc *blk_desc; + + blk_desc = &snap->blk_desc[i]; + SHA256_Update(&ctx, blk_desc->md, + sizeof(blk_desc->md)); + } + SHA256_Final(snap->md, &ctx); + if (msg != NULL) { size_t size;