dedup

data deduplication program
git clone git://git.2f30.org/dedup.git
Log | Files | Refs | README | LICENSE

commit 07c41115923df14d48ec16279ed14dcb0df598e1
parent 32c20d64995844daaaed9c9a11afc03ae68c7753
Author: z3bra <contactatz3bradotorg>
Date:   Sun, 17 Feb 2019 14:42:59 +0100

Check buffer size in chunk_blk()

It also changes the for loop so we can chunk a block if the pattern
matches right at the beginning of the block, thus making the smallest
chunk size WINSIZ instead of WINSIZ + 1.

Diffstat:
Mdedup.c | 14++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/dedup.c b/dedup.c @@ -140,6 +140,10 @@ chunk_blk(uint8_t *buf, size_t size) size_t i; uint32_t fp; + /* buzhash should be at least WINSIZ */ + if (size < WINSIZ) + return size; + /* * To achieve better deduplication, we chunk blocks based on a * recurring pattern occuring on the data stream. A fixed window @@ -150,8 +154,9 @@ chunk_blk(uint8_t *buf, size_t size) * WINSIZ the smallest possible block size. */ fp = buzh_init(buf, WINSIZ); - for (i = 1; i < size - WINSIZ; i++) { - fp = buzh_update(fp, buf[i - 1], buf[i + WINSIZ - 1], WINSIZ); + for (i = 0; i < size - WINSIZ; i++) { + if (i > 0) + fp = buzh_update(fp, buf[i - 1], buf[WINSIZ + i - 1], WINSIZ); if ((fp & HASHMSK) == 0) return i + WINSIZ; } @@ -387,10 +392,7 @@ dedup(int fd, char *msg) struct bdescr bdescr; size_t blksiz; - if (n > WINSIZ) - blksiz = chunk_blk(bp, n); - else - blksiz = n; + blksiz = chunk_blk(bp, n); memcpy(bdescr.md, md, sizeof(bdescr)); bdescr.offset = enthdr.store_size;