dedup

deduplicating backup program
git clone git://git.2f30.org/dedup
Log | Files | Refs | README | LICENSE

chunker.c (6135B)


      1 #include <stdint.h>
      2 #include <stdio.h>
      3 #include <stdlib.h>
      4 #include <string.h>
      5 #include <unistd.h>
      6 
      7 #include "misc.h"
      8 
      9 #define ROTL(x, y) (((x) << (y)) | ((x) >> (32 - (y))))
     10 
     11 struct chunker {
     12 	unsigned char *buf;
     13 	int fd;
     14 	size_t rp;
     15 	size_t wp;
     16 	size_t minsize;
     17 	size_t maxsize;
     18 	size_t mask;
     19 	size_t winsize;
     20 	uint32_t seed;
     21 };
     22 
     23 /*
     24  * Static table for use in buzhash algorithm.
     25  * 256 * 32 bits randomly generated unique integers
     26  *
     27  * To get better pseudo-random results, there is exactly the same number
     28  * of 0 and 1 spread amongst these integers. It means that there is
     29  * exactly 50% chance that a XOR operation would flip all the bits in
     30  * the hash.
     31  */
     32 static uint32_t buztbl[] = {
     33 	0xbc9fa594,0x30a8f827,0xced627a7,0xdb46a745,0xcfa4a9e8,0x77cccb59,0xddb66276,0x3adc532f,
     34 	0xfe8b67d3,0x8155b59e,0x0c893666,0x1d757009,0x17394ee4,0x85d94c07,0xcacd52da,0x076c6f79,
     35 	0xead0a798,0x6c7ccb4a,0x2639a1b8,0x3aa5ae32,0x3e6218d2,0xb290d980,0xa5149521,0x4b426119,
     36 	0xd3230fc7,0x677c1cc4,0x2b64603c,0x01fe92a8,0xbe358296,0xa7e7fac7,0xf509bf41,0x04b017ad,
     37 	0xf900344c,0x8e14e202,0xb2a6e9b4,0x3db3c311,0x960286a8,0xf6bf0468,0xed54ec94,0xf358070c,
     38 	0x6a4795dd,0x3f7b925c,0x5e13a060,0xfaecbafe,0x03c8bb55,0x8a56ba88,0x633e3b49,0xe036bbbe,
     39 	0x1ed3dbb5,0x76e8ad74,0x79d346ab,0x44b4ccc4,0x71eb22d3,0xa1aa3f24,0x50e05b81,0xa3b450d3,
     40 	0x7f5caffb,0xa1990650,0x54c44800,0xda134b65,0x72362eea,0xbd12b8e6,0xf7c99fdc,0x020d48c7,
     41 	0x9d9c3d46,0x32b75615,0xe61923cf,0xadc09d8f,0xab11376b,0xd66fe4cd,0xb3b086b6,0xb8345b9f,
     42 	0x59029667,0xae0e937c,0xcbd4d4ba,0x720bb3fb,0x5f7d2ca3,0xec24ba15,0x6b40109b,0xf0a54587,
     43 	0x3acf9420,0x466e981d,0xc66dc124,0x150ef7b4,0xc3ce718e,0x136774f5,0x46684ab4,0xb4b490f0,
     44 	0x26508a8b,0xf12febc8,0x4b99171b,0xfc373c84,0x339b5677,0x41703ff3,0x7cadbbd7,0x15ea24e2,
     45 	0x7a2f9783,0xed6a383a,0x649eb072,0x79970941,0x2abd28ad,0x4375e00c,0x9df084f7,0x6fdeec6c,
     46 	0x6619ac6d,0x7d256f4d,0x9b8e658a,0x3d7627e9,0xd5a98d45,0x15f84223,0x9b6acef5,0xf876be67,
     47 	0xe3ae7089,0x84e2b64a,0x6818a969,0x86e9ba4e,0xa24a5b57,0x61570cf1,0xa5f8fc91,0x879d8383,
     48 	0x91b13866,0x75e87961,0x16db8138,0x5a2ff6b8,0x8f664e9b,0x894e1496,0x88235c5b,0xcdb3b580,
     49 	0xa2e80109,0xb0f88a82,0xd12cd340,0x93fbc37d,0xf4d1eb82,0xce42f309,0x16ffd2c2,0xb4dfef2b,
     50 	0xb8b1a33e,0x4708a5e6,0xba66dd88,0xa9ec0da6,0x6f8ee2c9,0xad8b9993,0x1d6a25a8,0x1f3d08ce,
     51 	0x149c04e7,0x5cd1fa51,0xb84c89c7,0xeced6f8c,0xe328b30f,0x084fa836,0x6d1bb1b7,0x94c78ea5,
     52 	0x14973034,0xf1a1bcef,0x48b798d2,0xded9ca9e,0x5fd965d0,0x92544eb1,0x5e80f189,0xcbbf5e15,
     53 	0x4d8121f0,0x5dd3b92f,0xd9ea98fb,0x2dbf5644,0x0fbcb9b7,0x20a1db53,0x7c3fcc98,0x36744fbd,
     54 	0xced08954,0x8e7c5efe,0x3c5f6733,0x657477be,0x3630a02d,0x38bcbda0,0xb7702575,0x4a7f4bce,
     55 	0x0e7660fe,0x4dcb91b5,0x4fd7ffd3,0x041821c1,0xa846a181,0xc8048e9e,0xd4b05072,0x986e0509,
     56 	0xa00aaeeb,0x02e3526a,0x2fac4843,0xfa98e805,0x923ecd8d,0x395d9546,0x8674c3cd,0xae5a8a71,
     57 	0x966dfe45,0x5c9ceba5,0x0830a1cf,0xa1750981,0x8f604480,0x28ea0c9a,0x0da12413,0x98b0b3c5,
     58 	0xa21d473a,0x96ce4308,0xe9a1001b,0x8bbacb44,0x18bad3f4,0xe3121acb,0x46a9b45f,0x92cd9704,
     59 	0xc1a7c619,0x3281e361,0x462e8c79,0x9e572f93,0x7239e5f0,0x67d8e6ba,0x13747ce3,0xf01ee64a,
     60 	0xe7d0ae12,0xeea04088,0xe5b36767,0x17558eae,0x678ffbe6,0xe0bbc866,0x0c24adec,0xa9cbb869,
     61 	0x3fd44ee1,0x9ca4ca06,0x04c0ef00,0x04589a21,0x9cf9c819,0x976f6ca1,0x8a30e66a,0x004d6f7e,
     62 	0x384c8851,0x5bc97eb8,0xc6c49339,0x5aa386c7,0x74bdf8af,0x9b713750,0x4112f8c2,0x2895dae1,
     63 	0xf576d905,0x9de98bce,0xb2b26bcd,0xd46707a0,0x147fbb46,0xa52c6e50,0xe43128fc,0x374ad964,
     64 	0x8dfd4d53,0xc4d0c087,0x31dfb5ca,0xa44589b5,0x6b637e2e,0x663f6b45,0xd2d8baa0,0x1dac7e4c
     65 };
     66 
     67 /* Buzhash: https://en.wikipedia.org/wiki/Rolling_hash#Cyclic_polynomial */
     68 static inline uint32_t
     69 hinit(unsigned char *buf, size_t size)
     70 {
     71 	uint32_t sum;
     72 	size_t i;
     73 
     74 	for (i = 1, sum = 0; i < size; i++, buf++)
     75 		sum ^= ROTL(buztbl[*buf], (size - i) % 32);
     76 	return sum ^ buztbl[*buf];
     77 }
     78 
     79 static inline uint32_t
     80 hupdate(uint32_t sum, unsigned char out, unsigned char in, size_t size)
     81 {
     82 	return ROTL(sum, 1) ^ ROTL(buztbl[out], size % 32) ^ buztbl[in];
     83 }
     84 
     85 static size_t
     86 cgetsize(struct chunker *c)
     87 {
     88 	size_t maxcsize, winsize, i;
     89 	uint32_t sum;
     90 	unsigned char *bp;
     91 
     92 	maxcsize = c->wp - c->rp;
     93 	winsize = c->winsize;
     94 	if (maxcsize < winsize)
     95 		return maxcsize;
     96 
     97 	/*
     98 	 * To achieve better deduplication, we chunk blocks based on a
     99 	 * recurring pattern on the data stream. We slide a fixed window
    100 	 * of WINSIZE bytes over the data, and a rolling hash is computed
    101 	 * for this window.
    102 	 * When the rolling hash matches a given pattern the block is chunked
    103 	 * at the end of that window.
    104 	 */
    105 	bp = &c->buf[c->rp];
    106 	sum = hinit(bp, winsize);
    107 	for (i = 0; i < maxcsize - winsize; i++) {
    108 		size_t csize = i + winsize;
    109 
    110 		if (i > 0) {
    111 			unsigned char out = bp[i - 1];
    112 			unsigned char in = bp[csize - 1];
    113 
    114 			sum = hupdate(sum, out, in, winsize);
    115 		}
    116 
    117 		if (csize < c->minsize)
    118 			continue;
    119 
    120 		if ((sum & c->mask) == 0)
    121 			return csize;
    122 	}
    123 	return maxcsize;
    124 }
    125 
    126 struct chunker *
    127 copen(int fd, size_t minsize, size_t maxsize,
    128       size_t mask, size_t winsize, uint32_t seed)
    129 {
    130 	struct chunker *c;
    131 	size_t i;
    132 
    133 	c = calloc(1, sizeof(*c));
    134 	if (c == NULL) {
    135 		seterr("calloc: out of memory");
    136 		return NULL;
    137 	}
    138 
    139 	c->buf = calloc(1, maxsize);
    140 	if (c->buf == NULL) {
    141 		free(c);
    142 		seterr("calloc: out of memory");
    143 		return NULL;
    144 	}
    145 
    146 	c->fd = fd;
    147 	c->minsize = minsize;
    148 	c->maxsize = maxsize;
    149 	c->mask = mask;
    150 	c->winsize = winsize;
    151 	c->seed = seed;
    152 
    153 	for (i = 0; i < sizeof(buztbl) / sizeof(buztbl[0]); i++)
    154 		buztbl[i] ^= c->seed;
    155 	return c;
    156 }
    157 
    158 void
    159 cclose(struct chunker *c)
    160 {
    161 	free(c->buf);
    162 	free(c);
    163 }
    164 
    165 ssize_t
    166 cfill(struct chunker *c)
    167 {
    168 	unsigned char *bp;
    169 	ssize_t n;
    170 
    171 	bp = &c->buf[c->wp];
    172 	n = xread(c->fd, bp, c->maxsize - c->wp);
    173 	c->wp += n;
    174 	return c->wp;
    175 }
    176 
    177 void *
    178 cget(struct chunker *c, size_t *csize)
    179 {
    180 	unsigned char *bp;
    181 
    182 	if (c->rp == c->wp) {
    183 		seterr("chunker underflow");
    184 		return NULL;
    185 	}
    186 
    187 	bp = &c->buf[c->rp];
    188 	*csize = cgetsize(c);
    189 	c->rp += *csize;
    190 	return bp;
    191 }
    192 
    193 size_t
    194 cdrain(struct chunker *c)
    195 {
    196 	unsigned char *src, *dst;
    197 
    198 	src = &c->buf[c->rp];
    199 	dst = c->buf;
    200 	memmove(dst, src, c->wp - c->rp);
    201 	c->wp -= c->rp;
    202 	c->rp = 0;
    203 	return c->wp;
    204 }