bstorage.c (13644B)
1 /* 2 * Storage layer implementation using a single backing file. 3 * The file format is as follows: 4 * 5 * [block header] 6 * [block descriptor 0] 7 * [data] 8 * [block descriptor 1] 9 * [data] 10 * ... 11 */ 12 #include <sys/types.h> 13 #include <sys/stat.h> 14 15 #include <assert.h> 16 #include <errno.h> 17 #include <fcntl.h> 18 #include <stdint.h> 19 #include <stdio.h> 20 #include <stdlib.h> 21 #include <string.h> 22 #include <strings.h> 23 #include <unistd.h> 24 25 #include "block.h" 26 #include "compat.h" 27 #include "config.h" 28 #include "misc.h" 29 #include "queue.h" 30 #include "tree.h" 31 32 /* block header constants */ 33 #define BHDRMAGIC "DEDUPDIDUPDIDUP" 34 #define NBHDRMAGIC 16 35 36 #define VMIN 0 37 #define VMAJ 1 38 #define VMINMASK 0xff 39 #define VMAJSHIFT 8 40 #define VMAJMASK 0xff 41 42 #define BHDRSIZE (NBHDRMAGIC + 8 + 8) 43 44 /* block descriptor constants */ 45 #define BDTYPE 0x100 46 #define BDSIZE (8 + 8 + 8 + 8 + (MDSIZE)) 47 48 /* misc helpers */ 49 extern int pack(unsigned char *, char *, ...); 50 extern int unpack(unsigned char *, char *, ...); 51 52 static int bscreat(struct bctx *, char *, int); 53 static int bsopen(struct bctx *, char *, int, int); 54 static int bsput(struct bctx *, void *, size_t, unsigned char *); 55 static int bsget(struct bctx *, unsigned char *, void *, size_t *); 56 static int bsrm(struct bctx *, unsigned char *); 57 static int bsgc(struct bctx *); 58 static int bssync(struct bctx *); 59 static int bsclose(struct bctx *); 60 61 static struct bops bops = { 62 .creat = bscreat, 63 .open = bsopen, 64 .put = bsput, 65 .get = bsget, 66 .rm = bsrm, 67 .gc = bsgc, 68 .sync = bssync, 69 .close = bsclose, 70 }; 71 72 /* Block header structure */ 73 struct bhdr { 74 char magic[NBHDRMAGIC]; /* magic number for file(1) */ 75 uint64_t flags; /* version number */ 76 uint64_t nbd; /* number of block descriptors */ 77 }; 78 79 /* Block descriptor */ 80 struct bd { 81 uint16_t type; /* BDTYPE */ 82 unsigned char reserved[6]; /* should be set to 0 when writing */ 83 uint64_t offset; /* block offset */ 84 uint64_t size; /* block size */ 85 uint64_t refcnt; /* reference count of block, 0 if block is removed */ 86 unsigned char md[MDSIZE]; /* hash of block */ 87 RB_ENTRY(bd) rbe; /* bdcache link node */ 88 SLIST_ENTRY(bd) sle; /* gchead link node */ 89 }; 90 RB_HEAD(bdcache, bd); 91 92 /* Storage layer context */ 93 struct sctx { 94 struct bdcache bdcache; /* cache of block descriptors */ 95 SLIST_HEAD(gchead, bd) gchead; /* list of all blocks with a zero refcount */ 96 int fd; /* underlying storage file descriptor */ 97 int rdonly; /* when set to 1, the bssync() operation is a no-op */ 98 struct bhdr bhdr; /* block header */ 99 }; 100 101 static int 102 bd_cmp(struct bd *b1, struct bd *b2) 103 { 104 int r; 105 106 r = memcmp(b1->md, b2->md, MDSIZE); 107 if (r > 0) 108 return 1; 109 else if (r < 0) 110 return -1; 111 return 0; 112 } 113 static RB_PROTOTYPE(bdcache, bd, rbe, bd_cmp) 114 static RB_GENERATE(bdcache, bd, rbe, bd_cmp) 115 116 /* Unpack block header */ 117 static int 118 unpackbhdr(unsigned char *buf, struct bhdr *bhdr) 119 { 120 char fmt[BUFSIZ]; 121 int n; 122 123 snprintf(fmt, sizeof(fmt), "'%dqq", NBHDRMAGIC); 124 n = unpack(buf, fmt, 125 bhdr->magic, 126 &bhdr->flags, 127 &bhdr->nbd); 128 129 assert(n == BHDRSIZE); 130 return n; 131 } 132 133 /* Pack block header */ 134 static int 135 packbhdr(unsigned char *buf, struct bhdr *bhdr) 136 { 137 char fmt[BUFSIZ]; 138 int n; 139 140 snprintf(fmt, sizeof(fmt), "'%dqq", NBHDRMAGIC); 141 n = pack(buf, fmt, 142 bhdr->magic, 143 bhdr->flags, 144 bhdr->nbd); 145 146 assert(n == BHDRSIZE); 147 return n; 148 } 149 150 /* Unpack block descriptor */ 151 static int 152 unpackbd(unsigned char *buf, struct bd *bd) 153 { 154 char fmt[BUFSIZ]; 155 int n; 156 157 snprintf(fmt, sizeof(fmt), "s'6qqq'%d", MDSIZE); 158 n = unpack(buf, fmt, 159 &bd->type, 160 bd->reserved, 161 &bd->offset, 162 &bd->size, 163 &bd->refcnt, 164 bd->md); 165 166 assert(n == BDSIZE); 167 return n; 168 } 169 170 /* Write block descriptor */ 171 static int 172 packbd(unsigned char *buf, struct bd *bd) 173 { 174 char fmt[BUFSIZ]; 175 int n; 176 177 snprintf(fmt, sizeof(fmt), "s'6qqq'%d", MDSIZE); 178 n = pack(buf, fmt, 179 bd->type, 180 bd->reserved, 181 bd->offset, 182 bd->size, 183 bd->refcnt, 184 bd->md); 185 186 assert(n == BDSIZE); 187 return n; 188 } 189 190 /* Load block descriptor from file */ 191 static int 192 loadbd(struct sctx *sctx) 193 { 194 unsigned char bdbuf[BDSIZE]; 195 struct bd *bd; 196 197 bd = calloc(1, sizeof(*bd)); 198 if (bd == NULL) { 199 seterr("calloc: out of memory"); 200 return -1; 201 } 202 203 if (xread(sctx->fd, bdbuf, BDSIZE) != BDSIZE) { 204 free(bd); 205 seterr("failed to read block descriptor: %s", 206 strerror(errno)); 207 return -1; 208 } 209 unpackbd(bdbuf, bd); 210 211 if (bd->type != BDTYPE) { 212 free(bd); 213 seterr("invalid block descriptor type: %d", bd->type); 214 return -1; 215 } 216 217 /* Move to the next block descriptor */ 218 if (lseek(sctx->fd, bd->size, SEEK_CUR) < 0) { 219 free(bd); 220 seterr("lseek: %s", strerror(errno)); 221 return -1; 222 } 223 224 /* 225 * When refcount is 0 the block has been removed. 226 * In that case, the block descriptor is still present 227 * in the file as it is used to locate the next block 228 * descriptor which could be live. 229 * 230 * The garbage collection list links together all block 231 * descriptors that have a reference count of 0. 232 * This is needed to implement the gc operation. 233 */ 234 if (bd->refcnt > 0) 235 RB_INSERT(bdcache, &sctx->bdcache, bd); 236 else 237 SLIST_INSERT_HEAD(&sctx->gchead, bd, sle); 238 return 0; 239 } 240 241 /* Initialize block descriptor cache */ 242 static int 243 initbdcache(struct sctx *sctx) 244 { 245 struct bhdr *bhdr; 246 uint64_t i; 247 248 bhdr = &sctx->bhdr; 249 for (i = 0; i < bhdr->nbd; i++) { 250 struct bd *bd, *tmp; 251 252 if (loadbd(sctx) == 0) 253 continue; 254 255 /* Free block descriptor cache */ 256 RB_FOREACH_SAFE(bd, bdcache, &sctx->bdcache, tmp) { 257 RB_REMOVE(bdcache, &sctx->bdcache, bd); 258 free(bd); 259 } 260 261 /* Free garbage collector list */ 262 while (!SLIST_EMPTY(&sctx->gchead)) { 263 bd = SLIST_FIRST(&sctx->gchead); 264 SLIST_REMOVE(&sctx->gchead, bd, bd, sle); 265 free(bd); 266 } 267 return -1; 268 } 269 return 0; 270 } 271 272 /* Create storage file */ 273 static int 274 bscreat(struct bctx *bctx, char *path, int mode) 275 { 276 unsigned char bhdrbuf[BHDRSIZE]; 277 struct sctx *sctx; 278 struct bhdr *bhdr; 279 int fd; 280 281 fd = open(path, O_RDWR | O_CREAT | O_EXCL, mode); 282 if (fd < 0) { 283 seterr("open: %s", strerror(errno)); 284 return -1; 285 } 286 287 bctx->sctx = calloc(1, sizeof(struct sctx)); 288 if (bctx->sctx == NULL) { 289 close(fd); 290 seterr("calloc: out of memory"); 291 return -1; 292 } 293 294 sctx = bctx->sctx; 295 RB_INIT(&sctx->bdcache); 296 SLIST_INIT(&sctx->gchead); 297 sctx->fd = fd; 298 299 bhdr = &sctx->bhdr; 300 memcpy(bhdr->magic, BHDRMAGIC, NBHDRMAGIC); 301 bhdr->flags = (VMAJ << VMAJSHIFT) | VMIN; 302 bhdr->nbd = 0; 303 304 packbhdr(bhdrbuf, bhdr); 305 if (xwrite(fd, bhdrbuf, BHDRSIZE) != BHDRSIZE) { 306 free(sctx); 307 close(fd); 308 seterr("failed to write block header: %s", strerror(errno)); 309 return -1; 310 } 311 return 0; 312 } 313 314 /* Open storage file */ 315 static int 316 bsopen(struct bctx *bctx, char *path, int flags, int mode) 317 { 318 unsigned char bhdrbuf[BHDRSIZE]; 319 struct sctx *sctx; 320 struct bhdr *bhdr; 321 int fd; 322 323 switch (flags) { 324 case B_READ: 325 flags = O_RDONLY; 326 break; 327 case B_RDWR: 328 flags = O_RDWR; 329 break; 330 default: 331 seterr("invalid params"); 332 return -1; 333 } 334 335 fd = open(path, flags, mode); 336 if (fd < 0) { 337 seterr("open: %s", strerror(errno)); 338 return -1; 339 } 340 341 bctx->sctx = calloc(1, sizeof(struct sctx)); 342 if (bctx->sctx == NULL) { 343 close(fd); 344 seterr("calloc: out of memory"); 345 return -1; 346 } 347 348 sctx = bctx->sctx; 349 RB_INIT(&sctx->bdcache); 350 SLIST_INIT(&sctx->gchead); 351 bhdr = &sctx->bhdr; 352 353 if (xread(fd, bhdrbuf, BHDRSIZE) != BHDRSIZE) { 354 free(sctx); 355 close(fd); 356 seterr("failed to read block header: %s", strerror(errno)); 357 return -1; 358 } 359 unpackbhdr(bhdrbuf, bhdr); 360 361 if (memcmp(bhdr->magic, BHDRMAGIC, NBHDRMAGIC) != 0) { 362 free(sctx); 363 close(fd); 364 seterr("unknown block header magic"); 365 return -1; 366 } 367 368 /* If the major version is different, the format is incompatible */ 369 if (((bhdr->flags >> VMAJSHIFT) & VMAJMASK) != VMAJ) { 370 free(sctx); 371 close(fd); 372 seterr("block header version mismatch"); 373 return -1; 374 } 375 376 sctx->fd = fd; 377 sctx->rdonly = flags == O_RDONLY; 378 379 if (initbdcache(sctx) < 0) { 380 free(sctx); 381 close(fd); 382 return -1; 383 } 384 return 0; 385 } 386 387 /* Write a block to the storage file */ 388 static int 389 bsput(struct bctx *bctx, void *buf, size_t n, unsigned char *md) 390 { 391 unsigned char bdbuf[BDSIZE]; 392 struct sctx *sctx; 393 struct bhdr *bhdr; 394 struct bd key, *bd; 395 off_t offs; 396 397 /* 398 * If the block is already present in the cache 399 * just increment the reference count and write back 400 * the block descriptor associated with that block. 401 */ 402 sctx = bctx->sctx; 403 memcpy(key.md, md, MDSIZE); 404 bd = RB_FIND(bdcache, &sctx->bdcache, &key); 405 if (bd != NULL) { 406 off_t bdoffs; 407 408 bdoffs = bd->offset - BDSIZE; 409 if (lseek(sctx->fd, bdoffs, SEEK_SET) < 0) { 410 seterr("lseek: %s", strerror(errno)); 411 return -1; 412 } 413 414 bd->refcnt++; 415 packbd(bdbuf, bd); 416 if (xwrite(sctx->fd, bdbuf, BDSIZE) != BDSIZE) { 417 bd->refcnt--; 418 seterr("failed to write block descriptor: %s", 419 strerror(errno)); 420 return -1; 421 } 422 423 memcpy(md, bd->md, MDSIZE); 424 return 0; 425 } 426 427 /* New blocks are appended at the end of storage file */ 428 offs = lseek(sctx->fd, 0, SEEK_END); 429 if (offs < 0) { 430 seterr("lseek: %s", strerror(errno)); 431 return -1; 432 } 433 434 bd = calloc(1, sizeof(*bd)); 435 if (bd == NULL) { 436 seterr("calloc: out of memory"); 437 return -1; 438 } 439 bd->type = BDTYPE; 440 bd->offset = offs + BDSIZE; 441 bd->size = n; 442 bd->refcnt = 1; 443 memcpy(bd->md, key.md, MDSIZE); 444 445 packbd(bdbuf, bd); 446 if (xwrite(sctx->fd, bdbuf, BDSIZE) != BDSIZE) { 447 /* Shouldn't fail but if it does rewind storage file state */ 448 ftruncate(sctx->fd, offs); 449 free(bd); 450 seterr("failed to write block descriptor: %s", 451 strerror(errno)); 452 return -1; 453 } 454 455 if (xwrite(sctx->fd, buf, n) != n) { 456 /* Shouldn't fail but if it does rewind storage file state */ 457 ftruncate(sctx->fd, offs); 458 free(bd); 459 seterr("failed to write block: %s", strerror(errno)); 460 return -1; 461 } 462 463 /* 464 * Update block entry header. 465 * The header will be written to the storage file 466 * when bsclose() or bssync() is called. 467 */ 468 bhdr = &sctx->bhdr; 469 bhdr->nbd++; 470 471 RB_INSERT(bdcache, &sctx->bdcache, bd); 472 memcpy(md, bd->md, MDSIZE); 473 return bd->size; 474 } 475 476 /* Read a block from the storage file */ 477 static int 478 bsget(struct bctx *bctx, unsigned char *md, void *buf, size_t *n) 479 { 480 struct sctx *sctx; 481 struct bd key, *bd; 482 483 sctx = bctx->sctx; 484 memcpy(key.md, md, MDSIZE); 485 bd = RB_FIND(bdcache, &sctx->bdcache, &key); 486 if (bd == NULL) { 487 seterr("block not found"); 488 return -1; 489 } 490 491 if (*n < bd->size) { 492 seterr("buffer too small"); 493 return -1; 494 } 495 496 if (lseek(sctx->fd, bd->offset, SEEK_SET) < 0) { 497 seterr("lseek: %s", strerror(errno)); 498 return -1; 499 } 500 if (xread(sctx->fd, buf, bd->size) != bd->size) { 501 seterr("failed to read block: %s", strerror(errno)); 502 return -1; 503 } 504 *n = bd->size; 505 return 0; 506 } 507 508 /* Remove a block with the given hash */ 509 static int 510 bsrm(struct bctx *bctx, unsigned char *md) 511 { 512 unsigned char bdbuf[BDSIZE]; 513 struct sctx *sctx; 514 struct bd key, *bd; 515 off_t bdoffs; 516 517 sctx = bctx->sctx; 518 memcpy(key.md, md, MDSIZE); 519 bd = RB_FIND(bdcache, &sctx->bdcache, &key); 520 if (bd == NULL) { 521 seterr("block not found"); 522 return -1; 523 } 524 525 bdoffs = bd->offset - BDSIZE; 526 if (lseek(sctx->fd, bdoffs, SEEK_SET) < 0) { 527 seterr("lseek: %s", strerror(errno)); 528 return -1; 529 } 530 531 bd->refcnt--; 532 packbd(bdbuf, bd); 533 if (xwrite(sctx->fd, bdbuf, BDSIZE) != BDSIZE) { 534 bd->refcnt++; 535 seterr("failed to write block descriptor: %s", 536 strerror(errno)); 537 return -1; 538 } 539 540 /* This block is still referenced so just return */ 541 if (bd->refcnt > 0) 542 return 0; 543 544 if (punchhole(sctx->fd, bd->offset, bd->size) < 0) { 545 /* 546 * Filesystem does not support hole punching. 547 * Restore reference count. 548 */ 549 lseek(sctx->fd, bdoffs, SEEK_SET); 550 bd->refcnt++; 551 packbd(bdbuf, bd); 552 xwrite(sctx->fd, bdbuf, BDSIZE); 553 seterr("operation not supported"); 554 return -1; 555 } 556 557 /* 558 * Remove block from block descriptor cache as this is no 559 * longer a valid block. Insert it into the garbage collector 560 * list instead. 561 */ 562 RB_REMOVE(bdcache, &sctx->bdcache, bd); 563 SLIST_INSERT_HEAD(&sctx->gchead, bd, sle); 564 return 0; 565 } 566 567 /* 568 * Re-punch all holes in the storage file. 569 * This is needed when the storage file is copied from 570 * one system to another and back. The target system 571 * may not support hole punching so the holes will be 572 * filled with literal zeroes, negating the space saving 573 * effects. 574 */ 575 static int 576 bsgc(struct bctx *bctx) 577 { 578 struct sctx *sctx; 579 struct bd *bd; 580 581 sctx = bctx->sctx; 582 SLIST_FOREACH(bd, &sctx->gchead, sle) { 583 assert(bd->refcnt == 0); 584 punchhole(sctx->fd, bd->offset, bd->size); 585 } 586 return 0; 587 } 588 589 /* Sync block header to storage file */ 590 static int 591 bssync(struct bctx *bctx) 592 { 593 unsigned char bhdrbuf[BHDRSIZE]; 594 struct sctx *sctx; 595 struct bhdr *bhdr; 596 597 sctx = bctx->sctx; 598 if (sctx->rdonly) 599 return 0; 600 601 if (lseek(sctx->fd, 0, SEEK_SET) < 0) { 602 seterr("lseek: %s", strerror(errno)); 603 return -1; 604 } 605 606 bhdr = &sctx->bhdr; 607 packbhdr(bhdrbuf, bhdr); 608 if (xwrite(sctx->fd, bhdrbuf, BHDRSIZE) != BHDRSIZE) { 609 seterr("failed to write block header: %s", strerror(errno)); 610 return -1; 611 } 612 fsync(sctx->fd); 613 return 0; 614 } 615 616 /* Close storage handle */ 617 static int 618 bsclose(struct bctx *bctx) 619 { 620 struct sctx *sctx; 621 struct bd *bd, *tmp; 622 int r; 623 624 /* Free block descriptor cache */ 625 sctx = bctx->sctx; 626 RB_FOREACH_SAFE(bd, bdcache, &sctx->bdcache, tmp) { 627 RB_REMOVE(bdcache, &sctx->bdcache, bd); 628 free(bd); 629 } 630 631 /* Free garbage collector list */ 632 while (!SLIST_EMPTY(&sctx->gchead)) { 633 bd = SLIST_FIRST(&sctx->gchead); 634 SLIST_REMOVE(&sctx->gchead, bd, bd, sle); 635 free(bd); 636 } 637 638 r = close(sctx->fd); 639 free(sctx); 640 if (r < 0) 641 seterr("close: %s", strerror(errno)); 642 return r; 643 } 644 645 struct bops * 646 bstorageops(void) 647 { 648 return &bops; 649 }