join.c (9795B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <ctype.h> 3 #include <stdint.h> 4 #include <stdio.h> 5 #include <stdlib.h> 6 #include <string.h> 7 8 #include "text.h" 9 #include "utf.h" 10 #include "util.h" 11 12 enum { 13 INIT = 1, 14 GROW = 2, 15 }; 16 17 enum { 18 EXPAND = 0, 19 RESET = 1, 20 }; 21 22 enum { FIELD_ERROR = -2, }; 23 24 struct field { 25 char *s; 26 size_t len; 27 }; 28 29 struct jline { 30 struct line text; 31 size_t nf; 32 size_t maxf; 33 struct field *fields; 34 }; 35 36 struct spec { 37 size_t fileno; 38 size_t fldno; 39 }; 40 41 struct outlist { 42 size_t ns; 43 size_t maxs; 44 struct spec **specs; 45 }; 46 47 struct span { 48 size_t nl; 49 size_t maxl; 50 struct jline **lines; 51 }; 52 53 static char *sep = NULL; 54 static char *replace = NULL; 55 static const char defaultofs = ' '; 56 static const int jfield = 1; /* POSIX default join field */ 57 static int unpairsa = 0, unpairsb = 0; 58 static int oflag = 0; 59 static int pairs = 1; 60 static size_t seplen; 61 static struct outlist output; 62 63 static void 64 usage(void) 65 { 66 eprintf("usage: %s [-1 field] [-2 field] [-o list] [-e string] " 67 "[-a | -v fileno] [-t delim] file1 file2\n", argv0); 68 } 69 70 static void 71 prfield(struct field *fp) 72 { 73 if (fwrite(fp->s, 1, fp->len, stdout) != fp->len) 74 eprintf("fwrite:"); 75 } 76 77 static void 78 prsep(void) 79 { 80 if (sep) 81 fwrite(sep, 1, seplen, stdout); 82 else 83 putchar(defaultofs); 84 } 85 86 static void 87 swaplines(struct jline *la, struct jline *lb) 88 { 89 struct jline tmp; 90 91 tmp = *la; 92 *la = *lb; 93 *lb = tmp; 94 } 95 96 static void 97 prjoin(struct jline *la, struct jline *lb, size_t jfa, size_t jfb) 98 { 99 struct spec *sp; 100 struct field *joinfield; 101 size_t i; 102 103 if (jfa >= la->nf || jfb >= lb->nf) 104 return; 105 106 joinfield = &la->fields[jfa]; 107 108 if (oflag) { 109 for (i = 0; i < output.ns; i++) { 110 sp = output.specs[i]; 111 112 if (sp->fileno == 1) { 113 if (sp->fldno < la->nf) 114 prfield(&la->fields[sp->fldno]); 115 else if (replace) 116 fputs(replace, stdout); 117 } else if (sp->fileno == 2) { 118 if (sp->fldno < lb->nf) 119 prfield(&lb->fields[sp->fldno]); 120 else if (replace) 121 fputs(replace, stdout); 122 } else if (sp->fileno == 0) { 123 prfield(joinfield); 124 } 125 126 if (i < output.ns - 1) 127 prsep(); 128 } 129 } else { 130 prfield(joinfield); 131 prsep(); 132 133 for (i = 0; i < la->nf; i++) { 134 if (i != jfa) { 135 prfield(&la->fields[i]); 136 prsep(); 137 } 138 } 139 for (i = 0; i < lb->nf; i++) { 140 if (i != jfb) { 141 prfield(&lb->fields[i]); 142 if (i < lb->nf - 1) 143 prsep(); 144 } 145 } 146 } 147 putchar('\n'); 148 } 149 150 static void 151 prline(struct jline *lp) 152 { 153 if (fwrite(lp->text.data, 1, lp->text.len, stdout) != lp->text.len) 154 eprintf("fwrite:"); 155 putchar('\n'); 156 } 157 158 static int 159 jlinecmp(struct jline *la, struct jline *lb, size_t jfa, size_t jfb) 160 { 161 int status; 162 163 /* return FIELD_ERROR if both lines are short */ 164 if (jfa >= la->nf) { 165 status = (jfb >= lb->nf) ? FIELD_ERROR : -1; 166 } else if (jfb >= lb->nf) { 167 status = 1; 168 } else { 169 status = memcmp(la->fields[jfa].s, lb->fields[jfb].s, 170 MAX(la->fields[jfa].len, lb->fields[jfb].len)); 171 LIMIT(status, -1, 1); 172 } 173 174 return status; 175 } 176 177 static void 178 addfield(struct jline *lp, char *sp, size_t len) 179 { 180 if (lp->nf >= lp->maxf) { 181 lp->fields = ereallocarray(lp->fields, (GROW * lp->maxf), 182 sizeof(struct field)); 183 lp->maxf *= GROW; 184 } 185 lp->fields[lp->nf].s = sp; 186 lp->fields[lp->nf].len = len; 187 lp->nf++; 188 } 189 190 static void 191 prspanjoin(struct span *spa, struct span *spb, size_t jfa, size_t jfb) 192 { 193 size_t i, j; 194 195 for (i = 0; i < (spa->nl - 1); i++) 196 for (j = 0; j < (spb->nl - 1); j++) 197 prjoin(spa->lines[i], spb->lines[j], jfa, jfb); 198 } 199 200 static struct jline * 201 makeline(char *s, size_t len) 202 { 203 struct jline *lp; 204 char *tmp; 205 size_t i, end; 206 207 if (s[len - 1] == '\n') 208 s[--len] = '\0'; 209 210 lp = ereallocarray(NULL, INIT, sizeof(struct jline)); 211 lp->text.data = s; 212 lp->text.len = len; 213 lp->fields = ereallocarray(NULL, INIT, sizeof(struct field)); 214 lp->nf = 0; 215 lp->maxf = INIT; 216 217 for (i = 0; i < lp->text.len && isblank(lp->text.data[i]); i++) 218 ; 219 while (i < lp->text.len) { 220 if (sep) { 221 if ((lp->text.len - i) < seplen || 222 !(tmp = memmem(lp->text.data + i, 223 lp->text.len - i, sep, seplen))) { 224 goto eol; 225 } 226 end = tmp - lp->text.data; 227 addfield(lp, lp->text.data + i, end - i); 228 i = end + seplen; 229 } else { 230 for (end = i; !(isblank(lp->text.data[end])); end++) { 231 if (end + 1 == lp->text.len) 232 goto eol; 233 } 234 addfield(lp, lp->text.data + i, end - i); 235 for (i = end; isblank(lp->text.data[i]); i++) 236 ; 237 } 238 } 239 eol: 240 addfield(lp, lp->text.data + i, lp->text.len - i); 241 242 return lp; 243 } 244 245 static int 246 addtospan(struct span *sp, FILE *fp, int reset) 247 { 248 char *newl = NULL; 249 ssize_t len; 250 size_t size = 0; 251 252 if ((len = getline(&newl, &size, fp)) < 0) { 253 if (ferror(fp)) 254 eprintf("getline:"); 255 else 256 return 0; 257 } 258 259 if (reset) 260 sp->nl = 0; 261 262 if (sp->nl >= sp->maxl) { 263 sp->lines = ereallocarray(sp->lines, (GROW * sp->maxl), 264 sizeof(struct jline *)); 265 sp->maxl *= GROW; 266 } 267 268 sp->lines[sp->nl] = makeline(newl, len); 269 sp->nl++; 270 return 1; 271 } 272 273 static void 274 initspan(struct span *sp) 275 { 276 sp->nl = 0; 277 sp->maxl = INIT; 278 sp->lines = ereallocarray(NULL, INIT, sizeof(struct jline *)); 279 } 280 281 static void 282 freespan(struct span *sp) 283 { 284 size_t i; 285 286 for (i = 0; i < sp->nl; i++) { 287 free(sp->lines[i]->fields); 288 free(sp->lines[i]->text.data); 289 } 290 free(sp->lines); 291 } 292 293 static void 294 initolist(struct outlist *olp) 295 { 296 olp->ns = 0; 297 olp->maxs = 1; 298 olp->specs = ereallocarray(NULL, INIT, sizeof(struct spec *)); 299 } 300 301 static void 302 addspec(struct outlist *olp, struct spec *sp) 303 { 304 if (olp->ns >= olp->maxs) { 305 olp->specs = ereallocarray(olp->specs, (GROW * olp->maxs), 306 sizeof(struct spec *)); 307 olp->maxs *= GROW; 308 } 309 olp->specs[olp->ns] = sp; 310 olp->ns++; 311 } 312 313 static struct spec * 314 makespec(char *s) 315 { 316 struct spec *sp; 317 int fileno; 318 size_t fldno; 319 320 if (!strcmp(s, "0")) { /* join field must be 0 and nothing else */ 321 fileno = 0; 322 fldno = 0; 323 } else if ((s[0] == '1' || s[0] == '2') && s[1] == '.') { 324 fileno = s[0] - '0'; 325 fldno = estrtonum(&s[2], 1, MIN(LLONG_MAX, SIZE_MAX)) - 1; 326 } else { 327 eprintf("%s: invalid format\n", s); 328 } 329 330 sp = ereallocarray(NULL, INIT, sizeof(struct spec)); 331 sp->fileno = fileno; 332 sp->fldno = fldno; 333 return sp; 334 } 335 336 static void 337 makeolist(struct outlist *olp, char *s) 338 { 339 char *item, *sp; 340 sp = s; 341 342 while (sp) { 343 item = sp; 344 sp = strpbrk(sp, ", \t"); 345 if (sp) 346 *sp++ = '\0'; 347 addspec(olp, makespec(item)); 348 } 349 } 350 351 static void 352 freespecs(struct outlist *olp) 353 { 354 size_t i; 355 356 for (i = 0; i < olp->ns; i++) 357 free(olp->specs[i]); 358 } 359 360 static void 361 join(FILE *fa, FILE *fb, size_t jfa, size_t jfb) 362 { 363 struct span spa, spb; 364 int cmp, eofa, eofb; 365 366 initspan(&spa); 367 initspan(&spb); 368 cmp = eofa = eofb = 0; 369 370 addtospan(&spa, fa, RESET); 371 addtospan(&spb, fb, RESET); 372 373 while (spa.nl && spb.nl) { 374 if ((cmp = jlinecmp(spa.lines[0], spb.lines[0], jfa, jfb)) < 0) { 375 if (unpairsa) 376 prline(spa.lines[0]); 377 if (!addtospan(&spa, fa, RESET)) { 378 if (unpairsb) { /* a is EOF'd; print the rest of b */ 379 do 380 prline(spb.lines[0]); 381 while (addtospan(&spb, fb, RESET)); 382 } 383 eofa = eofb = 1; 384 } else { 385 continue; 386 } 387 } else if (cmp > 0) { 388 if (unpairsb) 389 prline(spb.lines[0]); 390 if (!addtospan(&spb, fb, RESET)) { 391 if (unpairsa) { /* b is EOF'd; print the rest of a */ 392 do 393 prline(spa.lines[0]); 394 while (addtospan(&spa, fa, RESET)); 395 } 396 eofa = eofb = 1; 397 } else { 398 continue; 399 } 400 } else if (cmp == 0) { 401 /* read all consecutive matching lines from a */ 402 do { 403 if (!addtospan(&spa, fa, EXPAND)) { 404 eofa = 1; 405 spa.nl++; 406 break; 407 } 408 } while (jlinecmp(spa.lines[spa.nl-1], spb.lines[0], jfa, jfb) == 0); 409 410 /* read all consecutive matching lines from b */ 411 do { 412 if (!addtospan(&spb, fb, EXPAND)) { 413 eofb = 1; 414 spb.nl++; 415 break; 416 } 417 } while (jlinecmp(spa.lines[0], spb.lines[spb.nl-1], jfa, jfb) == 0); 418 419 if (pairs) 420 prspanjoin(&spa, &spb, jfa, jfb); 421 422 } else { /* FIELD_ERROR: both lines lacked join fields */ 423 if (unpairsa) 424 prline(spa.lines[0]); 425 if (unpairsb) 426 prline(spb.lines[0]); 427 eofa = addtospan(&spa, fa, RESET) ? 0 : 1; 428 eofb = addtospan(&spb, fb, RESET) ? 0 : 1; 429 if (!eofa && !eofb) 430 continue; 431 } 432 433 if (eofa) { 434 spa.nl = 0; 435 } else { 436 swaplines(spa.lines[0], spa.lines[spa.nl - 1]); /* ugly */ 437 spa.nl = 1; 438 } 439 440 if (eofb) { 441 spb.nl = 0; 442 } else { 443 swaplines(spb.lines[0], spb.lines[spb.nl - 1]); /* ugly */ 444 spb.nl = 1; 445 } 446 } 447 freespan(&spa); 448 freespan(&spb); 449 } 450 451 452 int 453 main(int argc, char *argv[]) 454 { 455 size_t jf[2] = { jfield, jfield, }; 456 FILE *fp[2]; 457 int ret = 0, n; 458 char *fno; 459 460 ARGBEGIN { 461 case '1': 462 jf[0] = estrtonum(EARGF(usage()), 1, MIN(LLONG_MAX, SIZE_MAX)); 463 break; 464 case '2': 465 jf[1] = estrtonum(EARGF(usage()), 1, MIN(LLONG_MAX, SIZE_MAX)); 466 break; 467 case 'a': 468 fno = EARGF(usage()); 469 if (strcmp(fno, "1") == 0) 470 unpairsa = 1; 471 else if (strcmp(fno, "2") == 0) 472 unpairsb = 1; 473 else 474 usage(); 475 break; 476 case 'e': 477 replace = EARGF(usage()); 478 break; 479 case 'o': 480 oflag = 1; 481 initolist(&output); 482 makeolist(&output, EARGF(usage())); 483 break; 484 case 't': 485 sep = EARGF(usage()); 486 break; 487 case 'v': 488 pairs = 0; 489 fno = EARGF(usage()); 490 if (strcmp(fno, "1") == 0) 491 unpairsa = 1; 492 else if (strcmp(fno, "2") == 0) 493 unpairsb = 1; 494 else 495 usage(); 496 break; 497 default: 498 usage(); 499 } ARGEND 500 501 if (sep) 502 seplen = unescape(sep); 503 504 if (argc != 2) 505 usage(); 506 507 for (n = 0; n < 2; n++) { 508 if (!strcmp(argv[n], "-")) { 509 argv[n] = "<stdin>"; 510 fp[n] = stdin; 511 } else if (!(fp[n] = fopen(argv[n], "r"))) { 512 eprintf("fopen %s:", argv[n]); 513 } 514 } 515 516 jf[0]--; 517 jf[1]--; 518 519 join(fp[0], fp[1], jf[0], jf[1]); 520 521 if (oflag) 522 freespecs(&output); 523 524 if (fshut(fp[0], argv[0]) | (fp[0] != fp[1] && fshut(fp[1], argv[1])) | 525 fshut(stdout, "<stdout>")) 526 ret = 2; 527 528 return ret; 529 }