sed.c (41933B)
1 /* FIXME: summary 2 * decide whether we enforce valid UTF-8, right now it's enforced in certain 3 * parts of the script, but not the input... 4 * nul bytes cause explosions due to use of libc string functions. thoughts? 5 * lack of newline at end of file, currently we add one. what should we do? 6 * allow "\\t" for "\t" etc. in regex? in replacement text? 7 * POSIX says don't flush on N when out of input, but GNU and busybox do. 8 */ 9 10 #include <ctype.h> 11 #include <errno.h> 12 #include <regex.h> 13 #include <stdlib.h> 14 #include <string.h> 15 16 #include "utf.h" 17 #include "util.h" 18 19 /* Types */ 20 21 /* used as queue for writes and stack for {,:,b,t */ 22 typedef struct { 23 void **data; 24 size_t size; 25 size_t cap; 26 } Vec; 27 28 /* used for arbitrary growth, str is a C string 29 * FIXME: does it make sense to keep track of length? or just rely on libc 30 * string functions? If we want to support nul bytes everything changes 31 */ 32 typedef struct { 33 char *str; 34 size_t cap; 35 } String; 36 37 typedef struct Cmd Cmd; 38 typedef struct { 39 void (*fn)(Cmd *); 40 char *(*getarg)(Cmd *, char *); 41 void (*freearg)(Cmd *); 42 unsigned char naddr; 43 } Fninfo; 44 45 typedef struct { 46 union { 47 size_t lineno; 48 regex_t *re; 49 } u; 50 enum { 51 IGNORE, /* empty address, ignore */ 52 EVERY , /* every line */ 53 LINE , /* ilne number */ 54 LAST , /* last line ($) */ 55 REGEX , /* use included regex */ 56 LASTRE, /* use most recently used regex */ 57 } type; 58 } Addr; 59 60 /* DISCUSS: naddr is not strictly necessary, but very helpful 61 * naddr == 0 iff beg.type == EVERY && end.type == IGNORE 62 * naddr == 1 iff beg.type != IGNORE && end.type == IGNORE 63 * naddr == 2 iff beg.type != IGNORE && end.type != IGNORE 64 */ 65 typedef struct { 66 Addr beg; 67 Addr end; 68 unsigned char naddr; 69 } Range; 70 71 typedef struct { 72 regex_t *re; /* if NULL use last regex */ 73 String repl; 74 FILE *file; 75 size_t occurrence; /* 0 for all (g flag) */ 76 Rune delim; 77 unsigned int p:1; 78 } Sarg; 79 80 typedef struct { 81 Rune *set1; 82 Rune *set2; 83 } Yarg; 84 85 typedef struct { 86 String str; /* a,c,i text. r file path */ 87 void (*print)(char *, FILE *); /* check_puts for a, write_file for r, unused for c,i */ 88 } ACIRarg; 89 90 struct Cmd { 91 Range range; 92 Fninfo *fninfo; 93 union { 94 Cmd *jump; /* used for b,t when running */ 95 char *label; /* used for :,b,t when building */ 96 ptrdiff_t offset; /* used for { (pointers break during realloc) */ 97 FILE *file; /* used for w */ 98 99 /* FIXME: Should the following be in the union? or pointers and malloc? */ 100 Sarg s; 101 Yarg y; 102 ACIRarg acir; 103 } u; /* I find your lack of anonymous unions disturbing */ 104 unsigned int in_match:1; 105 unsigned int negate :1; 106 }; 107 108 /* Files for w command (and s' w flag) */ 109 typedef struct { 110 char *path; 111 FILE *file; 112 } Wfile; 113 114 /* 115 * Function Declarations 116 */ 117 118 /* Dynamically allocated arrays and strings */ 119 static void resize(void **ptr, size_t *nmemb, size_t size, size_t new_nmemb, void **next); 120 static void *pop(Vec *v); 121 static void push(Vec *v, void *p); 122 static void stracat(String *dst, char *src); 123 static void strnacat(String *dst, char *src, size_t n); 124 static void stracpy(String *dst, char *src); 125 126 /* Cleanup and errors */ 127 static void usage(void); 128 129 /* Parsing functions and related utilities */ 130 static void compile(char *s, int isfile); 131 static int read_line(FILE *f, String *s); 132 static char *make_range(Range *range, char *s); 133 static char *make_addr(Addr *addr, char *s); 134 static char *find_delim(char *s, Rune delim, int do_brackets); 135 static char *chompr(char *s, Rune rune); 136 static char *chomp(char *s); 137 static Rune *strtorunes(char *s, size_t nrunes); 138 static long stol(char *s, char **endp); 139 static size_t escapes(char *beg, char *end, Rune delim, int n_newline); 140 static size_t echarntorune(Rune *r, char *s, size_t n); 141 static void insert_labels(void); 142 143 /* Get and Free arg and related utilities */ 144 static char *get_aci_arg(Cmd *c, char *s); 145 static void aci_append(Cmd *c, char *s); 146 static void free_acir_arg(Cmd *c); 147 static char *get_bt_arg(Cmd *c, char *s); 148 static char *get_r_arg(Cmd *c, char *s); 149 static char *get_s_arg(Cmd *c, char *s); 150 static void free_s_arg(Cmd *c); 151 static char *get_w_arg(Cmd *c, char *s); 152 static char *get_y_arg(Cmd *c, char *s); 153 static void free_y_arg(Cmd *c); 154 static char *get_colon_arg(Cmd *c, char *s); 155 static char *get_lbrace_arg(Cmd *c, char *s); 156 static char *get_rbrace_arg(Cmd *c, char *s); 157 static char *semicolon_arg(char *s); 158 159 /* Running */ 160 static void run(void); 161 static int in_range(Cmd *c); 162 static int match_addr(Addr *a); 163 static int next_file(void); 164 static int is_eof(FILE *f); 165 static void do_writes(void); 166 static void write_file(char *path, FILE *out); 167 static void check_puts(char *s, FILE *f); 168 static void update_ranges(Cmd *beg, Cmd *end); 169 170 /* Sed functions */ 171 static void cmd_y(Cmd *c); 172 static void cmd_x(Cmd *c); 173 static void cmd_w(Cmd *c); 174 static void cmd_t(Cmd *c); 175 static void cmd_s(Cmd *c); 176 static void cmd_r(Cmd *c); 177 static void cmd_q(Cmd *c); 178 static void cmd_P(Cmd *c); 179 static void cmd_p(Cmd *c); 180 static void cmd_N(Cmd *c); 181 static void cmd_n(Cmd *c); 182 static void cmd_l(Cmd *c); 183 static void cmd_i(Cmd *c); 184 static void cmd_H(Cmd *c); 185 static void cmd_h(Cmd *c); 186 static void cmd_G(Cmd *c); 187 static void cmd_g(Cmd *c); 188 static void cmd_D(Cmd *c); 189 static void cmd_d(Cmd *c); 190 static void cmd_c(Cmd *c); 191 static void cmd_b(Cmd *c); 192 static void cmd_a(Cmd *c); 193 static void cmd_colon(Cmd *c); 194 static void cmd_equal(Cmd *c); 195 static void cmd_lbrace(Cmd *c); 196 static void cmd_rbrace(Cmd *c); 197 static void cmd_last(Cmd *c); 198 199 /* Actions */ 200 static void new_line(void); 201 static void app_line(void); 202 static void new_next(void); 203 static void old_next(void); 204 205 /* 206 * Globals 207 */ 208 static Vec braces, labels, branches; /* holds ptrdiff_t. addrs of {, :, bt */ 209 static Vec writes; /* holds cmd*. writes scheduled by a and r commands */ 210 static Vec wfiles; /* holds Wfile*. files for w and s///w commands */ 211 212 static Cmd *prog, *pc; /* Program, program counter */ 213 static size_t pcap; 214 static size_t lineno; 215 216 static regex_t *lastre; /* last used regex for empty regex search */ 217 static char **files; /* list of file names from argv */ 218 static FILE *file; /* current file we are reading */ 219 220 static String patt, hold, genbuf; 221 222 static struct { 223 unsigned int n :1; /* -n (no print) */ 224 unsigned int E :1; /* -E (extended re) */ 225 unsigned int s :1; /* s/// replacement happened */ 226 unsigned int aci_cont:1; /* a,c,i text continuation */ 227 unsigned int s_cont :1; /* s/// replacement text continuation */ 228 unsigned int halt :1; /* halt execution */ 229 } gflags; 230 231 /* FIXME: move character inside Fninfo and only use 26*sizeof(Fninfo) instead of 127*sizeof(Fninfo) bytes */ 232 static Fninfo fns[] = { 233 ['a'] = { cmd_a , get_aci_arg , free_acir_arg , 1 }, /* schedule write of text for later */ 234 ['b'] = { cmd_b , get_bt_arg , NULL , 2 }, /* branch to label char *label when building, Cmd *jump when running */ 235 ['c'] = { cmd_c , get_aci_arg , free_acir_arg , 2 }, /* delete pattern space. at 0 or 1 addr or end of 2 addr, write text */ 236 ['d'] = { cmd_d , NULL , NULL , 2 }, /* delete pattern space */ 237 ['D'] = { cmd_D , NULL , NULL , 2 }, /* delete to first newline and start new cycle without reading (if no newline, d) */ 238 ['g'] = { cmd_g , NULL , NULL , 2 }, /* replace pattern space with hold space */ 239 ['G'] = { cmd_G , NULL , NULL , 2 }, /* append newline and hold space to pattern space */ 240 ['h'] = { cmd_h , NULL , NULL , 2 }, /* replace hold space with pattern space */ 241 ['H'] = { cmd_H , NULL , NULL , 2 }, /* append newline and pattern space to hold space */ 242 ['i'] = { cmd_i , get_aci_arg , free_acir_arg , 1 }, /* write text */ 243 ['l'] = { cmd_l , NULL , NULL , 2 }, /* write pattern space in 'visually unambiguous form' */ 244 ['n'] = { cmd_n , NULL , NULL , 2 }, /* write pattern space (unless -n) read to replace pattern space (if no input, quit) */ 245 ['N'] = { cmd_N , NULL , NULL , 2 }, /* append to pattern space separated by newline, line number changes (if no input, quit) */ 246 ['p'] = { cmd_p , NULL , NULL , 2 }, /* write pattern space */ 247 ['P'] = { cmd_P , NULL , NULL , 2 }, /* write pattern space up to first newline */ 248 ['q'] = { cmd_q , NULL , NULL , 1 }, /* quit */ 249 ['r'] = { cmd_r , get_r_arg , free_acir_arg , 1 }, /* write contents of file (unable to open/read treated as empty file) */ 250 ['s'] = { cmd_s , get_s_arg , free_s_arg , 2 }, /* find/replace/all that crazy s stuff */ 251 ['t'] = { cmd_t , get_bt_arg , NULL , 2 }, /* if s/// succeeded (since input or last t) branch to label (branch to end if no label) */ 252 ['w'] = { cmd_w , get_w_arg , NULL , 2 }, /* append pattern space to file */ 253 ['x'] = { cmd_x , NULL , NULL , 2 }, /* exchange pattern and hold spaces */ 254 ['y'] = { cmd_y , get_y_arg , free_y_arg , 2 }, /* replace runes in set1 with runes in set2 */ 255 [':'] = { cmd_colon , get_colon_arg , NULL , 0 }, /* defines label for later b and t commands */ 256 ['='] = { cmd_equal , NULL , NULL , 1 }, /* printf("%d\n", line_number); */ 257 ['{'] = { cmd_lbrace, get_lbrace_arg, NULL , 2 }, /* if we match, run commands, otherwise jump to close */ 258 ['}'] = { cmd_rbrace, get_rbrace_arg, NULL , 0 }, /* noop, hold onto open for ease of building scripts */ 259 260 [0x7f] = { NULL, NULL, NULL, 0 }, /* index is checked with isascii(3p). fill out rest of array */ 261 }; 262 263 /* 264 * Function Definitions 265 */ 266 267 /* given memory pointed to by *ptr that currently holds *nmemb members of size 268 * size, realloc to hold new_nmemb members, return new_nmemb in *memb and one 269 * past old end in *next. if realloc fails...explode 270 */ 271 static void 272 resize(void **ptr, size_t *nmemb, size_t size, size_t new_nmemb, void **next) 273 { 274 void *n, *tmp; 275 276 if (new_nmemb) { 277 tmp = ereallocarray(*ptr, new_nmemb, size); 278 } else { /* turns out realloc(*ptr, 0) != free(*ptr) */ 279 free(*ptr); 280 tmp = NULL; 281 } 282 n = (char *)tmp + *nmemb * size; 283 *nmemb = new_nmemb; 284 *ptr = tmp; 285 if (next) 286 *next = n; 287 } 288 289 static void * 290 pop(Vec *v) 291 { 292 if (!v->size) 293 return NULL; 294 return v->data[--v->size]; 295 } 296 297 static void 298 push(Vec *v, void *p) 299 { 300 if (v->size == v->cap) 301 resize((void **)&v->data, &v->cap, sizeof(*v->data), v->cap * 2 + 1, NULL); 302 v->data[v->size++] = p; 303 } 304 305 static void 306 stracat(String *dst, char *src) 307 { 308 int new = !dst->cap; 309 size_t len; 310 311 len = (new ? 0 : strlen(dst->str)) + strlen(src) + 1; 312 if (dst->cap < len) 313 resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL); 314 if (new) 315 *dst->str = '\0'; 316 strcat(dst->str, src); 317 } 318 319 static void 320 strnacat(String *dst, char *src, size_t n) 321 { 322 int new = !dst->cap; 323 size_t len; 324 325 len = strlen(src); 326 len = (new ? 0 : strlen(dst->str)) + MIN(n, len) + 1; 327 if (dst->cap < len) 328 resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL); 329 if (new) 330 *dst->str = '\0'; 331 strlcat(dst->str, src, len); 332 } 333 334 static void 335 stracpy(String *dst, char *src) 336 { 337 size_t len; 338 339 len = strlen(src) + 1; 340 if (dst->cap < len) 341 resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL); 342 strcpy(dst->str, src); 343 } 344 345 static void 346 leprintf(char *s) 347 { 348 if (errno) 349 eprintf("%zu: %s: %s\n", lineno, s, strerror(errno)); 350 else 351 eprintf("%zu: %s\n", lineno, s); 352 } 353 354 /* FIXME: write usage message */ 355 static void 356 usage(void) 357 { 358 eprintf("usage: sed [-nrE] script [file ...]\n" 359 " sed [-nrE] -e script [-e script] ... [-f scriptfile] ... [file ...]\n" 360 " sed [-nrE] [-e script] ... -f scriptfile [-f scriptfile] ... [file ...]\n"); 361 } 362 363 /* Differences from POSIX 364 * we allows semicolons and trailing blanks inside {} 365 * we allow spaces after ! (and in between !s) 366 * we allow extended regular expressions (-E) 367 */ 368 static void 369 compile(char *s, int isfile) 370 { 371 FILE *f; 372 373 if (isfile) { 374 f = fopen(s, "r"); 375 if (!f) 376 eprintf("fopen %s:", s); 377 } else { 378 if (!*s) /* empty string script */ 379 return; 380 f = fmemopen(s, strlen(s), "r"); 381 if (!f) 382 eprintf("fmemopen:"); 383 } 384 385 /* NOTE: get arg functions can't use genbuf */ 386 while (read_line(f, &genbuf) != EOF) { 387 s = genbuf.str; 388 389 /* if the first two characters of the script are "#n" default output shall be suppressed */ 390 if (++lineno == 1 && *s == '#' && s[1] == 'n') { 391 gflags.n = 1; 392 continue; 393 } 394 395 if (gflags.aci_cont) { 396 aci_append(pc - 1, s); 397 continue; 398 } 399 if (gflags.s_cont) 400 s = (pc - 1)->fninfo->getarg(pc - 1, s); 401 402 while (*s) { 403 s = chompr(s, ';'); 404 if (!*s || *s == '#') 405 break; 406 407 if ((size_t)(pc - prog) == pcap) 408 resize((void **)&prog, &pcap, sizeof(*prog), pcap * 2 + 1, (void **)&pc); 409 410 pc->range.beg.type = pc->range.end.type = IGNORE; 411 pc->fninfo = NULL; 412 pc->in_match = 0; 413 414 s = make_range(&pc->range, s); 415 s = chomp(s); 416 pc->negate = *s == '!'; 417 s = chompr(s, '!'); 418 419 if (!isascii(*s) || !(pc->fninfo = &fns[(unsigned)*s])->fn) 420 leprintf("bad sed function"); 421 if (pc->range.naddr > pc->fninfo->naddr) 422 leprintf("wrong number of addresses"); 423 s++; 424 425 if (pc->fninfo->getarg) 426 s = pc->fninfo->getarg(pc, s); 427 428 pc++; 429 } 430 } 431 432 fshut(f, s); 433 } 434 435 /* FIXME: if we decide to honor lack of trailing newline, set/clear a global 436 * flag when reading a line 437 */ 438 static int 439 read_line(FILE *f, String *s) 440 { 441 ssize_t len; 442 443 if (!f) 444 return EOF; 445 446 if ((len = getline(&s->str, &s->cap, f)) < 0) { 447 if (ferror(f)) 448 eprintf("getline:"); 449 return EOF; 450 } 451 if (s->str[--len] == '\n') 452 s->str[len] = '\0'; 453 return 0; 454 } 455 456 /* read first range from s, return pointer to one past end of range */ 457 static char * 458 make_range(Range *range, char *s) 459 { 460 s = make_addr(&range->beg, s); 461 462 if (*s == ',') 463 s = make_addr(&range->end, s + 1); 464 else 465 range->end.type = IGNORE; 466 467 if (range->beg.type == EVERY && range->end.type == IGNORE) range->naddr = 0; 468 else if (range->beg.type != IGNORE && range->end.type == IGNORE) range->naddr = 1; 469 else if (range->beg.type != IGNORE && range->end.type != IGNORE) range->naddr = 2; 470 else leprintf("this is impossible..."); 471 472 return s; 473 } 474 475 /* read first addr from s, return pointer to one past end of addr */ 476 static char * 477 make_addr(Addr *addr, char *s) 478 { 479 Rune r; 480 char *p = s + strlen(s); 481 size_t rlen = echarntorune(&r, s, p - s); 482 483 if (r == '$') { 484 addr->type = LAST; 485 s += rlen; 486 } else if (isdigitrune(r)) { 487 addr->type = LINE; 488 addr->u.lineno = stol(s, &s); 489 } else if (r == '/' || r == '\\') { 490 Rune delim; 491 if (r == '\\') { 492 s += rlen; 493 rlen = echarntorune(&r, s, p - s); 494 } 495 if (r == '\\') 496 leprintf("bad delimiter '\\'"); 497 delim = r; 498 s += rlen; 499 rlen = echarntorune(&r, s, p - s); 500 if (r == delim) { 501 addr->type = LASTRE; 502 s += rlen; 503 } else { 504 addr->type = REGEX; 505 p = find_delim(s, delim, 1); 506 if (!*p) 507 leprintf("unclosed regex"); 508 p -= escapes(s, p, delim, 0); 509 *p++ = '\0'; 510 addr->u.re = emalloc(sizeof(*addr->u.re)); 511 eregcomp(addr->u.re, s, gflags.E ? REG_EXTENDED : 0); 512 s = p; 513 } 514 } else { 515 addr->type = EVERY; 516 } 517 518 return s; 519 } 520 521 /* return pointer to first delim in s that is not escaped 522 * and if do_brackets is set, not in [] (note possible [::], [..], [==], inside []) 523 * return pointer to trailing nul byte if no delim found 524 * 525 * any escaped character that is not special is just itself (POSIX undefined) 526 * FIXME: pull out into some util thing, will be useful for ed as well 527 */ 528 static char * 529 find_delim(char *s, Rune delim, int do_brackets) 530 { 531 enum { 532 OUTSIDE , /* not in brackets */ 533 BRACKETS_OPENING, /* last char was first [ or last two were first [^ */ 534 BRACKETS_INSIDE , /* inside [] */ 535 INSIDE_OPENING , /* inside [] and last char was [ */ 536 CLASS_INSIDE , /* inside class [::], or colating element [..] or [==], inside [] */ 537 CLASS_CLOSING , /* inside class [::], or colating element [..] or [==], and last character was the respective : . or = */ 538 } state = OUTSIDE; 539 540 Rune r, c = 0; /* no c won't be used uninitialized, shutup -Wall */ 541 size_t rlen; 542 int escape = 0; 543 char *end = s + strlen(s); 544 545 for (; *s; s += rlen) { 546 rlen = echarntorune(&r, s, end - s); 547 548 if (state == BRACKETS_OPENING && r == '^' ) { continue; } 549 else if (state == BRACKETS_OPENING && r == ']' ) { state = BRACKETS_INSIDE ; continue; } 550 else if (state == BRACKETS_OPENING ) { state = BRACKETS_INSIDE ; } 551 552 if (state == CLASS_CLOSING && r == ']' ) { state = BRACKETS_INSIDE ; } 553 else if (state == CLASS_CLOSING ) { state = CLASS_INSIDE ; } 554 else if (state == CLASS_INSIDE && r == c ) { state = CLASS_CLOSING ; } 555 else if (state == INSIDE_OPENING && (r == ':' || 556 r == '.' || 557 r == '=') ) { state = CLASS_INSIDE ; c = r; } 558 else if (state == INSIDE_OPENING && r == ']' ) { state = OUTSIDE ; } 559 else if (state == INSIDE_OPENING ) { state = BRACKETS_INSIDE ; } 560 else if (state == BRACKETS_INSIDE && r == '[' ) { state = INSIDE_OPENING ; } 561 else if (state == BRACKETS_INSIDE && r == ']' ) { state = OUTSIDE ; } 562 else if (state == OUTSIDE && escape ) { escape = 0 ; } 563 else if (state == OUTSIDE && r == '\\' ) { escape = 1 ; } 564 else if (state == OUTSIDE && r == delim) return s; 565 else if (state == OUTSIDE && do_brackets && r == '[' ) { state = BRACKETS_OPENING; } 566 } 567 return s; 568 } 569 570 static char * 571 chomp(char *s) 572 { 573 return chompr(s, 0); 574 } 575 576 /* eat all leading whitespace and occurrences of rune */ 577 static char * 578 chompr(char *s, Rune rune) 579 { 580 Rune r; 581 size_t rlen; 582 char *end = s + strlen(s); 583 584 while (*s && (rlen = echarntorune(&r, s, end - s)) && (isspacerune(r) || r == rune)) 585 s += rlen; 586 return s; 587 } 588 589 /* convert first nrunes Runes from UTF-8 string s in allocated Rune* 590 * NOTE: sequence must be valid UTF-8, check first */ 591 static Rune * 592 strtorunes(char *s, size_t nrunes) 593 { 594 Rune *rs, *rp; 595 596 rp = rs = ereallocarray(NULL, nrunes + 1, sizeof(*rs)); 597 598 while (nrunes--) 599 s += chartorune(rp++, s); 600 601 *rp = '\0'; 602 return rs; 603 } 604 605 static long 606 stol(char *s, char **endp) 607 { 608 long n; 609 errno = 0; 610 n = strtol(s, endp, 10); 611 612 if (errno) 613 leprintf("strtol:"); 614 if (*endp == s) 615 leprintf("strtol: invalid number"); 616 617 return n; 618 } 619 620 /* from beg to end replace "\\d" with "d" and "\\n" with "\n" (where d is delim) 621 * if delim is 'n' and n_newline is 0 then "\\n" is replaced with "n" (normal) 622 * if delim is 'n' and n_newline is 1 then "\\n" is replaced with "\n" (y command) 623 * if delim is 0 all escaped characters represent themselves (aci text) 624 * memmove rest of string (beyond end) into place 625 * return the number of converted escapes (backslashes removed) 626 * FIXME: this has had too many corner cases slapped on and is ugly. rewrite better 627 */ 628 static size_t 629 escapes(char *beg, char *end, Rune delim, int n_newline) 630 { 631 size_t num = 0; 632 char *src = beg, *dst = beg; 633 634 while (src < end) { 635 /* handle escaped backslash specially so we don't think the second 636 * backslash is escaping something */ 637 if (*src == '\\' && src[1] == '\\') { 638 *dst++ = *src++; 639 if (delim) 640 *dst++ = *src++; 641 else 642 src++; 643 } else if (*src == '\\' && !delim) { 644 src++; 645 } else if (*src == '\\' && src[1]) { 646 Rune r; 647 size_t rlen; 648 num++; 649 src++; 650 rlen = echarntorune(&r, src, end - src); 651 652 if (r == 'n' && delim == 'n') { 653 *src = n_newline ? '\n' : 'n'; /* src so we can still memmove() */ 654 } else if (r == 'n') { 655 *src = '\n'; 656 } else if (r != delim) { 657 *dst++ = '\\'; 658 num--; 659 } 660 661 memmove(dst, src, rlen); 662 dst += rlen; 663 src += rlen; 664 } else { 665 *dst++ = *src++; 666 } 667 } 668 memmove(dst, src, strlen(src) + 1); 669 return num; 670 } 671 672 static size_t 673 echarntorune(Rune *r, char *s, size_t n) 674 { 675 size_t rlen = charntorune(r, s, n); 676 if (!rlen || *r == Runeerror) 677 leprintf("invalid UTF-8"); 678 return rlen; 679 } 680 681 static void 682 insert_labels(void) 683 { 684 size_t i; 685 Cmd *from, *to; 686 687 while (branches.size) { 688 from = prog + (ptrdiff_t)pop(&branches); 689 690 if (!from->u.label) {/* no label branch to end of script */ 691 from->u.jump = pc - 1; 692 } else { 693 for (i = 0; i < labels.size; i++) { 694 to = prog + (ptrdiff_t)labels.data[i]; 695 if (!strcmp(from->u.label, to->u.label)) { 696 from->u.jump = to; 697 break; 698 } 699 } 700 if (i == labels.size) 701 leprintf("bad label"); 702 } 703 } 704 } 705 706 /* 707 * Getargs / Freeargs 708 * Read argument from s, return pointer to one past last character of argument 709 */ 710 711 /* POSIX compliant 712 * i\ 713 * foobar 714 * 715 * also allow the following non POSIX compliant 716 * i # empty line 717 * ifoobar 718 * ifoobar\ 719 * baz 720 * 721 * FIXME: GNU and busybox discard leading spaces 722 * i foobar 723 * i foobar 724 * ifoobar 725 * are equivalent in GNU and busybox. We don't. Should we? 726 */ 727 static char * 728 get_aci_arg(Cmd *c, char *s) 729 { 730 c->u.acir.print = check_puts; 731 c->u.acir.str = (String){ NULL, 0 }; 732 733 gflags.aci_cont = !!*s; /* no continue flag if empty string */ 734 735 /* neither empty string nor POSIX compliant */ 736 if (*s && !(*s == '\\' && !s[1])) 737 aci_append(c, s); 738 739 return s + strlen(s); 740 } 741 742 static void 743 aci_append(Cmd *c, char *s) 744 { 745 char *end = s + strlen(s), *p = end; 746 747 gflags.aci_cont = 0; 748 while (--p >= s && *p == '\\') 749 gflags.aci_cont = !gflags.aci_cont; 750 751 if (gflags.aci_cont) 752 *--end = '\n'; 753 754 escapes(s, end, 0, 0); 755 stracat(&c->u.acir.str, s); 756 } 757 758 static void 759 free_acir_arg(Cmd *c) 760 { 761 free(c->u.acir.str.str); 762 } 763 764 /* POSIX dictates that label is rest of line, including semicolons, trailing 765 * whitespace, closing braces, etc. and can be limited to 8 bytes 766 * 767 * I allow a semicolon or closing brace to terminate a label name, it's not 768 * POSIX compliant, but it's useful and every sed version I've tried to date 769 * does the same. 770 * 771 * FIXME: POSIX dictates that leading whitespace is ignored but trailing 772 * whitespace is not. This is annoying and we should probably get rid of it. 773 */ 774 static char * 775 get_bt_arg(Cmd *c, char *s) 776 { 777 char *p = semicolon_arg(s = chomp(s)); 778 779 if (p != s) { 780 c->u.label = estrndup(s, p - s); 781 } else { 782 c->u.label = NULL; 783 } 784 785 push(&branches, (void *)(c - prog)); 786 787 return p; 788 } 789 790 /* POSIX dictates file name is rest of line including semicolons, trailing 791 * whitespace, closing braces, etc. and file name must be preceded by a space 792 * 793 * I allow a semicolon or closing brace to terminate a file name and don't 794 * enforce leading space. 795 * 796 * FIXME: decide whether trailing whitespace should be included and fix 797 * accordingly 798 */ 799 static char * 800 get_r_arg(Cmd *c, char *s) 801 { 802 char *p = semicolon_arg(s = chomp(s)); 803 804 if (p == s) 805 leprintf("no file name"); 806 807 c->u.acir.str.str = estrndup(s, p - s); 808 c->u.acir.print = write_file; 809 810 return p; 811 } 812 813 /* we allow "\\n" in replacement text to mean "\n" (undefined in POSIX) 814 * 815 * FIXME: allow other escapes in regex and replacement? if so change escapes() 816 */ 817 static char * 818 get_s_arg(Cmd *c, char *s) 819 { 820 Rune delim, r; 821 Cmd buf; 822 char *p; 823 int esc, lastre; 824 825 /* s/Find/Replace/Flags */ 826 827 /* Find */ 828 if (!gflags.s_cont) { /* NOT continuing from literal newline in replacement text */ 829 lastre = 0; 830 c->u.s.repl = (String){ NULL, 0 }; 831 c->u.s.occurrence = 1; 832 c->u.s.file = NULL; 833 c->u.s.p = 0; 834 835 if (!*s || *s == '\\') 836 leprintf("bad delimiter"); 837 838 p = s + strlen(s); 839 s += echarntorune(&delim, s, p - s); 840 c->u.s.delim = delim; 841 842 echarntorune(&r, s, p - s); 843 if (r == delim) /* empty regex */ 844 lastre = 1; 845 846 p = find_delim(s, delim, 1); 847 if (!*p) 848 leprintf("missing second delimiter"); 849 p -= escapes(s, p, delim, 0); 850 *p = '\0'; 851 852 if (lastre) { 853 c->u.s.re = NULL; 854 } else { 855 c->u.s.re = emalloc(sizeof(*c->u.s.re)); 856 /* FIXME: different eregcomp that calls fatal */ 857 eregcomp(c->u.s.re, s, gflags.E ? REG_EXTENDED : 0); 858 } 859 s = p + runelen(delim); 860 } 861 862 /* Replace */ 863 delim = c->u.s.delim; 864 865 p = find_delim(s, delim, 0); 866 p -= escapes(s, p, delim, 0); 867 if (!*p) { /* no third delimiter */ 868 /* FIXME: same backslash counting as aci_append() */ 869 if (p[-1] != '\\') 870 leprintf("missing third delimiter or <backslash><newline>"); 871 p[-1] = '\n'; 872 gflags.s_cont = 1; 873 } else { 874 gflags.s_cont = 0; 875 } 876 877 /* check for bad references in replacement text */ 878 *p = '\0'; 879 for (esc = 0, p = s; *p; p++) { 880 if (esc) { 881 esc = 0; 882 if (isdigit(*p) && c->u.s.re && (size_t)(*p - '0') > c->u.s.re->re_nsub) 883 leprintf("back reference number greater than number of groups"); 884 } else if (*p == '\\') { 885 esc = 1; 886 } 887 } 888 stracat(&c->u.s.repl, s); 889 890 if (gflags.s_cont) 891 return p; 892 893 s = p + runelen(delim); 894 895 /* Flags */ 896 p = semicolon_arg(s = chomp(s)); 897 898 /* FIXME: currently for simplicity take last of g or occurrence flags and 899 * ignore multiple p flags. need to fix that */ 900 for (; s < p; s++) { 901 if (isdigit(*s)) { 902 c->u.s.occurrence = stol(s, &s); 903 s--; /* for loop will advance pointer */ 904 } else { 905 switch (*s) { 906 case 'g': c->u.s.occurrence = 0; break; 907 case 'p': c->u.s.p = 1; break; 908 case 'w': 909 /* must be last flag, take everything up to newline/semicolon 910 * s == p after this */ 911 s = get_w_arg(&buf, chomp(s+1)); 912 c->u.s.file = buf.u.file; 913 break; 914 } 915 } 916 } 917 return p; 918 } 919 920 static void 921 free_s_arg(Cmd *c) 922 { 923 if (c->u.s.re) 924 regfree(c->u.s.re); 925 free(c->u.s.re); 926 free(c->u.s.repl.str); 927 } 928 929 /* see get_r_arg notes */ 930 static char * 931 get_w_arg(Cmd *c, char *s) 932 { 933 char *p = semicolon_arg(s = chomp(s)); 934 Wfile *w, **wp; 935 936 if (p == s) 937 leprintf("no file name"); 938 939 for (wp = (Wfile **)wfiles.data; (size_t)(wp - (Wfile **)wfiles.data) < wfiles.size; wp++) { 940 if (strlen((*wp)->path) == (size_t)(p - s) && !strncmp(s, (*wp)->path, p - s)) { 941 c->u.file = (*wp)->file; 942 return p; 943 } 944 } 945 946 w = emalloc(sizeof(*w)); 947 w->path = estrndup(s, p - s); 948 949 if (!(w->file = fopen(w->path, "w"))) 950 leprintf("fopen failed"); 951 952 c->u.file = w->file; 953 954 push(&wfiles, w); 955 return p; 956 } 957 958 static char * 959 get_y_arg(Cmd *c, char *s) 960 { 961 Rune delim; 962 char *p = s + strlen(s); 963 size_t rlen = echarntorune(&delim, s, p - s); 964 size_t nrunes1, nrunes2; 965 966 c->u.y.set1 = c->u.y.set2 = NULL; 967 968 s += rlen; 969 p = find_delim(s, delim, 0); 970 p -= escapes(s, p, delim, 1); 971 nrunes1 = utfnlen(s, p - s); 972 c->u.y.set1 = strtorunes(s, nrunes1); 973 974 s = p + rlen; 975 p = find_delim(s, delim, 0); 976 p -= escapes(s, p, delim, 1); 977 nrunes2 = utfnlen(s, p - s); 978 979 if (nrunes1 != nrunes2) 980 leprintf("different set lengths"); 981 982 c->u.y.set2 = strtorunes(s, utfnlen(s, p - s)); 983 984 return p + rlen; 985 } 986 987 static void 988 free_y_arg(Cmd *c) 989 { 990 free(c->u.y.set1); 991 free(c->u.y.set2); 992 } 993 994 /* see get_bt_arg notes */ 995 static char * 996 get_colon_arg(Cmd *c, char *s) 997 { 998 char *p = semicolon_arg(s = chomp(s)); 999 1000 if (p == s) 1001 leprintf("no label name"); 1002 1003 c->u.label = estrndup(s, p - s); 1004 push(&labels, (void *)(c - prog)); 1005 return p; 1006 } 1007 1008 static char * 1009 get_lbrace_arg(Cmd *c, char *s) 1010 { 1011 push(&braces, (void *)(c - prog)); 1012 return s; 1013 } 1014 1015 static char * 1016 get_rbrace_arg(Cmd *c, char *s) 1017 { 1018 Cmd *lbrace; 1019 1020 if (!braces.size) 1021 leprintf("extra }"); 1022 1023 lbrace = prog + (ptrdiff_t)pop(&braces); 1024 lbrace->u.offset = c - prog; 1025 return s; 1026 } 1027 1028 /* s points to beginning of an argument that may be semicolon terminated 1029 * return pointer to semicolon or nul byte after string 1030 * or closing brace as to not force ; before } 1031 * FIXME: decide whether or not to eat trailing whitespace for arguments that 1032 * we allow semicolon/brace termination that POSIX doesn't 1033 * b, r, t, w, : 1034 * POSIX says trailing whitespace is part of label name, file name, etc. 1035 * we should probably eat it 1036 */ 1037 static char * 1038 semicolon_arg(char *s) 1039 { 1040 char *p = strpbrk(s, ";}"); 1041 if (!p) 1042 p = s + strlen(s); 1043 return p; 1044 } 1045 1046 static void 1047 run(void) 1048 { 1049 lineno = 0; 1050 if (braces.size) 1051 leprintf("extra {"); 1052 1053 /* genbuf has already been initialized, patt will be in new_line 1054 * (or we'll halt) */ 1055 stracpy(&hold, ""); 1056 1057 insert_labels(); 1058 next_file(); 1059 new_line(); 1060 1061 for (pc = prog; !gflags.halt; pc++) 1062 pc->fninfo->fn(pc); 1063 } 1064 1065 /* return true if we are in range for c, set c->in_match appropriately */ 1066 static int 1067 in_range(Cmd *c) 1068 { 1069 if (match_addr(&c->range.beg)) { 1070 if (c->range.naddr == 2) { 1071 if (c->range.end.type == LINE && c->range.end.u.lineno <= lineno) 1072 c->in_match = 0; 1073 else 1074 c->in_match = 1; 1075 } 1076 return !c->negate; 1077 } 1078 if (c->in_match && match_addr(&c->range.end)) { 1079 c->in_match = 0; 1080 return !c->negate; 1081 } 1082 return c->in_match ^ c->negate; 1083 } 1084 1085 /* return true if addr matches current line */ 1086 static int 1087 match_addr(Addr *a) 1088 { 1089 switch (a->type) { 1090 default: 1091 case IGNORE: return 0; 1092 case EVERY: return 1; 1093 case LINE: return lineno == a->u.lineno; 1094 case LAST: 1095 while (is_eof(file) && !next_file()) 1096 ; 1097 return !file; 1098 case REGEX: 1099 lastre = a->u.re; 1100 return !regexec(a->u.re, patt.str, 0, NULL, 0); 1101 case LASTRE: 1102 if (!lastre) 1103 leprintf("no previous regex"); 1104 return !regexec(lastre, patt.str, 0, NULL, 0); 1105 } 1106 } 1107 1108 /* move to next input file 1109 * stdin if first call and no files 1110 * return 0 for success and 1 for no more files 1111 */ 1112 static int 1113 next_file(void) 1114 { 1115 static unsigned char first = 1; 1116 1117 if (file == stdin) 1118 clearerr(file); 1119 else if (file) 1120 fshut(file, "<file>"); 1121 file = NULL; 1122 1123 do { 1124 if (!*files) { 1125 if (first) /* given no files, default to stdin */ 1126 file = stdin; 1127 /* else we've used all our files, leave file = NULL */ 1128 } else if (!strcmp(*files, "-")) { 1129 file = stdin; 1130 files++; 1131 } else if (!(file = fopen(*files++, "r"))) { 1132 /* warn this file didn't open, but move on to next */ 1133 weprintf("fopen:"); 1134 } 1135 } while (!file && *files); 1136 first = 0; 1137 1138 return !file; 1139 } 1140 1141 /* test if stream is at EOF */ 1142 static int 1143 is_eof(FILE *f) 1144 { 1145 int c; 1146 1147 if (!f || feof(f)) 1148 return 1; 1149 1150 c = fgetc(f); 1151 if (c == EOF && ferror(f)) 1152 eprintf("fgetc:"); 1153 if (c != EOF && ungetc(c, f) == EOF) 1154 eprintf("ungetc EOF\n"); 1155 1156 return c == EOF; 1157 } 1158 1159 /* perform writes that were scheduled 1160 * for aci this is check_puts(string, stdout) 1161 * for r this is write_file(path, stdout) 1162 */ 1163 static void 1164 do_writes(void) 1165 { 1166 Cmd *c; 1167 size_t i; 1168 1169 for (i = 0; i < writes.size; i++) { 1170 c = writes.data[i]; 1171 c->u.acir.print(c->u.acir.str.str, stdout); 1172 } 1173 writes.size = 0; 1174 } 1175 1176 /* used for r's u.acir.print() 1177 * FIXME: something like util's concat() would be better 1178 */ 1179 static void 1180 write_file(char *path, FILE *out) 1181 { 1182 FILE *in = fopen(path, "r"); 1183 if (!in) /* no file is treated as empty file */ 1184 return; 1185 1186 while (read_line(in, &genbuf) != EOF) 1187 check_puts(genbuf.str, out); 1188 1189 fshut(in, path); 1190 } 1191 1192 static void 1193 check_puts(char *s, FILE *f) 1194 { 1195 if (s && fputs(s, f) == EOF) 1196 eprintf("fputs:"); 1197 if (fputs("\n", f) == EOF) 1198 eprintf("fputs:"); 1199 } 1200 1201 /* iterate from beg to end updating ranges so we don't miss any commands 1202 * e.g. sed -n '1d;1,3p' should still print lines 2 and 3 1203 */ 1204 static void 1205 update_ranges(Cmd *beg, Cmd *end) 1206 { 1207 while (beg < end) 1208 in_range(beg++); 1209 } 1210 1211 /* 1212 * Sed functions 1213 */ 1214 static void 1215 cmd_a(Cmd *c) 1216 { 1217 if (in_range(c)) 1218 push(&writes, c); 1219 } 1220 1221 static void 1222 cmd_b(Cmd *c) 1223 { 1224 if (!in_range(c)) 1225 return; 1226 1227 /* if we jump backwards update to end, otherwise update to destination */ 1228 update_ranges(c + 1, c->u.jump > c ? c->u.jump : prog + pcap); 1229 pc = c->u.jump; 1230 } 1231 1232 static void 1233 cmd_c(Cmd *c) 1234 { 1235 if (!in_range(c)) 1236 return; 1237 1238 /* write the text on the last line of the match */ 1239 if (!c->in_match) 1240 check_puts(c->u.acir.str.str, stdout); 1241 /* otherwise start the next cycle without printing pattern space 1242 * effectively deleting the text */ 1243 new_next(); 1244 } 1245 1246 static void 1247 cmd_d(Cmd *c) 1248 { 1249 if (!in_range(c)) 1250 return; 1251 1252 new_next(); 1253 } 1254 1255 static void 1256 cmd_D(Cmd *c) 1257 { 1258 char *p; 1259 1260 if (!in_range(c)) 1261 return; 1262 1263 if ((p = strchr(patt.str, '\n'))) { 1264 p++; 1265 memmove(patt.str, p, strlen(p) + 1); 1266 old_next(); 1267 } else { 1268 new_next(); 1269 } 1270 } 1271 1272 static void 1273 cmd_g(Cmd *c) 1274 { 1275 if (in_range(c)) 1276 stracpy(&patt, hold.str); 1277 } 1278 1279 static void 1280 cmd_G(Cmd *c) 1281 { 1282 if (!in_range(c)) 1283 return; 1284 1285 stracat(&patt, "\n"); 1286 stracat(&patt, hold.str); 1287 } 1288 1289 static void 1290 cmd_h(Cmd *c) 1291 { 1292 if (in_range(c)) 1293 stracpy(&hold, patt.str); 1294 } 1295 1296 static void 1297 cmd_H(Cmd *c) 1298 { 1299 if (!in_range(c)) 1300 return; 1301 1302 stracat(&hold, "\n"); 1303 stracat(&hold, patt.str); 1304 } 1305 1306 static void 1307 cmd_i(Cmd *c) 1308 { 1309 if (in_range(c)) 1310 check_puts(c->u.acir.str.str, stdout); 1311 } 1312 1313 /* I think it makes sense to print invalid UTF-8 sequences in octal to satisfy 1314 * the "visually unambiguous form" sed(1p) 1315 */ 1316 static void 1317 cmd_l(Cmd *c) 1318 { 1319 Rune r; 1320 char *p, *end; 1321 size_t rlen; 1322 1323 char *escapes[] = { /* FIXME: 7 entries and search instead of 127 */ 1324 ['\\'] = "\\\\", ['\a'] = "\\a", ['\b'] = "\\b", 1325 ['\f'] = "\\f" , ['\r'] = "\\r", ['\t'] = "\\t", 1326 ['\v'] = "\\v" , [0x7f] = NULL, /* fill out the table */ 1327 }; 1328 1329 if (!in_range(c)) 1330 return; 1331 1332 /* FIXME: line wrapping. sed(1p) says "length at which folding occurs is 1333 * unspecified, but should be appropraite for the output device" 1334 * just wrap at 80 Runes? 1335 */ 1336 for (p = patt.str, end = p + strlen(p); p < end; p += rlen) { 1337 if (isascii(*p) && escapes[(unsigned int)*p]) { 1338 fputs(escapes[(unsigned int)*p], stdout); 1339 rlen = 1; 1340 } else if (!(rlen = charntorune(&r, p, end - p))) { 1341 /* ran out of chars, print the bytes of the short sequence */ 1342 for (; p < end; p++) 1343 printf("\\%03hho", (unsigned char)*p); 1344 break; 1345 } else if (r == Runeerror) { 1346 for (; rlen; rlen--, p++) 1347 printf("\\%03hho", (unsigned char)*p); 1348 } else { 1349 while (fwrite(p, rlen, 1, stdout) < 1 && errno == EINTR) 1350 ; 1351 if (ferror(stdout)) 1352 eprintf("fwrite:"); 1353 } 1354 } 1355 check_puts("$", stdout); 1356 } 1357 1358 static void 1359 cmd_n(Cmd *c) 1360 { 1361 if (!in_range(c)) 1362 return; 1363 1364 if (!gflags.n) 1365 check_puts(patt.str, stdout); 1366 do_writes(); 1367 new_line(); 1368 } 1369 1370 static void 1371 cmd_N(Cmd *c) 1372 { 1373 if (!in_range(c)) 1374 return; 1375 do_writes(); 1376 app_line(); 1377 } 1378 1379 static void 1380 cmd_p(Cmd *c) 1381 { 1382 if (in_range(c)) 1383 check_puts(patt.str, stdout); 1384 } 1385 1386 static void 1387 cmd_P(Cmd *c) 1388 { 1389 char *p; 1390 1391 if (!in_range(c)) 1392 return; 1393 1394 if ((p = strchr(patt.str, '\n'))) 1395 *p = '\0'; 1396 1397 check_puts(patt.str, stdout); 1398 1399 if (p) 1400 *p = '\n'; 1401 } 1402 1403 static void 1404 cmd_q(Cmd *c) 1405 { 1406 if (!in_range(c)) 1407 return; 1408 1409 if (!gflags.n) 1410 check_puts(patt.str, stdout); 1411 do_writes(); 1412 gflags.halt = 1; 1413 } 1414 1415 static void 1416 cmd_r(Cmd *c) 1417 { 1418 if (in_range(c)) 1419 push(&writes, c); 1420 } 1421 1422 static void 1423 cmd_s(Cmd *c) 1424 { 1425 String tmp; 1426 Rune r; 1427 size_t plen, rlen, len; 1428 char *p, *s, *end; 1429 unsigned int matches = 0, last_empty = 1, qflag = 0, cflags = 0; 1430 regex_t *re; 1431 regmatch_t *rm, *pmatch = NULL; 1432 1433 if (!in_range(c)) 1434 return; 1435 1436 if (!c->u.s.re && !lastre) 1437 leprintf("no previous regex"); 1438 1439 re = c->u.s.re ? c->u.s.re : lastre; 1440 lastre = re; 1441 1442 plen = re->re_nsub + 1; 1443 pmatch = ereallocarray(NULL, plen, sizeof(regmatch_t)); 1444 1445 *genbuf.str = '\0'; 1446 s = patt.str; 1447 1448 while (!qflag && !regexec(re, s, plen, pmatch, cflags)) { 1449 cflags = REG_NOTBOL; /* match against beginning of line first time, but not again */ 1450 if (!*s) /* match against empty string first time, but not again */ 1451 qflag = 1; 1452 1453 /* don't substitute if last match was not empty but this one is. 1454 * s_a*_._g 1455 * foobar -> .f.o.o.b.r. 1456 */ 1457 if ((last_empty || pmatch[0].rm_eo) && 1458 (++matches == c->u.s.occurrence || !c->u.s.occurrence)) { 1459 /* copy over everything before the match */ 1460 strnacat(&genbuf, s, pmatch[0].rm_so); 1461 1462 /* copy over replacement text, taking into account &, backreferences, and \ escapes */ 1463 for (p = c->u.s.repl.str, len = strcspn(p, "\\&"); *p; len = strcspn(++p, "\\&")) { 1464 strnacat(&genbuf, p, len); 1465 p += len; 1466 switch (*p) { 1467 default: leprintf("this shouldn't be possible"); 1468 case '\0': 1469 /* we're at the end, back up one so the ++p will put us on 1470 * the null byte to break out of the loop */ 1471 --p; 1472 break; 1473 case '&': 1474 strnacat(&genbuf, s + pmatch[0].rm_so, pmatch[0].rm_eo - pmatch[0].rm_so); 1475 break; 1476 case '\\': 1477 if (isdigit(*++p)) { /* backreference */ 1478 /* only need to check here if using lastre, otherwise we checked when building */ 1479 if (!c->u.s.re && (size_t)(*p - '0') > re->re_nsub) 1480 leprintf("back reference number greater than number of groups"); 1481 rm = &pmatch[*p - '0']; 1482 strnacat(&genbuf, s + rm->rm_so, rm->rm_eo - rm->rm_so); 1483 } else { /* character after backslash taken literally (well one byte, but it works) */ 1484 strnacat(&genbuf, p, 1); 1485 } 1486 break; 1487 } 1488 } 1489 } else { 1490 /* not replacing, copy over everything up to and including the match */ 1491 strnacat(&genbuf, s, pmatch[0].rm_eo); 1492 } 1493 1494 if (!pmatch[0].rm_eo) { /* empty match, advance one rune and add it to output */ 1495 end = s + strlen(s); 1496 rlen = charntorune(&r, s, end - s); 1497 1498 if (!rlen) { /* ran out of bytes, copy short sequence */ 1499 stracat(&genbuf, s); 1500 s = end; 1501 } else { /* copy whether or not it's a good rune */ 1502 strnacat(&genbuf, s, rlen); 1503 s += rlen; 1504 } 1505 } 1506 last_empty = !pmatch[0].rm_eo; 1507 s += pmatch[0].rm_eo; 1508 } 1509 free(pmatch); 1510 1511 if (!(matches && matches >= c->u.s.occurrence)) /* no replacement */ 1512 return; 1513 1514 gflags.s = 1; 1515 1516 stracat(&genbuf, s); 1517 1518 tmp = patt; 1519 patt = genbuf; 1520 genbuf = tmp; 1521 1522 if (c->u.s.p) 1523 check_puts(patt.str, stdout); 1524 if (c->u.s.file) 1525 check_puts(patt.str, c->u.s.file); 1526 } 1527 1528 static void 1529 cmd_t(Cmd *c) 1530 { 1531 if (!in_range(c) || !gflags.s) 1532 return; 1533 1534 /* if we jump backwards update to end, otherwise update to destination */ 1535 update_ranges(c + 1, c->u.jump > c ? c->u.jump : prog + pcap); 1536 pc = c->u.jump; 1537 gflags.s = 0; 1538 } 1539 1540 static void 1541 cmd_w(Cmd *c) 1542 { 1543 if (in_range(c)) 1544 check_puts(patt.str, c->u.file); 1545 } 1546 1547 static void 1548 cmd_x(Cmd *c) 1549 { 1550 String tmp; 1551 1552 if (!in_range(c)) 1553 return; 1554 1555 tmp = patt; 1556 patt = hold; 1557 hold = tmp; 1558 } 1559 1560 static void 1561 cmd_y(Cmd *c) 1562 { 1563 String tmp; 1564 Rune r, *rp; 1565 size_t n, rlen; 1566 char *s, *end, buf[UTFmax]; 1567 1568 if (!in_range(c)) 1569 return; 1570 1571 *genbuf.str = '\0'; 1572 for (s = patt.str, end = s + strlen(s); *s; s += rlen) { 1573 if (!(rlen = charntorune(&r, s, end - s))) { /* ran out of chars, copy rest */ 1574 stracat(&genbuf, s); 1575 break; 1576 } else if (r == Runeerror) { /* bad UTF-8 sequence, copy bytes */ 1577 strnacat(&genbuf, s, rlen); 1578 } else { 1579 for (rp = c->u.y.set1; *rp; rp++) 1580 if (*rp == r) 1581 break; 1582 if (*rp) { /* found r in set1, replace with Rune from set2 */ 1583 n = runetochar(buf, c->u.y.set2 + (rp - c->u.y.set1)); 1584 strnacat(&genbuf, buf, n); 1585 } else { 1586 strnacat(&genbuf, s, rlen); 1587 } 1588 } 1589 } 1590 tmp = patt; 1591 patt = genbuf; 1592 genbuf = tmp; 1593 } 1594 1595 static void 1596 cmd_colon(Cmd *c) 1597 { 1598 } 1599 1600 static void 1601 cmd_equal(Cmd *c) 1602 { 1603 if (in_range(c)) 1604 printf("%zu\n", lineno); 1605 } 1606 1607 static void 1608 cmd_lbrace(Cmd *c) 1609 { 1610 Cmd *jump; 1611 1612 if (in_range(c)) 1613 return; 1614 1615 /* update ranges on all commands we skip */ 1616 jump = prog + c->u.offset; 1617 update_ranges(c + 1, jump); 1618 pc = jump; 1619 } 1620 1621 static void 1622 cmd_rbrace(Cmd *c) 1623 { 1624 } 1625 1626 /* not actually a sed function, but acts like one, put in last spot of script */ 1627 static void 1628 cmd_last(Cmd *c) 1629 { 1630 if (!gflags.n) 1631 check_puts(patt.str, stdout); 1632 do_writes(); 1633 new_next(); 1634 } 1635 1636 /* 1637 * Actions 1638 */ 1639 1640 /* read new line, continue current cycle */ 1641 static void 1642 new_line(void) 1643 { 1644 while (read_line(file, &patt) == EOF) { 1645 if (next_file()) { 1646 gflags.halt = 1; 1647 return; 1648 } 1649 } 1650 gflags.s = 0; 1651 lineno++; 1652 } 1653 1654 /* append new line, continue current cycle 1655 * FIXME: used for N, POSIX specifies do not print pattern space when out of 1656 * input, but GNU does so busybox does as well. Currently we don't. 1657 * Should we? 1658 */ 1659 static void 1660 app_line(void) 1661 { 1662 while (read_line(file, &genbuf) == EOF) { 1663 if (next_file()) { 1664 gflags.halt = 1; 1665 return; 1666 } 1667 } 1668 1669 stracat(&patt, "\n"); 1670 stracat(&patt, genbuf.str); 1671 gflags.s = 0; 1672 lineno++; 1673 } 1674 1675 /* read new line, start new cycle */ 1676 static void 1677 new_next(void) 1678 { 1679 *patt.str = '\0'; 1680 update_ranges(pc + 1, prog + pcap); 1681 new_line(); 1682 pc = prog - 1; 1683 } 1684 1685 /* keep old pattern space, start new cycle */ 1686 static void 1687 old_next(void) 1688 { 1689 update_ranges(pc + 1, prog + pcap); 1690 pc = prog - 1; 1691 } 1692 1693 int 1694 main(int argc, char *argv[]) 1695 { 1696 char *arg; 1697 int ret = 0, script = 0; 1698 1699 ARGBEGIN { 1700 case 'n': 1701 gflags.n = 1; 1702 break; 1703 case 'r': 1704 case 'E': 1705 gflags.E = 1; 1706 break; 1707 case 'e': 1708 arg = EARGF(usage()); 1709 compile(arg, 0); 1710 script = 1; 1711 break; 1712 case 'f': 1713 arg = EARGF(usage()); 1714 compile(arg, 1); 1715 script = 1; 1716 break; 1717 default : usage(); 1718 } ARGEND 1719 1720 /* no script to run */ 1721 if (!script && !argc) 1722 usage(); 1723 1724 /* no script yet, next argument is script */ 1725 if (!script) 1726 compile(*argv++, 0); 1727 1728 /* shrink/grow memory to fit and add our last instruction */ 1729 resize((void **)&prog, &pcap, sizeof(*prog), pc - prog + 1, NULL); 1730 pc = prog + pcap - 1; 1731 pc->fninfo = &(Fninfo){ cmd_last, NULL, NULL, 0 }; 1732 1733 files = argv; 1734 run(); 1735 1736 ret |= fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>"); 1737 1738 return ret; 1739 }