sbase

suckless unix tools
git clone git://git.2f30.org/sbase.git
Log | Files | Refs | README | LICENSE

sed.c (41884B)


      1 /* FIXME: summary
      2  * decide whether we enforce valid UTF-8, right now it's enforced in certain
      3  *     parts of the script, but not the input...
      4  * nul bytes cause explosions due to use of libc string functions. thoughts?
      5  * lack of newline at end of file, currently we add one. what should we do?
      6  * allow "\\t" for "\t" etc. in regex? in replacement text?
      7  * POSIX says don't flush on N when out of input, but GNU and busybox do.
      8  */
      9 
     10 #include <ctype.h>
     11 #include <errno.h>
     12 #include <regex.h>
     13 #include <stdlib.h>
     14 #include <string.h>
     15 
     16 #include "utf.h"
     17 #include "util.h"
     18 
     19 /* Types */
     20 
     21 /* used as queue for writes and stack for {,:,b,t */
     22 typedef struct {
     23 	void **data;
     24 	size_t size;
     25 	size_t cap;
     26 } Vec;
     27 
     28 /* used for arbitrary growth, str is a C string
     29  * FIXME: does it make sense to keep track of length? or just rely on libc
     30  *        string functions? If we want to support nul bytes everything changes
     31  */
     32 typedef struct {
     33 	char  *str;
     34 	size_t cap;
     35 } String;
     36 
     37 typedef struct Cmd Cmd;
     38 typedef struct {
     39 	void  (*fn)(Cmd *);
     40 	char *(*getarg)(Cmd *, char *);
     41 	void  (*freearg)(Cmd *);
     42 	unsigned char naddr;
     43 } Fninfo;
     44 
     45 typedef struct {
     46 	union {
     47 		size_t   lineno;
     48 		regex_t *re;
     49 	} u;
     50 	enum {
     51 		IGNORE, /* empty address, ignore        */
     52 		EVERY , /* every line                   */
     53 		LINE  , /* ilne number                  */
     54 		LAST  , /* last line ($)                */
     55 		REGEX , /* use included regex           */
     56 		LASTRE, /* use most recently used regex */
     57 	} type;
     58 } Addr;
     59 
     60 /* DISCUSS: naddr is not strictly necessary, but very helpful
     61  * naddr == 0 iff beg.type == EVERY  && end.type == IGNORE
     62  * naddr == 1 iff beg.type != IGNORE && end.type == IGNORE
     63  * naddr == 2 iff beg.type != IGNORE && end.type != IGNORE
     64  */
     65 typedef struct {
     66 	Addr          beg;
     67 	Addr          end;
     68 	unsigned char naddr;
     69 } Range;
     70 
     71 typedef struct {
     72 	regex_t      *re; /* if NULL use last regex */
     73 	String        repl;
     74 	FILE         *file;
     75 	size_t        occurrence; /* 0 for all (g flag) */
     76 	Rune          delim;
     77 	unsigned int  p:1;
     78 } Sarg;
     79 
     80 typedef struct {
     81 	Rune *set1;
     82 	Rune *set2;
     83 } Yarg;
     84 
     85 typedef struct {
     86 	String str; /* a,c,i text. r file path */
     87 	void  (*print)(char *, FILE *); /* check_puts for a, write_file for r, unused for c,i */
     88 } ACIRarg;
     89 
     90 struct Cmd {
     91 	Range   range;
     92 	Fninfo *fninfo;
     93 	union {
     94 		Cmd      *jump;   /* used for   b,t when running  */
     95 		char     *label;  /* used for :,b,t when building */
     96 		ptrdiff_t offset; /* used for { (pointers break during realloc) */
     97 		FILE     *file;   /* used for w */
     98 
     99 		/* FIXME: Should the following be in the union? or pointers and malloc? */
    100 		Sarg      s;
    101 		Yarg      y;
    102 		ACIRarg   acir;
    103 	} u; /* I find your lack of anonymous unions disturbing */
    104 	unsigned int in_match:1;
    105 	unsigned int negate  :1;
    106 };
    107 
    108 /* Files for w command (and s' w flag) */
    109 typedef struct {
    110 	char *path;
    111 	FILE *file;
    112 } Wfile;
    113 
    114 /*
    115  * Function Declarations
    116  */
    117 
    118 /* Dynamically allocated arrays and strings */
    119 static void resize(void **ptr, size_t *nmemb, size_t size, size_t new_nmemb, void **next);
    120 static void *pop(Vec *v);
    121 static void push(Vec *v, void *p);
    122 static void stracat(String *dst, char *src);
    123 static void strnacat(String *dst, char *src, size_t n);
    124 static void stracpy(String *dst, char *src);
    125 
    126 /* Cleanup and errors */
    127 static void usage(void);
    128 
    129 /* Parsing functions and related utilities */
    130 static void compile(char *s, int isfile);
    131 static int read_line(FILE *f, String *s);
    132 static char *make_range(Range *range, char *s);
    133 static char *make_addr(Addr *addr, char *s);
    134 static char *find_delim(char *s, Rune delim, int do_brackets);
    135 static char *chompr(char *s, Rune rune);
    136 static char *chomp(char *s);
    137 static Rune *strtorunes(char *s, size_t nrunes);
    138 static long stol(char *s, char **endp);
    139 static size_t escapes(char *beg, char *end, Rune delim, int n_newline);
    140 static size_t echarntorune(Rune *r, char *s, size_t n);
    141 static void insert_labels(void);
    142 
    143 /* Get and Free arg and related utilities */
    144 static char *get_aci_arg(Cmd *c, char *s);
    145 static void aci_append(Cmd *c, char *s);
    146 static void free_acir_arg(Cmd *c);
    147 static char *get_bt_arg(Cmd *c, char *s);
    148 static char *get_r_arg(Cmd *c, char *s);
    149 static char *get_s_arg(Cmd *c, char *s);
    150 static void free_s_arg(Cmd *c);
    151 static char *get_w_arg(Cmd *c, char *s);
    152 static char *get_y_arg(Cmd *c, char *s);
    153 static void free_y_arg(Cmd *c);
    154 static char *get_colon_arg(Cmd *c, char *s);
    155 static char *get_lbrace_arg(Cmd *c, char *s);
    156 static char *get_rbrace_arg(Cmd *c, char *s);
    157 static char *semicolon_arg(char *s);
    158 
    159 /* Running */
    160 static void run(void);
    161 static int in_range(Cmd *c);
    162 static int match_addr(Addr *a);
    163 static int next_file(void);
    164 static int is_eof(FILE *f);
    165 static void do_writes(void);
    166 static void write_file(char *path, FILE *out);
    167 static void check_puts(char *s, FILE *f);
    168 static void update_ranges(Cmd *beg, Cmd *end);
    169 
    170 /* Sed functions */
    171 static void cmd_y(Cmd *c);
    172 static void cmd_x(Cmd *c);
    173 static void cmd_w(Cmd *c);
    174 static void cmd_t(Cmd *c);
    175 static void cmd_s(Cmd *c);
    176 static void cmd_r(Cmd *c);
    177 static void cmd_q(Cmd *c);
    178 static void cmd_P(Cmd *c);
    179 static void cmd_p(Cmd *c);
    180 static void cmd_N(Cmd *c);
    181 static void cmd_n(Cmd *c);
    182 static void cmd_l(Cmd *c);
    183 static void cmd_i(Cmd *c);
    184 static void cmd_H(Cmd *c);
    185 static void cmd_h(Cmd *c);
    186 static void cmd_G(Cmd *c);
    187 static void cmd_g(Cmd *c);
    188 static void cmd_D(Cmd *c);
    189 static void cmd_d(Cmd *c);
    190 static void cmd_c(Cmd *c);
    191 static void cmd_b(Cmd *c);
    192 static void cmd_a(Cmd *c);
    193 static void cmd_colon(Cmd *c);
    194 static void cmd_equal(Cmd *c);
    195 static void cmd_lbrace(Cmd *c);
    196 static void cmd_rbrace(Cmd *c);
    197 static void cmd_last(Cmd *c);
    198 
    199 /* Actions */
    200 static void new_line(void);
    201 static void app_line(void);
    202 static void new_next(void);
    203 static void old_next(void);
    204 
    205 /*
    206  * Globals
    207  */
    208 static Vec braces, labels, branches; /* holds ptrdiff_t. addrs of {, :, bt */
    209 static Vec writes; /* holds cmd*. writes scheduled by a and r commands */
    210 static Vec wfiles; /* holds Wfile*. files for w and s///w commands */
    211 
    212 static Cmd   *prog, *pc; /* Program, program counter */
    213 static size_t pcap;
    214 static size_t lineno;
    215 
    216 static regex_t *lastre; /* last used regex for empty regex search */
    217 static char   **files;  /* list of file names from argv */
    218 static FILE    *file;   /* current file we are reading */
    219 
    220 static String patt, hold, genbuf;
    221 
    222 static struct {
    223 	unsigned int n       :1; /* -n (no print) */
    224 	unsigned int E       :1; /* -E (extended re) */
    225 	unsigned int s       :1; /* s/// replacement happened */
    226 	unsigned int aci_cont:1; /* a,c,i text continuation */
    227 	unsigned int s_cont  :1; /* s/// replacement text continuation */
    228 	unsigned int halt    :1; /* halt execution */
    229 } gflags;
    230 
    231 /* FIXME: move character inside Fninfo and only use 26*sizeof(Fninfo) instead of 127*sizeof(Fninfo) bytes */
    232 static Fninfo fns[] = {
    233 	['a'] = { cmd_a     , get_aci_arg   , free_acir_arg , 1 }, /* schedule write of text for later                                                      */
    234 	['b'] = { cmd_b     , get_bt_arg    , NULL          , 2 }, /* branch to label char *label when building, Cmd *jump when running                     */
    235 	['c'] = { cmd_c     , get_aci_arg   , free_acir_arg , 2 }, /* delete pattern space. at 0 or 1 addr or end of 2 addr, write text                     */
    236 	['d'] = { cmd_d     , NULL          , NULL          , 2 }, /* delete pattern space                                                                  */
    237 	['D'] = { cmd_D     , NULL          , NULL          , 2 }, /* delete to first newline and start new cycle without reading (if no newline, d)        */
    238 	['g'] = { cmd_g     , NULL          , NULL          , 2 }, /* replace pattern space with hold space                                                 */
    239 	['G'] = { cmd_G     , NULL          , NULL          , 2 }, /* append newline and hold space to pattern space                                        */
    240 	['h'] = { cmd_h     , NULL          , NULL          , 2 }, /* replace hold space with pattern space                                                 */
    241 	['H'] = { cmd_H     , NULL          , NULL          , 2 }, /* append newline and pattern space to hold space                                        */
    242 	['i'] = { cmd_i     , get_aci_arg   , free_acir_arg , 1 }, /* write text                                                                            */
    243 	['l'] = { cmd_l     , NULL          , NULL          , 2 }, /* write pattern space in 'visually unambiguous form'                                    */
    244 	['n'] = { cmd_n     , NULL          , NULL          , 2 }, /* write pattern space (unless -n) read to replace pattern space (if no input, quit)     */
    245 	['N'] = { cmd_N     , NULL          , NULL          , 2 }, /* append to pattern space separated by newline, line number changes (if no input, quit) */
    246 	['p'] = { cmd_p     , NULL          , NULL          , 2 }, /* write pattern space                                                                   */
    247 	['P'] = { cmd_P     , NULL          , NULL          , 2 }, /* write pattern space up to first newline                                               */
    248 	['q'] = { cmd_q     , NULL          , NULL          , 1 }, /* quit                                                                                  */
    249 	['r'] = { cmd_r     , get_r_arg     , free_acir_arg , 1 }, /* write contents of file (unable to open/read treated as empty file)                    */
    250 	['s'] = { cmd_s     , get_s_arg     , free_s_arg    , 2 }, /* find/replace/all that crazy s stuff                                                   */
    251 	['t'] = { cmd_t     , get_bt_arg    , NULL          , 2 }, /* if s/// succeeded (since input or last t) branch to label (branch to end if no label) */
    252 	['w'] = { cmd_w     , get_w_arg     , NULL          , 2 }, /* append pattern space to file                                                          */
    253 	['x'] = { cmd_x     , NULL          , NULL          , 2 }, /* exchange pattern and hold spaces                                                      */
    254 	['y'] = { cmd_y     , get_y_arg     , free_y_arg    , 2 }, /* replace runes in set1 with runes in set2                                              */
    255 	[':'] = { cmd_colon , get_colon_arg , NULL          , 0 }, /* defines label for later b and t commands                                              */
    256 	['='] = { cmd_equal , NULL          , NULL          , 1 }, /* printf("%d\n", line_number);                                                          */
    257 	['{'] = { cmd_lbrace, get_lbrace_arg, NULL          , 2 }, /* if we match, run commands, otherwise jump to close                                    */
    258 	['}'] = { cmd_rbrace, get_rbrace_arg, NULL          , 0 }, /* noop, hold onto open for ease of building scripts                                     */
    259 
    260 	[0x7f] = { NULL, NULL, NULL, 0 }, /* index is checked with isascii(3p). fill out rest of array */
    261 };
    262 
    263 /*
    264  * Function Definitions
    265  */
    266 
    267 /* given memory pointed to by *ptr that currently holds *nmemb members of size
    268  * size, realloc to hold new_nmemb members, return new_nmemb in *memb and one
    269  * past old end in *next. if realloc fails...explode
    270  */
    271 static void
    272 resize(void **ptr, size_t *nmemb, size_t size, size_t new_nmemb, void **next)
    273 {
    274 	void *n, *tmp;
    275 
    276 	if (new_nmemb) {
    277 		tmp = ereallocarray(*ptr, new_nmemb, size);
    278 	} else { /* turns out realloc(*ptr, 0) != free(*ptr) */
    279 		free(*ptr);
    280 		tmp = NULL;
    281 	}
    282 	n = (char *)tmp + *nmemb * size;
    283 	*nmemb = new_nmemb;
    284 	*ptr   = tmp;
    285 	if (next)
    286 		*next = n;
    287 }
    288 
    289 static void *
    290 pop(Vec *v)
    291 {
    292 	if (!v->size)
    293 		return NULL;
    294 	return v->data[--v->size];
    295 }
    296 
    297 static void
    298 push(Vec *v, void *p)
    299 {
    300 	if (v->size == v->cap)
    301 		resize((void **)&v->data, &v->cap, sizeof(*v->data), v->cap * 2 + 1, NULL);
    302 	v->data[v->size++] = p;
    303 }
    304 
    305 static void
    306 stracat(String *dst, char *src)
    307 {
    308 	int new = !dst->cap;
    309 	size_t len;
    310 
    311 	len = (new ? 0 : strlen(dst->str)) + strlen(src) + 1;
    312 	if (dst->cap < len)
    313 		resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL);
    314 	if (new)
    315 		*dst->str = '\0';
    316 	strcat(dst->str, src);
    317 }
    318 
    319 static void
    320 strnacat(String *dst, char *src, size_t n)
    321 {
    322 	int new = !dst->cap;
    323 	size_t len;
    324 
    325 	len = strlen(src);
    326 	len = (new ? 0 : strlen(dst->str)) + MIN(n, len) + 1;
    327 	if (dst->cap < len)
    328 		resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL);
    329 	if (new)
    330 		*dst->str = '\0';
    331 	strlcat(dst->str, src, len);
    332 }
    333 
    334 static void
    335 stracpy(String *dst, char *src)
    336 {
    337 	size_t len;
    338 
    339 	len = strlen(src) + 1;
    340 	if (dst->cap < len)
    341 		resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL);
    342 	strcpy(dst->str, src);
    343 }
    344 
    345 static void
    346 leprintf(char *s)
    347 {
    348 	if (errno)
    349 		eprintf("%zu: %s: %s\n", lineno, s, strerror(errno));
    350 	else
    351 		eprintf("%zu: %s\n", lineno, s);
    352 }
    353 
    354 /* FIXME: write usage message */
    355 static void
    356 usage(void)
    357 {
    358 	eprintf("usage: sed [-nrE] script [file ...]\n"
    359 	        "       sed [-nrE] -e script [-e script] ... [-f scriptfile] ... [file ...]\n"
    360 	        "       sed [-nrE] [-e script] ... -f scriptfile [-f scriptfile] ... [file ...]\n");
    361 }
    362 
    363 /* Differences from POSIX
    364  * we allows semicolons and trailing blanks inside {}
    365  * we allow spaces after ! (and in between !s)
    366  * we allow extended regular expressions (-E)
    367  */
    368 static void
    369 compile(char *s, int isfile)
    370 {
    371 	FILE *f;
    372 
    373 	if (!isfile && !*s) /* empty string script */
    374 		return;
    375 
    376 	f = isfile ? fopen(s, "r") : fmemopen(s, strlen(s), "r");
    377 	if (!f)
    378 		eprintf("fopen/fmemopen:");
    379 
    380 	/* NOTE: get arg functions can't use genbuf */
    381 	while (read_line(f, &genbuf) != EOF) {
    382 		s = genbuf.str;
    383 
    384 		/* if the first two characters of the script are "#n" default output shall be suppressed */
    385 		if (++lineno == 1 && *s == '#' && s[1] == 'n') {
    386 			gflags.n = 1;
    387 			continue;
    388 		}
    389 
    390 		if (gflags.aci_cont) {
    391 			aci_append(pc - 1, s);
    392 			continue;
    393 		}
    394 		if (gflags.s_cont)
    395 			s = (pc - 1)->fninfo->getarg(pc - 1, s);
    396 
    397 		while (*s) {
    398 			s = chompr(s, ';');
    399 			if (!*s || *s == '#')
    400 				break;
    401 
    402 			if ((size_t)(pc - prog) == pcap)
    403 				resize((void **)&prog, &pcap, sizeof(*prog), pcap * 2 + 1, (void **)&pc);
    404 
    405 			pc->range.beg.type = pc->range.end.type = IGNORE;
    406 			pc->fninfo = NULL;
    407 			pc->in_match = 0;
    408 
    409 			s = make_range(&pc->range, s);
    410 			s = chomp(s);
    411 			pc->negate = *s == '!';
    412 			s = chompr(s, '!');
    413 
    414 			if (!isascii(*s) || !(pc->fninfo = &fns[(unsigned)*s])->fn)
    415 				leprintf("bad sed function");
    416 			if (pc->range.naddr > pc->fninfo->naddr)
    417 				leprintf("wrong number of addresses");
    418 			s++;
    419 
    420 			if (pc->fninfo->getarg)
    421 				s = pc->fninfo->getarg(pc, s);
    422 
    423 			pc++;
    424 		}
    425 	}
    426 
    427 	fshut(f, s);
    428 }
    429 
    430 /* FIXME: if we decide to honor lack of trailing newline, set/clear a global
    431  * flag when reading a line
    432  */
    433 static int
    434 read_line(FILE *f, String *s)
    435 {
    436 	ssize_t len;
    437 
    438 	if (!f)
    439 		return EOF;
    440 
    441 	if ((len = getline(&s->str, &s->cap, f)) < 0) {
    442 		if (ferror(f))
    443 			eprintf("getline:");
    444 		return EOF;
    445 	}
    446 	if (s->str[--len] == '\n')
    447 		s->str[len] = '\0';
    448 	return 0;
    449 }
    450 
    451 /* read first range from s, return pointer to one past end of range */
    452 static char *
    453 make_range(Range *range, char *s)
    454 {
    455 	s = make_addr(&range->beg, s);
    456 
    457 	if (*s == ',')
    458 		s = make_addr(&range->end, s + 1);
    459 	else
    460 		range->end.type = IGNORE;
    461 
    462 	if      (range->beg.type == EVERY  && range->end.type == IGNORE) range->naddr = 0;
    463 	else if (range->beg.type != IGNORE && range->end.type == IGNORE) range->naddr = 1;
    464 	else if (range->beg.type != IGNORE && range->end.type != IGNORE) range->naddr = 2;
    465 	else leprintf("this is impossible...");
    466 
    467 	return s;
    468 }
    469 
    470 /* read first addr from s, return pointer to one past end of addr */
    471 static char *
    472 make_addr(Addr *addr, char *s)
    473 {
    474 	Rune r;
    475 	char *p = s + strlen(s);
    476 	size_t rlen = echarntorune(&r, s, p - s);
    477 
    478 	if (r == '$') {
    479 		addr->type = LAST;
    480 		s += rlen;
    481 	} else if (isdigitrune(r)) {
    482 		addr->type = LINE;
    483 		addr->u.lineno = stol(s, &s);
    484 	} else if (r == '/' || r == '\\') {
    485 		Rune delim;
    486 		if (r == '\\') {
    487 			s += rlen;
    488 			rlen = echarntorune(&r, s, p - s);
    489 		}
    490 		if (r == '\\')
    491 			leprintf("bad delimiter '\\'");
    492 		delim = r;
    493 		s += rlen;
    494 		rlen = echarntorune(&r, s, p - s);
    495 		if (r == delim) {
    496 			addr->type = LASTRE;
    497 			s += rlen;
    498 		} else {
    499 			addr->type = REGEX;
    500 			p = find_delim(s, delim, 1);
    501 			if (!*p)
    502 				leprintf("unclosed regex");
    503 			p -= escapes(s, p, delim, 0);
    504 			*p++ = '\0';
    505 			addr->u.re = emalloc(sizeof(*addr->u.re));
    506 			eregcomp(addr->u.re, s, gflags.E ? REG_EXTENDED : 0);
    507 			s = p;
    508 		}
    509 	} else {
    510 		addr->type = EVERY;
    511 	}
    512 
    513 	return s;
    514 }
    515 
    516 /* return pointer to first delim in s that is not escaped
    517  * and if do_brackets is set, not in [] (note possible [::], [..], [==], inside [])
    518  * return pointer to trailing nul byte if no delim found
    519  *
    520  * any escaped character that is not special is just itself (POSIX undefined)
    521  * FIXME: pull out into some util thing, will be useful for ed as well
    522  */
    523 static char *
    524 find_delim(char *s, Rune delim, int do_brackets)
    525 {
    526 	enum {
    527 		OUTSIDE         , /* not in brackets */
    528 		BRACKETS_OPENING, /* last char was first [ or last two were first [^ */
    529 		BRACKETS_INSIDE , /* inside [] */
    530 		INSIDE_OPENING  , /* inside [] and last char was [ */
    531 		CLASS_INSIDE    , /* inside class [::], or colating element [..] or [==], inside [] */
    532 		CLASS_CLOSING   , /* inside class [::], or colating element [..] or [==], and last character was the respective : . or = */
    533 	} state = OUTSIDE;
    534 
    535 	Rune r, c = 0; /* no c won't be used uninitialized, shutup -Wall */
    536 	size_t rlen;
    537 	int escape = 0;
    538 	char *end = s + strlen(s);
    539 
    540 	for (; *s; s += rlen) {
    541 		rlen = echarntorune(&r, s, end - s);
    542 
    543 		if      (state == BRACKETS_OPENING       &&  r == '^'  ) {                            continue; }
    544 		else if (state == BRACKETS_OPENING       &&  r == ']'  ) { state  = BRACKETS_INSIDE ; continue; }
    545 		else if (state == BRACKETS_OPENING                     ) { state  = BRACKETS_INSIDE ;           }
    546 
    547 		if      (state == CLASS_CLOSING          &&  r == ']'  ) { state  = BRACKETS_INSIDE ;           }
    548 		else if (state == CLASS_CLOSING                        ) { state  = CLASS_INSIDE    ;           }
    549 		else if (state == CLASS_INSIDE           &&  r ==  c   ) { state  = CLASS_CLOSING   ;           }
    550 		else if (state == INSIDE_OPENING         && (r == ':'  ||
    551 		                                             r == '.'  ||
    552 		                                             r == '=') ) { state  = CLASS_INSIDE    ; c = r;    }
    553 		else if (state == INSIDE_OPENING         &&  r == ']'  ) { state  = OUTSIDE         ;           }
    554 		else if (state == INSIDE_OPENING                       ) { state  = BRACKETS_INSIDE ;           }
    555 		else if (state == BRACKETS_INSIDE        &&  r == '['  ) { state  = INSIDE_OPENING  ;           }
    556 		else if (state == BRACKETS_INSIDE        &&  r == ']'  ) { state  = OUTSIDE         ;           }
    557 		else if (state == OUTSIDE                &&  escape    ) { escape = 0               ;           }
    558 		else if (state == OUTSIDE                &&  r == '\\' ) { escape = 1               ;           }
    559 		else if (state == OUTSIDE                &&  r == delim) return s;
    560 		else if (state == OUTSIDE && do_brackets &&  r == '['  ) { state  = BRACKETS_OPENING;           }
    561 	}
    562 	return s;
    563 }
    564 
    565 static char *
    566 chomp(char *s)
    567 {
    568 	return chompr(s, 0);
    569 }
    570 
    571 /* eat all leading whitespace and occurrences of rune */
    572 static char *
    573 chompr(char *s, Rune rune)
    574 {
    575 	Rune   r;
    576 	size_t rlen;
    577 	char  *end = s + strlen(s);
    578 
    579 	while (*s && (rlen = echarntorune(&r, s, end - s)) && (isspacerune(r) || r == rune))
    580 		s += rlen;
    581 	return s;
    582 }
    583 
    584 /* convert first nrunes Runes from UTF-8 string s in allocated Rune*
    585  * NOTE: sequence must be valid UTF-8, check first */
    586 static Rune *
    587 strtorunes(char *s, size_t nrunes)
    588 {
    589 	Rune *rs, *rp;
    590 
    591 	rp = rs = ereallocarray(NULL, nrunes + 1, sizeof(*rs));
    592 
    593 	while (nrunes--)
    594 		s += chartorune(rp++, s);
    595 
    596 	*rp = '\0';
    597 	return rs;
    598 }
    599 
    600 static long
    601 stol(char *s, char **endp)
    602 {
    603 	long n;
    604 	errno = 0;
    605 	n = strtol(s, endp, 10);
    606 
    607 	if (errno)
    608 		leprintf("strtol:");
    609 	if (*endp == s)
    610 		leprintf("strtol: invalid number");
    611 
    612 	return n;
    613 }
    614 
    615 /* from beg to end replace "\\d" with "d" and "\\n" with "\n" (where d is delim)
    616  * if delim is 'n' and n_newline is 0 then "\\n" is replaced with "n" (normal)
    617  * if delim is 'n' and n_newline is 1 then "\\n" is replaced with "\n" (y command)
    618  * if delim is 0 all escaped characters represent themselves (aci text)
    619  * memmove rest of string (beyond end) into place
    620  * return the number of converted escapes (backslashes removed)
    621  * FIXME: this has had too many corner cases slapped on and is ugly. rewrite better
    622  */
    623 static size_t
    624 escapes(char *beg, char *end, Rune delim, int n_newline)
    625 {
    626 	size_t num = 0;
    627 	char *src = beg, *dst = beg;
    628 
    629 	while (src < end) {
    630 		/* handle escaped backslash specially so we don't think the second
    631 		 * backslash is escaping something */
    632 		if (*src == '\\' && src[1] == '\\') {
    633 			*dst++ = *src++;
    634 			if (delim)
    635 				*dst++ = *src++;
    636 			else
    637 				src++;
    638 		} else if (*src == '\\' && !delim) {
    639 			src++;
    640 		} else if (*src == '\\' && src[1]) {
    641 			Rune r;
    642 			size_t rlen;
    643 			num++;
    644 			src++;
    645 			rlen = echarntorune(&r, src, end - src);
    646 
    647 			if (r == 'n' && delim == 'n') {
    648 				*src = n_newline ? '\n' : 'n'; /* src so we can still memmove() */
    649 			} else if (r == 'n') {
    650 				*src = '\n';
    651 			} else if (r != delim) {
    652 				*dst++ = '\\';
    653 				num--;
    654 			}
    655 
    656 			memmove(dst, src, rlen);
    657 			dst += rlen;
    658 			src += rlen;
    659 		} else {
    660 			*dst++ = *src++;
    661 		}
    662 	}
    663 	memmove(dst, src, strlen(src) + 1);
    664 	return num;
    665 }
    666 
    667 static size_t
    668 echarntorune(Rune *r, char *s, size_t n)
    669 {
    670 	size_t rlen = charntorune(r, s, n);
    671 	if (!rlen || *r == Runeerror)
    672 		leprintf("invalid UTF-8");
    673 	return rlen;
    674 }
    675 
    676 static void
    677 insert_labels(void)
    678 {
    679 	size_t i;
    680 	Cmd *from, *to;
    681 
    682 	while (branches.size) {
    683 		from = prog + (ptrdiff_t)pop(&branches);
    684 
    685 		if (!from->u.label) {/* no label branch to end of script */
    686 			from->u.jump = pc - 1;
    687 		} else {
    688 			for (i = 0; i < labels.size; i++) {
    689 				to = prog + (ptrdiff_t)labels.data[i];
    690 				if (!strcmp(from->u.label, to->u.label)) {
    691 					from->u.jump = to;
    692 					break;
    693 				}
    694 			}
    695 			if (i == labels.size)
    696 				leprintf("bad label");
    697 		}
    698 	}
    699 }
    700 
    701 /*
    702  * Getargs / Freeargs
    703  * Read argument from s, return pointer to one past last character of argument
    704  */
    705 
    706 /* POSIX compliant
    707  * i\
    708  * foobar
    709  *
    710  * also allow the following non POSIX compliant
    711  * i        # empty line
    712  * ifoobar
    713  * ifoobar\
    714  * baz
    715  *
    716  * FIXME: GNU and busybox discard leading spaces
    717  * i  foobar
    718  * i foobar
    719  * ifoobar
    720  * are equivalent in GNU and busybox. We don't. Should we?
    721  */
    722 static char *
    723 get_aci_arg(Cmd *c, char *s)
    724 {
    725 	c->u.acir.print = check_puts;
    726 	c->u.acir.str = (String){ NULL, 0 };
    727 
    728 	gflags.aci_cont = !!*s; /* no continue flag if empty string */
    729 
    730 	/* neither empty string nor POSIX compliant */
    731 	if (*s && !(*s == '\\' && !s[1]))
    732 		aci_append(c, s);
    733 
    734 	return s + strlen(s);
    735 }
    736 
    737 static void
    738 aci_append(Cmd *c, char *s)
    739 {
    740 	char *end = s + strlen(s), *p = end;
    741 
    742 	gflags.aci_cont = 0;
    743 	while (--p >= s && *p == '\\')
    744 		gflags.aci_cont = !gflags.aci_cont;
    745 
    746 	if (gflags.aci_cont)
    747 		*--end = '\n';
    748 
    749 	escapes(s, end, 0, 0);
    750 	stracat(&c->u.acir.str, s);
    751 }
    752 
    753 static void
    754 free_acir_arg(Cmd *c)
    755 {
    756 	free(c->u.acir.str.str);
    757 }
    758 
    759 /* POSIX dictates that label is rest of line, including semicolons, trailing
    760  * whitespace, closing braces, etc. and can be limited to 8 bytes
    761  *
    762  * I allow a semicolon or closing brace to terminate a label name, it's not
    763  * POSIX compliant, but it's useful and every sed version I've tried to date
    764  * does the same.
    765  *
    766  * FIXME: POSIX dictates that leading whitespace is ignored but trailing
    767  * whitespace is not. This is annoying and we should probably get rid of it.
    768  */
    769 static char *
    770 get_bt_arg(Cmd *c, char *s)
    771 {
    772 	char *p = semicolon_arg(s = chomp(s));
    773 
    774 	if (p != s) {
    775 		c->u.label = estrndup(s, p - s);
    776 	} else {
    777 		c->u.label = NULL;
    778 	}
    779 
    780 	push(&branches, (void *)(c - prog));
    781 
    782 	return p;
    783 }
    784 
    785 /* POSIX dictates file name is rest of line including semicolons, trailing
    786  * whitespace, closing braces, etc. and file name must be preceded by a space
    787  *
    788  * I allow a semicolon or closing brace to terminate a file name and don't
    789  * enforce leading space.
    790  *
    791  * FIXME: decide whether trailing whitespace should be included and fix
    792  * accordingly
    793  */
    794 static char *
    795 get_r_arg(Cmd *c, char *s)
    796 {
    797 	char *p = semicolon_arg(s = chomp(s));
    798 
    799 	if (p == s)
    800 		leprintf("no file name");
    801 
    802 	c->u.acir.str.str = estrndup(s, p - s);
    803 	c->u.acir.print = write_file;
    804 
    805 	return p;
    806 }
    807 
    808 /* we allow "\\n" in replacement text to mean "\n" (undefined in POSIX)
    809  *
    810  * FIXME: allow other escapes in regex and replacement? if so change escapes()
    811  */
    812 static char *
    813 get_s_arg(Cmd *c, char *s)
    814 {
    815 	Rune delim, r;
    816 	Cmd buf;
    817 	char *p;
    818 	int esc, lastre;
    819 
    820 	/* s/Find/Replace/Flags */
    821 
    822 	/* Find */
    823 	if (!gflags.s_cont) { /* NOT continuing from literal newline in replacement text */
    824 		lastre = 0;
    825 		c->u.s.repl = (String){ NULL, 0 };
    826 		c->u.s.occurrence = 1;
    827 		c->u.s.file = NULL;
    828 		c->u.s.p = 0;
    829 
    830 		if (!*s || *s == '\\')
    831 			leprintf("bad delimiter");
    832 
    833 		p = s + strlen(s);
    834 		s += echarntorune(&delim, s, p - s);
    835 		c->u.s.delim = delim;
    836 
    837 		echarntorune(&r, s, p - s);
    838 		if (r == delim) /* empty regex */
    839 			lastre = 1;
    840 
    841 		p = find_delim(s, delim, 1);
    842 		if (!*p)
    843 			leprintf("missing second delimiter");
    844 		p -= escapes(s, p, delim, 0);
    845 		*p = '\0';
    846 
    847 		if (lastre) {
    848 			c->u.s.re = NULL;
    849 		} else {
    850 			c->u.s.re = emalloc(sizeof(*c->u.s.re));
    851 			/* FIXME: different eregcomp that calls fatal */
    852 			eregcomp(c->u.s.re, s, gflags.E ? REG_EXTENDED : 0);
    853 		}
    854 		s = p + runelen(delim);
    855 	}
    856 
    857 	/* Replace */
    858 	delim = c->u.s.delim;
    859 
    860 	p = find_delim(s, delim, 0);
    861 	p -= escapes(s, p, delim, 0);
    862 	if (!*p) { /* no third delimiter */
    863 		/* FIXME: same backslash counting as aci_append() */
    864 		if (p[-1] != '\\')
    865 			leprintf("missing third delimiter or <backslash><newline>");
    866 		p[-1] = '\n';
    867 		gflags.s_cont = 1;
    868 	} else {
    869 		gflags.s_cont = 0;
    870 	}
    871 
    872 	/* check for bad references in replacement text */
    873 	*p = '\0';
    874 	for (esc = 0, p = s; *p; p++) {
    875 		if (esc) {
    876 			esc = 0;
    877 			if (isdigit(*p) && c->u.s.re && (size_t)(*p - '0') > c->u.s.re->re_nsub)
    878 				leprintf("back reference number greater than number of groups");
    879 		} else if (*p == '\\') {
    880 			esc = 1;
    881 		}
    882 	}
    883 	stracat(&c->u.s.repl, s);
    884 
    885 	if (gflags.s_cont)
    886 		return p;
    887 
    888 	s = p + runelen(delim);
    889 
    890 	/* Flags */
    891 	p = semicolon_arg(s = chomp(s));
    892 
    893 	/* FIXME: currently for simplicity take last of g or occurrence flags and
    894 	 *        ignore multiple p flags. need to fix that */
    895 	for (; s < p; s++) {
    896 		if (isdigit(*s)) {
    897 			c->u.s.occurrence = stol(s, &s);
    898 			s--; /* for loop will advance pointer */
    899 		} else {
    900 			switch (*s) {
    901 			case 'g': c->u.s.occurrence = 0; break;
    902 			case 'p': c->u.s.p = 1;          break;
    903 			case 'w':
    904 				/* must be last flag, take everything up to newline/semicolon
    905 				 * s == p after this */
    906 				s = get_w_arg(&buf, chomp(s+1));
    907 				c->u.s.file = buf.u.file;
    908 				break;
    909 			}
    910 		}
    911 	}
    912 	return p;
    913 }
    914 
    915 static void
    916 free_s_arg(Cmd *c)
    917 {
    918 	if (c->u.s.re)
    919 		regfree(c->u.s.re);
    920 	free(c->u.s.re);
    921 	free(c->u.s.repl.str);
    922 }
    923 
    924 /* see get_r_arg notes */
    925 static char *
    926 get_w_arg(Cmd *c, char *s)
    927 {
    928 	char *p = semicolon_arg(s = chomp(s));
    929 	Wfile *w, **wp;
    930 
    931 	if (p == s)
    932 		leprintf("no file name");
    933 
    934 	for (wp = (Wfile **)wfiles.data; (size_t)(wp - (Wfile **)wfiles.data) < wfiles.size; wp++) {
    935 		if (strlen((*wp)->path) == (size_t)(p - s) && !strncmp(s, (*wp)->path, p - s)) {
    936 			c->u.file = (*wp)->file;
    937 			return p;
    938 		}
    939 	}
    940 
    941 	w = emalloc(sizeof(*w));
    942 	w->path = estrndup(s, p - s);
    943 
    944 	if (!(w->file = fopen(w->path, "w")))
    945 		leprintf("fopen failed");
    946 
    947 	c->u.file = w->file;
    948 
    949 	push(&wfiles, w);
    950 	return p;
    951 }
    952 
    953 static char *
    954 get_y_arg(Cmd *c, char *s)
    955 {
    956 	Rune delim;
    957 	char *p = s + strlen(s);
    958 	size_t rlen = echarntorune(&delim, s, p - s);
    959 	size_t nrunes1, nrunes2;
    960 
    961 	c->u.y.set1 = c->u.y.set2 = NULL;
    962 
    963 	s += rlen;
    964 	p = find_delim(s, delim, 0);
    965 	p -= escapes(s, p, delim, 1);
    966 	nrunes1 = utfnlen(s, p - s);
    967 	c->u.y.set1 = strtorunes(s, nrunes1);
    968 
    969 	s = p + rlen;
    970 	p = find_delim(s, delim, 0);
    971 	p -= escapes(s, p, delim, 1);
    972 	nrunes2 = utfnlen(s, p - s);
    973 
    974 	if (nrunes1 != nrunes2)
    975 		leprintf("different set lengths");
    976 
    977 	c->u.y.set2 = strtorunes(s, utfnlen(s, p - s));
    978 
    979 	return p + rlen;
    980 }
    981 
    982 static void
    983 free_y_arg(Cmd *c)
    984 {
    985 	free(c->u.y.set1);
    986 	free(c->u.y.set2);
    987 }
    988 
    989 /* see get_bt_arg notes */
    990 static char *
    991 get_colon_arg(Cmd *c, char *s)
    992 {
    993 	char *p = semicolon_arg(s = chomp(s));
    994 
    995 	if (p == s)
    996 		leprintf("no label name");
    997 
    998 	c->u.label = estrndup(s, p - s);
    999 	push(&labels, (void *)(c - prog));
   1000 	return p;
   1001 }
   1002 
   1003 static char *
   1004 get_lbrace_arg(Cmd *c, char *s)
   1005 {
   1006 	push(&braces, (void *)(c - prog));
   1007 	return s;
   1008 }
   1009 
   1010 static char *
   1011 get_rbrace_arg(Cmd *c, char *s)
   1012 {
   1013 	Cmd *lbrace;
   1014 
   1015 	if (!braces.size)
   1016 		leprintf("extra }");
   1017 
   1018 	lbrace = prog + (ptrdiff_t)pop(&braces);
   1019 	lbrace->u.offset = c - prog;
   1020 	return s;
   1021 }
   1022 
   1023 /* s points to beginning of an argument that may be semicolon terminated
   1024  * return pointer to semicolon or nul byte after string
   1025  * or closing brace as to not force ; before }
   1026  * FIXME: decide whether or not to eat trailing whitespace for arguments that
   1027  *        we allow semicolon/brace termination that POSIX doesn't
   1028  *        b, r, t, w, :
   1029  *        POSIX says trailing whitespace is part of label name, file name, etc.
   1030  *        we should probably eat it
   1031  */
   1032 static char *
   1033 semicolon_arg(char *s)
   1034 {
   1035 	char *p = strpbrk(s, ";}");
   1036 	if (!p)
   1037 		p = s + strlen(s);
   1038 	return p;
   1039 }
   1040 
   1041 static void
   1042 run(void)
   1043 {
   1044 	lineno = 0;
   1045 	if (braces.size)
   1046 		leprintf("extra {");
   1047 
   1048 	/* genbuf has already been initialized, patt will be in new_line
   1049 	 * (or we'll halt) */
   1050 	stracpy(&hold, "");
   1051 
   1052 	insert_labels();
   1053 	next_file();
   1054 	new_line();
   1055 
   1056 	for (pc = prog; !gflags.halt; pc++)
   1057 		pc->fninfo->fn(pc);
   1058 }
   1059 
   1060 /* return true if we are in range for c, set c->in_match appropriately */
   1061 static int
   1062 in_range(Cmd *c)
   1063 {
   1064 	if (match_addr(&c->range.beg)) {
   1065 		if (c->range.naddr == 2) {
   1066 			if (c->range.end.type == LINE && c->range.end.u.lineno <= lineno)
   1067 				c->in_match = 0;
   1068 			else
   1069 				c->in_match = 1;
   1070 		}
   1071 		return !c->negate;
   1072 	}
   1073 	if (c->in_match && match_addr(&c->range.end)) {
   1074 		c->in_match = 0;
   1075 		return !c->negate;
   1076 	}
   1077 	return c->in_match ^ c->negate;
   1078 }
   1079 
   1080 /* return true if addr matches current line */
   1081 static int
   1082 match_addr(Addr *a)
   1083 {
   1084 	switch (a->type) {
   1085 	default:
   1086 	case IGNORE: return 0;
   1087 	case EVERY: return 1;
   1088 	case LINE: return lineno == a->u.lineno;
   1089 	case LAST:
   1090 		while (is_eof(file) && !next_file())
   1091 			;
   1092 		return !file;
   1093 	case REGEX:
   1094 		lastre = a->u.re;
   1095 		return !regexec(a->u.re, patt.str, 0, NULL, 0);
   1096 	case LASTRE:
   1097 		if (!lastre)
   1098 			leprintf("no previous regex");
   1099 		return !regexec(lastre, patt.str, 0, NULL, 0);
   1100 	}
   1101 }
   1102 
   1103 /* move to next input file
   1104  * stdin if first call and no files
   1105  * return 0 for success and 1 for no more files
   1106  */
   1107 static int
   1108 next_file(void)
   1109 {
   1110 	static unsigned char first = 1;
   1111 
   1112 	if (file == stdin)
   1113 		clearerr(file);
   1114 	else if (file)
   1115 		fshut(file, "<file>");
   1116 	file = NULL;
   1117 
   1118 	do {
   1119 		if (!*files) {
   1120 			if (first) /* given no files, default to stdin */
   1121 				file = stdin;
   1122 			/* else we've used all our files, leave file = NULL */
   1123 		} else if (!strcmp(*files, "-")) {
   1124 			file = stdin;
   1125 			files++;
   1126 		} else if (!(file = fopen(*files++, "r"))) {
   1127 			/* warn this file didn't open, but move on to next */
   1128 			weprintf("fopen:");
   1129 		}
   1130 	} while (!file && *files);
   1131 	first = 0;
   1132 
   1133 	return !file;
   1134 }
   1135 
   1136 /* test if stream is at EOF */
   1137 static int
   1138 is_eof(FILE *f)
   1139 {
   1140 	int c;
   1141 
   1142 	if (!f || feof(f))
   1143 		return 1;
   1144 
   1145 	c = fgetc(f);
   1146 	if (c == EOF && ferror(f))
   1147 		eprintf("fgetc:");
   1148 	if (c != EOF && ungetc(c, f) == EOF)
   1149 		eprintf("ungetc EOF\n");
   1150 
   1151 	return c == EOF;
   1152 }
   1153 
   1154 /* perform writes that were scheduled
   1155  * for aci this is check_puts(string, stdout)
   1156  * for r this is write_file(path, stdout)
   1157  */
   1158 static void
   1159 do_writes(void)
   1160 {
   1161 	Cmd *c;
   1162 	size_t i;
   1163 
   1164 	for (i = 0; i < writes.size; i++) {
   1165 		c = writes.data[i];
   1166 		c->u.acir.print(c->u.acir.str.str, stdout);
   1167 	}
   1168 	writes.size = 0;
   1169 }
   1170 
   1171 /* used for r's u.acir.print()
   1172  * FIXME: something like util's concat() would be better
   1173  */
   1174 static void
   1175 write_file(char *path, FILE *out)
   1176 {
   1177 	FILE *in = fopen(path, "r");
   1178 	if (!in) /* no file is treated as empty file */
   1179 		return;
   1180 
   1181 	while (read_line(in, &genbuf) != EOF)
   1182 		check_puts(genbuf.str, out);
   1183 
   1184 	fshut(in, path);
   1185 }
   1186 
   1187 static void
   1188 check_puts(char *s, FILE *f)
   1189 {
   1190 	if (s && fputs(s, f) == EOF)
   1191 		eprintf("fputs:");
   1192 	if (fputs("\n", f) == EOF)
   1193 		eprintf("fputs:");
   1194 }
   1195 
   1196 /* iterate from beg to end updating ranges so we don't miss any commands
   1197  * e.g. sed -n '1d;1,3p' should still print lines 2 and 3
   1198  */
   1199 static void
   1200 update_ranges(Cmd *beg, Cmd *end)
   1201 {
   1202 	while (beg < end)
   1203 		in_range(beg++);
   1204 }
   1205 
   1206 /*
   1207  * Sed functions
   1208  */
   1209 static void
   1210 cmd_a(Cmd *c)
   1211 {
   1212 	if (in_range(c))
   1213 		push(&writes, c);
   1214 }
   1215 
   1216 static void
   1217 cmd_b(Cmd *c)
   1218 {
   1219 	if (!in_range(c))
   1220 		return;
   1221 
   1222 	/* if we jump backwards update to end, otherwise update to destination */
   1223 	update_ranges(c + 1, c->u.jump > c ? c->u.jump : prog + pcap);
   1224 	pc = c->u.jump;
   1225 }
   1226 
   1227 static void
   1228 cmd_c(Cmd *c)
   1229 {
   1230 	if (!in_range(c))
   1231 		return;
   1232 
   1233 	/* write the text on the last line of the match */
   1234 	if (!c->in_match)
   1235 		check_puts(c->u.acir.str.str, stdout);
   1236 	/* otherwise start the next cycle without printing pattern space
   1237 	 * effectively deleting the text */
   1238 	new_next();
   1239 }
   1240 
   1241 static void
   1242 cmd_d(Cmd *c)
   1243 {
   1244 	if (!in_range(c))
   1245 		return;
   1246 
   1247 	new_next();
   1248 }
   1249 
   1250 static void
   1251 cmd_D(Cmd *c)
   1252 {
   1253 	char *p;
   1254 
   1255 	if (!in_range(c))
   1256 		return;
   1257 
   1258 	if ((p = strchr(patt.str, '\n'))) {
   1259 		p++;
   1260 		memmove(patt.str, p, strlen(p) + 1);
   1261 		old_next();
   1262 	} else {
   1263 		new_next();
   1264 	}
   1265 }
   1266 
   1267 static void
   1268 cmd_g(Cmd *c)
   1269 {
   1270 	if (in_range(c))
   1271 		stracpy(&patt, hold.str);
   1272 }
   1273 
   1274 static void
   1275 cmd_G(Cmd *c)
   1276 {
   1277 	if (!in_range(c))
   1278 		return;
   1279 
   1280 	stracat(&patt, "\n");
   1281 	stracat(&patt, hold.str);
   1282 }
   1283 
   1284 static void
   1285 cmd_h(Cmd *c)
   1286 {
   1287 	if (in_range(c))
   1288 		stracpy(&hold, patt.str);
   1289 }
   1290 
   1291 static void
   1292 cmd_H(Cmd *c)
   1293 {
   1294 	if (!in_range(c))
   1295 		return;
   1296 
   1297 	stracat(&hold, "\n");
   1298 	stracat(&hold, patt.str);
   1299 }
   1300 
   1301 static void
   1302 cmd_i(Cmd *c)
   1303 {
   1304 	if (in_range(c))
   1305 		check_puts(c->u.acir.str.str, stdout);
   1306 }
   1307 
   1308 /* I think it makes sense to print invalid UTF-8 sequences in octal to satisfy
   1309  * the "visually unambiguous form" sed(1p)
   1310  */
   1311 static void
   1312 cmd_l(Cmd *c)
   1313 {
   1314 	Rune   r;
   1315 	char  *p, *end;
   1316 	size_t rlen;
   1317 
   1318 	char *escapes[] = { /* FIXME: 7 entries and search instead of 127 */
   1319 		['\\'] = "\\\\", ['\a'] = "\\a", ['\b'] = "\\b",
   1320 		['\f'] = "\\f" , ['\r'] = "\\r", ['\t'] = "\\t",
   1321 		['\v'] = "\\v" , [0x7f] = NULL, /* fill out the table */
   1322 	};
   1323 
   1324 	if (!in_range(c))
   1325 		return;
   1326 
   1327 	/* FIXME: line wrapping. sed(1p) says "length at which folding occurs is
   1328 	 * unspecified, but should be appropraite for the output device"
   1329 	 * just wrap at 80 Runes?
   1330 	 */
   1331 	for (p = patt.str, end = p + strlen(p); p < end; p += rlen) {
   1332 		if (isascii(*p) && escapes[(unsigned int)*p]) {
   1333 			fputs(escapes[(unsigned int)*p], stdout);
   1334 			rlen = 1;
   1335 		} else if (!(rlen = charntorune(&r, p, end - p))) {
   1336 			/* ran out of chars, print the bytes of the short sequence */
   1337 			for (; p < end; p++)
   1338 				printf("\\%03hho", (unsigned char)*p);
   1339 			break;
   1340 		} else if (r == Runeerror) {
   1341 			for (; rlen; rlen--, p++)
   1342 				printf("\\%03hho", (unsigned char)*p);
   1343 		} else {
   1344 			while (fwrite(p, rlen, 1, stdout) < 1 && errno == EINTR)
   1345 				;
   1346 			if (ferror(stdout))
   1347 				eprintf("fwrite:");
   1348 		}
   1349 	}
   1350 	check_puts("$", stdout);
   1351 }
   1352 
   1353 static void
   1354 cmd_n(Cmd *c)
   1355 {
   1356 	if (!in_range(c))
   1357 		return;
   1358 
   1359 	if (!gflags.n)
   1360 		check_puts(patt.str, stdout);
   1361 	do_writes();
   1362 	new_line();
   1363 }
   1364 
   1365 static void
   1366 cmd_N(Cmd *c)
   1367 {
   1368 	if (!in_range(c))
   1369 		return;
   1370 	do_writes();
   1371 	app_line();
   1372 }
   1373 
   1374 static void
   1375 cmd_p(Cmd *c)
   1376 {
   1377 	if (in_range(c))
   1378 		check_puts(patt.str, stdout);
   1379 }
   1380 
   1381 static void
   1382 cmd_P(Cmd *c)
   1383 {
   1384 	char *p;
   1385 
   1386 	if (!in_range(c))
   1387 		return;
   1388 
   1389 	if ((p = strchr(patt.str, '\n')))
   1390 		*p = '\0';
   1391 
   1392 	check_puts(patt.str, stdout);
   1393 
   1394 	if (p)
   1395 		*p = '\n';
   1396 }
   1397 
   1398 static void
   1399 cmd_q(Cmd *c)
   1400 {
   1401 	if (!in_range(c))
   1402 		return;
   1403 
   1404 	if (!gflags.n)
   1405 		check_puts(patt.str, stdout);
   1406 	do_writes();
   1407 	gflags.halt = 1;
   1408 }
   1409 
   1410 static void
   1411 cmd_r(Cmd *c)
   1412 {
   1413 	if (in_range(c))
   1414 		push(&writes, c);
   1415 }
   1416 
   1417 static void
   1418 cmd_s(Cmd *c)
   1419 {
   1420 	String tmp;
   1421 	Rune r;
   1422 	size_t plen, rlen, len;
   1423 	char *p, *s, *end;
   1424 	unsigned int matches = 0, last_empty = 1, qflag = 0, cflags = 0;
   1425 	regex_t *re;
   1426 	regmatch_t *rm, *pmatch = NULL;
   1427 
   1428 	if (!in_range(c))
   1429 		return;
   1430 
   1431 	if (!c->u.s.re && !lastre)
   1432 		leprintf("no previous regex");
   1433 
   1434 	re = c->u.s.re ? c->u.s.re : lastre;
   1435 	lastre = re;
   1436 
   1437 	plen = re->re_nsub + 1;
   1438 	pmatch = ereallocarray(NULL, plen, sizeof(regmatch_t));
   1439 
   1440 	*genbuf.str = '\0';
   1441 	s = patt.str;
   1442 
   1443 	while (!qflag && !regexec(re, s, plen, pmatch, cflags)) {
   1444 		cflags = REG_NOTBOL; /* match against beginning of line first time, but not again */
   1445 		if (!*s) /* match against empty string first time, but not again */
   1446 			qflag = 1;
   1447 
   1448 		/* don't substitute if last match was not empty but this one is.
   1449 		 * s_a*_._g
   1450 		 * foobar -> .f.o.o.b.r.
   1451 		 */
   1452 		if ((last_empty || pmatch[0].rm_eo) &&
   1453 		    (++matches == c->u.s.occurrence || !c->u.s.occurrence)) {
   1454 			/* copy over everything before the match */
   1455 			strnacat(&genbuf, s, pmatch[0].rm_so);
   1456 
   1457 			/* copy over replacement text, taking into account &, backreferences, and \ escapes */
   1458 			for (p = c->u.s.repl.str, len = strcspn(p, "\\&"); *p; len = strcspn(++p, "\\&")) {
   1459 				strnacat(&genbuf, p, len);
   1460 				p += len;
   1461 				switch (*p) {
   1462 				default: leprintf("this shouldn't be possible");
   1463 				case '\0':
   1464 					/* we're at the end, back up one so the ++p will put us on
   1465 					 * the null byte to break out of the loop */
   1466 					--p;
   1467 					break;
   1468 				case '&':
   1469 					strnacat(&genbuf, s + pmatch[0].rm_so, pmatch[0].rm_eo - pmatch[0].rm_so);
   1470 					break;
   1471 				case '\\':
   1472 					if (isdigit(*++p)) { /* backreference */
   1473 						/* only need to check here if using lastre, otherwise we checked when building */
   1474 						if (!c->u.s.re && (size_t)(*p - '0') > re->re_nsub)
   1475 							leprintf("back reference number greater than number of groups");
   1476 						rm = &pmatch[*p - '0'];
   1477 						strnacat(&genbuf, s + rm->rm_so, rm->rm_eo - rm->rm_so);
   1478 					} else { /* character after backslash taken literally (well one byte, but it works) */
   1479 						strnacat(&genbuf, p, 1);
   1480 					}
   1481 					break;
   1482 				}
   1483 			}
   1484 		} else {
   1485 			/* not replacing, copy over everything up to and including the match */
   1486 			strnacat(&genbuf, s, pmatch[0].rm_eo);
   1487 		}
   1488 
   1489 		if (!pmatch[0].rm_eo) { /* empty match, advance one rune and add it to output */
   1490 			end = s + strlen(s);
   1491 			rlen = charntorune(&r, s, end - s);
   1492 
   1493 			if (!rlen) { /* ran out of bytes, copy short sequence */
   1494 				stracat(&genbuf, s);
   1495 				s = end;
   1496 			} else { /* copy whether or not it's a good rune */
   1497 				strnacat(&genbuf, s, rlen);
   1498 				s += rlen;
   1499 			}
   1500 		}
   1501 		last_empty = !pmatch[0].rm_eo;
   1502 		s += pmatch[0].rm_eo;
   1503 	}
   1504 	free(pmatch);
   1505 
   1506 	if (!(matches && matches >= c->u.s.occurrence)) /* no replacement */
   1507 		return;
   1508 
   1509 	gflags.s = 1;
   1510 
   1511 	stracat(&genbuf, s);
   1512 
   1513 	tmp    = patt;
   1514 	patt   = genbuf;
   1515 	genbuf = tmp;
   1516 
   1517 	if (c->u.s.p)
   1518 		check_puts(patt.str, stdout);
   1519 	if (c->u.s.file)
   1520 		check_puts(patt.str, c->u.s.file);
   1521 }
   1522 
   1523 static void
   1524 cmd_t(Cmd *c)
   1525 {
   1526 	if (!in_range(c) || !gflags.s)
   1527 		return;
   1528 
   1529 	/* if we jump backwards update to end, otherwise update to destination */
   1530 	update_ranges(c + 1, c->u.jump > c ? c->u.jump : prog + pcap);
   1531 	pc = c->u.jump;
   1532 	gflags.s = 0;
   1533 }
   1534 
   1535 static void
   1536 cmd_w(Cmd *c)
   1537 {
   1538 	if (in_range(c))
   1539 		check_puts(patt.str, c->u.file);
   1540 }
   1541 
   1542 static void
   1543 cmd_x(Cmd *c)
   1544 {
   1545 	String tmp;
   1546 
   1547 	if (!in_range(c))
   1548 		return;
   1549 
   1550 	tmp  = patt;
   1551 	patt = hold;
   1552 	hold = tmp;
   1553 }
   1554 
   1555 static void
   1556 cmd_y(Cmd *c)
   1557 {
   1558 	String tmp;
   1559 	Rune r, *rp;
   1560 	size_t n, rlen;
   1561 	char *s, *end, buf[UTFmax];
   1562 
   1563 	if (!in_range(c))
   1564 		return;
   1565 
   1566 	*genbuf.str = '\0';
   1567 	for (s = patt.str, end = s + strlen(s); *s; s += rlen) {
   1568 		if (!(rlen = charntorune(&r, s, end - s))) { /* ran out of chars, copy rest */
   1569 			stracat(&genbuf, s);
   1570 			break;
   1571 		} else if (r == Runeerror) { /* bad UTF-8 sequence, copy bytes */
   1572 			strnacat(&genbuf, s, rlen);
   1573 		} else {
   1574 			for (rp = c->u.y.set1; *rp; rp++)
   1575 				if (*rp == r)
   1576 					break;
   1577 			if (*rp) { /* found r in set1, replace with Rune from set2 */
   1578 				n = runetochar(buf, c->u.y.set2 + (rp - c->u.y.set1));
   1579 				strnacat(&genbuf, buf, n);
   1580 			} else {
   1581 				strnacat(&genbuf, s, rlen);
   1582 			}
   1583 		}
   1584 	}
   1585 	tmp    = patt;
   1586 	patt   = genbuf;
   1587 	genbuf = tmp;
   1588 }
   1589 
   1590 static void
   1591 cmd_colon(Cmd *c)
   1592 {
   1593 }
   1594 
   1595 static void
   1596 cmd_equal(Cmd *c)
   1597 {
   1598 	if (in_range(c))
   1599 		printf("%zu\n", lineno);
   1600 }
   1601 
   1602 static void
   1603 cmd_lbrace(Cmd *c)
   1604 {
   1605 	Cmd *jump;
   1606 
   1607 	if (in_range(c))
   1608 		return;
   1609 
   1610 	/* update ranges on all commands we skip */
   1611 	jump = prog + c->u.offset;
   1612 	update_ranges(c + 1, jump);
   1613 	pc = jump;
   1614 }
   1615 
   1616 static void
   1617 cmd_rbrace(Cmd *c)
   1618 {
   1619 }
   1620 
   1621 /* not actually a sed function, but acts like one, put in last spot of script */
   1622 static void
   1623 cmd_last(Cmd *c)
   1624 {
   1625 	if (!gflags.n)
   1626 		check_puts(patt.str, stdout);
   1627 	do_writes();
   1628 	new_next();
   1629 }
   1630 
   1631 /*
   1632  * Actions
   1633  */
   1634 
   1635 /* read new line, continue current cycle */
   1636 static void
   1637 new_line(void)
   1638 {
   1639 	while (read_line(file, &patt) == EOF) {
   1640 		if (next_file()) {
   1641 			gflags.halt = 1;
   1642 			return;
   1643 		}
   1644 	}
   1645 	gflags.s = 0;
   1646 	lineno++;
   1647 }
   1648 
   1649 /* append new line, continue current cycle
   1650  * FIXME: used for N, POSIX specifies do not print pattern space when out of
   1651  *        input, but GNU does so busybox does as well. Currently we don't.
   1652  *        Should we?
   1653  */
   1654 static void
   1655 app_line(void)
   1656 {
   1657 	while (read_line(file, &genbuf) == EOF) {
   1658 		if (next_file()) {
   1659 			gflags.halt = 1;
   1660 			return;
   1661 		}
   1662 	}
   1663 
   1664 	stracat(&patt, "\n");
   1665 	stracat(&patt, genbuf.str);
   1666 	gflags.s = 0;
   1667 	lineno++;
   1668 }
   1669 
   1670 /* read new line, start new cycle */
   1671 static void
   1672 new_next(void)
   1673 {
   1674 	*patt.str = '\0';
   1675 	update_ranges(pc + 1, prog + pcap);
   1676 	new_line();
   1677 	pc = prog - 1;
   1678 }
   1679 
   1680 /* keep old pattern space, start new cycle */
   1681 static void
   1682 old_next(void)
   1683 {
   1684 	update_ranges(pc + 1, prog + pcap);
   1685 	pc = prog - 1;
   1686 }
   1687 
   1688 int
   1689 main(int argc, char *argv[])
   1690 {
   1691 	char *arg;
   1692 	int ret = 0, script = 0;
   1693 
   1694 	ARGBEGIN {
   1695 	case 'n':
   1696 		gflags.n = 1;
   1697 		break;
   1698 	case 'r':
   1699 	case 'E':
   1700 		gflags.E = 1;
   1701 		break;
   1702 	case 'e':
   1703 		arg = EARGF(usage());
   1704 		compile(arg, 0);
   1705 		script = 1;
   1706 		break;
   1707 	case 'f':
   1708 		arg = EARGF(usage());
   1709 		compile(arg, 1);
   1710 		script = 1;
   1711 		break;
   1712 	default : usage();
   1713 	} ARGEND
   1714 
   1715 	/* no script to run */
   1716 	if (!script && !argc)
   1717 		usage();
   1718 
   1719 	/* no script yet, next argument is script */
   1720 	if (!script)
   1721 		compile(*argv++, 0);
   1722 
   1723 	/* shrink/grow memory to fit and add our last instruction */
   1724 	resize((void **)&prog, &pcap, sizeof(*prog), pc - prog + 1, NULL);
   1725 	pc = prog + pcap - 1;
   1726 	pc->fninfo = &(Fninfo){ cmd_last, NULL, NULL, 0 };
   1727 
   1728 	files = argv;
   1729 	run();
   1730 
   1731 	ret |= fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>");
   1732 
   1733 	return ret;
   1734 }