hbase

heirloom base
git clone git://git.2f30.org/hbase
Log | Files | Refs | README

grep.c (15239B)


      1 /*
      2  * grep - search a file for a pattern
      3  *
      4  * Gunnar Ritter, Freiburg i. Br., Germany, April 2001.
      5  */
      6 /*
      7  * Copyright (c) 2003 Gunnar Ritter
      8  *
      9  * This software is provided 'as-is', without any express or implied
     10  * warranty. In no event will the authors be held liable for any damages
     11  * arising from the use of this software.
     12  *
     13  * Permission is granted to anyone to use this software for any purpose,
     14  * including commercial applications, and to alter it and redistribute
     15  * it freely, subject to the following restrictions:
     16  *
     17  * 1. The origin of this software must not be misrepresented; you must not
     18  *    claim that you wrote the original software. If you use this software
     19  *    in a product, an acknowledgment in the product documentation would be
     20  *    appreciated but is not required.
     21  *
     22  * 2. Altered source versions must be plainly marked as such, and must not be
     23  *    misrepresented as being the original software.
     24  *
     25  * 3. This notice may not be removed or altered from any source distribution.
     26  */
     27 
     28 /*	Sccsid @(#)grep.c	1.53 (gritter) 12/27/06>	*/
     29 
     30 /*
     31  * Code common to all grep flavors.
     32  */
     33 
     34 #include	<sys/types.h>
     35 #include	<sys/stat.h>
     36 #include	<sys/mman.h>
     37 #include	<sys/wait.h>
     38 #include	<fcntl.h>
     39 #include	<unistd.h>
     40 #include	<stdio.h>
     41 #include	<string.h>
     42 #include	<stdlib.h>
     43 #include	<libgen.h>
     44 #include	<locale.h>
     45 #include	<limits.h>
     46 #include	<ctype.h>
     47 #include	<dirent.h>
     48 #include	<errno.h>
     49 
     50 #include	"grep.h"
     51 #include	"alloc.h"
     52 
     53 /*
     54  * Generic flags and the like.
     55  */
     56 int		Eflag;			/* use EREs */
     57 int		Fflag;			/* use fixed strings */
     58 int		bflag;			/* print buffer count */
     59 int		cflag;			/* print count only */
     60 int		fflag;			/* had pattern file argument */
     61 int		hflag;			/* do not print filenames */
     62 int		iflag;			/* ignore case */
     63 int		lflag;			/* print filenames only */
     64 int		nflag;			/* print line numbers */
     65 int		qflag;			/* no output at all */
     66 int	(*rflag)(const char *, struct stat *);	/* operate recursively */
     67 int		sflag;			/* avoid error messages */
     68 int		vflag;			/* inverse selection */
     69 int		wflag;			/* search for words */
     70 int		xflag;			/* match entire line */
     71 int		zflag;			/* decompress compressed files */
     72 int		mb_cur_max;		/* avoid multiple calls to MB_CUR_MAX */
     73 unsigned	status = 1;		/* exit status */
     74 off_t		lmatch;			/* count of line matches */
     75 off_t		lineno;			/* current line number */
     76 char		*progname;		/* argv[0] to main() */
     77 char		*filename;		/* name of current file */
     78 char		*options;		/* for getopt() */
     79 void		(*build)(void);		/* compile function */
     80 int		(*match)(const char *, size_t); /* comparison function */
     81 int		(*range)(struct iblok *, char *); /* grep range of lines */
     82 
     83 /*
     84  * Regexp variables.
     85  */
     86 struct expr	*e0;			/* start of expression list */
     87 enum matchflags	matchflags;		/* matcher flags */
     88 
     89 /*
     90  * To avoid link loops with -r.
     91  */
     92 static struct	visit {
     93 	ino_t	v_ino;
     94 	dev_t	v_dev;
     95 } *visited;
     96 static int	vismax;			/* number of members in visited */
     97 
     98 /*
     99  * Lower-case a character string.
    100  */
    101 size_t
    102 loconv(register char *dst, register char *src, size_t sz)
    103 {
    104 	char	*odst = dst;
    105 
    106 	if (mbcode) {
    107 		char	mb[MB_LEN_MAX];
    108 		wchar_t wc;
    109 		int len, i, nlen;
    110 
    111 		while (sz > 0) {
    112 			if ((*src & 0200) == 0) {
    113 				*dst++ = tolower(*src);
    114 				src++;
    115 				sz--;
    116 			} else if ((len = mbtowc(&wc, src, sz)) <= 0 ||
    117 					len > sz) {
    118 				*dst++ = *src++;
    119 				sz--;
    120 			} else {
    121 				wc = towlower(wc);
    122 				if (len >= mb_cur_max) {
    123 					if ((nlen = wctomb(dst, wc)) <= len) {
    124 						dst += nlen;
    125 						src += len;
    126 						sz -= len;
    127 					} else {
    128 						*dst++ = *src++;
    129 						sz--;
    130 					}
    131 				} else {
    132 					if ((nlen = wctomb(mb, wc)) <= len) {
    133 						sz -= len;
    134 						src += len;
    135 						for (i = 0; i < nlen; i++)
    136 							*dst++ = mb[i];
    137 					} else {
    138 						*dst++ = *src++;
    139 						sz--;
    140 					}
    141 				}
    142 			}
    143 		}
    144 	} else {
    145 		while (sz--) {
    146 			*dst++ = tolower(*src & 0377);
    147 			src++;
    148 		}
    149 	}
    150 	return dst - odst;
    151 }
    152 
    153 /*
    154  * Determine if pat ends with an unescaped dollar sign.
    155  */
    156 static int
    157 termdollar(const char *pat, long len)
    158 {
    159 	int	dollar = 1;
    160 
    161 	if (len == 0 || pat[len - 1] != '$')
    162 		return 0;
    163 	pat += --len - 1;
    164 	while (len-- && *pat-- == '\\')
    165 		dollar = !dollar;
    166 	return dollar;
    167 }
    168 
    169 /*
    170  * Surround the pattern with \< \>.
    171  */
    172 void
    173 wcomp(char **pat, long *len)
    174 {
    175 	char	*wp = smalloc(*len + 5);
    176 
    177 	memcpy(&wp[2], *pat, *len);
    178 	if ((*pat)[0] == '^')
    179 		memcpy(wp, "^\\<", 3);
    180 	else
    181 		memcpy(wp, "\\<", 2);
    182 	if (termdollar(*pat, *len))
    183 		strcpy(&wp[*len-1+2], "\\>$");
    184 	else
    185 		strcpy(&wp[*len+2], "\\>");
    186 	*len += 4;
    187 	*pat = wp;
    188 }
    189 
    190 static struct iblok *
    191 redirect(struct iblok *ip, const char *arg0, const char *arg1)
    192 {
    193 	struct iblok	*nip = NULL;
    194 	int	pd[2];
    195 	pid_t	pid;
    196 
    197 	if (pipe(pd) < 0)
    198 		return NULL;
    199 	switch (pid = fork()) {
    200 	case 0:
    201 		if (lseek(ip->ib_fd, -(ip->ib_end - ip->ib_cur),
    202 					SEEK_CUR) == (off_t)-1) {
    203 			int	xpd[2];
    204 			if (pipe(xpd) == 0 && fork() == 0) {
    205 				ssize_t	rd, wo, wt;
    206 				close(xpd[0]);
    207 				for (;;) {
    208 					rd = ip->ib_end - ip->ib_cur;
    209 					wo = wt = 0;
    210 					do {
    211 						if ((wo = write(xpd[1],
    212 								&ip->ib_cur[wt],
    213 								rd - wt))
    214 								<= 0) {
    215 							if (errno == EINTR)
    216 								continue;
    217 							_exit(0);
    218 						}
    219 						wt += wo;
    220 					} while (wt < rd);
    221 					if (ib_read(ip) == EOF)
    222 						break;
    223 					ip->ib_cur--;
    224 				}
    225 				_exit(0);
    226 			} else {
    227 				close(xpd[1]);
    228 				dup2(xpd[0], 0);
    229 				close(xpd[0]);
    230 			}
    231 		} else {
    232 			if (ip->ib_fd)
    233 				dup2(ip->ib_fd, 0);
    234 		}
    235 		if (ip->ib_fd)
    236 			ib_close(ip);
    237 		else
    238 			ib_free(ip);
    239 		dup2(pd[1], 1);
    240 		close(pd[0]);
    241 		close(pd[1]);
    242 		execlp(arg0, arg0, arg1, NULL);
    243 		fprintf(stderr, "%s: could not exec %s\n", progname, arg0);
    244 		_exit(0177);
    245 		/*NOTREACHED*/
    246 	case -1:
    247 		fprintf(stderr, "%s: cannot fork()\n", progname);
    248 		status = 2;
    249 		return NULL;
    250 	default:
    251 		close(pd[1]);
    252 		nip = ib_alloc(pd[0], 0);
    253 		nip->ib_pid = pid;
    254 		return nip;
    255 	}
    256 }
    257 
    258 /*
    259  * Report a matching line.
    260  */
    261 void
    262 report(const char *line, size_t llen, off_t bcnt, int addnl)
    263 {
    264 	if (filename && !hflag)
    265 		printf("%s:", filename);
    266 #ifdef	LONGLONG
    267 	if (bflag)
    268 		printf("%llu:", (long long)bcnt);
    269 	if (nflag)
    270 		printf("%llu:", (long long)lineno);
    271 #else	/* !LONGLONG */
    272 	if (bflag)
    273 		printf("%lu:", (long)bcnt);
    274 	if (nflag)
    275 		printf("%lu:", (long)lineno);
    276 #endif	/* !LONGLONG */
    277 	if (line && llen)
    278 		fwrite(line, sizeof *line, llen, stdout);
    279 	if (addnl)
    280 		putchar('\n');
    281 }
    282 
    283 /*
    284  * Check line for match. If necessary, the line gets NUL-terminated (so
    285  * its address range must be writable then). When ignoring character case,
    286  * a lower-case-only copy of the line is made instead. If a match is found,
    287  * statistics are printed. Returns 1 if main loop shall terminate, 0 else.
    288  */
    289 static int
    290 matchline(char *line, size_t sz, int putnl, struct iblok *ip)
    291 {
    292 	size_t	csz = sz;
    293 	int terminate = 0;
    294 	char lbuf[512], *abuf = NULL, *cline = line;
    295 
    296 	if (iflag && (matchflags & MF_LOCONV)) {
    297 		if (sz >= sizeof lbuf - 1) {
    298 			abuf = smalloc(sz + 1);
    299 			cline = abuf;
    300 		} else
    301 			cline = lbuf;
    302 		csz = loconv(cline, line, sz);
    303 		cline[csz] = '\0';
    304 	} else if (matchflags & MF_NULTERM)
    305 		cline[sz] = '\0';
    306 	lineno++;
    307 	if (match(cline, csz) ^ vflag) {
    308 		lmatch++;
    309 		if (qflag == 0) {
    310 			if (status == 1)
    311 				status = 0;
    312 			if (lflag) {
    313 				puts(filename ? filename : stdinmsg);
    314 			} else if (!cflag)
    315 				report(line, sz, (ib_offs(ip)-1) / BSZ, putnl);
    316 		} else
    317 			exit(0);
    318 		if (qflag || lflag)
    319 			terminate = 1;
    320 	}
    321 	if (abuf)
    322 		free(abuf);
    323 	return terminate;
    324 }
    325 
    326 /*
    327  * Check all lines within ip->ib_cur and last which contains the last
    328  * newline. If the main loop shall terminate, 1 is returned.
    329  */
    330 static int
    331 gn_range(struct iblok *ip, char *last)
    332 {
    333 	char *nl;
    334 
    335 	while ((nl = memchr(ip->ib_cur, '\n', last + 1 - ip->ib_cur)) != NULL) {
    336 		if (matchline(ip->ib_cur, nl - ip->ib_cur,  1, ip))
    337 			return 1;
    338 		if (nl == last)
    339 			return 0;
    340 		ip->ib_cur = nl + 1;
    341 	}
    342 	return 0;
    343 }
    344 
    345 /*
    346  * Main grep routine. The line buffer herein is only used for overlaps
    347  * between file buffer fills.
    348  */
    349 static struct iblok *
    350 grep(struct iblok *ip)
    351 {
    352 	char *line = NULL;		/* line buffer */
    353 	register char *lastnl;		/* last newline in file buffer */
    354 	size_t sz = 0;			/* length of line in line buffer */
    355 	char *cp;
    356 	int hadnl;			/* lastnl points to newline char */
    357 	int	oom = 0;		/* got out of memory */
    358 
    359 	lineno = lmatch = 0;
    360 	if (ib_read(ip) == EOF)
    361 		goto endgrep;
    362 	ip->ib_cur--;
    363 	if (zflag) {
    364 		struct iblok	*np;
    365 		for (;;) {
    366 			sz = ip->ib_end - ip->ib_cur;
    367 			if (sz > 3 && memcmp(ip->ib_cur, "BZh", 3) == 0)
    368 				np = redirect(ip, "bzip2", "-cd");
    369 			else if (sz > 2 &&
    370 					memcmp(ip->ib_cur, "\37\235", 2) == 0)
    371 				np = redirect(ip, "zcat", NULL);
    372 			else if (sz > 2 &&
    373 					memcmp(ip->ib_cur, "\37\213", 2) == 0)
    374 				np = redirect(ip, "gzip", "-cd");
    375 			else
    376 				break;
    377 			if (np == NULL)
    378 				break;
    379 			if (ip->ib_fd)
    380 				ib_close(ip);
    381 			else
    382 				ib_free(ip);
    383 			ip = np;
    384 			if (ib_read(ip) == EOF)
    385 				goto endgrep;
    386 			ip->ib_cur--;
    387 		}
    388 	}
    389 	for (;;) {
    390 		for (lastnl = ip->ib_end - 1;
    391 				*lastnl != '\n' && lastnl > ip->ib_cur;
    392 				lastnl--);
    393 		if (hadnl = (ip->ib_cur < ip->ib_end && *lastnl == '\n'))
    394 			if (range(ip, lastnl))
    395 				break;
    396 		if (lastnl < ip->ib_end - hadnl) {
    397 			/*
    398 			 * Copy the partial line from file buffer to line
    399 			 * buffer. Allocate enough space to zero-terminate
    400 			 * the line later if necessary.
    401 			 */
    402 			sz = ip->ib_end - lastnl - hadnl;
    403 			line = smalloc(sz + 1);
    404 			memcpy(line, lastnl + hadnl, sz);
    405 			ip->ib_cur = lastnl + hadnl;
    406 		} else
    407 			line = NULL;
    408 nextbuf:
    409 		if (ib_read(ip) == EOF) {
    410 			if (line) {
    411 				matchline(line, sz, sus, ip);
    412 				free(line);
    413 				line = NULL;
    414 				sz = 0;
    415 			}
    416 			break;
    417 		}
    418 		ip->ib_cur--;
    419 		if (line) {
    420 			/*
    421 			 * Append the partial line at the beginning of the
    422 			 * file buffer to the line buffer.
    423 			 */
    424 			size_t oldsz = sz;
    425 			if ((cp = memchr(ip->ib_cur, '\n',
    426 					ip->ib_end - ip->ib_cur)) == NULL) {
    427 				char	*nline;
    428 				/*
    429 				 * Ugh. This is really a huge line. Store the
    430 				 * entire file buffer in the line buffer and
    431 				 * read the next part of the file.
    432 				 */
    433 				sz += ip->ib_end - ip->ib_cur;
    434 				if ((nline = realloc(line, sz + 1)) == NULL) {
    435 					sz = oldsz;
    436 					cp = &ip->ib_end[-1];
    437 					oom++;
    438 				} else {
    439 					line = nline;
    440 					memcpy(line + oldsz, ip->ib_cur,
    441 						ip->ib_end - ip->ib_cur);
    442 					goto nextbuf;
    443 				}
    444 			}
    445 			if ((sz = cp - ip->ib_cur) > 0) {
    446 				char	*nline;
    447 				sz += oldsz;
    448 				if ((nline = realloc(line, sz + 1)) == NULL) {
    449 					sz = oldsz;
    450 					oom++;
    451 				} else {
    452 					line = nline;
    453 					memcpy(line + oldsz, ip->ib_cur,
    454 							cp - ip->ib_cur);
    455 				}
    456 			} else
    457 				sz = oldsz;
    458 			if (matchline(line, sz, 1, ip))
    459 				break;
    460 			free(line);
    461 			line = NULL;
    462 			sz = 0;
    463 			ip->ib_cur = cp + (oom == 0);
    464 			oom = 0;
    465 		}
    466 	}
    467 endgrep:
    468 	if (!qflag && cflag) {
    469 		if (filename && !hflag)
    470 			printf("%s:", filename);
    471 #ifdef	LONGLONG
    472 		printf("%llu\n", (long long)lmatch);
    473 #else
    474 		printf("%lu\n", (long)lmatch);
    475 #endif
    476 	}
    477 	return ip;
    478 }
    479 
    480 /*
    481  * Grep a named file.
    482  */
    483 static void
    484 fngrep(const char *fn, int level)
    485 {
    486 	struct iblok	*ip;
    487 	struct stat	st;
    488 	int	i;
    489 
    490 	if (rflag && fn && (level ? rflag : stat)(fn, &st) == 0) {
    491 		if (rflag != lstat) {
    492 			for (i = 0; i < level; i++)
    493 				if (st.st_dev == visited[i].v_dev &&
    494 						st.st_ino == visited[i].v_ino)
    495 					return;
    496 			if (level >= vismax) {
    497 				vismax += 20;
    498 				visited = srealloc(visited, sizeof *visited *
    499 						vismax);
    500 			}
    501 			visited[level].v_dev = st.st_dev;
    502 			visited[level].v_ino = st.st_ino;
    503 		}
    504 	mode:	switch (st.st_mode&S_IFMT) {
    505 #define	ignoring(t, s)	fprintf(stderr, "%s: ignoring %s %s\n", progname, t, s)
    506 		case S_IFIFO:
    507 			ignoring("named pipe", fn);
    508 			return;
    509 		case S_IFBLK:
    510 			ignoring("block device", fn);
    511 			return;
    512 		case S_IFCHR:
    513 			ignoring("block device", fn);
    514 			return;
    515 #ifdef	S_IFSOCK
    516 		case S_IFSOCK:
    517 			ignoring("socket", fn);
    518 			return;
    519 #endif	/* S_IFSOCK */
    520 		case S_IFLNK:
    521 			if (stat(fn, &st) < 0 || (st.st_mode&S_IFMT) == S_IFDIR)
    522 				return;
    523 			goto mode;
    524 		default:
    525 			break;
    526 		case S_IFDIR: {
    527 			char	*path;
    528 			int	pend, psize, i;
    529 			DIR	*df;
    530 			struct dirent	*dp;
    531 
    532 			if (hflag == 2)
    533 				hflag = 0;
    534 			if ((df = opendir(fn)) == NULL) {
    535 				if (sflag == 0)
    536 					fprintf(stderr, "%s: can't open "
    537 							"directory %s\n",
    538 							progname, fn);
    539 				if (!qflag || status == 1)
    540 					status = 2;
    541 				return;
    542 			}
    543 			pend = strlen(fn);
    544 			path = malloc(psize = pend + 2);
    545 			strcpy(path, fn);
    546 			path[pend++] = '/';
    547 			while ((dp = readdir(df)) != NULL) {
    548 				if (dp->d_name[0] == '.' &&
    549 						(dp->d_name[1] == '\0' ||
    550 					 	dp->d_name[1] == '.' &&
    551 					 	dp->d_name[2] == '\0'))
    552 					continue;
    553 				i = 0;
    554 				do {
    555 					if (pend + i >= psize)
    556 						path = srealloc(path,
    557 								psize += 14);
    558 					path[pend+i] = dp->d_name[i];
    559 				} while (dp->d_name[i++]);
    560 				filename = path;
    561 				fngrep(path, level+1);
    562 			}
    563 			free(path);
    564 			closedir(df);
    565 			return;
    566 		    }
    567 		}
    568 	}
    569 	if (fn) {
    570 		if ((ip = ib_open(fn, 0)) == NULL) {
    571 			if (sflag == 0)
    572 				fprintf(stderr, "%s: can't open %s\n",
    573 						progname, fn);
    574 			if (!qflag || status == 1)
    575 				status = 2;
    576 			return;
    577 		}
    578 	} else
    579 		ip = ib_alloc(0, 0);
    580 	ip = grep(ip);
    581 	if (ip->ib_fd) {
    582 		ib_close(ip);
    583 		if (zflag && ip->ib_pid) {
    584 			int	s;
    585 			waitpid(ip->ib_pid, &s, 0);
    586 			if (s)
    587 				status = 2;
    588 		}
    589 	} else
    590 		ib_free(ip);
    591 }
    592 
    593 int
    594 main(int argc, char **argv)
    595 {
    596 	int i, hadpat = 0;
    597 
    598 #ifdef	__GLIBC__
    599 	putenv("POSIXLY_CORRECT=1");
    600 #endif
    601 	progname = basename(argv[0]);
    602 	setlocale(LC_COLLATE, "");
    603 	setlocale(LC_CTYPE, "");
    604 	mb_cur_max = MB_CUR_MAX;
    605 	range = gn_range;
    606 	init();
    607 	while ((i = getopt(argc, argv, options)) != EOF) {
    608 		switch (i) {
    609 		case 'E':
    610 			Eflag |= 1;
    611 			rc_select();
    612 			break;
    613 		case 'F':
    614 			if (Eflag&2)
    615 				Eflag = 0;
    616 			Fflag |= 1;
    617 			ac_select();
    618 			break;
    619 		case 'b':
    620 			bflag = 1;
    621 			break;
    622 		case 'c':
    623 			cflag = 1;
    624 			break;
    625 		case 'e':
    626 			patstring(optarg);
    627 			hadpat++;
    628 			break;
    629 		case 'f':
    630 			fflag++;
    631 			patfile(optarg);
    632 			hadpat++;
    633 			break;
    634 		case 'h':
    635 			hflag = 1;
    636 			break;
    637 		case 'i':
    638 		case 'y':
    639 			iflag = 1;
    640 			break;
    641 		case 'l':
    642 			lflag = 1;
    643 			break;
    644 		case 'n':
    645 			nflag = 1;
    646 			break;
    647 		case 'q':
    648 			qflag = 1;
    649 			break;
    650 		case 'r':
    651 			rflag = stat;
    652 			break;
    653 		case 'R':
    654 			rflag = lstat;
    655 			break;
    656 		case 's':
    657 			sflag = 1;
    658 			break;
    659 		case 'v':
    660 			vflag = 1;
    661 			break;
    662 		case 'w':
    663 			wflag = 1;
    664 			break;
    665 		case 'x':
    666 			xflag = 1;
    667 			break;
    668 		case 'z':
    669 			zflag = 1;
    670 			break;
    671 		default:
    672 			if (!(Fflag&2))
    673 				usage();
    674 			status = 2;
    675 		}
    676 	}
    677 	if (sus) {
    678 		if (Fflag == 2) {
    679 			if (sflag) {
    680 				optind = 1;
    681 				argv[1] = "-s";
    682 				getopt(argc, argv, "");
    683 				usage();
    684 			}
    685 			if (qflag) {
    686 				optind = 1;
    687 				argv[1] = "-q";
    688 				getopt(argc, argv, "");
    689 				usage();
    690 			}
    691 		}
    692 		if (Fflag && status == 2)
    693 			usage();
    694 		if (Eflag == 1 && Fflag == 1 || cflag + lflag + qflag > 1)
    695 			usage();
    696 		if (wflag && (Eflag || Fflag))
    697 			usage();
    698 	}
    699 	if (cflag)
    700 		lflag = 0;
    701 	if (hadpat == 0) {
    702 		if (optind >= argc)
    703 			misop();
    704 		patstring(argv[optind++]);
    705 	} else if (e0 == NULL)
    706 		patstring(NULL);
    707 	build();
    708 	if (optind != argc) {
    709 		if (optind + 1 == argc)
    710 			hflag = 2;
    711 		do {
    712 			if (sus && argv[optind][0] == '-' &&
    713 					argv[optind][1] == '\0') {
    714 				filename = NULL;
    715 				fngrep(NULL, 0);
    716 			} else {
    717 				filename = argv[optind];
    718 				fngrep(argv[optind], 0);
    719 			}
    720 		} while (++optind < argc);
    721 	} else {
    722 		if (lflag && !sus && (Eflag || Fflag))
    723 			exit(1);
    724 		fngrep(NULL, 0);
    725 	}
    726 	return status;
    727 }