commit bf4c5d1fcdae00b01fe2bf8a2cd8601e8e882a94
parent 1cdb424ac63ebd7eb5eda1b8af53871d89e1ead9
Author: Evan Gates <evan.gates@gmail.com>
Date: Wed, 25 Feb 2015 20:12:14 -0800
expr comments, cleanup, UTF-8 support
Diffstat:
M | README | | | 2 | +- |
M | expr.c | | | 279 | ++++++++++++++++++++++++++++++++++++++++--------------------------------------- |
2 files changed, 142 insertions(+), 139 deletions(-)
diff --git a/README b/README
@@ -29,7 +29,7 @@ The following tools are implemented ('*' == finished, '#' == UTF-8 support,
=* echo yes none
=* env yes none
#* expand yes none
-=* expr yes none
+#* expr yes none
=* false yes none
= find yes none
#* fold yes none
diff --git a/expr.c b/expr.c
@@ -1,31 +1,23 @@
/* See LICENSE file for copyright and license details. */
#include <inttypes.h>
#include <stdio.h>
-#include <stdlib.h>
#include <string.h>
+#include "utf.h"
#include "util.h"
+/* token types for lexing/parsing
+ * single character operators represent themselves */
enum {
VAL = CHAR_MAX + 1, GE, LE, NE
};
typedef struct {
- char *s;
+ char *s; /* iff s is NULL, Val is an integer */
intmax_t n;
} Val;
-static void enan(Val);
-static void ezero(intmax_t);
-static void doop(int*, int**, Val*, Val**);
-static Val match(Val, Val);
-static int valcmp(Val, Val);
-static char *valstr(Val, char *, size_t);
-static int lex(char *);
-static int parse(char **, int);
-
static size_t intlen;
-static Val lastval;
static void
enan(Val v)
@@ -41,53 +33,117 @@ ezero(intmax_t n)
enprintf(2, "division by zero\n");
}
+static char *
+valstr(Val val, char *buf, size_t bufsiz)
+{
+ if (val.s)
+ return val.s;
+ snprintf(buf, bufsiz, "%"PRIdMAX, val.n);
+ return buf;
+}
+
+static int
+valcmp(Val a, Val b)
+{
+ char buf1[intlen], buf2[intlen];
+ char *astr = valstr(a, buf1, sizeof(buf1));
+ char *bstr = valstr(b, buf2, sizeof(buf2));
+
+ if (!a.s && !b.s)
+ return (a.n > b.n) - (a.n < b.n);
+ return strcmp(astr, bstr);
+}
+
+/* match vstr against BRE vregx (treat both values as strings)
+ * if there is at least one subexpression \(...\)
+ * then return the text matched by it \1 (empty string for no match)
+ * else return number of characters matched (0 for no match)
+ */
+static Val
+match(Val vstr, Val vregx)
+{
+ regex_t re;
+ regmatch_t matches[2];
+ char buf1[intlen], buf2[intlen];
+ char *str = valstr(vstr, buf1, sizeof(buf1));
+ char *regx = valstr(vregx, buf2, sizeof(buf2));;
+ char anchreg[strlen(regx) + 2];
+
+ /* expr(1p) "all patterns are anchored to the beginning of the string" */
+ snprintf(anchreg, sizeof(anchreg), "^%s", regx);
+ enregcomp(3, &re, anchreg, 0);
+
+ if (regexec(&re, str, 2, matches, 0)) {
+ regfree(&re);
+ return (Val){ (re.re_nsub ? "" : NULL), 0 };
+ }
+
+ if (re.re_nsub) {
+ intmax_t d;
+ char *s = str + matches[1].rm_so, *p = str + matches[1].rm_eo;
+ regfree(&re);
+
+ *p = '\0';
+ d = strtoimax(s, &p, 10);
+ if (*s && !*p) /* string matched by subexpression is an integer */
+ return (Val){ NULL, d };
+
+ /* FIXME? string is never free()d, worth fixing?
+ * need to allocate as it could be in buf1 instead of vstr.s */
+ return (Val){ enstrdup(3, s), 0 };
+ }
+ regfree(&re);
+ str += matches[0].rm_so;
+ return (Val){ NULL, utfnlen(str, matches[0].rm_eo - matches[0].rm_so) };
+}
+
+/* ops points to a stack of operators, opp points to one past the last op
+ * vals points to a stack of values , valp points to one past the last val
+ * guaranteed that opp != ops
+ * ops is unused here, but still included for parity with vals
+ * pop operator, pop two values, apply operator, push result
+ */
static void
-doop(int *op, int **opp, Val *val, Val **valp)
+doop(int *ops, int **opp, Val *vals, Val **valp)
{
Val ret, a, b;
- int o;
+ int op;
/* For an operation, we need a valid operator
* and two values on the stack */
if ((*opp)[-1] == '(')
enprintf(2, "syntax error: extra (\n");
- if (*valp - val < 2)
+ if (*valp - vals < 2)
enprintf(2, "syntax error: missing expression or extra operator\n");
a = (*valp)[-2];
b = (*valp)[-1];
- o = (*opp)[-1];
+ op = (*opp)[-1];
- switch (o) {
+ switch (op) {
case '|':
- if (a.s && *a.s) {
- ret = (Val){ a.s, 0 };
- } else if (!a.s && a.n) {
- ret = (Val){ NULL, a.n };
- } else if (b.s && *b.s) {
- ret = (Val){ b.s, 0 };
- } else {
- ret = (Val){ NULL, b.n };
- }
+ if ( a.s && *a.s) ret = (Val){ a.s , 0 };
+ else if (!a.s && a.n) ret = (Val){ NULL, a.n };
+ else if ( b.s && *b.s) ret = (Val){ b.s , 0 };
+ else ret = (Val){ NULL, b.n };
break;
case '&':
- if (((a.s && *a.s) || a.n) &&
- ((b.s && *b.s) || b.n)) {
+ if (((a.s && *a.s) || a.n) && ((b.s && *b.s) || b.n))
ret = a;
- } else {
+ else
ret = (Val){ NULL, 0 };
- }
break;
+
case '=': ret = (Val){ NULL, valcmp(a, b) == 0 }; break;
case '>': ret = (Val){ NULL, valcmp(a, b) > 0 }; break;
- case GE: ret = (Val){ NULL, valcmp(a, b) >= 0 }; break;
+ case GE : ret = (Val){ NULL, valcmp(a, b) >= 0 }; break;
case '<': ret = (Val){ NULL, valcmp(a, b) < 0 }; break;
- case LE: ret = (Val){ NULL, valcmp(a, b) <= 0 }; break;
- case NE: ret = (Val){ NULL, valcmp(a, b) != 0 }; break;
+ case LE : ret = (Val){ NULL, valcmp(a, b) <= 0 }; break;
+ case NE : ret = (Val){ NULL, valcmp(a, b) != 0 }; break;
- case '+': enan(a); enan(b); ret = (Val){ NULL, a.n + b.n }; break;
- case '-': enan(a); enan(b); ret = (Val){ NULL, a.n - b.n }; break;
- case '*': enan(a); enan(b); ret = (Val){ NULL, a.n * b.n }; break;
+ case '+': enan(a); enan(b); ret = (Val){ NULL, a.n + b.n }; break;
+ case '-': enan(a); enan(b); ret = (Val){ NULL, a.n - b.n }; break;
+ case '*': enan(a); enan(b); ret = (Val){ NULL, a.n * b.n }; break;
case '/': enan(a); enan(b); ezero(b.n); ret = (Val){ NULL, a.n / b.n }; break;
case '%': enan(a); enan(b); ezero(b.n); ret = (Val){ NULL, a.n % b.n }; break;
@@ -99,110 +155,50 @@ doop(int *op, int **opp, Val *val, Val **valp)
(*valp)--;
}
-static Val
-match(Val vstr, Val vregx)
-{
- intmax_t d;
- char *anchreg, *ret, *p;
- char buf1[intlen], buf2[intlen], *str, *regx;
- regoff_t len;
- regex_t re;
- regmatch_t matches[2];
-
- str = valstr(vstr, buf1, sizeof(buf1));
- regx = valstr(vregx, buf2, sizeof(buf2));
-
- anchreg = enmalloc(3, strlen(regx) + 2);
- snprintf(anchreg, strlen(regx) + 2, "^%s", regx);
-
- enregcomp(3, &re, anchreg, 0);
- free(anchreg);
-
- if (regexec(&re, str, 2, matches, 0)) {
- regfree(&re);
- return (Val){ (re.re_nsub ? "" : NULL), 0 };
- }
-
- if (re.re_nsub) {
- regfree(&re);
- len = matches[1].rm_eo - matches[1].rm_so + 1;
- ret = enmalloc(3, len);
- strlcpy(ret, str + matches[1].rm_so, len);
- d = strtoimax(ret, &p, 10);
- if (*ret && !*p) {
- free(ret);
- return (Val){ NULL, d };
- }
- return (Val){ ret, 0 };
- }
- regfree(&re);
- return (Val){ NULL, matches[0].rm_eo - matches[0].rm_so };
-}
-
-
-
+/* retrn the type of the next token, s
+ * if it is a value, place the value in v for use by parser
+ */
static int
-valcmp(Val a, Val b)
-{
- char buf1[intlen], buf2[intlen], *astr, *bstr;
-
- astr = valstr(a, buf1, sizeof(buf1));
- bstr = valstr(b, buf2, sizeof(buf2));
-
- return strcmp(astr, bstr);
-}
-
-static char *
-valstr(Val val, char *buf, size_t bufsiz)
-{
- if (val.s)
- return val.s;
- snprintf(buf, bufsiz, "%"PRIdMAX, val.n);
- return buf;
-}
-
-static int
-lex(char *p)
+lex(char *s, Val *v)
{
intmax_t d;
- char *q, *ops = "|&=><+-*/%():";
+ char *p, *ops = "|&=><+-*/%():";
/* clean integer */
- d = strtoimax(p, &q, 10);
- if (*p && !*q) {
- lastval = (Val){ NULL, d };
+ d = strtoimax(s, &p, 10);
+ if (*s && !*p) {
+ *v = (Val){ NULL, d };
return VAL;
}
/* one-char operand */
- if (*p && !*(p + 1) && strchr(ops, *p))
- return *p;
+ if (*s && !s[1] && strchr(ops, *s))
+ return *s;
/* two-char operand */
- if (*p && *(p + 1) == '=' && !*(p + 2)) {
- switch (*p) {
- case '>':
- return GE;
- case '<':
- return LE;
- case '!':
- return NE;
- }
- }
+ if (!strcmp(s, ">=")) return GE;
+ if (!strcmp(s, "<=")) return LE;
+ if (!strcmp(s, "!=")) return NE;
/* nothing matched, treat as string */
- lastval = (Val){ p, 0 };
+ *v = (Val){ s, 0 };
return VAL;
}
+/* using shunting-yard to convert from infix to rpn
+ * https://en.wikipedia.org/wiki/Shunting-yard_algorithm
+ * instead of creating rpn output to evaluate later, evaluate it immediately as
+ * it is created
+ * vals is the value stack, valp points to one past last value on the stack
+ * ops is the operator stack, opp points to one past last op on the stack
+ */
static int
parse(char **expr, int exprlen)
{
- Val val[exprlen], *valp = val;
- int op[exprlen], *opp = op;
- int i, type, lasttype = 0;
- char prec[] = {
- [ 0 ] = 0, [VAL] = 0,
+ Val vals[exprlen], *valp = vals;
+ int ops[exprlen], *opp = ops;
+ int i, lasttype = 0;
+ char prec[] = { /* precedence of operators */
['|'] = 1,
['&'] = 2,
['='] = 3, ['>'] = 3, [GE] = 3, ['<'] = 3, [LE] = 3, [NE] = 3,
@@ -212,11 +208,12 @@ parse(char **expr, int exprlen)
};
for (i = 0; i < exprlen; i++) {
- type = lex(expr[i]);
+ Val v;
+ int type = lex(expr[i], &v);
switch (type) {
case VAL:
- *valp++ = lastval;
+ *valp++ = v;
break;
case '(':
*opp++ = '(';
@@ -224,31 +221,30 @@ parse(char **expr, int exprlen)
case ')':
if (lasttype == '(')
enprintf(2, "syntax error: empty ( )\n");
- while (opp > op && opp[-1] != '(')
- doop(op, &opp, val, &valp);
- if (opp == op)
+ while (opp > ops && opp[-1] != '(')
+ doop(ops, &opp, vals, &valp);
+ if (opp == ops)
enprintf(2, "syntax error: extra )\n");
opp--;
break;
- default :
+ default: /* operator */
if (prec[lasttype])
enprintf(2, "syntax error: extra operator\n");
- while (opp > op && prec[opp[-1]] >= prec[type])
- doop(op, &opp, val, &valp);
+ while (opp > ops && prec[opp[-1]] >= prec[type])
+ doop(ops, &opp, vals, &valp);
*opp++ = type;
break;
}
lasttype = type;
}
- while (opp > op)
- doop(op, &opp, val, &valp);
+ while (opp > ops)
+ doop(ops, &opp, vals, &valp);
- if (valp == val)
+ if (valp == vals)
enprintf(2, "syntax error: missing expression\n");
- if (valp - val > 1)
+ if (--valp != vals)
enprintf(2, "syntax error: extra expression\n");
- valp--;
if (valp->s)
printf("%s\n", valp->s);
else
@@ -257,10 +253,17 @@ parse(char **expr, int exprlen)
return (valp->s && *valp->s) || valp->n;
}
+/* the only way to get usage() is if the user didn't supply -- and expression
+ * begins with a -
+ * expr(1p): "... the conforming application must employ the -- construct ...
+ * if there is any chance the first operand might be a negative integer (or any
+ * string with a leading minus"
+ */
static void
usage(void)
{
- eprintf("usage: %s expression\n", argv0);
+ enprintf(3, "usage: %s [--] expression\n"
+ "note : the -- is mandatory if expression begins with a -\n", argv0);
}
int
@@ -269,11 +272,11 @@ main(int argc, char *argv[])
intmax_t n = INTMAX_MIN;
/* Get the maximum number of digits (+ sign) */
- for (intlen = (n < 0); n; n /= 10, ++intlen);
+ for (intlen = (n < 0); n; n /= 10, ++intlen)
+ ;
ARGBEGIN {
- default:
- usage();
+ default: usage();
} ARGEND;
return !parse(argv, argc);