Re-organize the lexer - iris - small scheme interpreter

commit 957be47ca3edb0e5d2365360a479d91190db0fc2
parent 60d4c2eb9824194b7c1b4a4dda537080f6e6ad25
Author: sin <sin@2f30.org>
Date:   Sun, 11 May 2014 14:39:38 +0100

Re-organize the lexer

Still some things left to do.

Diffstat:
M lexer.c  | 203 ++++++++++++++++++++++++++++++++++++++-----------------------------------------
M lexer.h  | 19 +++++++------------
M repl.c  | 45 +++++++++------------------------------------

3 files changed, 114 insertions(+), 153 deletions(-)
diff --git a/lexer.c b/lexer.c
@@ -1,5 +1,6 @@
 /* See LICENSE file for copyright and license details. */
 #include <ctype.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "lexer.h"
@@ -23,14 +24,14 @@ enum state {
 	State_Illegal_Input,
 };
 
-struct lexerctx {
-	enum state state;
-	int ready;
-	const char *s;
-	const char *e;
-};
+#define MAXTOKSIZE 256
 
-int
+static enum state state;
+static char *s;
+static char *e;
+static char buf[MAXTOKSIZE];
+
+static int
 delim(int c)
 {
 	return (c == ' ' || c == '\t' || c == '\n' ||
@@ -48,185 +49,192 @@ extractstring(struct tok *t)
 	return strndup(t->s, t->e - t->s);
 }
 
-void
-freelexer(struct lexerctx *ctx)
-{
-	free(ctx);
-}
-
 struct tok
-gettok(struct lexerctx *ctx, const char *buf, size_t len)
+gettok(FILE *in)
 {
 	struct tok tok;
 	char *tmp;
+	int c;
 
-	if (ctx->ready == 0) {
-		ctx->s = buf;
-		ctx->e = buf;
-		ctx->ready = 1;
-	}
-	ctx->state = State_Se;
-	while (ctx->e < &buf[len]) {
-		switch (ctx->state) {
+again:
+	state = State_Se;
+	s = e = buf;
+	while ((c = getc(in)) != EOF) {
+		*e = c;
+		switch (state) {
 		case State_Se:
-			if (*ctx->e == ' ' || *ctx->e == '\t' ||
-			    *ctx->e == '\n')
-				break;
-			if (isalpha(*ctx->e) != 0)
-				ctx->state = State_Identifier;
-			else if (*ctx->e == '#')
-				ctx->state = State_Probable_Boolean;
-			else if (isdigit(*ctx->e) != 0)
-				ctx->state = State_Number;
-			else if (*ctx->e == '-' || *ctx->e == '+')
-				ctx->state = State_Signed_Number;
-			else if (*ctx->e == '"')
-				ctx->state = State_Probable_String;
-			else if (*ctx->e == '(')
-				ctx->state = State_Lparen;
-			else if (*ctx->e == ')')
-				ctx->state = State_Rparen;
-			else if (*ctx->e == '\'')
-				ctx->state = State_Quote;
-			else if (*ctx->e == '.')
-				ctx->state = State_Dot;
-			else if (*ctx->e == ';')
-				ctx->state = State_Comment;
+			if (*e == ' ' || *e == '\t' || *e == '\n')
+				goto again;
+			if (isalpha(*e) != 0)
+				state = State_Identifier;
+			else if (*e == '#')
+				state = State_Probable_Boolean;
+			else if (isdigit(*e) != 0)
+				state = State_Number;
+			else if (*e == '-' || *e == '+')
+				state = State_Signed_Number;
+			else if (*e == '"')
+				state = State_Probable_String;
+			else if (*e == '(')
+				state = State_Lparen;
+			else if (*e == ')')
+				state = State_Rparen;
+			else if (*e == '\'')
+				state = State_Quote;
+			else if (*e == '.')
+				state = State_Dot;
+			else if (*e == ';')
+				state = State_Comment;
 			else
-				ctx->state = State_Illegal_Input;
-			if (ctx->state != State_Se)
-				ctx->s = ctx->e;
+				state = State_Illegal_Input;
 			break;
 		case State_Identifier:
-			if (delim(*ctx->e) != 0) {
+			if (delim(*e) != 0) {
 				tok.type = Identifier;
-				tok.s = ctx->s;
-				tok.e = ctx->e;
+				tok.s = s;
+				tok.e = e;
+				ungetc(*e, in);
 				return tok;
 			}
-			if (isalnum(*ctx->e) == 0) {
+			if (isalnum(*e) == 0) {
 				tok.type = Error;
 				tok.s = "malformed identifier";
 				tok.e = NULL;
+				ungetc(*e, in);
 				return tok;
 			}
 			break;
 		case State_Probable_Boolean:
-			if (*ctx->e == 'f' || *ctx->e == 't')
-				ctx->state = State_Boolean;
-			else if (*ctx->e == '\\')
-				ctx->state = State_Probable_Character;
+			if (*e == 'f' || *e == 't')
+				state = State_Boolean;
+			else if (*e == '\\')
+				state = State_Probable_Character;
 			else {
 				tok.type = Error;
 				tok.s = "not a boolean or a character";
 				tok.e = NULL;
+				ungetc(*e, in);
 				return tok;
 			}
 			break;
 		case State_Boolean:
-			if (delim(*ctx->e) != 0) {
+			if (delim(*e) != 0) {
 				tok.type = Boolean;
-				tok.s = ctx->s;
-				tok.e = ctx->e;
+				tok.s = s;
+				tok.e = e;
+				ungetc(*e, in);
 				return tok;
 			}
 			tok.type = Error;
 			tok.s = "missing delimiter after boolean";
 			tok.e = NULL;
+			ungetc(*e, in);
 			return tok;
 		case State_Number:
-			if (delim(*ctx->e) != 0) {
+			if (delim(*e) != 0) {
 				tok.type = Number;
-				tok.s = ctx->s;
-				tok.e = ctx->e;
+				tok.s = s;
+				tok.e = e;
+				ungetc(*e, in);
 				return tok;
 			}
-			if (isdigit(*ctx->e) == 0) {
+			if (isdigit(*e) == 0) {
 				tok.type = Error;
 				tok.s = "not a number";
 				tok.e = NULL;
+				ungetc(*e, in);
 				return tok;
 			}
 			break;
 		case State_Signed_Number:
-			if (isdigit(*ctx->e) == 0) {
+			if (isdigit(*e) == 0) {
 				tok.type = Error;
 				tok.s = "not a number";
 				tok.e = NULL;
+				ungetc(*e, in);
 				return tok;
 			}
-			ctx->state = State_Number;
+			state = State_Number;
 			break;
 		case State_Probable_Character:
-			if (isalpha(*ctx->e) != 0) {
-				ctx->state = State_Character;
+			if (isalpha(*e) != 0) {
+				state = State_Character;
 			} else {
 				tok.type = Error;
 				tok.s = "expected character constant";
 				tok.e = NULL;
+				ungetc(*e, in);
 				return tok;
 			}
 			break;
 		case State_Character:
-			if (delim(*ctx->e) != 0) {
+			if (delim(*e) != 0) {
 				tok.type = Character;
-				tok.s = ctx->s;
-				tok.e = ctx->e;
+				tok.s = s;
+				tok.e = e;
+				ungetc(*e, in);
 				return tok;
 			}
 			tok.type = Error;
 			tok.s = "missing delimiter after character constant";
 			tok.e = NULL;
+			ungetc(*e, in);
 			return tok;
-			break;
 		case State_Probable_String:
-			if (*ctx->e == '"')
-				ctx->state = String;
+			if (*e == '"')
+				state = String;
 			break;
 		case State_String:
 			tok.type = String;
-			tok.s = ctx->s;
-			tok.e = ctx->e;
+			tok.s = s;
+			tok.e = e;
+			ungetc(*e, in);
 			return tok;
 		case State_Lparen:
 			tok.type = Lparen;
-			tok.s = ctx->s;
-			tok.e = ctx->e;
+			tok.s = s;
+			tok.e = e;
+			ungetc(*e, in);
 			return tok;
 		case State_Rparen:
 			tok.type = Rparen;
-			tok.s = ctx->s;
-			tok.e = ctx->e;
+			tok.s = s;
+			tok.e = e;
+			ungetc(*e, in);
 			return tok;
 		case State_Quote:
 			tok.type = Quote;
-			tok.s = ctx->s;
-			tok.e = ctx->e;
+			tok.s = s;
+			tok.e = e;
+			ungetc(*e, in);
 			return tok;
 		case State_Dot:
 			tok.type = Dot;
-			tok.s = ctx->s;
-			tok.e = ctx->e;
+			tok.s = s;
+			tok.e = e;
+			ungetc(*e, in);
 			return tok;
 		case State_Comment:
-			tmp = strchr(ctx->s, '\n');
+			tmp = strchr(s, '\n');
 			if (tmp) {
-				ctx->state = State_Se;
-				ctx->s = tmp;
+				state = State_Se;
+				s = tmp;
 			}
 			break;
 		case State_Illegal_Input:
 			tok.type = Error;
 			tok.s = "illegal input";
 			tok.e = NULL;
+			ungetc(*e, in);
 			return tok;
 		default:
 			tok.type = Error;
 			tok.s = "internal lex error";
 			tok.e = NULL;
+			ungetc(*e, in);
 			return tok;
 		}
-		ctx->e++;
+		e++;
 	}
 	tok.type = Eof;
 	tok.s = "reached the end-of-file";
@@ -234,24 +242,9 @@ gettok(struct lexerctx *ctx, const char *buf, size_t len)
 	return tok;
 }
 
-struct lexerctx *
+int
 initlexer(void)
 {
-	struct lexerctx *ctx;
-
-	ctx = malloc(sizeof(*ctx));
-	if (!ctx)
-		return NULL;
-	ctx->state = State_Se;
-	ctx->ready = 0;
-	ctx->s = NULL;
-	ctx->e = NULL;
-	return ctx;
-}
-
-void
-resetlexer(struct lexerctx *ctx)
-{
-	ctx->ready = 0;
-	ctx->state = State_Se;
+	state = State_Se;
+	return 0;
 }
diff --git a/lexer.h b/lexer.h
@@ -1,5 +1,5 @@
 /* See LICENSE file for copyright and license details. */
-typedef enum {
+enum toktype {
 	Error = -2,
 	Eof = -1,
 	Identifier,
@@ -11,19 +11,14 @@ typedef enum {
 	Rparen,
 	Quote,
 	Dot
-} toktype;
+};
 
 struct tok {
-	toktype type;
-	const char *s;
-	const char *e;
+	enum toktype type;
+	char *s;
+	char *e;
 };
 
-struct lexerctx;
-
-int delim(int);
 char *extractstring(struct tok *);
-void freelexer(struct lexerctx *ctx);
-struct tok gettok(struct lexerctx *, const char *, size_t);
-struct lexerctx *initlexer(void);
-void resetlexer(struct lexerctx *);
+struct tok gettok(FILE *fp);
+int initlexer(void);
diff --git a/repl.c b/repl.c
@@ -1,52 +1,25 @@
 /* See LICENSE file for copyright and license details. */
 #include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
 #include "lexer.h"
 #include "debug.h"
-#include "util.h"
 
 static void
-lexertest(char *buf)
+lexertest(FILE *in)
 {
 	struct tok tok;
-	struct lexerctx *ctx;
 
-	ctx = initlexer();
-	if (ctx) {
-		resetlexer(ctx);
-		do {
-			tok = gettok(ctx, buf, strlen(buf));
-			printtok(&tok);
-			putchar('\n');
-		} while (tok.type != Eof && tok.type != Error);
-		freelexer(ctx);
-	}
+	initlexer();
+	do {
+		tok = gettok(in);
+		printtok(&tok);
+		putchar('\n');
+		fflush(stdout);
+	} while (tok.type != Eof && tok.type != Error);
 }
 
 int
 main(void)
 {
-	char *buf = NULL;
-	size_t sz = 0;
-	int interactive = isatty(fileno(stdin));
-
-	if (interactive == 1)
-		puts("Welcome to iris, use ^C to quit");
-	do {
-		if (interactive == 1) {
-			printf("> ");
-			fflush(stdout);
-		}
-		if (afgets(&buf, &sz, stdin))
-			lexertest(buf);
-		if (ferror(stdin)) {
-			fprintf(stderr, "input error\n");
-			return EXIT_FAILURE;
-		}
-		fflush(stdout);
-	} while (interactive == 1);
-	free(buf);
+	lexertest(stdin);
 	return 0;
 }

	iris small scheme interpreter
	git clone git://git.2f30.org/iris
	Log \| Files \| Refs \| LICENSE

M	lexer.c	\|	203	++++++++++++++++++++++++++++++++++++++-----------------------------------------
M	lexer.h	\|	19	+++++++------------
M	repl.c	\|	45	+++++++++------------------------------------