commit cd0b771cbbb56096335c1f2d8a0c953d46b0d430
parent f83d7bc647a99405e919ba27416a56d8b2d0617a
Author: Wolfgang Corcoran-Mathe <first.lord.of.teal@gmail.com>
Date: Mon, 20 Apr 2015 11:23:20 +0100
Add join(1)
Diffstat:
M | LICENSE | | | 1 | + |
M | Makefile | | | 1 | + |
M | README | | | 1 | + |
M | TODO | | | 1 | - |
A | join.1 | | | 105 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | join.c | | | 554 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
6 files changed, 662 insertions(+), 1 deletion(-)
diff --git a/LICENSE b/LICENSE
@@ -58,3 +58,4 @@ Authors/contributors include:
© 2015 Tai Chi Minh Ralph Eastwood <tcmreastwood@gmail.com>
© 2015 Quentin Rameau <quinq@quinq.eu.org>
© 2015 Dionysis Grigoropoulos <info@erethon.com>
+© 2015 Wolfgang Corcoran-Mathe <first.lord.of.teal@gmail.com>
diff --git a/Makefile b/Makefile
@@ -99,6 +99,7 @@ BIN =\
fold\
grep\
head\
+ join\
hostname\
kill\
link\
diff --git a/README b/README
@@ -40,6 +40,7 @@ The following tools are implemented:
=* o grep .
=*|o head .
=*|x hostname .
+=* o join .
=*|o kill .
=*|o link .
=*|o ln .
diff --git a/TODO b/TODO
@@ -10,7 +10,6 @@ diff
ed
getconf
install
-join
od
patch
pathchk
diff --git a/join.1 b/join.1
@@ -0,0 +1,105 @@
+.Dd April 18, 2015
+.Dt JOIN 1
+.Os sbase
+.Sh NAME
+.Nm join
+.Nd relational database operator
+.Sh SYNOPSIS
+.Nm
+.Op Fl 1 Ar field
+.Op Fl 2 Ar field
+.Op Fl o Ar list
+.Op Fl e Ar string
+.Op Fl a Ar fileno | Fl v Ar fileno
+.Op Fl t Ar delim
+.Ar file1 file2
+.Sh DESCRIPTION
+.Nm
+lines from
+.Ar file1
+and
+.Ar file2
+on a matching field. If one of the input files is '-', standard input
+is read for that file.
+.Pp
+Files are read sequentially and are assumed to be sorted on the join
+field.
+.Nm
+does not check the order of input, and joining two unsorted files will
+produce unexpected output.
+.Pp
+By default, input lines are matched on the first blank-separated
+field; output lines are space-separated and consist of the join field
+followed by the remaining fields from
+.Ar file1 Ns ,
+then the remaining fields from
+.Ar file2 Ns .
+.Sh OPTIONS
+.Bl -tag -width Ds
+.It Fl 1 Ar field
+Join on the
+.Ar field Ns eth
+field of file 1.
+.It Fl 2 Ar field
+Join on the
+.Ar field Ns eth
+field of file 2.
+.It Fl a Ar fileno
+Print unpairable lines from file
+.Ar fileno
+in addition to normal output.
+.It Fl e Ar string
+When used with
+.Fl o Ns ,
+replace empty fields in the output list with
+.Ar string Ns .
+.It Fl o Ar list
+Format output according to the string
+.Ar list Ns .
+Each element of
+.Ar list
+may be either
+.Ar fileno.field
+or 0 (representing the join field).
+Elements in
+.Ar list
+may be separated by blanks or commas. For example,
+.Bd -literal -offset indent
+join -o "0 2.1 1.3"
+.Ed
+.Pp
+would print the join field, the first field of
+.Ar file2 Ns ,
+then the third field of
+.Ar file1 Ns .
+.Pp
+Only paired lines are formatted with the
+.Fl o
+option. Unpairable lines (selected with
+.Fl a
+or
+.Fl v Ns )
+are printed raw.
+.It Fl t Ar delim
+Use the arbitrary string
+.Ar delim
+as field delimiter for both input and output.
+.It Fl v Ar fileno
+Print unpairable lines from file
+.Ar fileno
+instead of normal output.
+.El
+.Sh STANDARDS
+The
+.Nm
+utility is compliant with the
+.St -p1003.1-2013
+specification with the following exeption:
+.Bl -bullet -offset indent
+.It
+Unpairable lines ignore formatting specified with
+.Fl o Ns .
+.El
+.Pp
+The possibility of specifying multibyte delimiters of arbitrary
+length is an extension to the specification.
diff --git a/join.c b/join.c
@@ -0,0 +1,554 @@
+#include <ctype.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "arg.h"
+#include "text.h"
+#include "utf.h"
+#include "util.h"
+
+enum {
+ INIT = 1,
+ GROW = 2,
+};
+
+enum {
+ EXPAND = 0,
+ RESET = 1,
+};
+
+enum { FIELD_ERROR = -2, };
+
+struct field {
+ char *s;
+ size_t len;
+};
+
+struct line {
+ char *text;
+ size_t nf;
+ size_t maxf;
+ struct field *fields;
+};
+
+struct spec {
+ size_t fileno;
+ size_t fldno;
+};
+
+struct outlist {
+ size_t ns;
+ size_t maxs;
+ struct spec **specs;
+};
+
+struct span {
+ size_t nl;
+ size_t maxl;
+ struct line **lines;
+};
+
+static char *sep = NULL;
+static char *replace = NULL;
+static const char defaultofs = ' ';
+static const int jfield = 1; /* POSIX default join field */
+static int unpairsa = 0, unpairsb = 0;
+static int oflag = 0;
+static int pairs = 1;
+static size_t seplen;
+static struct outlist output;
+
+char *argv0;
+
+
+static void
+usage(void)
+{
+ eprintf("usage: %s [-1 field] [-2 field] [-o list] [-e string] "
+ "[-a | -v fileno] [-t delim] file1 file2\n", argv0);
+}
+
+static void
+prfield(struct field *fp)
+{
+ if (fwrite(fp->s, 1, fp->len, stdout) != fp->len)
+ eprintf("fwrite:");
+}
+
+static void
+prsep(void)
+{
+ if (sep)
+ fwrite(sep, 1, seplen, stdout);
+ else
+ putchar(defaultofs);
+}
+
+static void
+swaplines(struct line *la, struct line *lb)
+{
+ struct line tmp;
+
+ tmp = *la;
+ *la = *lb;
+ *lb = tmp;
+}
+
+static void
+prjoin(struct line *la, struct line *lb, size_t jfa, size_t jfb)
+{
+ struct spec *sp;
+ struct field *joinfield;
+ size_t i;
+
+ if (jfa >= la->nf || jfb >= lb->nf)
+ return;
+
+ joinfield = &la->fields[jfa];
+
+ if (oflag) {
+ for (i = 0; i < output.ns; i++) {
+ sp = output.specs[i];
+
+ if (sp->fileno == 1) {
+ if (sp->fldno < la->nf)
+ prfield(&la->fields[sp->fldno]);
+ else if (replace)
+ fputs(replace, stdout);
+ } else if (sp->fileno == 2) {
+ if (sp->fldno < lb->nf)
+ prfield(&lb->fields[sp->fldno]);
+ else if (replace)
+ fputs(replace, stdout);
+ } else if (sp->fileno == 0) {
+ prfield(joinfield);
+ }
+
+ if (i < output.ns - 1)
+ prsep();
+ }
+ } else {
+ prfield(joinfield);
+ prsep();
+
+ for (i = 0; i < la->nf; i++) {
+ if (i != jfa) {
+ prfield(&la->fields[i]);
+ prsep();
+ }
+ }
+ for (i = 0; i < lb->nf; i++) {
+ if (i != jfb) {
+ prfield(&lb->fields[i]);
+ if (i < la->nf - 1)
+ prsep();
+ }
+ }
+ }
+
+ putchar('\n');
+}
+
+static void
+prline(struct line *lp)
+{
+ size_t len = strlen(lp->text);
+
+ if (fwrite(lp->text, 1, len, stdout) != len)
+ eprintf("fwrite:");
+
+ putchar('\n');
+}
+
+static int
+linecmp(struct line *la, struct line *lb, size_t jfa, size_t jfb)
+{
+ int status;
+
+ /* return FIELD_ERROR if both lines are short */
+ if (jfa >= la->nf) {
+ status = jfb >= lb->nf ? FIELD_ERROR : -1;
+ } else if (jfb >= lb->nf) {
+ status = 1;
+ } else {
+ status = memcmp(la->fields[jfa].s, lb->fields[jfb].s,
+ MAX (la->fields[jfa].len, lb->fields[jfb].len));
+ if (status > 0)
+ status = 1;
+ else if (status < 0)
+ status = -1;
+ }
+
+ return status;
+}
+
+static void
+addfield(struct line *lp, char *sp, size_t len)
+{
+ if (lp->nf >= lp->maxf) {
+ lp->fields = ereallocarray(lp->fields, (GROW * lp->maxf),
+ sizeof(struct field));
+ lp->maxf *= GROW;
+ }
+ lp->fields[lp->nf].s = sp;
+ lp->fields[lp->nf].len = len;
+ lp->nf++;
+}
+
+static void
+prspanjoin(struct span *spa, struct span *spb, size_t jfa, size_t jfb)
+{
+ size_t i, j;
+
+ for (i = 0; i < (spa->nl - 1); i++)
+ for (j = 0; j < (spb->nl - 1); j++)
+ prjoin(spa->lines[i], spb->lines[j], jfa, jfb);
+}
+
+static struct line *
+makeline(char *s, size_t len)
+{
+ struct line *lp;
+ char *sp, *beg, *end;
+ size_t i;
+ int eol = 0;
+
+ if (s[len-1] == '\n')
+ s[len-1] = '\0';
+
+ lp = ereallocarray(NULL, INIT, sizeof(struct line));
+ lp->text = s;
+ lp->fields = ereallocarray(NULL, INIT, sizeof(struct field));
+ lp->nf = 0;
+ lp->maxf = INIT;
+
+ for (sp = lp->text; isblank(*sp); sp++)
+ ;
+
+ while (!eol) {
+ beg = sp;
+
+ if (sep) {
+ if (!(end = utfutf(sp, sep)))
+ eol = 1;
+
+ if (!eol) {
+ addfield(lp, beg, end - beg);
+ for (i = 0; i < seplen; i++)
+ end++;
+ }
+ } else {
+ for (end = sp; !(isblank(*end)); end++) {
+ if (*end == '\0') {
+ eol = 1;
+ break;
+ }
+ }
+
+ if (!eol)
+ addfield(lp, beg, end - beg);
+ while (isblank(*++end))
+ ;
+ }
+
+ if (eol)
+ addfield(lp, beg, strlen(sp));
+
+ sp = end;
+ }
+
+ return lp;
+}
+
+static int
+addtospan(struct span *sp, FILE *fp, int reset)
+{
+ char *newl = NULL;
+ size_t len, size = 0;
+
+ if ((len = getline(&newl, &size, fp)) == -1) {
+ if (ferror(fp))
+ eprintf("getline:");
+ else
+ return 0;
+ }
+
+ if (reset)
+ sp->nl = 0;
+
+ if (sp->nl >= sp->maxl) {
+ sp->lines = ereallocarray(sp->lines, (GROW * sp->maxl),
+ sizeof(struct line *));
+ sp->maxl *= GROW;
+ }
+
+ sp->lines[sp->nl] = makeline(newl, len);
+ sp->nl++;
+ return 1;
+}
+
+static void
+initspan(struct span *sp)
+{
+ sp->nl = 0;
+ sp->maxl = INIT;
+ sp->lines = ereallocarray(NULL, INIT, sizeof(struct line *));;
+}
+
+static void
+freespan(struct span *sp)
+{
+ size_t i;
+
+ for (i = 0; i < sp->nl; i++) {
+ free(sp->lines[i]->fields);
+ free(sp->lines[i]->text);
+ }
+
+ free(sp->lines);
+}
+
+static void
+initolist(struct outlist *olp)
+{
+ olp->ns = 0;
+ olp->maxs = 1;
+ olp->specs = ereallocarray(NULL, INIT, sizeof(struct spec *));
+}
+
+static void
+addspec(struct outlist *olp, struct spec *sp)
+{
+ if (olp->ns >= olp->maxs) {
+ olp->specs = ereallocarray(olp->specs, (GROW * olp->maxs),
+ sizeof(struct spec *));
+ olp->maxs *= GROW;
+ }
+ olp->specs[olp->ns] = sp;
+ olp->ns++;
+}
+
+static struct spec *
+makespec(char *s)
+{
+ struct spec *sp;
+ int fileno;
+ size_t fldno;
+
+ switch (s[0]) {
+ case '0': /* join field */
+ fileno = 0;
+ fldno = 0;
+ break;
+ case '1': case '2':
+ if (sscanf(s, "%d.%zu", &fileno, &fldno) != 2)
+ eprintf("\"%s\": invalid format\n", s);
+ fldno--; /* ugly */
+ break;
+ default:
+ eprintf("%c: invalid file number (must be 0, 1 or 2)\n", s[0]);
+ break;
+ }
+
+ sp = ereallocarray(NULL, INIT, sizeof(struct spec));
+ sp->fileno = fileno;
+ sp->fldno = fldno;
+ return sp;
+}
+
+static void
+makeolist(struct outlist *olp, char *s)
+{
+ char *item, *sp;
+ sp = s;
+
+ while (sp) {
+ item = sp;
+ sp = strpbrk(sp, ", \t");
+ if (sp)
+ *sp++ = '\0';
+ addspec(olp, makespec(item));
+ }
+}
+
+static void
+freespecs(struct outlist *olp)
+{
+ size_t i;
+
+ for (i = 0; i < olp->ns; i++)
+ free(olp->specs[i]);
+}
+
+static void
+join(FILE *fa, FILE *fb, size_t jfa, size_t jfb)
+{
+ struct span spa, spb;
+ int cmp, eofa, eofb;
+
+ initspan(&spa);
+ initspan(&spb);
+ cmp = eofa = eofb = 0;
+
+ addtospan(&spa, fa, RESET);
+ addtospan(&spb, fb, RESET);
+
+ while (spa.nl && spb.nl) {
+ if ((cmp = linecmp(spa.lines[0], spb.lines[0], jfa, jfb)) < 0) {
+ if (unpairsa)
+ prline(spa.lines[0]);
+ if (!addtospan(&spa, fa, RESET)) {
+ if (unpairsb) { /* a is EOF'd; print the rest of b */
+ do
+ prline(spb.lines[0]);
+ while (addtospan(&spb, fb, RESET));
+ }
+ eofa = eofb = 1;
+ } else {
+ continue;
+ }
+ } else if (cmp > 0) {
+ if (unpairsb)
+ prline(spb.lines[0]);
+ if (!addtospan(&spb, fb, RESET)) {
+ if (unpairsa) { /* b is EOF'd; print the rest of a */
+ do
+ prline(spa.lines[0]);
+ while (addtospan(&spa, fa, RESET));
+ }
+ eofa = eofb = 1;
+ } else {
+ continue;
+ }
+ } else if (cmp == 0) {
+ /* read all consecutive matching lines from a */
+ do {
+ if (!addtospan(&spa, fa, EXPAND)) {
+ eofa = 1;
+ spa.nl++;
+ break;
+ }
+ } while (linecmp(spa.lines[spa.nl-1], spb.lines[0], jfa, jfb) == 0);
+
+ /* read all consecutive matching lines from b */
+ do {
+ if (!addtospan(&spb, fb, EXPAND)) {
+ eofb = 1;
+ spb.nl++;
+ break;
+ }
+ } while (linecmp(spa.lines[0], spb.lines[spb.nl-1], jfa, jfb) == 0);
+
+ if (pairs)
+ prspanjoin(&spa, &spb, jfa, jfb);
+
+ } else { /* FIELD_ERROR: both lines lacked join fields */
+ if (unpairsa)
+ prline(spa.lines[0]);
+ if (unpairsb)
+ prline(spb.lines[0]);
+ eofa = addtospan(&spa, fa, RESET) ? 0 : 1;
+ eofb = addtospan(&spb, fb, RESET) ? 0 : 1;
+ if (!eofa && !eofb)
+ continue;
+ }
+
+ if (eofa) {
+ spa.nl = 0;
+ } else {
+ swaplines(spa.lines[0], spa.lines[spa.nl - 1]); /* ugly */
+ spa.nl = 1;
+ }
+
+ if (eofb) {
+ spb.nl = 0;
+ } else {
+ swaplines(spb.lines[0], spb.lines[spb.nl - 1]); /* ugly */
+ spb.nl = 1;
+ }
+ }
+ freespan(&spa);
+ freespan(&spb);
+}
+
+
+int
+main(int argc, char *argv[])
+{
+ size_t jf[2] = { jfield, jfield, };
+ FILE *fp[2];
+ int n;
+ char *fno;
+
+ ARGBEGIN {
+ case '1':
+ jf[0] = estrtonum(EARGF(usage()), 1, MIN(LLONG_MAX, SIZE_MAX));
+ break;
+ case '2':
+ jf[1] = estrtonum(EARGF(usage()), 1, MIN(LLONG_MAX, SIZE_MAX));
+ break;
+ case 'a':
+ fno = EARGF(usage());
+ if (strcmp(fno, "1") == 0)
+ unpairsa = 1;
+ else if (strcmp(fno, "2") == 0)
+ unpairsb = 1;
+ else
+ usage();
+ break;
+ case 'e':
+ replace = EARGF(usage());
+ break;
+ case 'o':
+ oflag = 1;
+ initolist(&output);
+ makeolist(&output, EARGF(usage()));
+ break;
+ case 't':
+ sep = EARGF(usage());
+ break;
+ case 'v':
+ pairs = 0;
+ fno = EARGF(usage());
+ if (strcmp(fno, "1") == 0)
+ unpairsa = 1;
+ else if (strcmp(fno, "2") == 0)
+ unpairsb = 1;
+ else
+ usage();
+ break;
+ default:
+ usage();
+ } ARGEND;
+
+ if (sep)
+ seplen = unescape(sep);
+
+ if (argc != 2)
+ usage();
+
+ for (n = 0; n < 2; n++) {
+ if (argv[n][0] == '-' && !argv[n][1]) {
+ argv[n] = "<stdin>";
+ fp[n] = stdin;
+ } else if (!(fp[n] = fopen(argv[n], "r"))) {
+ eprintf("fopen %s:", argv[n]);
+ }
+ }
+
+ jf[0]--;
+ jf[1]--;
+
+ join(fp[0], fp[1], jf[0], jf[1]);
+
+ if (oflag)
+ freespecs(&output);
+
+ enfshut(2, fp[0], argv[0]);
+ if (fp[0] != fp[1])
+ enfshut(2, fp[1], argv[1]);
+ enfshut(2, stdout, "<stdout>");
+ exit(0);
+}