sbase

suckless unix tools
git clone git://git.2f30.org/sbase
Log | Files | Refs | README | LICENSE

commit 733b33f1c770a9081c41685d931d37f8e99b33f8
parent dc70eb797610565650ed2b8d91b1ec74d0ca1197
Author: FRIGN <dev@frign.de>
Date:   Thu, 22 Jan 2015 12:32:50 +0100

Add UTF-8-delimiter-support to cut(1)

Now you can specify a multibyte-delimiter to cut, which should
definitely be possible for the end-user (Fuck POSIX).
Looking at GNU/coreutils' cut(1)[0], which basically ignores the difference
between characters and bytes, the -n-option and which is bloated as hell,
one has to wonder why they are still default. This is insane!
Things like this personally keep me motivated to make sbase better
every day.

[0]: http://git.savannah.gnu.org/gitweb/?p=coreutils.git;a=blob;f=src/cut.c;hb=HEAD
     NSFW! You have been warned.

Diffstat:
MREADME | 2+-
Mcut.1 | 4++--
Mcut.c | 52+++++++++++++++++++++++++++++++++++++---------------
3 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/README b/README @@ -22,7 +22,7 @@ The following tools are implemented ('*' == finished, '#' == UTF-8 support, =* comm yes none = cp no -H, -i, -L =* cron non-posix none - * cut yes none +#* cut yes none = date yes none = dirname yes none = du no -H, -L, -x diff --git a/cut.1 b/cut.1 @@ -1,4 +1,4 @@ -.Dd January 18, 2015 +.Dd January 22, 2015 .Dt CUT 1 sbase\-VERSION .Sh NAME .Nm cut @@ -67,4 +67,4 @@ utility is compliant with the specification. .Pp The possibility of separating numbers and ranges with a space -is an extension to that specification. +and specifying multibyte delimiters is an extension to that specification. diff --git a/cut.c b/cut.c @@ -4,6 +4,7 @@ #include <string.h> #include "text.h" +#include "utf.h" #include "util.h" typedef struct Range { @@ -11,11 +12,12 @@ typedef struct Range { struct Range *next; } Range; -static Range *list = NULL; -static char mode = 0; -static char delim = '\t'; -static int nflag = 0; -static int sflag = 0; +static Range *list = NULL; +static char mode = 0; +static Rune delim = '\t'; +static size_t delimlen = 1; +static int nflag = 0; +static int sflag = 0; static void insert(Range *r) @@ -70,10 +72,11 @@ static size_t seek(const char *s, size_t pos, size_t *prev, size_t count) { const char *t; - size_t n = pos - *prev; + size_t n = pos - *prev, i; + Rune r; if (mode == 'b') { - if ((t = memchr(s, 0, n))) + if ((t = memchr(s, '\0', n))) return t - s; if (nflag) while (n && !UTF8_POINT(s[n])) @@ -85,11 +88,18 @@ seek(const char *s, size_t pos, size_t *prev, size_t count) if (UTF8_POINT(*t) && !--n) break; } else { - for (t = (count < 2) ? s : s + 1; n && *t; t++) - if (*t == delim && !--n && count) + for (t = (count < delimlen + 1) ? s : s + delimlen; n && *t; ) { + for (i = 1; t[i]; i++) + if (fullrune(t, i)) + break; + charntorune(&r, t, i); + if (r == delim && !--n && count) break; + t += i; + } } *prev = pos; + return t - s; } @@ -106,20 +116,22 @@ cut(FILE *fp) while ((len = getline(&buf, &size, fp)) != -1) { if (len && buf[len - 1] == '\n') buf[len - 1] = '\0'; - if (mode == 'f' && !strchr(buf, delim)) { + if (mode == 'f' && !utfrune(buf, delim)) { if (!sflag) puts(buf); continue; } for (i = 0, p = 1, s = buf, r = list; r; r = r->next, s += n) { - s += seek(s, r->min, &p, i++); + s += seek(s, r->min, &p, i); + i += (mode == 'f') ? delimlen : 1; if (!*s) break; if (!r->max) { fputs(s, stdout); break; } - n = seek(s, r->max + 1, &p, i++); + n = seek(s, r->max + 1, &p, i); + i += (mode == 'f') ? delimlen : 1; if (fwrite(s, 1, n, stdout) != n) eprintf("write error:"); } @@ -139,16 +151,27 @@ int main(int argc, char *argv[]) { FILE *fp; + int i; + char *m, *d; ARGBEGIN { case 'b': case 'c': case 'f': mode = ARGC(); - parselist(ARGF()); + m = ARGF(); + if (!m) + usage(); + parselist(m); break; case 'd': - delim = *ARGF(); + if(!(d = ARGF())) + usage(); + for (i = 1; i <= strlen(d); i++) + if (fullrune(d, i)) + break; + charntorune(&delim, d, i); + delimlen = i; break; case 'n': nflag = 1; @@ -162,7 +185,6 @@ main(int argc, char *argv[]) if (!mode) usage(); - if (!argc) cut(stdin); else for (; argc--; argv++) {