sbase

suckless unix tools
git clone git://git.2f30.org/sbase
Log | Files | Refs | README | LICENSE

commit b3a63a60e4c23daf63d155c22d29cbe3f1399b6f
parent 8b3a9c197109fdfdd399c050e9cca038bfeee00b
Author: Adria Garriga <rhaps0dy@installgentoo.com>
Date:   Tue, 15 Jul 2014 00:49:42 +0200

Improved tr

- Added support for character ranges ( a-z )
- Added support for complementary charset ( -c ), only in delete mode
- Added support for octal escape sequences
- Unicode now only works when there are no octal escape sequences,
  otherwise behavior is not predictable at first sight.
- tr now supports null characters in the input
- Does not yet have support for character classes ( [:upper:] )

Diffstat:
Mtr.1 | 13+++++++++++--
Mtr.c | 340+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
2 files changed, 273 insertions(+), 80 deletions(-)

diff --git a/tr.1 b/tr.1 @@ -3,7 +3,7 @@ tr \- translate characters .SH SYNOPSIS .B tr -.RB [ \-d ] +.RB [ \-d ] [ \-c ] .RB set1 .P .B tr @@ -13,6 +13,9 @@ tr \- translate characters .TP .B \-d For compatibility. If given, characters in set1 will be deleted from the input and specifying set2 will result in an error. +.B \-c +Complementary, causes the specified character set to be inverted, this is all the characters not specified belong to it. +It only works in conjunction with \-d, because order doesn't make much sense with translation. .SH DESCRIPTION .B tr reads input from stdin replacing every character in @@ -50,9 +53,15 @@ If set1 is longer than set2 .B tr will map all the remaining characters to the last one in set2. In case set2 is longer than set1, the remaining characters from set2 will be ignored. .B +Character escape sequences, be them characters or octal numbers, are done preceding the token with a "\\". You may specify three digits or less for it, +digits will stop being read when a non-octal character or when three characters are read. +.B +Use "A-B" for ordered sets fom A to B. +.B .SH NOTES .B tr -is Unicode-aware but does not yet handle character classes (e.g. [:alnum:] or [:digit:]). +is Unicode-aware, but only if you don't specify characters in octal (for example \\012), because else it is not predictable. Does not support character +classes. .SH SEE ALSO .IR sed(1) .IR awk(1) diff --git a/tr.c b/tr.c @@ -3,7 +3,6 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <sys/mman.h> #include <locale.h> #include <wchar.h> #include "text.h" @@ -12,135 +11,316 @@ static void usage(void) { - eprintf("usage: %s [-d] set1 [set2]\n", argv0); + eprintf("usage: %s [-d] [-c] set1 [set2]\n", argv0); +} + +static int dflag, cflag; +static wchar_t mappings[0x110000]; + +struct wset_state { + char *s; /* current character */ + wchar_t rfirst, rlast; /* first and last in range */ + wchar_t prev; /* previous returned character */ + int prev_was_range; /* was the previous character part of a c-c range? */ +}; + +struct set_state { + char *s, rfirst, rlast, prev; + int prev_was_octal; /* was the previous returned character written in octal? */ +}; + +static void +set_state_defaults(struct set_state *s) +{ + s->rfirst = 1; + s->rlast = 0; + s->prev_was_octal = 1; } static void -handleescapes(char *s) +wset_state_defaults(struct wset_state *s) { + s->rfirst = 1; + s->rlast = 0; + s->prev_was_range = 1; +} + +/* sets *s to the char that was intended to be written. + * returns how many bytes the s pointer has to advance to skip the + * escape sequence if it was an octal, always zero otherwise. */ +static int +resolve_escape(char *s) +{ + int i; + unsigned char c; + switch(*s) { case 'n': *s = '\n'; - break; + return 0; case 't': *s = '\t'; - break; - case '\\': - *s = '\\'; - break; + return 0; case 'r': *s = '\r'; - break; + return 0; case 'f': *s = '\f'; - break; + return 0; case 'a': *s = '\a'; - break; + return 0; case 'b': *s = '\b'; - break; + return 0; case 'v': *s = '\v'; - break; + return 0; + case '\\': + *s = '\\'; + return 0; + case '\0': + eprintf("stray '\\' at end of input:"); + default: ; } + + if(*s<'0' || *s>'7') + eprintf("invalid character after '\\':"); + for(i=0, c=0; s[i]>='0' && s[i]<='7' && i<3; i++) { + c <<= 3; + c += s[i]-'0'; + } + if(*s>'3' && i==3) + eprintf("octal byte cannot be bigger than 377:"); + *s = c; + return i; } +#define embtowc(a, b) mbtowc(a, b, 4) + static int xmbtowc(wchar_t *unicodep, const char *s) { int rv; - rv = mbtowc(unicodep, s, 4); + rv = embtowc(unicodep, s); if (rv < 0) - eprintf("mbtowc:"); + eprintf("mbtowc: invalid input sequence:"); return rv; } -static void -parsemapping(const char *set1, const char *set2, wchar_t *mappings) +static int +has_octal_escapes(const char *s) { - char *s1, *s2; - wchar_t runeleft; - wchar_t runeright; - int leftbytes; - int rightbytes; - - s1 = (char *)set1; - if(set2) - s2 = (char *)set2; - else - s2 = (char *)set1; - - while(*s1) { - if(*s1 == '\\') - handleescapes(++s1); - leftbytes = xmbtowc(&runeleft, s1); - s1 += leftbytes; - if(*s2 == '\\') - handleescapes(++s2); - if(*s2 != '\0') { - rightbytes = xmbtowc(&runeright, s2); - s2 += rightbytes; + while(*s) + if(*s++ == '\\' && *s >= '0' && *s <= '7') + return 1; + return 0; +} + +static char +get_next_char(struct set_state *s) +{ + char c; + int nchars; + +start: + if(s->rfirst <= s->rlast) { + c = s->rfirst; + s->rfirst++; + return c; + } + + if(*s->s == '-' && !s->prev_was_octal) { + s->s++; + if(!*s->s) + return '-'; + if(*s->s == '\\' && (nchars = resolve_escape(++(s->s)))) + goto char_is_octal; + s->rlast = *(s->s)++; + if(!s->rlast) + return '\0'; + s->prev_was_octal = 1; + s->rfirst = ++(s->prev); + goto start; + } + if(*s->s == '\\' && (nchars = resolve_escape(++(s->s)))) + goto char_is_octal; + + s->prev_was_octal = 0; + c = *(s->s)++; + s->prev = c; + return c; + +char_is_octal: + s->prev_was_octal = 1; + c = *s->s; + s->s += nchars; + return c; +} + +static wchar_t +get_next_wchar(struct wset_state *s) +{ +start: + if(s->rfirst <= s->rlast) { + s->prev = s->rfirst; + s->rfirst++; + return s->prev; + } + + if(*s->s == '-' && !s->prev_was_range) { + s->s++; + if(!*s->s) + return '-'; + if(*s->s == '\\') + resolve_escape(++(s->s)); + s->s += xmbtowc(&s->rlast, s->s); + if(!s->rlast) + return '\0'; + s->rfirst = ++(s->prev); + s->prev_was_range = 1; + goto start; + } + + if(*s->s == '\\') + resolve_escape(++(s->s)); + s->s += xmbtowc(&s->prev, s->s); + s->prev_was_range = 0; + return s->prev; +} + +static int +is_mapping_wide(const char *set1, const char *set2) +{ + struct set_state ss1, ss2; + struct wset_state wss1, wss2; + wchar_t wc1, wc2, last_wc2; + + if(has_octal_escapes(set1)) { + set_state_defaults(&ss1); + ss1.s = (char *) set1; + if(set2) { + set_state_defaults(&ss2); + ss2.s = (char *) set2; + /* if the character returned is from an octal triplet, it might be null + and still need to continue */ + while((wc1 = (unsigned char) get_next_char(&ss1)) || ss1.prev_was_octal ) { + if(!(wc2 = (unsigned char) get_next_char(&ss2))) + wc2 = last_wc2; + mappings[wc1] = wc2; + last_wc2 = wc2; + } + } else { + while((wc1 = (unsigned char) get_next_char(&ss1)) || ss1.prev_was_octal) + mappings[wc1] = 1; } - mappings[runeleft] = runeright; + return 0; + } else { + wset_state_defaults(&wss1); + wss1.s = (char *) set1; + if(set2) { + wset_state_defaults(&wss2); + wss2.s = (char *) set2; + while((wc1 = get_next_wchar(&wss1))) { + if(!(wc2 = get_next_wchar(&wss2))) + wc2 = last_wc2; + mappings[wc1] = wc2; + last_wc2 = wc2; + } + } else { + while((wc1 = get_next_wchar(&wss1))) + mappings[wc1] = 1; + } + return 1; } + return 0; /* unreachable */ } static void -maptonull(const wchar_t *mappings, char *in) +wmap_null(char *in, ssize_t nbytes) { - const char *s; - wchar_t runeleft; - int leftbytes = 0; + char *s; + wchar_t rune; + int parsed_bytes = 0; s = in; - while(*s) { - leftbytes = xmbtowc(&runeleft, s); - if(!mappings[runeleft]) - putwchar(runeleft); - s += leftbytes; + while(nbytes) { + parsed_bytes = embtowc(&rune, s); + if(parsed_bytes < 0) { + rune = *s; + parsed_bytes = 1; + } + if(((!mappings[rune])&1) ^ cflag) + putwchar(rune); + s += parsed_bytes; + nbytes -= parsed_bytes; } } static void -maptoset(const wchar_t *mappings, char *in) +wmap_set(char *in, ssize_t nbytes) { - const char *s; - wchar_t runeleft; - int leftbytes = 0; + char *s; + wchar_t rune; + int parsed_bytes = 0; s = in; - while(*s) { - leftbytes = xmbtowc(&runeleft, s); - if(!mappings[runeleft]) - putwchar(runeleft); + while(nbytes) { + parsed_bytes = embtowc(&rune, s); + if(parsed_bytes < 0) { + rune = *s; + parsed_bytes = 1; + } + if(!mappings[rune]) + putwchar(rune); else - putwchar(mappings[runeleft]); - s += leftbytes; + putwchar(mappings[rune]); + nbytes -= parsed_bytes; + s += parsed_bytes; } } +static void +map_null(char *in, ssize_t nbytes) +{ + char *s; + + for(s=in; nbytes; s++, nbytes--) + if(((!mappings[(unsigned char)*s])&1) ^ cflag) + putchar(*s); +} + +static void +map_set(char *in, ssize_t nbytes) +{ + char *s; + + for(s=in; nbytes; s++, nbytes--) + if(!mappings[(unsigned char)*s]) + putchar(*s); + else + putchar(mappings[(unsigned char)*s]); +} + int main(int argc, char *argv[]) { - wchar_t *mappings; char *buf = NULL; size_t size = 0; - void (*mapfunc)(const wchar_t*, char*); - int dflag = 0; + ssize_t nbytes; + void (*mapfunc)(char*, ssize_t); setlocale(LC_ALL, ""); - - mappings = mmap(NULL, 0x110000 * sizeof(wchar_t), - PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0); - if (mappings == MAP_FAILED) - eprintf("mmap:"); + dflag = cflag = 0; ARGBEGIN { case 'd': dflag = 1; break; + case 'c': + cflag = 1; + break; default: usage(); } ARGEND; @@ -148,25 +328,29 @@ main(int argc, char *argv[]) if(argc == 0) usage(); - if(dflag || argc == 1) { + if(dflag) { if(argc != 1) usage(); - parsemapping(argv[0], NULL, mappings); - mapfunc = maptonull; + if(is_mapping_wide(argv[0], NULL)) + mapfunc = wmap_null; + else + mapfunc = map_null; + } else if(cflag) { + usage(); + } else if(argc == 2) { + if(is_mapping_wide(argv[0], argv[1])) + mapfunc = wmap_set; + else + mapfunc = map_set; } else { - if(argc != 2) - usage(); - parsemapping(argv[0], argv[1], mappings); - mapfunc = maptoset; + usage(); } - while(agetline(&buf, &size, stdin) != -1) - mapfunc(mappings, buf); + while((nbytes = agetline(&buf, &size, stdin)) != -1) + mapfunc(buf, nbytes); free(buf); if(ferror(stdin)) eprintf("<stdin>: read error:"); - munmap(mappings, 0x110000 * sizeof(wchar_t)); - return EXIT_SUCCESS; }