sort.c (9405B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <ctype.h> 3 #include <stdio.h> 4 #include <stdlib.h> 5 #include <string.h> 6 7 #include "queue.h" 8 #include "text.h" 9 #include "utf.h" 10 #include "util.h" 11 12 struct keydef { 13 int start_column; 14 int end_column; 15 int start_char; 16 int end_char; 17 int flags; 18 TAILQ_ENTRY(keydef) entry; 19 }; 20 21 struct column { 22 struct line line; 23 size_t cap; 24 }; 25 26 enum { 27 MOD_N = 1 << 0, 28 MOD_STARTB = 1 << 1, 29 MOD_ENDB = 1 << 2, 30 MOD_R = 1 << 3, 31 MOD_D = 1 << 4, 32 MOD_F = 1 << 5, 33 MOD_I = 1 << 6, 34 }; 35 36 static TAILQ_HEAD(kdhead, keydef) kdhead = TAILQ_HEAD_INITIALIZER(kdhead); 37 38 static int Cflag = 0, cflag = 0, uflag = 0; 39 static char *fieldsep = NULL; 40 static size_t fieldseplen = 0; 41 static struct column col1, col2; 42 43 static void 44 skipblank(struct line *a) 45 { 46 while (a->len && (*(a->data) == ' ' || *(a->data) == '\t')) { 47 a->data++; 48 a->len--; 49 } 50 } 51 52 static void 53 skipnonblank(struct line *a) 54 { 55 while (a->len && (*(a->data) != '\n' && *(a->data) != ' ' && 56 *(a->data) != '\t')) { 57 a->data++; 58 a->len--; 59 } 60 } 61 62 static void 63 skipcolumn(struct line *a, int skip_to_next_col) 64 { 65 char *s; 66 67 if (fieldsep) { 68 if ((s = memmem(a->data, a->len, fieldsep, fieldseplen))) { 69 if (skip_to_next_col) { 70 s += fieldseplen; 71 a->data = s; 72 a->len = a->len - (s - a->data); 73 } 74 } else { 75 a->data += a->len - 1; 76 a->len = 1; 77 } 78 } else { 79 skipblank(a); 80 skipnonblank(a); 81 } 82 } 83 84 static void 85 columns(struct line *line, const struct keydef *kd, struct column *col) 86 { 87 Rune r; 88 struct line start, end; 89 size_t utflen, rlen; 90 int i; 91 92 start.data = line->data; 93 start.len = line->len; 94 for (i = 1; i < kd->start_column; i++) 95 skipcolumn(&start, 1); 96 if (kd->flags & MOD_STARTB) 97 skipblank(&start); 98 for (utflen = 0; start.len > 1 && utflen < kd->start_char - 1;) { 99 rlen = chartorune(&r, start.data); 100 start.data += rlen; 101 start.len -= rlen; 102 utflen++; 103 } 104 105 end.data = line->data; 106 end.len = line->len; 107 if (kd->end_column) { 108 for (i = 1; i < kd->end_column; i++) 109 skipcolumn(&end, 1); 110 if (kd->flags & MOD_ENDB) 111 skipblank(&end); 112 if (kd->end_char) { 113 for (utflen = 0; end.len > 1 && utflen < kd->end_char;) { 114 rlen = chartorune(&r, end.data); 115 end.data += rlen; 116 end.len -= rlen; 117 utflen++; 118 } 119 } else { 120 skipcolumn(&end, 0); 121 } 122 } else { 123 end.data += end.len - 1; 124 end.len = 1; 125 } 126 col->line.len = MAX(0, end.data - start.data); 127 if (!(col->line.data) || col->cap < col->line.len + 1) { 128 free(col->line.data); 129 col->line.data = emalloc(col->line.len + 1); 130 } 131 memcpy(col->line.data, start.data, col->line.len); 132 col->line.data[col->line.len] = '\0'; 133 } 134 135 static int 136 skipmodcmp(struct line *a, struct line *b, int flags) 137 { 138 Rune r1, r2; 139 size_t offa = 0, offb = 0; 140 141 do { 142 offa += chartorune(&r1, a->data + offa); 143 offb += chartorune(&r2, b->data + offb); 144 145 if (flags & MOD_D && flags & MOD_I) { 146 while (offa < a->len && ((!isblankrune(r1) && 147 !isalnumrune(r1)) || (!isprintrune(r1)))) 148 offa += chartorune(&r1, a->data + offa); 149 while (offb < b->len && ((!isblankrune(r2) && 150 !isalnumrune(r2)) || (!isprintrune(r2)))) 151 offb += chartorune(&r2, b->data + offb); 152 } 153 else if (flags & MOD_D) { 154 while (offa < a->len && !isblankrune(r1) && 155 !isalnumrune(r1)) 156 offa += chartorune(&r1, a->data + offa); 157 while (offb < b->len && !isblankrune(r2) && 158 !isalnumrune(r2)) 159 offb += chartorune(&r2, b->data + offb); 160 } 161 else if (flags & MOD_I) { 162 while (offa < a->len && !isprintrune(r1)) 163 offa += chartorune(&r1, a->data + offa); 164 while (offb < b->len && !isprintrune(r2)) 165 offb += chartorune(&r2, b->data + offb); 166 } 167 if (flags & MOD_F) { 168 r1 = toupperrune(r1); 169 r2 = toupperrune(r2); 170 } 171 } while (r1 && r1 == r2); 172 173 return r1 - r2; 174 } 175 176 static int 177 slinecmp(struct line *a, struct line *b) 178 { 179 int res = 0; 180 double x, y; 181 struct keydef *kd; 182 183 TAILQ_FOREACH(kd, &kdhead, entry) { 184 columns(a, kd, &col1); 185 columns(b, kd, &col2); 186 187 /* if -u is given, don't use default key definition 188 * unless it is the only one */ 189 if (uflag && kd == TAILQ_LAST(&kdhead, kdhead) && 190 TAILQ_LAST(&kdhead, kdhead) != TAILQ_FIRST(&kdhead)) { 191 res = 0; 192 } else if (kd->flags & MOD_N) { 193 x = strtod(col1.line.data, NULL); 194 y = strtod(col2.line.data, NULL); 195 res = (x < y) ? -1 : (x > y); 196 } else if (kd->flags & (MOD_D | MOD_F | MOD_I)) { 197 res = skipmodcmp(&col1.line, &col2.line, kd->flags); 198 } else { 199 res = linecmp(&col1.line, &col2.line); 200 } 201 202 if (kd->flags & MOD_R) 203 res = -res; 204 if (res) 205 break; 206 } 207 208 return res; 209 } 210 211 static int 212 check(FILE *fp, const char *fname) 213 { 214 static struct line prev, cur, tmp; 215 static size_t prevsize, cursize, tmpsize; 216 ssize_t len; 217 218 if (!prev.data) { 219 if ((len = getline(&prev.data, &prevsize, fp)) < 0) 220 eprintf("getline:"); 221 prev.len = len; 222 } 223 while ((len = getline(&cur.data, &cursize, fp)) > 0) { 224 cur.len = len; 225 if (uflag > slinecmp(&cur, &prev)) { 226 if (!Cflag) { 227 weprintf("disorder %s: ", fname); 228 fwrite(cur.data, 1, cur.len, stderr); 229 } 230 return 1; 231 } 232 tmp = cur; 233 tmpsize = cursize; 234 cur = prev; 235 cursize = prevsize; 236 prev = tmp; 237 prevsize = tmpsize; 238 } 239 240 return 0; 241 } 242 243 static int 244 parse_flags(char **s, int *flags, int bflag) 245 { 246 while (isalpha((int)**s)) { 247 switch (*((*s)++)) { 248 case 'b': 249 *flags |= bflag; 250 break; 251 case 'd': 252 *flags |= MOD_D; 253 break; 254 case 'f': 255 *flags |= MOD_F; 256 break; 257 case 'i': 258 *flags |= MOD_I; 259 break; 260 case 'n': 261 *flags |= MOD_N; 262 break; 263 case 'r': 264 *flags |= MOD_R; 265 break; 266 default: 267 return -1; 268 } 269 } 270 271 return 0; 272 } 273 274 static void 275 addkeydef(char *kdstr, int flags) 276 { 277 struct keydef *kd; 278 279 kd = enmalloc(2, sizeof(*kd)); 280 281 /* parse key definition kdstr with format 282 * start_column[.start_char][flags][,end_column[.end_char][flags]] 283 */ 284 kd->start_column = 1; 285 kd->start_char = 1; 286 kd->end_column = 0; /* 0 means end of line */ 287 kd->end_char = 0; /* 0 means end of column */ 288 kd->flags = flags; 289 290 if ((kd->start_column = strtol(kdstr, &kdstr, 10)) < 1) 291 enprintf(2, "invalid start column in key definition\n"); 292 293 if (*kdstr == '.') { 294 if ((kd->start_char = strtol(kdstr + 1, &kdstr, 10)) < 1) 295 enprintf(2, "invalid start character in key " 296 "definition\n"); 297 } 298 if (parse_flags(&kdstr, &kd->flags, MOD_STARTB) < 0) 299 enprintf(2, "invalid start flags in key definition\n"); 300 301 if (*kdstr == ',') { 302 if ((kd->end_column = strtol(kdstr + 1, &kdstr, 10)) < 0) 303 enprintf(2, "invalid end column in key definition\n"); 304 if (*kdstr == '.') { 305 if ((kd->end_char = strtol(kdstr + 1, &kdstr, 10)) < 0) 306 enprintf(2, "invalid end character in key " 307 "definition\n"); 308 } 309 if (parse_flags(&kdstr, &kd->flags, MOD_ENDB) < 0) 310 enprintf(2, "invalid end flags in key definition\n"); 311 } 312 313 if (*kdstr != '\0') 314 enprintf(2, "invalid key definition\n"); 315 316 TAILQ_INSERT_TAIL(&kdhead, kd, entry); 317 } 318 319 static void 320 usage(void) 321 { 322 enprintf(2, "usage: %s [-Cbcdfimnru] [-o outfile] [-t delim] " 323 "[-k def]... [file ...]\n", argv0); 324 } 325 326 int 327 main(int argc, char *argv[]) 328 { 329 FILE *fp, *ofp = stdout; 330 struct linebuf linebuf = EMPTY_LINEBUF; 331 size_t i; 332 int global_flags = 0, ret = 0; 333 char *outfile = NULL; 334 335 ARGBEGIN { 336 case 'C': 337 Cflag = 1; 338 break; 339 case 'b': 340 global_flags |= MOD_STARTB | MOD_ENDB; 341 break; 342 case 'c': 343 cflag = 1; 344 break; 345 case 'd': 346 global_flags |= MOD_D; 347 break; 348 case 'f': 349 global_flags |= MOD_F; 350 break; 351 case 'i': 352 global_flags |= MOD_I; 353 break; 354 case 'k': 355 addkeydef(EARGF(usage()), global_flags); 356 break; 357 case 'm': 358 /* more or less for free, but for performance-reasons, 359 * we should keep this flag in mind and maybe some later 360 * day implement it properly so we don't run out of memory 361 * while merging large sorted files. 362 */ 363 break; 364 case 'n': 365 global_flags |= MOD_N; 366 break; 367 case 'o': 368 outfile = EARGF(usage()); 369 break; 370 case 'r': 371 global_flags |= MOD_R; 372 break; 373 case 't': 374 fieldsep = EARGF(usage()); 375 if (!*fieldsep) 376 eprintf("empty delimiter\n"); 377 fieldseplen = unescape(fieldsep); 378 break; 379 case 'u': 380 uflag = 1; 381 break; 382 default: 383 usage(); 384 } ARGEND 385 386 /* -b shall only apply to custom key definitions */ 387 if (TAILQ_EMPTY(&kdhead) && global_flags) 388 addkeydef("1", global_flags & ~(MOD_STARTB | MOD_ENDB)); 389 addkeydef("1", global_flags & MOD_R); 390 391 if (!argc) { 392 if (Cflag || cflag) { 393 if (check(stdin, "<stdin>") && !ret) 394 ret = 1; 395 } else { 396 getlines(stdin, &linebuf); 397 } 398 } else for (; *argv; argc--, argv++) { 399 if (!strcmp(*argv, "-")) { 400 *argv = "<stdin>"; 401 fp = stdin; 402 } else if (!(fp = fopen(*argv, "r"))) { 403 enprintf(2, "fopen %s:", *argv); 404 continue; 405 } 406 if (Cflag || cflag) { 407 if (check(fp, *argv) && !ret) 408 ret = 1; 409 } else { 410 getlines(fp, &linebuf); 411 } 412 if (fp != stdin && fshut(fp, *argv)) 413 ret = 2; 414 } 415 416 if (!Cflag && !cflag) { 417 if (outfile && !(ofp = fopen(outfile, "w"))) 418 eprintf("fopen %s:", outfile); 419 420 qsort(linebuf.lines, linebuf.nlines, sizeof(*linebuf.lines), 421 (int (*)(const void *, const void *))slinecmp); 422 423 for (i = 0; i < linebuf.nlines; i++) { 424 if (!uflag || i == 0 || 425 slinecmp(&linebuf.lines[i], &linebuf.lines[i - 1])) { 426 fwrite(linebuf.lines[i].data, 1, 427 linebuf.lines[i].len, ofp); 428 } 429 } 430 } 431 432 if (fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>") | 433 fshut(stderr, "<stderr>")) 434 ret = 2; 435 436 return ret; 437 }