toybox/toys/posix/cut.c
<<
>>
Prefs
   1/* cut.c - print selected ranges from a file
   2 *
   3 * Copyright 2016 Rob Landley <rob@landley.net>
   4 *
   5 * http://pubs.opengroup.org/onlinepubs/9699919799/utilities/cut.html
   6 *
   7 * Deviations from posix: added -DF. We can only accept 512 selections, and
   8 * "-" counts as start to end. Using spaces to separate a comma-separated list
   9 * is silly and inconsistent with dd, ps, cp, and mount.
  10 *
  11 * todo: -n, -s with -c
  12
  13USE_CUT(NEWTOY(cut, "b*|c*|f*|F*|C*|O(output-delimiter):d:sDn[!cbf]", TOYFLAG_USR|TOYFLAG_BIN))
  14
  15config CUT
  16  bool "cut"
  17  default y
  18  help
  19    usage: cut [-Ds] [-bcfF LIST] [-dO DELIM] [FILE...]
  20
  21    Print selected parts of lines from each FILE to standard output.
  22
  23    Each selection LIST is comma separated, either numbers (counting from 1)
  24    or dash separated ranges (inclusive, with X- meaning to end of line and -X
  25    from start). By default selection ranges are sorted and collated, use -D
  26    to prevent that.
  27
  28    -b  select bytes
  29    -c  select UTF-8 characters
  30    -C  select unicode columns
  31    -d  use DELIM (default is TAB for -f, run of whitespace for -F)
  32    -D  Don't sort/collate selections or match -fF lines without delimiter
  33    -f  select fields (words) separated by single DELIM character
  34    -F  select fields separated by DELIM regex
  35    -O  output delimiter (default one space for -F, input delim for -f)
  36    -s  skip lines without delimiters
  37*/
  38#define FOR_cut
  39#include "toys.h"
  40
  41GLOBALS(
  42  char *d;
  43  char *O;
  44  struct arg_list *select[5]; // we treat them the same, so loop through
  45
  46  int pairs;
  47  regex_t reg;
  48)
  49
  50// Return number of bytes to start of first column fitting in columns
  51// invalid sequences are skipped/ignored
  52int unicolumns(char *start, unsigned columns)
  53{
  54  int i, j = 0;
  55  wchar_t wc;
  56  char *s = start, *ss = start;
  57
  58  // Skip start, rounding down if we hit a multicolumn char
  59  while (j<columns && (i = utf8towc(&wc, s, 4))) {
  60    if (i<0) s++;
  61    else {
  62      s += i;
  63      if (0<(i = wcwidth(wc))) {
  64        if ((j += i)>columns) break;
  65        ss = s;
  66      }
  67    }
  68  }
  69
  70  return ss-start;
  71}
  72
  73// Apply selections to an input line, producing output
  74static void cut_line(char **pline, long len)
  75{
  76  unsigned *pairs = (void *)toybuf;
  77  char *line;
  78  int i, j;
  79
  80  if (!pline) return;
  81  line = *pline;
  82  if (len && line[len-1]=='\n') line[--len] = 0;
  83
  84  // Loop through selections
  85  for (i=0; i<TT.pairs; i++) {
  86    unsigned start = pairs[2*i], end = pairs[(2*i)+1], count;
  87    char *s = line, *ss;
  88
  89    // input: start/end position, count=difference between them
  90    // output: s = start of string, len = bytes to output
  91
  92    if (start) start--;
  93    if (start>=len) continue;
  94    if (!end || end>len) end = len;
  95    count = end-start;
  96
  97    // Find start and end of output string for the relevant selection type
  98    if (toys.optflags&FLAG_b) s += start;
  99    else if (toys.optflags&FLAG_C) {
 100      // crunch_str() currently assumes that combining characters get
 101      // escaped, to provide an unambiguous visual representation.
 102      // This assumes the input string is null terminated.
 103      //if (start) crunch_str(&s, start, 0, 0, 0);
 104      //if (!*s) continue;
 105      //start = s-line;
 106      //ss = s;
 107      //crunch_str(&ss, count, 0, 0, 0);
 108      //count = ss-s;
 109
 110      s += unicolumns(s, start);
 111      count = unicolumns(s, end-start);
 112    } else if (toys.optflags&FLAG_c) {
 113      wchar_t wc;
 114      char *sss;
 115
 116      // Find start
 117      ss = line+len;
 118      while (start && s<ss) {
 119        if (0<=(j = utf8towc(&wc, s, len))) start--;
 120        s += (j<1) ? 1 : j;
 121      }
 122      if (s == ss) continue;
 123
 124      // Find end
 125      end = count;
 126      sss = s;
 127      while (end && sss<ss) {
 128        if (0<=(j = utf8towc(&wc, sss, len))) end--;
 129        sss += (j<1) ? 1 : j;
 130      }
 131      count = sss-s;
 132    } else {
 133      regmatch_t match;
 134
 135      // Loop through skipping appropriate number of fields
 136      for (j = 0; j<2; j++) {
 137        ss = s;
 138        if (j) start = count;
 139        else end = start;
 140        while (*ss && start) {
 141          if (toys.optflags&FLAG_f) {
 142            if (!strchr(TT.d, *ss++)) continue;
 143            if (!--start && j) ss--;
 144          } else {
 145            if (regexec(&TT.reg, ss, 1, &match, REG_NOTBOL|REG_NOTEOL)) {
 146              ss = line+len;
 147              continue;
 148            }
 149            if (!match.rm_eo) break; // zero length match == no delimiter
 150            ss += (!--start && j) ? match.rm_so : match.rm_eo;
 151          }
 152        }
 153        if (!j && !*(s = ss)) break;
 154      }
 155
 156      // If we never encountered even one separator, print whole line (posix!)
 157      if (!j && end == start) {
 158        if (toys.optflags&FLAG_D) break;
 159        if (toys.optflags&FLAG_s) return;
 160        fwrite(line, len, 1, stdout);
 161        break;
 162      } else if (!*s) continue;
 163      count = ss-s;
 164    }
 165    if (i && TT.O) fputs(TT.O, stdout);
 166    fwrite(s, count, 1, stdout);
 167  }
 168  xputc('\n');
 169}
 170
 171static int compar(unsigned *a, unsigned *b)
 172{
 173  if (*a<*b) return -1;
 174  if (*a>*b) return 1;
 175  if (a[1]<b[1]) return -1;
 176  if (a[1]>b[1]) return 1;
 177
 178  return 0;
 179}
 180
 181// parse A or A-B or A- or -B
 182static char *get_range(void *data, char *str, int len)
 183{
 184  char *end = str;
 185  unsigned *pairs = (void *)toybuf, i;
 186
 187  // Using toybuf[] to store ranges means we can have 512 selections max.
 188  if (TT.pairs == sizeof(toybuf)/sizeof(int)) perror_exit("select limit");
 189  pairs += 2*TT.pairs++;
 190
 191  pairs[1] = UINT_MAX;
 192  for (i = 0; ;i++) {
 193    if (i==2) return end;
 194    if (isdigit(*end)) {
 195      long long ll = estrtol(end, &end, 10);
 196
 197      if (ll<1 || ll>UINT_MAX || errno) return end;
 198      pairs[i] = ll;
 199    }
 200    if (*end++ != '-') break;
 201  }
 202  if (!i) pairs[1] = pairs[0];
 203  if ((end-str)<len) return end;
 204  if (pairs[0]>pairs[1]) return str;
 205
 206  // No error
 207  return 0;
 208}
 209
 210void cut_main(void)
 211{
 212  int i;
 213  char buf[8];
 214
 215  // Parse command line arguments
 216  if ((toys.optflags&(FLAG_s|FLAG_f|FLAG_F))==FLAG_s)
 217    error_exit("-s needs -Ff");
 218  if ((toys.optflags&(FLAG_d|FLAG_f|FLAG_F))==FLAG_d)
 219    error_exit("-d needs -Ff");
 220  if (!TT.d) TT.d = (toys.optflags&FLAG_F) ? "[[:space:]][[:space:]]*" : "\t";
 221  if (toys.optflags&FLAG_F) xregcomp(&TT.reg, TT.d, REG_EXTENDED);
 222  if (!TT.O) {
 223    if (toys.optflags&FLAG_F) TT.O = " ";
 224    else if (toys.optflags&FLAG_f) TT.O = TT.d;
 225  }
 226
 227  // Parse ranges, which are attached to a selection type (only one can be set)
 228  for (i = 0; i<ARRAY_LEN(TT.select); i++) {
 229    sprintf(buf, "bad -%c", "CFfcb"[i]); // reverse order from newtoy optstr
 230    if (TT.select[i]) comma_args(TT.select[i], 0, buf, get_range);
 231  }
 232  if (!TT.pairs) error_exit("no selections");
 233
 234  // Sort and collate selections
 235  if (!(toys.optflags&FLAG_D)) {
 236    int from, to;
 237    unsigned *pairs = (void *)toybuf;
 238
 239    qsort(toybuf, TT.pairs, 8, (void *)compar);
 240    for (to = 0, from = 2; from/2 < TT.pairs; from += 2) {
 241      if (pairs[from] > pairs[to+1]) {
 242        to += 2;
 243        memcpy(pairs+to, pairs+from, 2*sizeof(unsigned));
 244      } else if (pairs[from+1] > pairs[to+1]) pairs[to+1] = pairs[from+1];
 245    }
 246    TT.pairs = (to/2)+1;
 247  }
 248
 249  // For each argument, loop through lines of file and call cut_line() on each
 250  loopfiles_lines(toys.optargs, cut_line);
 251}
 252