toybox/toys/posix/cut.c
<<
>>
Prefs
   1/* cut.c - print selected ranges from a file
   2 *
   3 * Copyright 2016 Rob Landley <rob@landley.net>
   4 *
   5 * http://pubs.opengroup.org/onlinepubs/9699919799/utilities/cut.html
   6 *
   7 * Deviations from posix: added -DF. We can only accept 512 selections, and
   8 * "-" counts as start to end. Using spaces to separate a comma-separated list
   9 * is silly and inconsistent with dd, ps, cp, and mount.
  10 *
  11 * todo: -n, -s with -c
  12
  13USE_CUT(NEWTOY(cut, "b*|c*|f*|F*|C*|O(output-delimiter):d:sDn[!cbf]", TOYFLAG_USR|TOYFLAG_BIN))
  14
  15config CUT
  16  bool "cut"
  17  default y
  18  help
  19    usage: cut [-Ds] [-bcfF LIST] [-dO DELIM] [FILE...]
  20
  21    Print selected parts of lines from each FILE to standard output.
  22
  23    Each selection LIST is comma separated, either numbers (counting from 1)
  24    or dash separated ranges (inclusive, with X- meaning to end of line and -X
  25    from start). By default selection ranges are sorted and collated, use -D
  26    to prevent that.
  27
  28    -b  Select bytes
  29    -c  Select UTF-8 characters
  30    -C  Select unicode columns
  31    -d  Use DELIM (default is TAB for -f, run of whitespace for -F)
  32    -D  Don't sort/collate selections or match -fF lines without delimiter
  33    -f  Select fields (words) separated by single DELIM character
  34    -F  Select fields separated by DELIM regex
  35    -O  Output delimiter (default one space for -F, input delim for -f)
  36    -s  Skip lines without delimiters
  37*/
  38#define FOR_cut
  39#include "toys.h"
  40
  41GLOBALS(
  42  char *d, *O;
  43  struct arg_list *select[5]; // we treat them the same, so loop through
  44
  45  int pairs;
  46  regex_t reg;
  47)
  48
  49
  50// Apply selections to an input line, producing output
  51static void cut_line(char **pline, long len)
  52{
  53  unsigned *pairs = (void *)toybuf;
  54  char *line;
  55  int i, j;
  56
  57  if (!pline) return;
  58  line = *pline;
  59  if (len && line[len-1]=='\n') line[--len] = 0;
  60
  61  // Loop through selections
  62  for (i=0; i<TT.pairs; i++) {
  63    unsigned start = pairs[2*i], end = pairs[(2*i)+1], count;
  64    char *s = line, *ss;
  65
  66    // input: start/end position, count=difference between them
  67    // output: s = start of string, len = bytes to output
  68
  69    if (start) start--;
  70    if (start>=len) continue;
  71    if (!end || end>len) end = len;
  72    count = end-start;
  73
  74    // Find start and end of output string for the relevant selection type
  75    if (toys.optflags&FLAG_b) s += start;
  76    else if (toys.optflags&FLAG_C) {
  77      // crunch_str() currently assumes that combining characters get
  78      // escaped, to provide an unambiguous visual representation.
  79      // This assumes the input string is null terminated.
  80      if (start) crunch_str(&s, start, 0, 0, 0);
  81      if (!*s) continue;
  82      start = s-line;
  83      ss = s;
  84      crunch_str(&ss, count, 0, 0, 0);
  85      count = ss-s;
  86
  87    } else if (toys.optflags&FLAG_c) {
  88      wchar_t wc;
  89      char *sss;
  90
  91      // Find start
  92      ss = line+len;
  93      while (start && s<ss) {
  94        if (0<=(j = utf8towc(&wc, s, len))) start--;
  95        s += (j<1) ? 1 : j;
  96      }
  97      if (s == ss) continue;
  98
  99      // Find end
 100      end = count;
 101      sss = s;
 102      while (end && sss<ss) {
 103        if (0<=(j = utf8towc(&wc, sss, len))) end--;
 104        sss += (j<1) ? 1 : j;
 105      }
 106      count = sss-s;
 107    } else {
 108      regmatch_t match;
 109
 110      // Loop through skipping appropriate number of fields
 111      for (j = 0; j<2; j++) {
 112        ss = s;
 113        if (j) start = count;
 114        else end = start;
 115        while (*ss && start) {
 116          if (toys.optflags&FLAG_f) {
 117            if (!strchr(TT.d, *ss++)) continue;
 118            if (!--start && j) ss--;
 119          } else {
 120            if (regexec(&TT.reg, ss, 1, &match, REG_NOTBOL|REG_NOTEOL)) {
 121              ss = line+len;
 122              continue;
 123            }
 124            if (!match.rm_eo) break; // zero length match == no delimiter
 125            ss += (!--start && j) ? match.rm_so : match.rm_eo;
 126          }
 127        }
 128        if (!j && !*(s = ss)) break;
 129      }
 130
 131      // If we never encountered even one separator, print whole line (posix!)
 132      if (!j && end == start) {
 133        if (toys.optflags&FLAG_D) break;
 134        if (toys.optflags&FLAG_s) return;
 135        fwrite(line, len, 1, stdout);
 136        break;
 137      } else if (!*s) continue;
 138      count = ss-s;
 139    }
 140    if (i && TT.O) fputs(TT.O, stdout);
 141    fwrite(s, count, 1, stdout);
 142  }
 143  xputc('\n');
 144}
 145
 146static int compar(unsigned *a, unsigned *b)
 147{
 148  if (*a<*b) return -1;
 149  if (*a>*b) return 1;
 150  if (a[1]<b[1]) return -1;
 151  if (a[1]>b[1]) return 1;
 152
 153  return 0;
 154}
 155
 156// parse A or A-B or A- or -B
 157static char *get_range(void *data, char *str, int len)
 158{
 159  char *end = str;
 160  unsigned *pairs = (void *)toybuf, i;
 161
 162  // Using toybuf[] to store ranges means we can have 512 selections max.
 163  if (TT.pairs == sizeof(toybuf)/sizeof(int)) perror_exit("select limit");
 164  pairs += 2*TT.pairs++;
 165
 166  pairs[1] = UINT_MAX;
 167  for (i = 0; ;i++) {
 168    if (i==2) return end;
 169    if (isdigit(*end)) {
 170      long long ll = estrtol(end, &end, 10);
 171
 172      if (ll<1 || ll>UINT_MAX || errno) return end;
 173      pairs[i] = ll;
 174    }
 175    if (*end++ != '-') break;
 176  }
 177  if (!i) pairs[1] = pairs[0];
 178  if ((end-str)<len) return end;
 179  if (pairs[0]>pairs[1]) return str;
 180
 181  // No error
 182  return 0;
 183}
 184
 185void cut_main(void)
 186{
 187  int i;
 188  char buf[8];
 189
 190  // Parse command line arguments
 191  if ((toys.optflags&(FLAG_s|FLAG_f|FLAG_F))==FLAG_s)
 192    error_exit("-s needs -Ff");
 193  if ((toys.optflags&(FLAG_d|FLAG_f|FLAG_F))==FLAG_d)
 194    error_exit("-d needs -Ff");
 195  if (!TT.d) TT.d = (toys.optflags&FLAG_F) ? "[[:space:]][[:space:]]*" : "\t";
 196  if (toys.optflags&FLAG_F) xregcomp(&TT.reg, TT.d, REG_EXTENDED);
 197  if (!TT.O) {
 198    if (toys.optflags&FLAG_F) TT.O = " ";
 199    else if (toys.optflags&FLAG_f) TT.O = TT.d;
 200  }
 201
 202  // Parse ranges, which are attached to a selection type (only one can be set)
 203  for (i = 0; i<ARRAY_LEN(TT.select); i++) {
 204    sprintf(buf, "bad -%c", "CFfcb"[i]); // reverse order from newtoy optstr
 205    if (TT.select[i]) comma_args(TT.select[i], 0, buf, get_range);
 206  }
 207  if (!TT.pairs) error_exit("no selections");
 208
 209  // Sort and collate selections
 210  if (!(toys.optflags&FLAG_D)) {
 211    int from, to;
 212    unsigned *pairs = (void *)toybuf;
 213
 214    qsort(toybuf, TT.pairs, 8, (void *)compar);
 215    for (to = 0, from = 2; from/2 < TT.pairs; from += 2) {
 216      if (pairs[from] > pairs[to+1]) {
 217        to += 2;
 218        memcpy(pairs+to, pairs+from, 2*sizeof(unsigned));
 219      } else if (pairs[from+1] > pairs[to+1]) pairs[to+1] = pairs[from+1];
 220    }
 221    TT.pairs = (to/2)+1;
 222  }
 223
 224  // For each argument, loop through lines of file and call cut_line() on each
 225  loopfiles_lines(toys.optargs, cut_line);
 226}
 227