toybox/toys/posix/cut.c
<<
>>
Prefs
   1/* cut.c - print selected ranges from a file
   2 *
   3 * Copyright 2016 Rob Landley <rob@landley.net>
   4 *
   5 * http://pubs.opengroup.org/onlinepubs/9699919799/utilities/cut.html
   6 *
   7 * Deviations from posix: added -DF. We can only accept 512 selections, and
   8 * "-" counts as start to end. Using spaces to separate a comma-separated list
   9 * is silly and inconsistent with dd, ps, cp, and mount.
  10 *
  11 * todo: -n, -s with -c
  12
  13USE_CUT(NEWTOY(cut, "b*|c*|f*|F*|C*|O(output-delimiter):d:sDn[!cbfF]", TOYFLAG_USR|TOYFLAG_BIN))
  14
  15config CUT
  16  bool "cut"
  17  default y
  18  help
  19    usage: cut [-Ds] [-bcCfF LIST] [-dO DELIM] [FILE...]
  20
  21    Print selected parts of lines from each FILE to standard output.
  22
  23    Each selection LIST is comma separated, either numbers (counting from 1)
  24    or dash separated ranges (inclusive, with X- meaning to end of line and -X
  25    from start). By default selection ranges are sorted and collated, use -D
  26    to prevent that.
  27
  28    -b  Select bytes
  29    -c  Select UTF-8 characters
  30    -C  Select unicode columns
  31    -d  Use DELIM (default is TAB for -f, run of whitespace for -F)
  32    -D  Don't sort/collate selections or match -fF lines without delimiter
  33    -f  Select fields (words) separated by single DELIM character
  34    -F  Select fields separated by DELIM regex
  35    -O  Output delimiter (default one space for -F, input delim for -f)
  36    -s  Skip lines without delimiters
  37*/
  38#define FOR_cut
  39#include "toys.h"
  40
  41GLOBALS(
  42  char *d, *O;
  43  struct arg_list *select[5]; // we treat them the same, so loop through
  44
  45  unsigned line;
  46  int pairs;
  47  regex_t reg;
  48)
  49
  50// Apply selections to an input line, producing output
  51static void cut_line(char **pline, long len)
  52{
  53  unsigned *pairs = (void *)toybuf, wc;
  54  char *line;
  55  int i, j, k;
  56
  57  if (!pline) return;
  58  line = *pline;
  59  if (len && line[len-1]=='\n') line[--len] = 0;
  60  TT.line++;
  61
  62  // Loop through selections
  63  for (i=0; i<TT.pairs; i++) {
  64    unsigned start = pairs[2*i], end = pairs[(2*i)+1], count;
  65    char *s = line, *ss, *sss;
  66
  67    // when the delimiter is \n output lines.
  68    if (*TT.d == '\n') {
  69      if (TT.line<start || TT.line>end) {
  70        if (i+1 == TT.pairs) return;
  71        continue;
  72      }
  73      goto write_line;
  74    }
  75
  76    // input: start/end position, count=difference between them
  77    // output: s = start of string, len = bytes to output
  78
  79    if (start) start--;
  80    if (start>=len) continue;
  81    if (!end || end>len) end = len;
  82    count = end-start;
  83
  84    // Find start and end of output string for the relevant selection type
  85    if (FLAG(b)) {
  86      if (!FLAG(n)) s += start;
  87      else {
  88        if (end>len) end = len;
  89        for (sss = ss = s; (k = (ss-line))<end;) {
  90          if (0>(j = utf8towc(&wc, ss, len))) ss++;
  91          else {
  92            if (((ss += j)-line)<=end) sss = ss;
  93            if ((ss-line)<=start) s = ss;
  94          }
  95        }
  96        if (!(count = sss-s)) continue;
  97      }
  98    } else if (FLAG(C)) {
  99      // crunch_str() currently assumes that combining characters get
 100      // escaped, to provide an unambiguous visual representation.
 101      // This assumes the input string is null terminated.
 102      if (start) crunch_str(&s, start, 0, 0, 0);
 103      if (!*s) continue;
 104      start = s-line;
 105      ss = s;
 106      crunch_str(&ss, count, 0, 0, 0);
 107      count = ss-s;
 108
 109    } else if (FLAG(c)) {
 110
 111      // Find start
 112      ss = line+len;
 113      while (start && s<ss) {
 114        if (0<=(j = utf8towc(&wc, s, len))) start--;
 115        s += (j<1) ? 1 : j;
 116      }
 117      if (s == ss) continue;
 118
 119      // Find end
 120      end = count;
 121      sss = s;
 122      while (end && sss<ss) {
 123        if (0<=(j = utf8towc(&wc, sss, len))) end--;
 124        sss += (j<1) ? 1 : j;
 125      }
 126      count = sss-s;
 127    } else {
 128      regmatch_t match;
 129
 130      // Loop through skipping appropriate number of fields
 131      for (j = 0; j<2; j++) {
 132        ss = s;
 133        if (j) start = count;
 134        else end = start;
 135        while (*ss && start) {
 136          if (FLAG(f)) {
 137            if (!strchr(TT.d, *ss++)) continue;
 138            if (!--start && j) ss--;
 139          } else {
 140            if (regexec(&TT.reg, ss, 1, &match, REG_NOTBOL|REG_NOTEOL)) {
 141              ss = line+len;
 142              continue;
 143            }
 144            if (!match.rm_eo) break; // zero length match == no delimiter
 145            ss += (!--start && j) ? match.rm_so : match.rm_eo;
 146          }
 147        }
 148        if (!j && !*(s = ss)) break;
 149      }
 150
 151      // If we never encountered even one separator, print whole line (posix!)
 152      if (!j && end == start) {
 153        if (FLAG(D)) break;
 154        if (FLAG(s)) return;
 155write_line:
 156        fwrite(line, len, 1, stdout);
 157        break;
 158      } else if (!*s) continue;
 159      count = ss-s;
 160    }
 161    if (i && TT.O) fputs(TT.O, stdout);
 162    fwrite(s, count, 1, stdout);
 163  }
 164  xputc('\n');
 165}
 166
 167static int compar(unsigned *a, unsigned *b)
 168{
 169  if (*a<*b) return -1;
 170  if (*a>*b) return 1;
 171  if (a[1]<b[1]) return -1;
 172  if (a[1]>b[1]) return 1;
 173
 174  return 0;
 175}
 176
 177// parse A or A-B or A- or -B
 178static char *get_range(void *data, char *str, int len)
 179{
 180  char *end = str;
 181  unsigned *pairs = (void *)toybuf, i;
 182
 183  // Using toybuf[] to store ranges means we can have 512 selections max.
 184  if (TT.pairs == sizeof(toybuf)/sizeof(int)) perror_exit("select limit");
 185  pairs += 2*TT.pairs++;
 186
 187  pairs[1] = UINT_MAX;
 188  for (i = 0; ;i++) {
 189    if (i==2) return end;
 190    if (isdigit(*end)) {
 191      long long ll = estrtol(end, &end, 10);
 192
 193      if (ll<1 || ll>UINT_MAX || errno) return end;
 194      pairs[i] = ll;
 195    }
 196    if (*end++ != '-') break;
 197  }
 198  if (!i) pairs[1] = pairs[0];
 199  if ((end-str)<len) return end;
 200  if (pairs[0]>pairs[1]) return str;
 201
 202  // No error
 203  return 0;
 204}
 205
 206void cut_main(void)
 207{
 208  int i;
 209  char buf[8];
 210
 211  // Parse command line arguments
 212  if ((toys.optflags&(FLAG_s|FLAG_f|FLAG_F))==FLAG_s)
 213    error_exit("-s needs -Ff");
 214  if ((toys.optflags&(FLAG_d|FLAG_f|FLAG_F))==FLAG_d)
 215    error_exit("-d needs -Ff");
 216  if (!TT.d) TT.d = (FLAG(F)) ? "[[:space:]][[:space:]]*" : "\t";
 217  if (FLAG(F)) xregcomp(&TT.reg, TT.d, REG_EXTENDED);
 218  if (!TT.O) {
 219    if (FLAG(F)) TT.O = " ";
 220    else if (FLAG(f)) TT.O = TT.d;
 221  }
 222
 223  // Parse ranges, which are attached to a selection type (only one can be set)
 224  for (i = 0; i<ARRAY_LEN(TT.select); i++) {
 225    sprintf(buf, "bad -%c", "CFfcb"[i]); // reverse order from newtoy optstr
 226    if (TT.select[i]) comma_args(TT.select[i], 0, buf, get_range);
 227  }
 228  if (!TT.pairs) error_exit("no selections");
 229
 230  // Sort and collate selections
 231  if (!FLAG(D)) {
 232    int from, to;
 233    unsigned *pairs = (void *)toybuf;
 234
 235    qsort(toybuf, TT.pairs, 8, (void *)compar);
 236    for (to = 0, from = 2; from/2 < TT.pairs; from += 2) {
 237      if (pairs[from] > pairs[to+1]) {
 238        to += 2;
 239        memcpy(pairs+to, pairs+from, 2*sizeof(unsigned));
 240      } else if (pairs[from+1] > pairs[to+1]) pairs[to+1] = pairs[from+1];
 241    }
 242    TT.pairs = (to/2)+1;
 243  }
 244
 245  // For each argument, loop through lines of file and call cut_line() on each
 246  loopfiles_lines(toys.optargs, cut_line);
 247}
 248