toybox/toys/posix/cut.c
<<
>>
Prefs
   1/* cut.c - print selected ranges from a file
   2 *
   3 * Copyright 2016 Rob Landley <rob@landley.net>
   4 *
   5 * http://pubs.opengroup.org/onlinepubs/9699919799/utilities/cut.html
   6 *
   7 * Deviations from posix: added -DF. We can only accept 512 selections, and
   8 * "-" counts as start to end. Using spaces to separate a comma-separated list
   9 * is silly and inconsistent with dd, ps, cp, and mount.
  10 *
  11 * todo: -n, -s with -c
  12
  13USE_CUT(NEWTOY(cut, "b*|c*|f*|F*|C*|O(output-delimiter):d:sDn[!cbf]", TOYFLAG_USR|TOYFLAG_BIN))
  14
  15config CUT
  16  bool "cut"
  17  default y
  18  help
  19    usage: cut [-Ds] [-bcfF LIST] [-dO DELIM] [FILE...]
  20
  21    Print selected parts of lines from each FILE to standard output.
  22
  23    Each selection LIST is comma separated, either numbers (counting from 1)
  24    or dash separated ranges (inclusive, with X- meaning to end of line and -X
  25    from start). By default selection ranges are sorted and collated, use -D
  26    to prevent that.
  27
  28    -b  Select bytes
  29    -c  Select UTF-8 characters
  30    -C  Select unicode columns
  31    -d  Use DELIM (default is TAB for -f, run of whitespace for -F)
  32    -D  Don't sort/collate selections or match -fF lines without delimiter
  33    -f  Select fields (words) separated by single DELIM character
  34    -F  Select fields separated by DELIM regex
  35    -O  Output delimiter (default one space for -F, input delim for -f)
  36    -s  Skip lines without delimiters
  37*/
  38#define FOR_cut
  39#include "toys.h"
  40
  41GLOBALS(
  42  char *d, *O;
  43  struct arg_list *select[5]; // we treat them the same, so loop through
  44
  45  int pairs;
  46  regex_t reg;
  47)
  48
  49// Return number of bytes to start of first column fitting in columns
  50// invalid sequences are skipped/ignored
  51int unicolumns(char *start, unsigned columns)
  52{
  53  int i, j = 0;
  54  wchar_t wc;
  55  char *s = start, *ss = start;
  56
  57  // Skip start, rounding down if we hit a multicolumn char
  58  while (j<columns && (i = utf8towc(&wc, s, 4))) {
  59    if (i<0) s++;
  60    else {
  61      s += i;
  62      if (0<(i = wcwidth(wc))) {
  63        if ((j += i)>columns) break;
  64        ss = s;
  65      }
  66    }
  67  }
  68
  69  return ss-start;
  70}
  71
  72// Apply selections to an input line, producing output
  73static void cut_line(char **pline, long len)
  74{
  75  unsigned *pairs = (void *)toybuf;
  76  char *line;
  77  int i, j;
  78
  79  if (!pline) return;
  80  line = *pline;
  81  if (len && line[len-1]=='\n') line[--len] = 0;
  82
  83  // Loop through selections
  84  for (i=0; i<TT.pairs; i++) {
  85    unsigned start = pairs[2*i], end = pairs[(2*i)+1], count;
  86    char *s = line, *ss;
  87
  88    // input: start/end position, count=difference between them
  89    // output: s = start of string, len = bytes to output
  90
  91    if (start) start--;
  92    if (start>=len) continue;
  93    if (!end || end>len) end = len;
  94    count = end-start;
  95
  96    // Find start and end of output string for the relevant selection type
  97    if (toys.optflags&FLAG_b) s += start;
  98    else if (toys.optflags&FLAG_C) {
  99      // crunch_str() currently assumes that combining characters get
 100      // escaped, to provide an unambiguous visual representation.
 101      // This assumes the input string is null terminated.
 102      //if (start) crunch_str(&s, start, 0, 0, 0);
 103      //if (!*s) continue;
 104      //start = s-line;
 105      //ss = s;
 106      //crunch_str(&ss, count, 0, 0, 0);
 107      //count = ss-s;
 108
 109      s += unicolumns(s, start);
 110      count = unicolumns(s, end-start);
 111    } else if (toys.optflags&FLAG_c) {
 112      wchar_t wc;
 113      char *sss;
 114
 115      // Find start
 116      ss = line+len;
 117      while (start && s<ss) {
 118        if (0<=(j = utf8towc(&wc, s, len))) start--;
 119        s += (j<1) ? 1 : j;
 120      }
 121      if (s == ss) continue;
 122
 123      // Find end
 124      end = count;
 125      sss = s;
 126      while (end && sss<ss) {
 127        if (0<=(j = utf8towc(&wc, sss, len))) end--;
 128        sss += (j<1) ? 1 : j;
 129      }
 130      count = sss-s;
 131    } else {
 132      regmatch_t match;
 133
 134      // Loop through skipping appropriate number of fields
 135      for (j = 0; j<2; j++) {
 136        ss = s;
 137        if (j) start = count;
 138        else end = start;
 139        while (*ss && start) {
 140          if (toys.optflags&FLAG_f) {
 141            if (!strchr(TT.d, *ss++)) continue;
 142            if (!--start && j) ss--;
 143          } else {
 144            if (regexec(&TT.reg, ss, 1, &match, REG_NOTBOL|REG_NOTEOL)) {
 145              ss = line+len;
 146              continue;
 147            }
 148            if (!match.rm_eo) break; // zero length match == no delimiter
 149            ss += (!--start && j) ? match.rm_so : match.rm_eo;
 150          }
 151        }
 152        if (!j && !*(s = ss)) break;
 153      }
 154
 155      // If we never encountered even one separator, print whole line (posix!)
 156      if (!j && end == start) {
 157        if (toys.optflags&FLAG_D) break;
 158        if (toys.optflags&FLAG_s) return;
 159        fwrite(line, len, 1, stdout);
 160        break;
 161      } else if (!*s) continue;
 162      count = ss-s;
 163    }
 164    if (i && TT.O) fputs(TT.O, stdout);
 165    fwrite(s, count, 1, stdout);
 166  }
 167  xputc('\n');
 168}
 169
 170static int compar(unsigned *a, unsigned *b)
 171{
 172  if (*a<*b) return -1;
 173  if (*a>*b) return 1;
 174  if (a[1]<b[1]) return -1;
 175  if (a[1]>b[1]) return 1;
 176
 177  return 0;
 178}
 179
 180// parse A or A-B or A- or -B
 181static char *get_range(void *data, char *str, int len)
 182{
 183  char *end = str;
 184  unsigned *pairs = (void *)toybuf, i;
 185
 186  // Using toybuf[] to store ranges means we can have 512 selections max.
 187  if (TT.pairs == sizeof(toybuf)/sizeof(int)) perror_exit("select limit");
 188  pairs += 2*TT.pairs++;
 189
 190  pairs[1] = UINT_MAX;
 191  for (i = 0; ;i++) {
 192    if (i==2) return end;
 193    if (isdigit(*end)) {
 194      long long ll = estrtol(end, &end, 10);
 195
 196      if (ll<1 || ll>UINT_MAX || errno) return end;
 197      pairs[i] = ll;
 198    }
 199    if (*end++ != '-') break;
 200  }
 201  if (!i) pairs[1] = pairs[0];
 202  if ((end-str)<len) return end;
 203  if (pairs[0]>pairs[1]) return str;
 204
 205  // No error
 206  return 0;
 207}
 208
 209void cut_main(void)
 210{
 211  int i;
 212  char buf[8];
 213
 214  // Parse command line arguments
 215  if ((toys.optflags&(FLAG_s|FLAG_f|FLAG_F))==FLAG_s)
 216    error_exit("-s needs -Ff");
 217  if ((toys.optflags&(FLAG_d|FLAG_f|FLAG_F))==FLAG_d)
 218    error_exit("-d needs -Ff");
 219  if (!TT.d) TT.d = (toys.optflags&FLAG_F) ? "[[:space:]][[:space:]]*" : "\t";
 220  if (toys.optflags&FLAG_F) xregcomp(&TT.reg, TT.d, REG_EXTENDED);
 221  if (!TT.O) {
 222    if (toys.optflags&FLAG_F) TT.O = " ";
 223    else if (toys.optflags&FLAG_f) TT.O = TT.d;
 224  }
 225
 226  // Parse ranges, which are attached to a selection type (only one can be set)
 227  for (i = 0; i<ARRAY_LEN(TT.select); i++) {
 228    sprintf(buf, "bad -%c", "CFfcb"[i]); // reverse order from newtoy optstr
 229    if (TT.select[i]) comma_args(TT.select[i], 0, buf, get_range);
 230  }
 231  if (!TT.pairs) error_exit("no selections");
 232
 233  // Sort and collate selections
 234  if (!(toys.optflags&FLAG_D)) {
 235    int from, to;
 236    unsigned *pairs = (void *)toybuf;
 237
 238    qsort(toybuf, TT.pairs, 8, (void *)compar);
 239    for (to = 0, from = 2; from/2 < TT.pairs; from += 2) {
 240      if (pairs[from] > pairs[to+1]) {
 241        to += 2;
 242        memcpy(pairs+to, pairs+from, 2*sizeof(unsigned));
 243      } else if (pairs[from+1] > pairs[to+1]) pairs[to+1] = pairs[from+1];
 244    }
 245    TT.pairs = (to/2)+1;
 246  }
 247
 248  // For each argument, loop through lines of file and call cut_line() on each
 249  loopfiles_lines(toys.optargs, cut_line);
 250}
 251