toybox/toys/posix/cut.c
<<
>>
Prefs
   1/* cut.c - print selected ranges from a file
   2 *
   3 * Copyright 2016 Rob Landley <rob@landley.net>
   4 *
   5 * http://pubs.opengroup.org/onlinepubs/9699919799/utilities/cut.html
   6 *
   7 * Deviations from posix: added -DF. We can only accept 512 selections, and
   8 * "-" counts as start to end. Using spaces to separate a comma-separated list
   9 * is silly and inconsistent with dd, ps, cp, and mount.
  10 *
  11 * todo: -n, -s with -c
  12
  13USE_CUT(NEWTOY(cut, "b*|c*|f*|F*|C*|O(output-delimiter):d:sDn[!cbf]", TOYFLAG_USR|TOYFLAG_BIN))
  14
  15config CUT
  16  bool "cut"
  17  default y
  18  help
  19    usage: cut [-Ds] [-bcfF LIST] [-dO DELIM] [FILE...]
  20
  21    Print selected parts of lines from each FILE to standard output.
  22
  23    Each selection LIST is comma separated, either numbers (counting from 1)
  24    or dash separated ranges (inclusive, with X- meaning to end of line and -X
  25    from start). By default selection ranges are sorted and collated, use -D
  26    to prevent that.
  27
  28    -b  select bytes
  29    -c  select UTF-8 characters
  30    -C  select unicode columns
  31    -d  use DELIM (default is TAB for -f, run of whitespace for -F)
  32    -D  Don't sort/collate selections
  33    -f  select fields (words) separated by single DELIM character
  34    -F  select fields separated by DELIM regex
  35    -O  output delimiter (default one space for -F, input delim for -f)
  36    -s  skip lines without delimiters
  37*/
  38#define FOR_cut
  39#include "toys.h"
  40
  41GLOBALS(
  42  char *d;
  43  char *O;
  44  struct arg_list *select[5]; // we treat them the same, so loop through
  45
  46  int pairs;
  47  regex_t reg;
  48)
  49
  50// Return number of bytes to start of first column fitting in columns
  51// invalid sequences are skipped/ignored
  52int unicolumns(char *start, unsigned columns)
  53{
  54  int i, j = 0;
  55  wchar_t wc;
  56  char *s = start, *ss = start;
  57
  58  // Skip start, rounding down if we hit a multicolumn char
  59  while (j<columns && (i = utf8towc(&wc, s, 4))) {
  60    if (i<0) s++;
  61    else {
  62      s += i;
  63      if (0<(i = wcwidth(wc))) {
  64        if ((j += i)>columns) break;
  65        ss = s;
  66      }
  67    }
  68  }
  69
  70  return ss-start;
  71}
  72
  73
  74// Apply selections to an input line, producing output
  75static void cut_line(char **pline, long len)
  76{
  77  unsigned *pairs = (void *)toybuf;
  78  char *line = *pline;
  79  int i, j;
  80
  81  if (len && line[len-1]=='\n') line[--len] = 0;
  82
  83  // Loop through selections
  84  for (i=0; i<TT.pairs; i++) {
  85    unsigned start = pairs[2*i], end = pairs[(2*i)+1], count;
  86    char *s = line, *ss;
  87
  88    // input: start/end position, count=difference between them
  89    // output: s = start of string, len = bytes to output
  90
  91    if (start) start--;
  92    if (start>=len) continue;
  93    if (!end || end>len) end = len;
  94    count = end-start;
  95
  96    // Find start and end of output string for the relevant selection type
  97    if (toys.optflags&FLAG_b) s += start;
  98    else if (toys.optflags&FLAG_C) {
  99      // crunch_str() currently assumes that combining characters get
 100      // escaped, to provide an unambiguous visual representation.
 101      // This assumes the input string is null terminated.
 102      //if (start) crunch_str(&s, start, 0, 0, 0);
 103      //if (!*s) continue;
 104      //start = s-line;
 105      //ss = s;
 106      //crunch_str(&ss, count, 0, 0, 0);
 107      //count = ss-s;
 108
 109      s += unicolumns(s, start);
 110      count = unicolumns(s, end-start);
 111    } else if (toys.optflags&FLAG_c) {
 112      wchar_t wc;
 113      char *sss;
 114
 115      // Find start
 116      ss = line+len;
 117      while (start && s<ss) {
 118        if (0<=(j = utf8towc(&wc, s, len))) start--;
 119        s += (j<1) ? 1 : j;
 120      }
 121      if (s == ss) continue;
 122
 123      // Find end
 124      end = count;
 125      sss = s;
 126      while (end && sss<ss) {
 127        if (0<=(j = utf8towc(&wc, sss, len))) end--;
 128        sss += (j<1) ? 1 : j;
 129      }
 130      count = sss-s;
 131    } else {
 132      regmatch_t match;
 133
 134      // Loop through skipping appropriate number of fields
 135      for (j = 0; j<2; j++) {
 136        ss = s;
 137        if (j) start = count;
 138        else end = start;
 139        while (*ss && start) {
 140          if (toys.optflags&FLAG_f) {
 141            if (!strchr(TT.d, *ss++)) continue;
 142            if (!--start && j) ss--;
 143          } else {
 144            if (regexec(&TT.reg, ss, 1, &match, REG_NOTBOL|REG_NOTEOL)) {
 145              ss = line+len;
 146              continue;
 147            }
 148            if (!match.rm_eo) break; // zero length match == no delimiter
 149            ss += (!--start && j) ? match.rm_so : match.rm_eo;
 150          }
 151        }
 152        if (!j && !*(s = ss)) break;
 153      }
 154
 155      // If we never encountered even one separator, print whole line (posix!)
 156      if (!j && end == start) {
 157        if (toys.optflags&FLAG_s) return;
 158        fwrite(line, len, 1, stdout);
 159        break;
 160      } else if (!*s) continue;
 161      count = ss-s;
 162    }
 163    if (i && TT.O) fputs(TT.O, stdout);
 164    fwrite(s, count, 1, stdout);
 165  }
 166  xputc('\n');
 167}
 168
 169static int compar(unsigned *a, unsigned *b)
 170{
 171  if (*a<*b) return -1;
 172  if (*a>*b) return 1;
 173  if (a[1]<b[1]) return -1;
 174  if (a[1]>b[1]) return 1;
 175
 176  return 0;
 177}
 178
 179// parse A or A-B or A- or -B
 180static char *get_range(void *data, char *str, int len)
 181{
 182  char *end = str;
 183  unsigned *pairs = (void *)toybuf, i;
 184
 185  // Using toybuf[] to store ranges means we can have 512 selections max.
 186  if (TT.pairs == sizeof(toybuf)/sizeof(int)) perror_exit("select limit");
 187  pairs += 2*TT.pairs++;
 188
 189  pairs[1] = UINT_MAX;
 190  for (i = 0; ;i++) {
 191    if (i==2) return end;
 192    if (isdigit(*end)) {
 193      long long ll = estrtol(end, &end, 10);
 194
 195      if (ll<1 || ll>UINT_MAX || errno) return end;
 196      pairs[i] = ll;
 197    }
 198    if (*end++ != '-') break;
 199  }
 200  if (!i) pairs[1] = pairs[0];
 201  if ((end-str)<len) return end;
 202  if (pairs[0]>pairs[1]) return str;
 203
 204  // No error
 205  return 0;
 206}
 207
 208void cut_main(void)
 209{
 210  int i;
 211  char buf[8];
 212
 213  // Parse command line arguments
 214  if ((toys.optflags&(FLAG_s|FLAG_f|FLAG_F))==FLAG_s)
 215    error_exit("-s needs -Ff");
 216  if ((toys.optflags&(FLAG_d|FLAG_f|FLAG_F))==FLAG_d)
 217    error_exit("-d needs -Ff");
 218  if (!TT.d) TT.d = (toys.optflags&FLAG_F) ? "[[:space:]][[:space:]]*" : "\t";
 219  if (toys.optflags&FLAG_F) xregcomp(&TT.reg, TT.d, REG_EXTENDED);
 220  if (!TT.O) {
 221    if (toys.optflags&FLAG_F) TT.O = " ";
 222    else if (toys.optflags&FLAG_f) TT.O = TT.d;
 223  }
 224
 225  // Parse ranges, which are attached to a selection type (only one can be set)
 226  for (i = 0; i<ARRAY_LEN(TT.select); i++) {
 227    sprintf(buf, "bad -%c", "CFfcb"[i]); // reverse order from newtoy optstr
 228    if (TT.select[i]) comma_args(TT.select[i], 0, buf, get_range);
 229  }
 230  if (!TT.pairs) error_exit("no selections");
 231
 232  // Sort and collate selections
 233  if (!(toys.optflags&FLAG_D)) {
 234    int from, to;
 235    unsigned *pairs = (void *)toybuf;
 236
 237    qsort(toybuf, TT.pairs, 8, (void *)compar);
 238    for (to = 0, from = 2; from/2 < TT.pairs; from += 2) {
 239      if (pairs[from] > pairs[to+1]) {
 240        to += 2;
 241        memcpy(pairs+to, pairs+from, 2*sizeof(unsigned));
 242      } else if (pairs[from+1] > pairs[to+1]) pairs[to+1] = pairs[from+1];
 243    }
 244    TT.pairs = (to/2)+1;
 245  }
 246
 247  // For each argument, loop through lines of file and call cut_line() on each
 248  loopfiles_lines(toys.optargs, cut_line);
 249}
 250