toybox/toys/posix/sed.c
<<
>>
Prefs
   1/* sed.c - stream editor. Thing that does s/// and other stuff.
   2 *
   3 * Copyright 2014 Rob Landley <rob@landley.net>
   4 *
   5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
   6 *
   7 * TODO: lines > 2G could wrap signed int length counters. Not just getline()
   8 * but N and s///
   9 * TODO: make y// handle unicode, unicode delimiters
  10 * TODO: handle error return from emit(), error_msg/exit consistently
  11 *       What's the right thing to do for -i when write fails? Skip to next?
  12 * test '//q' with no previous regex, also repeat previous regex?
  13
  14USE_SED(NEWTOY(sed, "(help)(version)e*f*i:;nErz(null-data)[+Er]", TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
  15
  16config SED
  17  bool "sed"
  18  default y
  19  help
  20    usage: sed [-inrzE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
  21
  22    Stream editor. Apply one or more editing SCRIPTs to each line of input
  23    (from FILE or stdin) producing output (by default to stdout).
  24
  25    -e  Add SCRIPT to list
  26    -f  Add contents of SCRIPT_FILE to list
  27    -i  Edit each file in place (-iEXT keeps backup file with extension EXT)
  28    -n  No default output (use the p command to output matched lines)
  29    -r  Use extended regular expression syntax
  30    -E  POSIX alias for -r
  31    -s  Treat input files separately (implied by -i)
  32    -z  Use \0 rather than \n as the input line separator
  33
  34    A SCRIPT is a series of one or more COMMANDs separated by newlines or
  35    semicolons. All -e SCRIPTs are concatenated together as if separated
  36    by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
  37    If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
  38
  39    Each COMMAND may be preceded by an address which limits the command to
  40    apply only to the specified line(s). Commands without an address apply to
  41    every line. Addresses are of the form:
  42
  43      [ADDRESS[,ADDRESS]]COMMAND
  44
  45    The ADDRESS may be a decimal line number (starting at 1), a /regular
  46    expression/ within a pair of forward slashes, or the character "$" which
  47    matches the last line of input. (In -s or -i mode this matches the last
  48    line of each file, otherwise just the last line of the last file.) A single
  49    address matches one line, a pair of comma separated addresses match
  50    everything from the first address to the second address (inclusive). If
  51    both addresses are regular expressions, more than one range of lines in
  52    each file can match. The second address can be +N to end N lines later.
  53
  54    REGULAR EXPRESSIONS in sed are started and ended by the same character
  55    (traditionally / but anything except a backslash or a newline works).
  56    Backslashes may be used to escape the delimiter if it occurs in the
  57    regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
  58    and unicode). An empty regex repeats the previous one. ADDRESS regexes
  59    (above) require the first delimiter to be escaped with a backslash when
  60    it isn't a forward slash (to distinguish it from the COMMANDs below).
  61
  62    Sed mostly operates on individual lines one at a time. It reads each line,
  63    processes it, and either writes it to the output or discards it before
  64    reading the next line. Sed can remember one additional line in a separate
  65    buffer (using the h, H, g, G, and x commands), and can read the next line
  66    of input early (using the n and N command), but other than that command
  67    scripts operate on individual lines of text.
  68
  69    Each COMMAND starts with a single character. The following commands take
  70    no arguments:
  71
  72      {  Start a new command block, continuing until a corresponding "}".
  73         Command blocks may nest. If the block has an address, commands within
  74         the block are only run for lines within the block's address range.
  75
  76      }  End command block (this command cannot have an address)
  77
  78      d  Delete this line and move on to the next one
  79         (ignores remaining COMMANDs)
  80
  81      D  Delete one line of input and restart command SCRIPT (same as "d"
  82         unless you've glued lines together with "N" or similar)
  83
  84      g  Get remembered line (overwriting current line)
  85
  86      G  Get remembered line (appending to current line)
  87
  88      h  Remember this line (overwriting remembered line)
  89
  90      H  Remember this line (appending to remembered line, if any)
  91
  92      l  Print line, escaping \abfrtv (but not newline), octal escaping other
  93         nonprintable characters, wrapping lines to terminal width with a
  94         backslash, and appending $ to actual end of line.
  95
  96      n  Print default output and read next line, replacing current line
  97         (If no next line available, quit processing script)
  98
  99      N  Append next line of input to this line, separated by a newline
 100         (This advances the line counter for address matching and "=", if no
 101         next line available quit processing script without default output)
 102
 103      p  Print this line
 104
 105      P  Print this line up to first newline (from "N")
 106
 107      q  Quit (print default output, no more commands processed or lines read)
 108
 109      x  Exchange this line with remembered line (overwrite in both directions)
 110
 111      =  Print the current line number (followed by a newline)
 112
 113    The following commands (may) take an argument. The "text" arguments (to
 114    the "a", "b", and "c" commands) may end with an unescaped "\" to append
 115    the next line (for which leading whitespace is not skipped), and also
 116    treat ";" as a literal character (use "\;" instead).
 117
 118      a [text]   Append text to output before attempting to read next line
 119
 120      b [label]  Branch, jumps to :label (or with no label, to end of SCRIPT)
 121
 122      c [text]   Delete line, output text at end of matching address range
 123                 (ignores remaining COMMANDs)
 124
 125      i [text]   Print text
 126
 127      r [file]   Append contents of file to output before attempting to read
 128                 next line.
 129
 130      s/S/R/F    Search for regex S, replace matched text with R using flags F.
 131                 The first character after the "s" (anything but newline or
 132                 backslash) is the delimiter, escape with \ to use normally.
 133
 134                 The replacement text may contain "&" to substitute the matched
 135                 text (escape it with backslash for a literal &), or \1 through
 136                 \9 to substitute a parenthetical subexpression in the regex.
 137                 You can also use the normal backslash escapes such as \n and
 138                 a backslash at the end of the line appends the next line.
 139
 140                 The flags are:
 141
 142                 [0-9]    A number, substitute only that occurrence of pattern
 143                 g        Global, substitute all occurrences of pattern
 144                 i        Ignore case when matching
 145                 p        Print the line if match was found and replaced
 146                 w [file] Write (append) line to file if match replaced
 147
 148      t [label]  Test, jump to :label only if an "s" command found a match in
 149                 this line since last test (replacing with same text counts)
 150
 151      T [label]  Test false, jump only if "s" hasn't found a match.
 152
 153      w [file]   Write (append) line to file
 154
 155      y/old/new/ Change each character in 'old' to corresponding character
 156                 in 'new' (with standard backslash escapes, delimiter can be
 157                 any repeated character except \ or \n)
 158
 159      : [label]  Labeled target for jump commands
 160
 161      #  Comment, ignore rest of this line of SCRIPT
 162
 163    Deviations from POSIX: allow extended regular expressions with -r,
 164    editing in place with -i, separate with -s, NUL-separated input with -z,
 165    printf escapes in text, line continuations, semicolons after all commands,
 166    2-address anywhere an address is allowed, "T" command, multiline
 167    continuations for [abc], \; to end [abc] argument before end of line.
 168*/
 169
 170#define FOR_sed
 171#include "toys.h"
 172
 173GLOBALS(
 174  char *i;
 175  struct arg_list *f, *e;
 176
 177  // processed pattern list
 178  struct double_list *pattern;
 179
 180  char *nextline, *remember;
 181  void *restart, *lastregex;
 182  long nextlen, rememberlen, count;
 183  int fdout, noeol;
 184  unsigned xx;
 185  char delim;
 186)
 187
 188// Linked list of parsed sed commands. Offset fields indicate location where
 189// regex or string starts, ala offset+(char *)struct, because we remalloc()
 190// these to expand them for multiline inputs, and pointers would have to be
 191// individually adjusted.
 192
 193struct sedcmd {
 194  struct sedcmd *next, *prev;
 195
 196  // Begin and end of each match
 197  long lmatch[2]; // line number of match
 198  int rmatch[2];  // offset of regex struct for prefix matches (/abc/,/def/p)
 199  int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
 200  unsigned not, hit;
 201  unsigned sflags; // s///flag bits: i=1, g=2, p=4
 202  char c; // action
 203};
 204
 205// Write out line with potential embedded NUL, handling eol/noeol
 206static int emit(char *line, long len, int eol)
 207{
 208  int l, old = line[len];
 209
 210  if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
 211  TT.noeol = !eol;
 212  if (eol) line[len++] = '\n';
 213  if (!len) return 0;
 214  l = writeall(TT.fdout, line, len);
 215  if (eol) line[len-1] = old;
 216  if (l != len) {
 217    if (TT.fdout != 1) perror_msg("short write");
 218
 219    return 1;
 220  }
 221
 222  return 0;
 223}
 224
 225// Extend allocation to include new string, with newline between if newlen<0
 226
 227static char *extend_string(char **old, char *new, int oldlen, int newlen)
 228{
 229  int newline = newlen < 0;
 230  char *s;
 231
 232  if (newline) newlen = -newlen;
 233  s = *old = xrealloc(*old, oldlen+newlen+newline+1);
 234  if (newline) s[oldlen++] = '\n';
 235  memcpy(s+oldlen, new, newlen);
 236  s[oldlen+newlen] = 0;
 237
 238  return s+oldlen+newlen+1;
 239}
 240
 241// An empty regex repeats the previous one
 242static void *get_regex(void *command, int offset)
 243{
 244  if (!offset) {
 245    if (!TT.lastregex) error_exit("no previous regex");
 246    return TT.lastregex;
 247  }
 248
 249  return TT.lastregex = offset+(char *)command;
 250}
 251
 252// Apply pattern to line from input file
 253static void sed_line(char **pline, long plen)
 254{
 255  struct append {
 256    struct append *next, *prev;
 257    int file;
 258    char *str;
 259  } *append = 0;
 260  char *line = TT.nextline;
 261  long len = TT.nextlen;
 262  struct sedcmd *command;
 263  int eol = 0, tea = 0;
 264
 265  // Ignore EOF for all files before last unless -i
 266  if (!pline && !FLAG(i)) return;
 267
 268  // Grab next line for deferred processing (EOF detection: we get a NULL
 269  // pline at EOF to flush last line). Note that only end of _last_ input
 270  // file matches $ (unless we're doing -i).
 271  TT.nextline = 0;
 272  TT.nextlen = 0;
 273  if (pline) {
 274    TT.nextline = *pline;
 275    TT.nextlen = plen;
 276    *pline = 0;
 277  }
 278
 279  if (!line || !len) return;
 280  if (line[len-1] == '\n') line[--len] = eol++;
 281  TT.count++;
 282
 283  // The restart-1 is because we added one to make sure it wasn't NULL,
 284  // otherwise N as last command would restart script
 285  command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
 286  TT.restart = 0;
 287
 288  while (command) {
 289    char *str, c = command->c;
 290
 291    // Have we got a line or regex matching range for this rule?
 292    if (*command->lmatch || *command->rmatch) {
 293      int miss = 0;
 294      long lm;
 295
 296      // In a match that might end?
 297      if (command->hit) {
 298        if (!(lm = command->lmatch[1])) {
 299          if (!command->rmatch[1]) command->hit = 0;
 300          else {
 301            void *rm = get_regex(command, command->rmatch[1]);
 302
 303            // regex match end includes matching line, so defer deactivation
 304            if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
 305          }
 306        } else if (lm > 0 && lm < TT.count) command->hit = 0;
 307        else if (lm < -1 && TT.count == command->hit+(-lm-1)) command->hit = 0;
 308
 309      // Start a new match?
 310      } else {
 311        if (!(lm = *command->lmatch)) {
 312          void *rm = get_regex(command, *command->rmatch);
 313
 314          if (line && !regexec0(rm, line, len, 0, 0, 0))
 315            command->hit = TT.count;
 316        } else if (lm == TT.count || (lm == -1 && !pline))
 317          command->hit = TT.count;
 318
 319        if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
 320      } 
 321
 322      // Didn't match?
 323      lm = !(command->not^!!command->hit);
 324
 325      // Deferred disable from regex end match
 326      if (miss || command->lmatch[1] == TT.count) command->hit = 0;
 327
 328      if (lm) {
 329        // Handle skipping curly bracket command group
 330        if (c == '{') {
 331          int curly = 1;
 332
 333          while (curly) {
 334            command = command->next;
 335            if (command->c == '{') curly++;
 336            if (command->c == '}') curly--;
 337          }
 338        }
 339        command = command->next;
 340        continue;
 341      }
 342    }
 343
 344    // A deleted line can still update line match state for later commands
 345    if (!line) {
 346      command = command->next;
 347      continue;
 348    }
 349
 350    // Process command
 351
 352    if (c=='a' || c=='r') {
 353      struct append *a = xzalloc(sizeof(struct append));
 354      if (command->arg1) a->str = command->arg1+(char *)command;
 355      a->file = c=='r';
 356      dlist_add_nomalloc((void *)&append, (void *)a);
 357    } else if (c=='b' || c=='t' || c=='T') {
 358      int t = tea;
 359
 360      if (c != 'b') tea = 0;
 361      if (c=='b' || t^(c=='T')) {
 362        if (!command->arg1) break;
 363        str = command->arg1+(char *)command;
 364        for (command = (void *)TT.pattern; command; command = command->next)
 365          if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
 366            break;
 367        if (!command) error_exit("no :%s", str);
 368      }
 369    } else if (c=='c') {
 370      str = command->arg1+(char *)command;
 371      if (!command->hit) emit(str, strlen(str), 1);
 372      free(line);
 373      line = 0;
 374      continue;
 375    } else if (c=='d') {
 376      free(line);
 377      line = 0;
 378      continue;
 379    } else if (c=='D') {
 380      // Delete up to \n or end of buffer
 381      str = line;
 382      while ((str-line)<len) if (*(str++) == '\n') break;
 383      len -= str - line;
 384      memmove(line, str, len);
 385
 386      // if "delete" blanks line, disable further processing
 387      // otherwise trim and restart script
 388      if (!len) {
 389        free(line);
 390        line = 0;
 391      } else {
 392        line[len] = 0;
 393        command = (void *)TT.pattern;
 394      }
 395      continue;
 396    } else if (c=='g') {
 397      free(line);
 398      line = xstrdup(TT.remember);
 399      len = TT.rememberlen;
 400    } else if (c=='G') {
 401      line = xrealloc(line, len+TT.rememberlen+2);
 402      line[len++] = '\n';
 403      memcpy(line+len, TT.remember, TT.rememberlen);
 404      line[len += TT.rememberlen] = 0;
 405    } else if (c=='h') {
 406      free(TT.remember);
 407      TT.remember = xstrdup(line);
 408      TT.rememberlen = len;
 409    } else if (c=='H') {
 410      TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
 411      TT.remember[TT.rememberlen++] = '\n';
 412      memcpy(TT.remember+TT.rememberlen, line, len);
 413      TT.remember[TT.rememberlen += len] = 0;
 414    } else if (c=='i') {
 415      str = command->arg1+(char *)command;
 416      emit(str, strlen(str), 1);
 417    } else if (c=='l') {
 418      int i, x, off;
 419
 420      if (!TT.xx) {
 421        terminal_size(&TT.xx, 0);
 422        if (!TT.xx) TT.xx = 80;
 423        if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
 424        if (TT.xx > 4) TT.xx -= 4;
 425      }
 426
 427      for (i = off = 0; i<len; i++) {
 428        if (off >= TT.xx) {
 429          toybuf[off++] = '\\';
 430          emit(toybuf, off, 1);
 431          off = 0;
 432        }
 433        x = stridx("\\\a\b\f\r\t\v", line[i]);
 434        if (x != -1) {
 435          toybuf[off++] = '\\';
 436          toybuf[off++] = "\\abfrtv"[x];
 437        } else if (line[i] >= ' ') toybuf[off++] = line[i];
 438        else off += sprintf(toybuf+off, "\\%03o", line[i]);
 439      }
 440      toybuf[off++] = '$';
 441      emit(toybuf, off, 1);
 442    } else if (c=='n') {
 443      TT.restart = command->next+1;
 444
 445      break;
 446    } else if (c=='N') {
 447      // Can't just grab next line because we could have multiple N and
 448      // we need to actually read ahead to get N;$p EOF detection right.
 449      if (pline) {
 450        TT.restart = command->next+1;
 451        extend_string(&line, TT.nextline, len, -TT.nextlen);
 452        free(TT.nextline);
 453        TT.nextline = line;
 454        TT.nextlen += len + 1;
 455        line = 0;
 456      }
 457
 458      // Pending append goes out right after N
 459      goto done; 
 460    } else if (c=='p' || c=='P') {
 461      char *l = (c=='P') ? strchr(line, '\n') : 0;
 462
 463      if (emit(line, l ? l-line : len, eol)) break;
 464    } else if (c=='q') {
 465      if (pline) *pline = (void *)1;
 466      free(TT.nextline);
 467      TT.nextline = 0;
 468      TT.nextlen = 0;
 469
 470      break;
 471    } else if (c=='s') {
 472      char *rline = line, *new = command->arg2 + (char *)command, *l2 = 0;
 473      regmatch_t *match = (void *)toybuf;
 474      regex_t *reg = get_regex(command, command->arg1);
 475      int mflags = 0, count = 0, l2used = 0, zmatch = 1, l2l = len, l2old = 0,
 476        mlen, off, newlen;
 477
 478      // Loop finding match in remaining line (up to remaining len)
 479      while (!regexec0(reg, rline, len-(rline-line), 10, match, mflags)) {
 480        mflags = REG_NOTBOL;
 481
 482        // Zero length matches don't count immediately after a previous match
 483        mlen = match[0].rm_eo-match[0].rm_so;
 484        if (!mlen && !zmatch) {
 485          if (rline-line == len) break;
 486          l2[l2used++] = *rline++;
 487          zmatch++;
 488          continue;
 489        } else zmatch = 0;
 490
 491        // If we're replacing only a specific match, skip if this isn't it
 492        off = command->sflags>>3;
 493        if (off && off != ++count) {
 494          memcpy(l2+l2used, rline, match[0].rm_eo);
 495          l2used += match[0].rm_eo;
 496          rline += match[0].rm_eo;
 497
 498          continue;
 499        }
 500        // The fact getline() can allocate unbounded amounts of memory is
 501        // a bigger issue, but while we're here check for integer overflow
 502        if (match[0].rm_eo > INT_MAX) perror_exit(0);
 503
 504        // newlen = strlen(new) but with \1 and & and printf escapes
 505        for (off = newlen = 0; new[off]; off++) {
 506          int cc = -1;
 507
 508          if (new[off] == '&') cc = 0;
 509          else if (new[off] == '\\') cc = new[++off] - '0';
 510          if (cc < 0 || cc > 9) {
 511            newlen++;
 512            continue;
 513          }
 514          newlen += match[cc].rm_eo-match[cc].rm_so;
 515        }
 516
 517        // Copy changed data to new string
 518
 519        // Adjust allocation size of new string, copy data we know we'll keep
 520        l2l += newlen-mlen;
 521        if ((l2l|0xfff) > l2old) l2 = xrealloc(l2, l2old = (l2l|0xfff)+1);
 522        if (match[0].rm_so) {
 523          memcpy(l2+l2used, rline, match[0].rm_so);
 524          l2used += match[0].rm_so;
 525        }
 526
 527        // copy in new replacement text
 528        for (off = mlen = 0; new[off]; off++) {
 529          int cc = 0, ll;
 530
 531          if (new[off] == '\\') {
 532            cc = new[++off] - '0';
 533            if (cc<0 || cc>9) {
 534              if (!(l2[l2used+mlen++] = unescape(new[off])))
 535                l2[l2used+mlen-1] = new[off];
 536
 537              continue;
 538            } else if (cc > reg->re_nsub) error_exit("no s//\\%d/", cc);
 539          } else if (new[off] != '&') {
 540            l2[l2used+mlen++] = new[off];
 541
 542            continue;
 543          }
 544
 545          if (match[cc].rm_so != -1) {
 546            ll = match[cc].rm_eo-match[cc].rm_so;
 547            memcpy(l2+l2used+mlen, rline+match[cc].rm_so, ll);
 548            mlen += ll;
 549          }
 550        }
 551        l2used += newlen;
 552        rline += match[0].rm_eo;
 553
 554        // Stop after first substitution unless we have flag g
 555        if (!(command->sflags & 2)) break;
 556      }
 557
 558      // If we made any changes, finish off l2 and swap it for line
 559      if (l2) {
 560        // grab trailing unmatched data and null terminator, swap with original
 561        mlen = len-(rline-line);
 562        memcpy(l2+l2used, rline, mlen+1);
 563        len = l2used + mlen;
 564        free(line);
 565        line = l2;
 566      }
 567
 568      if (mflags) {
 569        // flag p
 570        if (command->sflags & 4) emit(line, len, eol);
 571
 572        tea = 1;
 573        if (command->w) goto writenow;
 574      }
 575    } else if (c=='w') {
 576      int fd, noeol;
 577      char *name;
 578
 579writenow:
 580      // Swap out emit() context
 581      fd = TT.fdout;
 582      noeol = TT.noeol;
 583
 584      // We save filehandle and newline status before filename
 585      name = command->w + (char *)command;
 586      memcpy(&TT.fdout, name, 4);
 587      name += 4;
 588      TT.noeol = *(name++);
 589
 590      // write, then save/restore context
 591      if (emit(line, len, eol))
 592        perror_exit("w '%s'", command->arg1+(char *)command);
 593      *(--name) = TT.noeol;
 594      TT.noeol = noeol;
 595      TT.fdout = fd;
 596    } else if (c=='x') {
 597      long swap = TT.rememberlen;
 598
 599      str = TT.remember;
 600      TT.remember = line;
 601      line = str;
 602      TT.rememberlen = len;
 603      len = swap;
 604    } else if (c=='y') {
 605      char *from, *to = (char *)command;
 606      int i, j;
 607
 608      from = to+command->arg1;
 609      to += command->arg2;
 610
 611      for (i = 0; i < len; i++) {
 612        j = stridx(from, line[i]);
 613        if (j != -1) line[i] = to[j];
 614      }
 615    } else if (c=='=') {
 616      sprintf(toybuf, "%ld", TT.count);
 617      if (emit(toybuf, strlen(toybuf), 1)) break;
 618    }
 619
 620    command = command->next;
 621  }
 622
 623  if (line && !FLAG(n)) emit(line, len, eol);
 624
 625done:
 626  if (dlist_terminate(append)) while (append) {
 627    struct append *a = append->next;
 628
 629    if (append->file) {
 630      int fd = open(append->str, O_RDONLY);
 631
 632      // Force newline if noeol pending
 633      if (fd != -1) {
 634        if (TT.noeol) xwrite(TT.fdout, "\n", 1);
 635        TT.noeol = 0;
 636        xsendfile(fd, TT.fdout);
 637        close(fd);
 638      }
 639    } else if (append->str) emit(append->str, strlen(append->str), 1);
 640    else emit(line, 0, 0);
 641    free(append);
 642    append = a;
 643  }
 644  free(line);
 645}
 646
 647// Callback called on each input file
 648static void do_sed_file(int fd, char *name)
 649{
 650  char *tmp;
 651
 652  if (FLAG(i)) {
 653    struct sedcmd *command;
 654
 655    if (!fd) return error_msg("-i on stdin");
 656    TT.fdout = copy_tempfile(fd, name, &tmp);
 657    TT.count = 0;
 658    for (command = (void *)TT.pattern; command; command = command->next)
 659      command->hit = 0;
 660  }
 661  do_lines(fd, TT.delim, sed_line);
 662  if (FLAG(i)) {
 663    if (TT.i && *TT.i) {
 664      char *s = xmprintf("%s%s", name, TT.i);
 665
 666      xrename(name, s);
 667      free(s);
 668    }
 669    replace_tempfile(-1, TT.fdout, &tmp);
 670    TT.fdout = 1;
 671    TT.nextline = 0;
 672    TT.nextlen = TT.noeol = 0;
 673  }
 674}
 675
 676// Copy chunk of string between two delimiters, converting printf escapes.
 677// returns processed copy of string (0 if error), *pstr advances to next
 678// unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
 679// if regxex, ignore delimiter in [ranges]
 680static char *unescape_delimited_string(char **pstr, char *delim)
 681{
 682  char *to, *from, mode = 0, d;
 683
 684  // Grab leading delimiter (if necessary), allocate space for new string
 685  from = *pstr;
 686  if (!delim || !*delim) {
 687    if (!(d = *(from++))) return 0;
 688    if (d == '\\') d = *(from++);
 689    if (!d || d == '\\') return 0;
 690    if (delim) *delim = d;
 691  } else d = *delim;
 692  to = delim = xmalloc(strlen(*pstr)+1);
 693
 694  while (mode || *from != d) {
 695    if (!*from) return 0;
 696
 697    // delimiter in regex character range doesn't count
 698    if (*from == '[') {
 699      if (!mode) {
 700        mode = ']';
 701        if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
 702      } else if (mode == ']' && strchr(".=:", from[1])) {
 703        *(to++) = *(from++);
 704        mode = *from;
 705      }
 706    } else if (*from == mode) {
 707      if (mode == ']') mode = 0;
 708      else {
 709        *(to++) = *(from++);
 710        mode = ']';
 711      }
 712    // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
 713    // but the perl build does it, so we need to filter it out.
 714    } else if (mode && *from == '-' && from[-1] == from[1]) {
 715      from+=2;
 716      continue;
 717    } else if (*from == '\\') {
 718      if (!from[1]) return 0;
 719
 720      // Check escaped end delimiter before printf style escapes.
 721      if (from[1] == d) from++;
 722      else if (from[1]=='\\') *(to++) = *(from++);
 723      else {
 724        char c = unescape(from[1]);
 725
 726        if (c) {
 727          *(to++) = c;
 728          from+=2;
 729          continue;
 730        } else if (!mode) *(to++) = *(from++);
 731      }
 732    }
 733    *(to++) = *(from++);
 734  }
 735  *to = 0;
 736  *pstr = from+1;
 737
 738  return delim;
 739}
 740
 741// Translate pattern strings into command structures. Each command structure
 742// is a single allocation (which requires some math and remalloc at times).
 743static void parse_pattern(char **pline, long len)
 744{
 745  struct sedcmd *command = (void *)TT.pattern;
 746  char *line, *reg, c, *errstart;
 747  int i;
 748
 749  line = errstart = pline ? *pline : "";
 750  if (len && line[len-1]=='\n') line[--len] = 0;
 751
 752  // Append this line to previous multiline command? (hit indicates type.)
 753  // During parsing "hit" stores data about line continuations, but in
 754  // sed_line() it means the match range attached to this command
 755  // is active, so processing the continuation must zero it again.
 756  if (command && command->prev->hit) {
 757    // Remove half-finished entry from list so remalloc() doesn't confuse it
 758    TT.pattern = TT.pattern->prev;
 759    command = dlist_pop(&TT.pattern);
 760    c = command->c;
 761    reg = (char *)command;
 762    reg += command->arg1 + strlen(reg + command->arg1);
 763
 764    // Resume parsing for 'a' or 's' command. (Only two that can do this.)
 765    // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
 766    // a unicode character.
 767    if (command->hit < 256) goto resume_s;
 768    else goto resume_a;
 769  }
 770
 771  // Loop through commands in this line.
 772
 773  command = 0;
 774  for (;;) {
 775    if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
 776
 777    // If there's no more data on this line, return.
 778    for (;;) {
 779      while (isspace(*line) || *line == ';') line++;
 780      if (*line == '#') while (*line && *line != '\n') line++;
 781      else break;
 782    }
 783    if (!*line) return;
 784
 785    // Start by writing data into toybuf.
 786
 787    errstart = line;
 788    memset(toybuf, 0, sizeof(struct sedcmd));
 789    command = (void *)toybuf;
 790    reg = toybuf + sizeof(struct sedcmd);
 791
 792    // Parse address range (if any)
 793    for (i = 0; i < 2; i++) {
 794      if (*line == ',') line++;
 795      else if (i) break;
 796
 797      if (i && *line == '+' && isdigit(line[1])) {
 798        line++;
 799        command->lmatch[i] = -2-strtol(line, &line, 0);
 800      } else if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
 801      else if (*line == '$') {
 802        command->lmatch[i] = -1;
 803        line++;
 804      } else if (*line == '/' || *line == '\\') {
 805        char *s = line;
 806
 807        if (!(s = unescape_delimited_string(&line, 0))) goto error;
 808        if (!*s) command->rmatch[i] = 0;
 809        else {
 810          xregcomp((void *)reg, s, REG_EXTENDED*!!FLAG(r));
 811          command->rmatch[i] = reg-toybuf;
 812          reg += sizeof(regex_t);
 813        }
 814        free(s);
 815      } else break;
 816    }
 817
 818    while (isspace(*line)) line++;
 819    if (!*line) break;
 820
 821    while (*line == '!') {
 822      command->not = 1;
 823      line++;
 824    }
 825    while (isspace(*line)) line++;
 826
 827    c = command->c = *(line++);
 828    if (strchr("}:", c) && i) break;
 829    if (strchr("aiqr=", c) && i>1) break;
 830
 831    // Allocate memory and copy out of toybuf now that we know how big it is
 832    command = xmemdup(toybuf, reg-toybuf);
 833    reg = (reg-toybuf) + (char *)command;
 834
 835    // Parse arguments by command type
 836    if (c == '{') TT.nextlen++;
 837    else if (c == '}') {
 838      if (!TT.nextlen--) break;
 839    } else if (c == 's') {
 840      char *end, delim = 0;
 841
 842      // s/pattern/replacement/flags
 843
 844      // line continuations use arg1 (back at the start of the function),
 845      // so let's fill out arg2 first (since the regex part can't be multiple
 846      // lines) and swap them back later.
 847
 848      // get pattern (just record, we parse it later)
 849      command->arg2 = reg - (char *)command;
 850      if (!(TT.remember = unescape_delimited_string(&line, &delim)))
 851        goto error;
 852
 853      reg += sizeof(regex_t);
 854      command->arg1 = reg-(char *)command;
 855      command->hit = delim;
 856resume_s:
 857      // get replacement - don't replace escapes yet because \1 and \& need
 858      // processing later, after we replace \\ with \ we can't tell \\1 from \1
 859      end = line;
 860      while (*end != command->hit) {
 861        if (!*end) goto error;
 862        if (*end++ == '\\') {
 863          if (!*end || *end == '\n') {
 864            end[-1] = '\n';
 865            break;
 866          }
 867          end++;
 868        }
 869      }
 870
 871      reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
 872      line = end;
 873      // line continuation? (note: '\n' can't be a valid delim).
 874      if (*line == command->hit) command->hit = 0;
 875      else {
 876        if (!*line) continue;
 877        reg--;
 878        line++;
 879        goto resume_s;
 880      }
 881
 882      // swap arg1/arg2 so they're back in order arguments occur.
 883      i = command->arg1;
 884      command->arg1 = command->arg2;
 885      command->arg2 = i;
 886
 887      // get flags
 888      for (line++; *line; line++) {
 889        long l;
 890
 891        if (isspace(*line) && *line != '\n') continue;
 892
 893        if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l;
 894        else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
 895          command->sflags |= l << 3;
 896          line--;
 897        } else break;
 898      }
 899
 900      // We deferred actually parsing the regex until we had the s///i flag
 901      // allocating the space was done by extend_string() above
 902      if (!*TT.remember) command->arg1 = 0;
 903      else xregcomp((void *)(command->arg1 + (char *)command), TT.remember,
 904        (REG_EXTENDED*!!FLAG(r))|((command->sflags&1)*REG_ICASE));
 905      free(TT.remember);
 906      TT.remember = 0;
 907      if (*line == 'w') {
 908        line++;
 909        goto writenow;
 910      }
 911    } else if (c == 'w') {
 912      int fd, delim;
 913      char *cc;
 914
 915      // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
 916      // eol status, and to retain the filename for error messages, we'd need
 917      // to go up to arg5 just for this. Compromise: dynamically allocate the
 918      // filehandle and eol status.
 919
 920writenow:
 921      while (isspace(*line)) line++;
 922      if (!*line) goto error;
 923      for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
 924      delim = *cc;
 925      *cc = 0;
 926      fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
 927      *cc = delim;
 928
 929      command->w = reg - (char *)command;
 930      command = xrealloc(command, command->w+(cc-line)+6);
 931      reg = command->w + (char *)command;
 932
 933      memcpy(reg, &fd, 4);
 934      reg += 4;
 935      *(reg++) = 0;
 936      memcpy(reg, line, delim);
 937      reg += delim;
 938      *(reg++) = 0;
 939
 940      line = cc;
 941      if (delim) line += 2;
 942    } else if (c == 'y') {
 943      char *s, delim = 0;
 944      int len;
 945
 946      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
 947      command->arg1 = reg-(char *)command;
 948      len = strlen(s);
 949      reg = extend_string((void *)&command, s, reg-(char *)command, len);
 950      free(s);
 951      command->arg2 = reg-(char *)command;
 952      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
 953      if (len != strlen(s)) goto error;
 954      reg = extend_string((void *)&command, s, reg-(char*)command, len);
 955      free(s);
 956    } else if (strchr("abcirtTw:", c)) {
 957      int end;
 958
 959      // trim leading spaces
 960      while (isspace(*line) && *line != '\n') line++;
 961
 962      // Resume logic differs from 's' case because we don't add a newline
 963      // unless it's after something, so we add it on return instead.
 964resume_a:
 965      command->hit = 0;
 966
 967      // btT: end with space or semicolon, aicrw continue to newline.
 968      if (!(end = strcspn(line, strchr(":btT", c) ? "}; \t\r\n\v\f" : "\n"))) {
 969        // Argument's optional for btT
 970        if (strchr("btT", c)) continue;
 971        else if (!command->arg1) break;
 972      }
 973
 974      // Extend allocation to include new string. We use offsets instead of
 975      // pointers so realloc() moving stuff doesn't break things. Ok to write
 976      // \n over NUL terminator because call to extend_string() adds it back.
 977      if (!command->arg1) command->arg1 = reg - (char*)command;
 978      else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
 979      else if (!pline) {
 980        command->arg1 = 0;
 981        continue;
 982      }
 983      reg = extend_string((void *)&command, line, reg - (char *)command, end);
 984
 985      // Recopy data to remove escape sequences and handle line continuation.
 986      if (strchr("aci", c)) {
 987        reg -= end+1;
 988        for (i = end; i; i--) {
 989          if ((*reg++ = *line++)=='\\') {
 990
 991            // escape at end of line: resume if -e escaped literal newline,
 992            // else request callback and resume with next line
 993            if (!--i) {
 994              *--reg = 0;
 995              if (*line) {
 996                line++;
 997                goto resume_a;
 998              }
 999              command->hit = 256;
1000              break;
1001            }
1002            if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
1003            line++;
1004          }
1005        }
1006        *reg = 0;
1007      } else line += end;
1008
1009    // Commands that take no arguments
1010    } else if (!strchr("{dDgGhHlnNpPqx=", c)) break;
1011  }
1012
1013error:
1014  error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
1015}
1016
1017void sed_main(void)
1018{
1019  struct arg_list *al;
1020  char **args = toys.optargs;
1021
1022  if (!FLAG(z)) TT.delim = '\n';
1023
1024  // Lie to autoconf when it asks stupid questions, so configure regexes
1025  // that look for "GNU sed version %f" greater than some old buggy number
1026  // don't fail us for not matching their narrow expectations.
1027  if (FLAG(version)) {
1028    xprintf("This is not GNU sed version 9.0\n");
1029    return;
1030  }
1031
1032  // Handling our own --version means we handle our own --help too.
1033  if (FLAG(help)) help_exit(0);
1034
1035  // Parse pattern into commands.
1036
1037  // If no -e or -f, first argument is the pattern.
1038  if (!TT.e && !TT.f) {
1039    if (!*toys.optargs) error_exit("no pattern");
1040    (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1041  }
1042
1043  // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1044  // so handle all -e, then all -f. (At least the behavior's consistent.)
1045
1046  for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
1047  parse_pattern(0, 0);
1048  for (al = TT.f; al; al = al->next)
1049    do_lines(xopenro(al->arg), TT.delim, parse_pattern);
1050  dlist_terminate(TT.pattern);
1051  if (TT.nextlen) error_exit("no }");  
1052
1053  TT.fdout = 1;
1054  TT.remember = xstrdup("");
1055
1056  // Inflict pattern upon input files. Long version because !O_CLOEXEC
1057  loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file);
1058
1059  // Provide EOF flush at end of cumulative input for non-i mode.
1060  if (!FLAG(i)) {
1061    toys.optflags |= FLAG_i;
1062    sed_line(0, 0);
1063  }
1064
1065  // todo: need to close fd when done for TOYBOX_FREE?
1066}
1067