toybox/toys/posix/sed.c
<<
>>
Prefs
   1/* sed.c - stream editor. Thing that does s/// and other stuff.
   2 *
   3 * Copyright 2014 Rob Landley <rob@landley.net>
   4 *
   5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
   6 *
   7 * TODO: lines > 2G could wrap signed int length counters. Not just getline()
   8 * but N and s///
   9 * TODO: make y// handle unicode, unicode delimiters
  10 * TODO: handle error return from emit(), error_msg/exit consistently
  11 *       What's the right thing to do for -i when write fails? Skip to next?
  12 * test '//q' with no previous regex, also repeat previous regex?
  13
  14USE_SED(NEWTOY(sed, "(help)(version)e*f*i:;nErz(null-data)[+Er]", TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
  15
  16config SED
  17  bool "sed"
  18  default y
  19  help
  20    usage: sed [-inrzE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
  21
  22    Stream editor. Apply one or more editing SCRIPTs to each line of input
  23    (from FILE or stdin) producing output (by default to stdout).
  24
  25    -e  Add SCRIPT to list
  26    -f  Add contents of SCRIPT_FILE to list
  27    -i  Edit each file in place (-iEXT keeps backup file with extension EXT)
  28    -n  No default output (use the p command to output matched lines)
  29    -r  Use extended regular expression syntax
  30    -E  POSIX alias for -r
  31    -s  Treat input files separately (implied by -i)
  32    -z  Use \0 rather than \n as the input line separator
  33
  34    A SCRIPT is a series of one or more COMMANDs separated by newlines or
  35    semicolons. All -e SCRIPTs are concatenated together as if separated
  36    by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
  37    If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
  38
  39    Each COMMAND may be preceded by an address which limits the command to
  40    apply only to the specified line(s). Commands without an address apply to
  41    every line. Addresses are of the form:
  42
  43      [ADDRESS[,ADDRESS]]COMMAND
  44
  45    The ADDRESS may be a decimal line number (starting at 1), a /regular
  46    expression/ within a pair of forward slashes, or the character "$" which
  47    matches the last line of input. (In -s or -i mode this matches the last
  48    line of each file, otherwise just the last line of the last file.) A single
  49    address matches one line, a pair of comma separated addresses match
  50    everything from the first address to the second address (inclusive). If
  51    both addresses are regular expressions, more than one range of lines in
  52    each file can match.
  53
  54    REGULAR EXPRESSIONS in sed are started and ended by the same character
  55    (traditionally / but anything except a backslash or a newline works).
  56    Backslashes may be used to escape the delimiter if it occurs in the
  57    regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
  58    and unicode). An empty regex repeats the previous one. ADDRESS regexes
  59    (above) require the first delimeter to be escaped with a backslash when
  60    it isn't a forward slash (to distinguish it from the COMMANDs below).
  61
  62    Sed mostly operates on individual lines one at a time. It reads each line,
  63    processes it, and either writes it to the output or discards it before
  64    reading the next line. Sed can remember one additional line in a separate
  65    buffer (using the h, H, g, G, and x commands), and can read the next line
  66    of input early (using the n and N command), but other than that command
  67    scripts operate on individual lines of text.
  68
  69    Each COMMAND starts with a single character. The following commands take
  70    no arguments:
  71
  72      {  Start a new command block, continuing until a corresponding "}".
  73         Command blocks may nest. If the block has an address, commands within
  74         the block are only run for lines within the block's address range.
  75
  76      }  End command block (this command cannot have an address)
  77
  78      d  Delete this line and move on to the next one
  79         (ignores remaining COMMANDs)
  80
  81      D  Delete one line of input and restart command SCRIPT (same as "d"
  82         unless you've glued lines together with "N" or similar)
  83
  84      g  Get remembered line (overwriting current line)
  85
  86      G  Get remembered line (appending to current line)
  87
  88      h  Remember this line (overwriting remembered line)
  89
  90      H  Remember this line (appending to remembered line, if any)
  91
  92      l  Print line, escaping \abfrtv (but not newline), octal escaping other
  93         nonprintable characters, wrapping lines to terminal width with a
  94         backslash, and appending $ to actual end of line.
  95
  96      n  Print default output and read next line, replacing current line
  97         (If no next line available, quit processing script)
  98
  99      N  Append next line of input to this line, separated by a newline
 100         (This advances the line counter for address matching and "=", if no
 101         next line available quit processing script without default output)
 102
 103      p  Print this line
 104
 105      P  Print this line up to first newline (from "N")
 106
 107      q  Quit (print default output, no more commands processed or lines read)
 108
 109      x  Exchange this line with remembered line (overwrite in both directions)
 110
 111      =  Print the current line number (followed by a newline)
 112
 113    The following commands (may) take an argument. The "text" arguments (to
 114    the "a", "b", and "c" commands) may end with an unescaped "\" to append
 115    the next line (for which leading whitespace is not skipped), and also
 116    treat ";" as a literal character (use "\;" instead).
 117
 118      a [text]   Append text to output before attempting to read next line
 119
 120      b [label]  Branch, jumps to :label (or with no label, to end of SCRIPT)
 121
 122      c [text]   Delete line, output text at end of matching address range
 123                 (ignores remaining COMMANDs)
 124
 125      i [text]   Print text
 126
 127      r [file]   Append contents of file to output before attempting to read
 128                 next line.
 129
 130      s/S/R/F    Search for regex S, replace matched text with R using flags F.
 131                 The first character after the "s" (anything but newline or
 132                 backslash) is the delimiter, escape with \ to use normally.
 133
 134                 The replacement text may contain "&" to substitute the matched
 135                 text (escape it with backslash for a literal &), or \1 through
 136                 \9 to substitute a parenthetical subexpression in the regex.
 137                 You can also use the normal backslash escapes such as \n and
 138                 a backslash at the end of the line appends the next line.
 139
 140                 The flags are:
 141
 142                 [0-9]    A number, substitute only that occurrence of pattern
 143                 g        Global, substitute all occurrences of pattern
 144                 i        Ignore case when matching
 145                 p        Print the line if match was found and replaced
 146                 w [file] Write (append) line to file if match replaced
 147
 148      t [label]  Test, jump to :label only if an "s" command found a match in
 149                 this line since last test (replacing with same text counts)
 150
 151      T [label]  Test false, jump only if "s" hasn't found a match.
 152
 153      w [file]   Write (append) line to file
 154
 155      y/old/new/ Change each character in 'old' to corresponding character
 156                 in 'new' (with standard backslash escapes, delimiter can be
 157                 any repeated character except \ or \n)
 158
 159      : [label]  Labeled target for jump commands
 160
 161      #  Comment, ignore rest of this line of SCRIPT
 162
 163    Deviations from POSIX: allow extended regular expressions with -r,
 164    editing in place with -i, separate with -s, NUL-separated input with -z,
 165    printf escapes in text, line continuations, semicolons after all commands,
 166    2-address anywhere an address is allowed, "T" command, multiline
 167    continuations for [abc], \; to end [abc] argument before end of line.
 168*/
 169
 170#define FOR_sed
 171#include "toys.h"
 172
 173GLOBALS(
 174  char *i;
 175  struct arg_list *f, *e;
 176
 177  // processed pattern list
 178  struct double_list *pattern;
 179
 180  char *nextline, *remember;
 181  void *restart, *lastregex;
 182  long nextlen, rememberlen, count;
 183  int fdout, noeol;
 184  unsigned xx;
 185  char delim;
 186)
 187
 188// Linked list of parsed sed commands. Offset fields indicate location where
 189// regex or string starts, ala offset+(char *)struct, because we remalloc()
 190// these to expand them for multiline inputs, and pointers would have to be
 191// individually adjusted.
 192
 193struct sedcmd {
 194  struct sedcmd *next, *prev;
 195
 196  // Begin and end of each match
 197  long lmatch[2]; // line number of match
 198  int rmatch[2];  // offset of regex struct for prefix matches (/abc/,/def/p)
 199  int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
 200  unsigned not, hit;
 201  unsigned sflags; // s///flag bits: i=1, g=2, p=4
 202  char c; // action
 203};
 204
 205// Write out line with potential embedded NUL, handling eol/noeol
 206static int emit(char *line, long len, int eol)
 207{
 208  int l, old = line[len];
 209
 210  if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
 211  TT.noeol = !eol;
 212  if (eol) line[len++] = '\n';
 213  if (!len) return 0;
 214  l = writeall(TT.fdout, line, len);
 215  if (eol) line[len-1] = old;
 216  if (l != len) {
 217    perror_msg("short write");
 218
 219    return 1;
 220  }
 221
 222  return 0;
 223}
 224
 225// Extend allocation to include new string, with newline between if newlen<0
 226
 227static char *extend_string(char **old, char *new, int oldlen, int newlen)
 228{
 229  int newline = newlen < 0;
 230  char *s;
 231
 232  if (newline) newlen = -newlen;
 233  s = *old = xrealloc(*old, oldlen+newlen+newline+1);
 234  if (newline) s[oldlen++] = '\n';
 235  memcpy(s+oldlen, new, newlen);
 236  s[oldlen+newlen] = 0;
 237
 238  return s+oldlen+newlen+1;
 239}
 240
 241// An empty regex repeats the previous one
 242static void *get_regex(void *trump, int offset)
 243{
 244  if (!offset) {
 245    if (!TT.lastregex) error_exit("no previous regex");
 246    return TT.lastregex;
 247  }
 248
 249  return TT.lastregex = offset+(char *)trump;
 250}
 251
 252// Apply pattern to line from input file
 253static void sed_line(char **pline, long plen)
 254{
 255  struct append {
 256    struct append *next, *prev;
 257    int file;
 258    char *str;
 259  } *append = 0;
 260  char *line = TT.nextline;
 261  long len = TT.nextlen;
 262  struct sedcmd *command;
 263  int eol = 0, tea = 0;
 264
 265  // Ignore EOF for all files before last unless -i
 266  if (!pline && !FLAG(i)) return;
 267
 268  // Grab next line for deferred processing (EOF detection: we get a NULL
 269  // pline at EOF to flush last line). Note that only end of _last_ input
 270  // file matches $ (unless we're doing -i).
 271  TT.nextline = 0;
 272  TT.nextlen = 0;
 273  if (pline) {
 274    TT.nextline = *pline;
 275    TT.nextlen = plen;
 276    *pline = 0;
 277  }
 278
 279  if (!line || !len) return;
 280  if (line[len-1] == '\n') line[--len] = eol++;
 281  TT.count++;
 282
 283  // The restart-1 is because we added one to make sure it wasn't NULL,
 284  // otherwise N as last command would restart script
 285  command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
 286  TT.restart = 0;
 287
 288  while (command) {
 289    char *str, c = command->c;
 290
 291    // Have we got a line or regex matching range for this rule?
 292    if (*command->lmatch || *command->rmatch) {
 293      int miss = 0;
 294      long lm;
 295
 296      // In a match that might end?
 297      if (command->hit) {
 298        if (!(lm = command->lmatch[1])) {
 299          if (!command->rmatch[1]) command->hit = 0;
 300          else {
 301            void *rm = get_regex(command, command->rmatch[1]);
 302
 303            // regex match end includes matching line, so defer deactivation
 304            if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
 305          }
 306        } else if (lm > 0 && lm < TT.count) command->hit = 0;
 307
 308      // Start a new match?
 309      } else {
 310        if (!(lm = *command->lmatch)) {
 311          void *rm = get_regex(command, *command->rmatch);
 312
 313          if (line && !regexec0(rm, line, len, 0, 0, 0)) command->hit++;
 314        } else if (lm == TT.count || (lm == -1 && !pline)) command->hit++;
 315
 316        if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
 317      } 
 318
 319      // Didn't match?
 320      lm = !(command->hit ^ command->not);
 321
 322      // Deferred disable from regex end match
 323      if (miss || command->lmatch[1] == TT.count) command->hit = 0;
 324
 325      if (lm) {
 326        // Handle skipping curly bracket command group
 327        if (c == '{') {
 328          int curly = 1;
 329
 330          while (curly) {
 331            command = command->next;
 332            if (command->c == '{') curly++;
 333            if (command->c == '}') curly--;
 334          }
 335        }
 336        command = command->next;
 337        continue;
 338      }
 339    }
 340
 341    // A deleted line can still update line match state for later commands
 342    if (!line) {
 343      command = command->next;
 344      continue;
 345    }
 346
 347    // Process command
 348
 349    if (c=='a' || c=='r') {
 350      struct append *a = xzalloc(sizeof(struct append));
 351      if (command->arg1) a->str = command->arg1+(char *)command;
 352      a->file = c=='r';
 353      dlist_add_nomalloc((void *)&append, (void *)a);
 354    } else if (c=='b' || c=='t' || c=='T') {
 355      int t = tea;
 356
 357      if (c != 'b') tea = 0;
 358      if (c=='b' || t^(c=='T')) {
 359        if (!command->arg1) break;
 360        str = command->arg1+(char *)command;
 361        for (command = (void *)TT.pattern; command; command = command->next)
 362          if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
 363            break;
 364        if (!command) error_exit("no :%s", str);
 365      }
 366    } else if (c=='c') {
 367      str = command->arg1+(char *)command;
 368      if (!command->hit) emit(str, strlen(str), 1);
 369      free(line);
 370      line = 0;
 371      continue;
 372    } else if (c=='d') {
 373      free(line);
 374      line = 0;
 375      continue;
 376    } else if (c=='D') {
 377      // Delete up to \n or end of buffer
 378      str = line;
 379      while ((str-line)<len) if (*(str++) == '\n') break;
 380      len -= str - line;
 381      memmove(line, str, len);
 382
 383      // if "delete" blanks line, disable further processing
 384      // otherwise trim and restart script
 385      if (!len) {
 386        free(line);
 387        line = 0;
 388      } else {
 389        line[len] = 0;
 390        command = (void *)TT.pattern;
 391      }
 392      continue;
 393    } else if (c=='g') {
 394      free(line);
 395      line = xstrdup(TT.remember);
 396      len = TT.rememberlen;
 397    } else if (c=='G') {
 398      line = xrealloc(line, len+TT.rememberlen+2);
 399      line[len++] = '\n';
 400      memcpy(line+len, TT.remember, TT.rememberlen);
 401      line[len += TT.rememberlen] = 0;
 402    } else if (c=='h') {
 403      free(TT.remember);
 404      TT.remember = xstrdup(line);
 405      TT.rememberlen = len;
 406    } else if (c=='H') {
 407      TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
 408      TT.remember[TT.rememberlen++] = '\n';
 409      memcpy(TT.remember+TT.rememberlen, line, len);
 410      TT.remember[TT.rememberlen += len] = 0;
 411    } else if (c=='i') {
 412      str = command->arg1+(char *)command;
 413      emit(str, strlen(str), 1);
 414    } else if (c=='l') {
 415      int i, x, off;
 416
 417      if (!TT.xx) {
 418        terminal_size(&TT.xx, 0);
 419        if (!TT.xx) TT.xx = 80;
 420        if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
 421        if (TT.xx > 4) TT.xx -= 4;
 422      }
 423
 424      for (i = off = 0; i<len; i++) {
 425        if (off >= TT.xx) {
 426          toybuf[off++] = '\\';
 427          emit(toybuf, off, 1);
 428          off = 0;
 429        }
 430        x = stridx("\\\a\b\f\r\t\v", line[i]);
 431        if (x != -1) {
 432          toybuf[off++] = '\\';
 433          toybuf[off++] = "\\abfrtv"[x];
 434        } else if (line[i] >= ' ') toybuf[off++] = line[i];
 435        else off += sprintf(toybuf+off, "\\%03o", line[i]);
 436      }
 437      toybuf[off++] = '$';
 438      emit(toybuf, off, 1);
 439    } else if (c=='n') {
 440      TT.restart = command->next+1;
 441
 442      break;
 443    } else if (c=='N') {
 444      // Can't just grab next line because we could have multiple N and
 445      // we need to actually read ahead to get N;$p EOF detection right.
 446      if (pline) {
 447        TT.restart = command->next+1;
 448        extend_string(&line, TT.nextline, len, -TT.nextlen);
 449        free(TT.nextline);
 450        TT.nextline = line;
 451        TT.nextlen += len + 1;
 452        line = 0;
 453      }
 454
 455      // Pending append goes out right after N
 456      goto done; 
 457    } else if (c=='p' || c=='P') {
 458      char *l = (c=='P') ? strchr(line, '\n') : 0;
 459
 460      if (emit(line, l ? l-line : len, eol)) break;
 461    } else if (c=='q') {
 462      if (pline) *pline = (void *)1;
 463      free(TT.nextline);
 464      TT.nextline = 0;
 465      TT.nextlen = 0;
 466
 467      break;
 468    } else if (c=='s') {
 469      char *rline = line, *new = command->arg2 + (char *)command, *swap, *rswap;
 470      regmatch_t *match = (void *)toybuf;
 471      regex_t *reg = get_regex(command, command->arg1);
 472      int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen;
 473
 474      // Find match in remaining line (up to remaining len)
 475      while (!regexec0(reg, rline, rlen, 10, match, mflags)) {
 476        mflags = REG_NOTBOL;
 477
 478        // Zero length matches don't count immediately after a previous match
 479        mlen = match[0].rm_eo-match[0].rm_so;
 480        if (!mlen && !zmatch) {
 481          if (!rlen--) break;
 482          rline++;
 483          zmatch++;
 484          continue;
 485        } else zmatch = 0;
 486
 487        // If we're replacing only a specific match, skip if this isn't it
 488        off = command->sflags>>3;
 489        if (off && off != ++count) {
 490          rline += match[0].rm_eo;
 491          rlen -= match[0].rm_eo;
 492
 493          continue;
 494        }
 495        // The fact getline() can allocate unbounded amounts of memory is
 496        // a bigger issue, but while we're here check for integer overflow
 497        if (match[0].rm_eo > INT_MAX) perror_exit(0);
 498
 499        // newlen = strlen(new) but with \1 and & and printf escapes
 500        for (off = newlen = 0; new[off]; off++) {
 501          int cc = -1;
 502
 503          if (new[off] == '&') cc = 0;
 504          else if (new[off] == '\\') cc = new[++off] - '0';
 505          if (cc < 0 || cc > 9) {
 506            newlen++;
 507            continue;
 508          }
 509          newlen += match[cc].rm_eo-match[cc].rm_so;
 510        }
 511
 512        // Allocate new size, copy start/end around match. (Can't extend in
 513        // place because backrefs may refer to text after it's overwritten.)
 514        len += newlen-mlen;
 515        swap = xmalloc(len+1);
 516        rswap = swap+(rline-line)+match[0].rm_so;
 517        memcpy(swap, line, (rline-line)+match[0].rm_so);
 518        memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1);
 519
 520        // copy in new replacement text
 521        for (off = mlen = 0; new[off]; off++) {
 522          int cc = 0, ll;
 523
 524          if (new[off] == '\\') {
 525            cc = new[++off] - '0';
 526            if (cc<0 || cc>9) {
 527              if (!(rswap[mlen++] = unescape(new[off])))
 528                rswap[mlen-1] = new[off];
 529
 530              continue;
 531            } else if (match[cc].rm_so == -1) error_exit("no s//\\%d/", cc);
 532          } else if (new[off] != '&') {
 533            rswap[mlen++] = new[off];
 534
 535            continue;
 536          }
 537
 538          ll = match[cc].rm_eo-match[cc].rm_so;
 539          memcpy(rswap+mlen, rline+match[cc].rm_so, ll);
 540          mlen += ll;
 541        }
 542
 543        rline = rswap+newlen;
 544        free(line);
 545        line = swap;
 546
 547        // Stop after first substitution unless we have flag g
 548        if (!(command->sflags & 2)) break;
 549      }
 550
 551      if (mflags) {
 552        // flag p
 553        if (command->sflags & 4) emit(line, len, eol);
 554
 555        tea = 1;
 556        if (command->w) goto writenow;
 557      }
 558    } else if (c=='w') {
 559      int fd, noeol;
 560      char *name;
 561
 562writenow:
 563      // Swap out emit() context
 564      fd = TT.fdout;
 565      noeol = TT.noeol;
 566
 567      // We save filehandle and newline status before filename
 568      name = command->w + (char *)command;
 569      memcpy(&TT.fdout, name, 4);
 570      name += 4;
 571      TT.noeol = *(name++);
 572
 573      // write, then save/restore context
 574      if (emit(line, len, eol))
 575        perror_exit("w '%s'", command->arg1+(char *)command);
 576      *(--name) = TT.noeol;
 577      TT.noeol = noeol;
 578      TT.fdout = fd;
 579    } else if (c=='x') {
 580      long swap = TT.rememberlen;
 581
 582      str = TT.remember;
 583      TT.remember = line;
 584      line = str;
 585      TT.rememberlen = len;
 586      len = swap;
 587    } else if (c=='y') {
 588      char *from, *to = (char *)command;
 589      int i, j;
 590
 591      from = to+command->arg1;
 592      to += command->arg2;
 593
 594      for (i = 0; i < len; i++) {
 595        j = stridx(from, line[i]);
 596        if (j != -1) line[i] = to[j];
 597      }
 598    } else if (c=='=') {
 599      sprintf(toybuf, "%ld", TT.count);
 600      emit(toybuf, strlen(toybuf), 1);
 601    }
 602
 603    command = command->next;
 604  }
 605
 606  if (line && !FLAG(n)) emit(line, len, eol);
 607
 608done:
 609  if (dlist_terminate(append)) while (append) {
 610    struct append *a = append->next;
 611
 612    if (append->file) {
 613      int fd = open(append->str, O_RDONLY);
 614
 615      // Force newline if noeol pending
 616      if (fd != -1) {
 617        if (TT.noeol) xwrite(TT.fdout, "\n", 1);
 618        TT.noeol = 0;
 619        xsendfile(fd, TT.fdout);
 620        close(fd);
 621      }
 622    } else if (append->str) emit(append->str, strlen(append->str), 1);
 623    else emit(line, 0, 0);
 624    free(append);
 625    append = a;
 626  }
 627  free(line);
 628}
 629
 630// Callback called on each input file
 631static void do_sed_file(int fd, char *name)
 632{
 633  char *tmp;
 634
 635  if (FLAG(i)) {
 636    struct sedcmd *command;
 637
 638    if (!fd) return error_msg("-i on stdin");
 639    TT.fdout = copy_tempfile(fd, name, &tmp);
 640    TT.count = 0;
 641    for (command = (void *)TT.pattern; command; command = command->next)
 642      command->hit = 0;
 643  }
 644  do_lines(fd, TT.delim, sed_line);
 645  if (FLAG(i)) {
 646    if (TT.i && *TT.i) {
 647      char *s = xmprintf("%s%s", name, TT.i);
 648
 649      xrename(name, s);
 650      free(s);
 651    }
 652    replace_tempfile(-1, TT.fdout, &tmp);
 653    TT.fdout = 1;
 654    TT.nextline = 0;
 655    TT.nextlen = TT.noeol = 0;
 656  }
 657}
 658
 659// Copy chunk of string between two delimiters, converting printf escapes.
 660// returns processed copy of string (0 if error), *pstr advances to next
 661// unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
 662// if regxex, ignore delimiter in [ranges]
 663static char *unescape_delimited_string(char **pstr, char *delim)
 664{
 665  char *to, *from, mode = 0, d;
 666
 667  // Grab leading delimiter (if necessary), allocate space for new string
 668  from = *pstr;
 669  if (!delim || !*delim) {
 670    if (!(d = *(from++))) return 0;
 671    if (d == '\\') d = *(from++);
 672    if (!d || d == '\\') return 0;
 673    if (delim) *delim = d;
 674  } else d = *delim;
 675  to = delim = xmalloc(strlen(*pstr)+1);
 676
 677  while (mode || *from != d) {
 678    if (!*from) return 0;
 679
 680    // delimiter in regex character range doesn't count
 681    if (*from == '[') {
 682      if (!mode) {
 683        mode = ']';
 684        if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
 685      } else if (mode == ']' && strchr(".=:", from[1])) {
 686        *(to++) = *(from++);
 687        mode = *from;
 688      }
 689    } else if (*from == mode) {
 690      if (mode == ']') mode = 0;
 691      else {
 692        *(to++) = *(from++);
 693        mode = ']';
 694      }
 695    // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
 696    // but the perl build does it, so we need to filter it out.
 697    } else if (mode && *from == '-' && from[-1] == from[1]) {
 698      from+=2;
 699      continue;
 700    } else if (*from == '\\') {
 701      if (!from[1]) return 0;
 702
 703      // Check escaped end delimiter before printf style escapes.
 704      if (from[1] == d) from++;
 705      else if (from[1]=='\\') *(to++) = *(from++);
 706      else {
 707        char c = unescape(from[1]);
 708
 709        if (c) {
 710          *(to++) = c;
 711          from+=2;
 712          continue;
 713        } else if (!mode) *(to++) = *(from++);
 714      }
 715    }
 716    *(to++) = *(from++);
 717  }
 718  *to = 0;
 719  *pstr = from+1;
 720
 721  return delim;
 722}
 723
 724// Translate pattern strings into command structures. Each command structure
 725// is a single allocation (which requires some math and remalloc at times).
 726static void parse_pattern(char **pline, long len)
 727{
 728  struct sedcmd *command = (void *)TT.pattern;
 729  char *line, *reg, c, *errstart;
 730  int i;
 731
 732  line = errstart = pline ? *pline : "";
 733  if (len && line[len-1]=='\n') line[--len] = 0;
 734
 735  // Append this line to previous multiline command? (hit indicates type.)
 736  // During parsing "hit" stores data about line continuations, but in
 737  // sed_line() it means the match range attached to this command
 738  // is active, so processing the continuation must zero it again.
 739  if (command && command->prev->hit) {
 740    // Remove half-finished entry from list so remalloc() doesn't confuse it
 741    TT.pattern = TT.pattern->prev;
 742    command = dlist_pop(&TT.pattern);
 743    c = command->c;
 744    reg = (char *)command;
 745    reg += command->arg1 + strlen(reg + command->arg1);
 746
 747    // Resume parsing for 'a' or 's' command. (Only two that can do this.)
 748    // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
 749    // a unicode character.
 750    if (command->hit < 256) goto resume_s;
 751    else goto resume_a;
 752  }
 753
 754  // Loop through commands in this line.
 755
 756  command = 0;
 757  for (;;) {
 758    if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
 759
 760    // If there's no more data on this line, return.
 761    for (;;) {
 762      while (isspace(*line) || *line == ';') line++;
 763      if (*line == '#') while (*line && *line != '\n') line++;
 764      else break;
 765    }
 766    if (!*line) return;
 767
 768    // We start by writing data into toybuf. Later we'll allocate the
 769    // ex
 770
 771    errstart = line;
 772    memset(toybuf, 0, sizeof(struct sedcmd));
 773    command = (void *)toybuf;
 774    reg = toybuf + sizeof(struct sedcmd);
 775
 776    // Parse address range (if any)
 777    for (i = 0; i < 2; i++) {
 778      if (*line == ',') line++;
 779      else if (i) break;
 780
 781      if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
 782      else if (*line == '$') {
 783        command->lmatch[i] = -1;
 784        line++;
 785      } else if (*line == '/' || *line == '\\') {
 786        char *s = line;
 787
 788        if (!(s = unescape_delimited_string(&line, 0))) goto error;
 789        if (!*s) command->rmatch[i] = 0;
 790        else {
 791          xregcomp((void *)reg, s, REG_EXTENDED*!!FLAG(r));
 792          command->rmatch[i] = reg-toybuf;
 793          reg += sizeof(regex_t);
 794        }
 795        free(s);
 796      } else break;
 797    }
 798
 799    while (isspace(*line)) line++;
 800    if (!*line) break;
 801
 802    while (*line == '!') {
 803      command->not = 1;
 804      line++;
 805    }
 806    while (isspace(*line)) line++;
 807
 808    c = command->c = *(line++);
 809    if (strchr("}:", c) && i) break;
 810    if (strchr("aiqr=", c) && i>1) break;
 811
 812    // Add step to pattern
 813    command = xmemdup(toybuf, reg-toybuf);
 814    reg = (reg-toybuf) + (char *)command;
 815
 816    // Parse arguments by command type
 817    if (c == '{') TT.nextlen++;
 818    else if (c == '}') {
 819      if (!TT.nextlen--) break;
 820    } else if (c == 's') {
 821      char *end, delim = 0;
 822
 823      // s/pattern/replacement/flags
 824
 825      // line continuations use arg1 (back at the start of the function),
 826      // so let's fill out arg2 first (since the regex part can't be multiple
 827      // lines) and swap them back later.
 828
 829      // get pattern (just record, we parse it later)
 830      command->arg2 = reg - (char *)command;
 831      if (!(TT.remember = unescape_delimited_string(&line, &delim)))
 832        goto error;
 833
 834      reg += sizeof(regex_t);
 835      command->arg1 = reg-(char *)command;
 836      command->hit = delim;
 837resume_s:
 838      // get replacement - don't replace escapes yet because \1 and \& need
 839      // processing later, after we replace \\ with \ we can't tell \\1 from \1
 840      end = line;
 841      while (*end != command->hit) {
 842        if (!*end) goto error;
 843        if (*end++ == '\\') {
 844          if (!*end || *end == '\n') {
 845            end[-1] = '\n';
 846            break;
 847          }
 848          end++;
 849        }
 850      }
 851
 852      reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
 853      line = end;
 854      // line continuation? (note: '\n' can't be a valid delim).
 855      if (*line == command->hit) command->hit = 0;
 856      else {
 857        if (!*line) continue;
 858        reg--;
 859        line++;
 860        goto resume_s;
 861      }
 862
 863      // swap arg1/arg2 so they're back in order arguments occur.
 864      i = command->arg1;
 865      command->arg1 = command->arg2;
 866      command->arg2 = i;
 867
 868      // get flags
 869      for (line++; *line; line++) {
 870        long l;
 871
 872        if (isspace(*line) && *line != '\n') continue;
 873
 874        if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l;
 875        else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
 876          command->sflags |= l << 3;
 877          line--;
 878        } else break;
 879      }
 880
 881      // We deferred actually parsing the regex until we had the s///i flag
 882      // allocating the space was done by extend_string() above
 883      if (!*TT.remember) command->arg1 = 0;
 884      else xregcomp((void *)(command->arg1 + (char *)command), TT.remember,
 885        (REG_EXTENDED*!!FLAG(r))|((command->sflags&1)*REG_ICASE));
 886      free(TT.remember);
 887      TT.remember = 0;
 888      if (*line == 'w') {
 889        line++;
 890        goto writenow;
 891      }
 892    } else if (c == 'w') {
 893      int fd, delim;
 894      char *cc;
 895
 896      // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
 897      // eol status, and to retain the filename for error messages, we'd need
 898      // to go up to arg5 just for this. Compromise: dynamically allocate the
 899      // filehandle and eol status.
 900
 901writenow:
 902      while (isspace(*line)) line++;
 903      if (!*line) goto error;
 904      for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
 905      delim = *cc;
 906      *cc = 0;
 907      fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
 908      *cc = delim;
 909
 910      command->w = reg - (char *)command;
 911      command = xrealloc(command, command->w+(cc-line)+6);
 912      reg = command->w + (char *)command;
 913
 914      memcpy(reg, &fd, 4);
 915      reg += 4;
 916      *(reg++) = 0;
 917      memcpy(reg, line, delim);
 918      reg += delim;
 919      *(reg++) = 0;
 920
 921      line = cc;
 922      if (delim) line += 2;
 923    } else if (c == 'y') {
 924      char *s, delim = 0;
 925      int len;
 926
 927      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
 928      command->arg1 = reg-(char *)command;
 929      len = strlen(s);
 930      reg = extend_string((void *)&command, s, reg-(char *)command, len);
 931      free(s);
 932      command->arg2 = reg-(char *)command;
 933      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
 934      if (len != strlen(s)) goto error;
 935      reg = extend_string((void *)&command, s, reg-(char*)command, len);
 936      free(s);
 937    } else if (strchr("abcirtTw:", c)) {
 938      int end;
 939
 940      // trim leading spaces
 941      while (isspace(*line) && *line != '\n') line++;
 942
 943      // Resume logic differs from 's' case because we don't add a newline
 944      // unless it's after something, so we add it on return instead.
 945resume_a:
 946      command->hit = 0;
 947
 948      // btT: end with space or semicolon, aicrw continue to newline.
 949      if (!(end = strcspn(line, strchr(":btT", c) ? "}; \t\r\n\v\f" : "\n"))) {
 950        // Argument's optional for btT
 951        if (strchr("btT", c)) continue;
 952        else if (!command->arg1) break;
 953      }
 954
 955      // Extend allocation to include new string. We use offsets instead of
 956      // pointers so realloc() moving stuff doesn't break things. Ok to write
 957      // \n over NUL terminator because call to extend_string() adds it back.
 958      if (!command->arg1) command->arg1 = reg - (char*)command;
 959      else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
 960      else if (!pline) {
 961        command->arg1 = 0;
 962        continue;
 963      }
 964      reg = extend_string((void *)&command, line, reg - (char *)command, end);
 965
 966      // Recopy data to remove escape sequences and handle line continuation.
 967      if (strchr("aci", c)) {
 968        reg -= end+1;
 969        for (i = end; i; i--) {
 970          if ((*reg++ = *line++)=='\\') {
 971
 972            // escape at end of line: resume if -e escaped literal newline,
 973            // else request callback and resume with next line
 974            if (!--i) {
 975              *--reg = 0;
 976              if (*line) {
 977                line++;
 978                goto resume_a;
 979              }
 980              command->hit = 256;
 981              break;
 982            }
 983            if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
 984            line++;
 985          }
 986        }
 987        *reg = 0;
 988      } else line += end;
 989
 990    // Commands that take no arguments
 991    } else if (!strchr("{dDgGhHlnNpPqx=", c)) break;
 992  }
 993
 994error:
 995  error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
 996}
 997
 998void sed_main(void)
 999{
1000  struct arg_list *al;
1001  char **args = toys.optargs;
1002
1003  if (!FLAG(z)) TT.delim = '\n';
1004
1005  // Lie to autoconf when it asks stupid questions, so configure regexes
1006  // that look for "GNU sed version %f" greater than some old buggy number
1007  // don't fail us for not matching their narrow expectations.
1008  if (FLAG(version)) {
1009    xprintf("This is not GNU sed version 9.0\n");
1010    return;
1011  }
1012
1013  // Handling our own --version means we handle our own --help too.
1014  if (FLAG(help)) help_exit(0);
1015
1016  // Parse pattern into commands.
1017
1018  // If no -e or -f, first argument is the pattern.
1019  if (!TT.e && !TT.f) {
1020    if (!*toys.optargs) error_exit("no pattern");
1021    (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1022  }
1023
1024  // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1025  // so handle all -e, then all -f. (At least the behavior's consistent.)
1026
1027  for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
1028  parse_pattern(0, 0);
1029  for (al = TT.f; al; al = al->next)
1030    do_lines(xopenro(al->arg), TT.delim, parse_pattern);
1031  dlist_terminate(TT.pattern);
1032  if (TT.nextlen) error_exit("no }");  
1033
1034  TT.fdout = 1;
1035  TT.remember = xstrdup("");
1036
1037  // Inflict pattern upon input files. Long version because !O_CLOEXEC
1038  loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file);
1039
1040  // Provide EOF flush at end of cumulative input for non-i mode.
1041  if (!FLAG(i)) {
1042    toys.optflags |= FLAG_i;
1043    sed_line(0, 0);
1044  }
1045
1046  // todo: need to close fd when done for TOYBOX_FREE?
1047}
1048