toybox/toys/posix/sed.c
<<
>>
Prefs
   1/* sed.c - stream editor. Thing that does s/// and other stuff.
   2 *
   3 * Copyright 2014 Rob Landley <rob@landley.net>
   4 *
   5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
   6 *
   7 * TODO: lines > 2G could wrap signed int length counters. Not just getline()
   8 * but N and s///
   9 * TODO: make y// handle unicode
  10 * TODO: handle error return from emit(), error_msg/exit consistently
  11 *       What's the right thing to do for -i when write fails? Skip to next?
  12
  13USE_SED(NEWTOY(sed, "(help)(version)e*f*inEr[+Er]", TOYFLAG_USR|TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
  14
  15config SED
  16  bool "sed"
  17  default y
  18  help
  19    usage: sed [-inrE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
  20
  21    Stream editor. Apply one or more editing SCRIPTs to each line of input
  22    (from FILE or stdin) producing output (by default to stdout).
  23
  24    -e  add SCRIPT to list
  25    -f  add contents of SCRIPT_FILE to list
  26    -i  Edit each file in place
  27    -n  No default output (use the p command to output matched lines)
  28    -r  Use extended regular expression syntax
  29    -E  Alias for -r
  30    -s  Treat input files separately (implied by -i)
  31
  32    A SCRIPT is a series of one or more COMMANDs separated by newlines or
  33    semicolons. All -e SCRIPTs are concatenated together as if separated
  34    by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
  35    If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
  36
  37    Each COMMAND may be preceded by an address which limits the command to
  38    apply only to the specified line(s). Commands without an address apply to
  39    every line. Addresses are of the form:
  40
  41      [ADDRESS[,ADDRESS]]COMMAND
  42
  43    The ADDRESS may be a decimal line number (starting at 1), a /regular
  44    expression/ within a pair of forward slashes, or the character "$" which
  45    matches the last line of input. (In -s or -i mode this matches the last
  46    line of each file, otherwise just the last line of the last file.) A single
  47    address matches one line, a pair of comma separated addresses match
  48    everything from the first address to the second address (inclusive). If
  49    both addresses are regular expressions, more than one range of lines in
  50    each file can match.
  51
  52    REGULAR EXPRESSIONS in sed are started and ended by the same character
  53    (traditionally / but anything except a backslash or a newline works).
  54    Backslashes may be used to escape the delimiter if it occurs in the
  55    regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
  56    and unicode). An empty regex repeats the previous one. ADDRESS regexes
  57    (above) require the first delimeter to be escaped with a backslash when
  58    it isn't a forward slash (to distinguish it from the COMMANDs below).
  59
  60    Sed mostly operates on individual lines one at a time. It reads each line,
  61    processes it, and either writes it to the output or discards it before
  62    reading the next line. Sed can remember one additional line in a separate
  63    buffer (using the h, H, g, G, and x commands), and can read the next line
  64    of input early (using the n and N command), but other than that command
  65    scripts operate on individual lines of text.
  66
  67    Each COMMAND starts with a single character. The following commands take
  68    no arguments:
  69
  70      {  Start a new command block, continuing until a corresponding "}".
  71         Command blocks may nest. If the block has an address, commands within
  72         the block are only run for lines within the block's address range.
  73
  74      }  End command block (this command cannot have an address)
  75
  76      d  Delete this line and move on to the next one
  77         (ignores remaining COMMANDs)
  78
  79      D  Delete one line of input and restart command SCRIPT (same as "d"
  80         unless you've glued lines together with "N" or similar)
  81
  82      g  Get remembered line (overwriting current line)
  83
  84      G  Get remembered line (appending to current line)
  85
  86      h  Remember this line (overwriting remembered line)
  87
  88      H  Remember this line (appending to remembered line, if any)
  89
  90      l  Print line, escaping \abfrtv (but not newline), octal escaping other
  91         nonprintable characters, wrapping lines to terminal width with a
  92         backslash, and appending $ to actual end of line.
  93
  94      n  Print default output and read next line, replacing current line
  95         (If no next line available, quit processing script)
  96
  97      N  Append next line of input to this line, separated by a newline
  98         (This advances the line counter for address matching and "=", if no
  99         next line available quit processing script without default output)
 100
 101      p  Print this line
 102
 103      P  Print this line up to first newline (from "N")
 104
 105      q  Quit (print default output, no more commands processed or lines read)
 106
 107      x  Exchange this line with remembered line (overwrite in both directions)
 108
 109      =  Print the current line number (followed by a newline)
 110
 111    The following commands (may) take an argument. The "text" arguments (to
 112    the "a", "b", and "c" commands) may end with an unescaped "\" to append
 113    the next line (for which leading whitespace is not skipped), and also
 114    treat ";" as a literal character (use "\;" instead).
 115
 116      a [text]   Append text to output before attempting to read next line
 117
 118      b [label]  Branch, jumps to :label (or with no label, to end of SCRIPT)
 119
 120      c [text]   Delete line, output text at end of matching address range
 121                 (ignores remaining COMMANDs)
 122
 123      i [text]   Print text
 124
 125      r [file]   Append contents of file to output before attempting to read
 126                 next line.
 127
 128      s/S/R/F    Search for regex S, replace matched text with R using flags F.
 129                 The first character after the "s" (anything but newline or
 130                 backslash) is the delimiter, escape with \ to use normally.
 131
 132                 The replacement text may contain "&" to substitute the matched
 133                 text (escape it with backslash for a literal &), or \1 through
 134                 \9 to substitute a parenthetical subexpression in the regex.
 135                 You can also use the normal backslash escapes such as \n and
 136                 a backslash at the end of the line appends the next line.
 137
 138                 The flags are:
 139
 140                 [0-9]    A number, substitute only that occurrence of pattern
 141                 g        Global, substitute all occurrences of pattern
 142                 i        Ignore case when matching
 143                 p        Print the line if match was found and replaced
 144                 w [file] Write (append) line to file if match replaced
 145
 146      t [label]  Test, jump to :label only if an "s" command found a match in
 147                 this line since last test (replacing with same text counts)
 148
 149      T [label]  Test false, jump only if "s" hasn't found a match.
 150
 151      w [file]   Write (append) line to file
 152
 153      y/old/new/ Change each character in 'old' to corresponding character
 154                 in 'new' (with standard backslash escapes, delimiter can be
 155                 any repeated character except \ or \n)
 156
 157      : [label]  Labeled target for jump commands
 158
 159      #  Comment, ignore rest of this line of SCRIPT
 160
 161    Deviations from posix: allow extended regular expressions with -r,
 162    editing in place with -i, separate with -s, printf escapes in text, line
 163    continuations, semicolons after all commands, 2-address anywhere an
 164    address is allowed, "T" command, multiline continuations for [abc],
 165    \; to end [abc] argument before end of line.
 166*/
 167
 168#define FOR_sed
 169#include "toys.h"
 170
 171GLOBALS(
 172  struct arg_list *f;
 173  struct arg_list *e;
 174
 175  // processed pattern list
 176  struct double_list *pattern;
 177
 178  char *nextline, *remember;
 179  void *restart, *lastregex;
 180  long nextlen, rememberlen, count;
 181  int fdout, noeol;
 182  unsigned xx;
 183)
 184
 185// Linked list of parsed sed commands. Offset fields indicate location where
 186// regex or string starts, ala offset+(char *)struct, because we remalloc()
 187// these to expand them for multiline inputs, and pointers would have to be
 188// individually adjusted.
 189
 190struct sedcmd {
 191  struct sedcmd *next, *prev;
 192
 193  // Begin and end of each match
 194  long lmatch[2]; // line number of match
 195  int rmatch[2];  // offset of regex struct for prefix matches (/abc/,/def/p)
 196  int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
 197  unsigned not, hit;
 198  unsigned sflags; // s///flag bits: i=1, g=2, p=4
 199  char c; // action
 200};
 201
 202// Write out line with potential embedded NUL, handling eol/noeol
 203static int emit(char *line, long len, int eol)
 204{
 205  int l, old = line[len];
 206
 207  if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
 208  TT.noeol = !eol;
 209  if (eol) line[len++] = '\n';
 210  if (!len) return 0;
 211  l = writeall(TT.fdout, line, len);
 212  if (eol) line[len-1] = old;
 213  if (l != len) {
 214    perror_msg("short write");
 215
 216    return 1;
 217  }
 218
 219  return 0;
 220}
 221
 222// Extend allocation to include new string, with newline between if newlen<0
 223
 224static char *extend_string(char **old, char *new, int oldlen, int newlen)
 225{
 226  int newline = newlen < 0;
 227  char *s;
 228
 229  if (newline) newlen = -newlen;
 230  s = *old = xrealloc(*old, oldlen+newlen+newline+1);
 231  if (newline) s[oldlen++] = '\n';
 232  memcpy(s+oldlen, new, newlen);
 233  s[oldlen+newlen] = 0;
 234
 235  return s+oldlen+newlen+1;
 236}
 237
 238// An empty regex repeats the previous one
 239static void *get_regex(void *trump, int offset)
 240{
 241  if (!offset) {
 242    if (!TT.lastregex) error_exit("no previous regex");
 243    return TT.lastregex;
 244  }
 245
 246  return TT.lastregex = offset+(char *)trump;
 247}
 248
 249// Apply pattern to line from input file
 250static void process_line(char **pline, long plen)
 251{
 252  struct append {
 253    struct append *next, *prev;
 254    int file;
 255    char *str;
 256  } *append = 0;
 257  char *line = TT.nextline;
 258  long len = TT.nextlen;
 259  struct sedcmd *command;
 260  int eol = 0, tea = 0;
 261
 262  // Grab next line for deferred processing (EOF detection: we get a NULL
 263  // pline at EOF to flush last line). Note that only end of _last_ input
 264  // file matches $ (unless we're doing -i).
 265  TT.nextline = 0;
 266  TT.nextlen = 0;
 267  if (pline) {
 268    TT.nextline = *pline;
 269    TT.nextlen = plen;
 270    *pline = 0;
 271  }
 272
 273  if (!line || !len) return;
 274  if (line[len-1] == '\n') line[--len] = eol++;
 275  TT.count++;
 276
 277  // The restart-1 is because we added one to make sure it wasn't NULL,
 278  // otherwise N as last command would restart script
 279  command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
 280  TT.restart = 0;
 281
 282  while (command) {
 283    char *str, c = command->c;
 284
 285    // Have we got a line or regex matching range for this rule?
 286    if (*command->lmatch || *command->rmatch) {
 287      int miss = 0;
 288      long lm;
 289
 290      // In a match that might end?
 291      if (command->hit) {
 292        if (!(lm = command->lmatch[1])) {
 293          if (!command->rmatch[1]) command->hit = 0;
 294          else {
 295            void *rm = get_regex(command, command->rmatch[1]);
 296
 297            // regex match end includes matching line, so defer deactivation
 298            if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
 299          }
 300        } else if (lm > 0 && lm < TT.count) command->hit = 0;
 301
 302      // Start a new match?
 303      } else {
 304        if (!(lm = *command->lmatch)) {
 305          void *rm = get_regex(command, *command->rmatch);
 306
 307          if (line && !regexec0(rm, line, len, 0, 0, 0)) command->hit++;
 308        } else if (lm == TT.count || (lm == -1 && !pline)) command->hit++;
 309
 310        if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
 311      } 
 312
 313      // Didn't match?
 314      lm = !(command->hit ^ command->not);
 315
 316      // Deferred disable from regex end match
 317      if (miss || command->lmatch[1] == TT.count) command->hit = 0;
 318
 319      if (lm) {
 320        // Handle skipping curly bracket command group
 321        if (c == '{') {
 322          int curly = 1;
 323
 324          while (curly) {
 325            command = command->next;
 326            if (command->c == '{') curly++;
 327            if (command->c == '}') curly--;
 328          }
 329        }
 330        command = command->next;
 331        continue;
 332      }
 333    }
 334
 335    // A deleted line can still update line match state for later commands
 336    if (!line) {
 337      command = command->next;
 338      continue;
 339    }
 340
 341    // Process command
 342
 343    if (c=='a' || c=='r') {
 344      struct append *a = xzalloc(sizeof(struct append));
 345      if (command->arg1) a->str = command->arg1+(char *)command;
 346      a->file = c=='r';
 347      dlist_add_nomalloc((void *)&append, (void *)a);
 348    } else if (c=='b' || c=='t' || c=='T') {
 349      int t = tea;
 350
 351      if (c != 'b') tea = 0;
 352      if (c=='b' || t^(c=='T')) {
 353        if (!command->arg1) break;
 354        str = command->arg1+(char *)command;
 355        for (command = (void *)TT.pattern; command; command = command->next)
 356          if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
 357            break;
 358        if (!command) error_exit("no :%s", str);
 359      }
 360    } else if (c=='c') {
 361      str = command->arg1+(char *)command;
 362      if (!command->hit) emit(str, strlen(str), 1);
 363      free(line);
 364      line = 0;
 365      continue;
 366    } else if (c=='d') {
 367      free(line);
 368      line = 0;
 369      continue;
 370    } else if (c=='D') {
 371      // Delete up to \n or end of buffer
 372      str = line;
 373      while ((str-line)<len) if (*(str++) == '\n') break;
 374      len -= str - line;
 375      memmove(line, str, len);
 376
 377      // if "delete" blanks line, disable further processing
 378      // otherwise trim and restart script
 379      if (!len) {
 380        free(line);
 381        line = 0;
 382      } else {
 383        line[len] = 0;
 384        command = (void *)TT.pattern;
 385      }
 386      continue;
 387    } else if (c=='g') {
 388      free(line);
 389      line = xstrdup(TT.remember);
 390      len = TT.rememberlen;
 391    } else if (c=='G') {
 392      line = xrealloc(line, len+TT.rememberlen+2);
 393      line[len++] = '\n';
 394      memcpy(line+len, TT.remember, TT.rememberlen);
 395      line[len += TT.rememberlen] = 0;
 396    } else if (c=='h') {
 397      free(TT.remember);
 398      TT.remember = xstrdup(line);
 399      TT.rememberlen = len;
 400    } else if (c=='H') {
 401      TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
 402      TT.remember[TT.rememberlen++] = '\n';
 403      memcpy(TT.remember+TT.rememberlen, line, len);
 404      TT.remember[TT.rememberlen += len] = 0;
 405    } else if (c=='i') {
 406      str = command->arg1+(char *)command;
 407      emit(str, strlen(str), 1);
 408    } else if (c=='l') {
 409      int i, x, off;
 410
 411      if (!TT.xx) {
 412        terminal_size(&TT.xx, 0);
 413        if (!TT.xx) TT.xx = 80;
 414        if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
 415        if (TT.xx > 4) TT.xx -= 4;
 416      }
 417
 418      for (i = off = 0; i<len; i++) {
 419        if (off >= TT.xx) {
 420          toybuf[off++] = '\\';
 421          emit(toybuf, off, 1);
 422          off = 0;
 423        }
 424        x = stridx("\\\a\b\f\r\t\v", line[i]);
 425        if (x != -1) {
 426          toybuf[off++] = '\\';
 427          toybuf[off++] = "\\abfrtv"[x];
 428        } else if (line[i] >= ' ') toybuf[off++] = line[i];
 429        else off += sprintf(toybuf+off, "\\%03o", line[i]);
 430      }
 431      toybuf[off++] = '$';
 432      emit(toybuf, off, 1);
 433    } else if (c=='n') {
 434      TT.restart = command->next+1;
 435
 436      break;
 437    } else if (c=='N') {
 438      // Can't just grab next line because we could have multiple N and
 439      // we need to actually read ahead to get N;$p EOF detection right.
 440      if (pline) {
 441        TT.restart = command->next+1;
 442        extend_string(&line, TT.nextline, len, -TT.nextlen);
 443        free(TT.nextline);
 444        TT.nextline = line;
 445        TT.nextlen += len + 1;
 446        line = 0;
 447      }
 448
 449      // Pending append goes out right after N
 450      goto done; 
 451    } else if (c=='p' || c=='P') {
 452      char *l = (c=='P') ? strchr(line, '\n') : 0;
 453
 454      if (emit(line, l ? l-line : len, eol)) break;
 455    } else if (c=='q') {
 456      if (pline) *pline = (void *)1;
 457      free(TT.nextline);
 458      TT.nextline = 0;
 459      TT.nextlen = 0;
 460
 461      break;
 462    } else if (c=='s') {
 463      char *rline = line, *new = command->arg2 + (char *)command, *swap, *rswap;
 464      regmatch_t *match = (void *)toybuf;
 465      regex_t *reg = get_regex(command, command->arg1);
 466      int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen;
 467
 468      // Find match in remaining line (up to remaining len)
 469      while (!regexec0(reg, rline, rlen, 10, match, mflags)) {
 470        mflags = REG_NOTBOL;
 471
 472        // Zero length matches don't count immediately after a previous match
 473        mlen = match[0].rm_eo-match[0].rm_so;
 474        if (!mlen && !zmatch) {
 475          if (!rlen--) break;
 476          rline++;
 477          zmatch++;
 478          continue;
 479        } else zmatch = 0;
 480
 481        // If we're replacing only a specific match, skip if this isn't it
 482        off = command->sflags>>3;
 483        if (off && off != ++count) {
 484          rline += match[0].rm_eo;
 485          rlen -= match[0].rm_eo;
 486
 487          continue;
 488        }
 489        // The fact getline() can allocate unbounded amounts of memory is
 490        // a bigger issue, but while we're here check for integer overflow
 491        if (match[0].rm_eo > INT_MAX) perror_exit(0);
 492
 493        // newlen = strlen(new) but with \1 and & and printf escapes
 494        for (off = newlen = 0; new[off]; off++) {
 495          int cc = -1;
 496
 497          if (new[off] == '&') cc = 0;
 498          else if (new[off] == '\\') cc = new[++off] - '0';
 499          if (cc < 0 || cc > 9) {
 500            newlen++;
 501            continue;
 502          }
 503          newlen += match[cc].rm_eo-match[cc].rm_so;
 504        }
 505
 506        // Allocate new size, copy start/end around match. (Can't extend in
 507        // place because backrefs may refer to text after it's overwritten.)
 508        len += newlen-mlen;
 509        swap = xmalloc(len+1);
 510        rswap = swap+(rline-line)+match[0].rm_so;
 511        memcpy(swap, line, (rline-line)+match[0].rm_so);
 512        memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1);
 513
 514        // copy in new replacement text
 515        for (off = mlen = 0; new[off]; off++) {
 516          int cc = 0, ll;
 517
 518          if (new[off] == '\\') {
 519            cc = new[++off] - '0';
 520            if (cc<0 || cc>9) {
 521              if (!(rswap[mlen++] = unescape(new[off])))
 522                rswap[mlen-1] = new[off];
 523
 524              continue;
 525            } else if (match[cc].rm_so == -1) error_exit("no s//\\%d/", cc);
 526          } else if (new[off] != '&') {
 527            rswap[mlen++] = new[off];
 528
 529            continue;
 530          }
 531
 532          ll = match[cc].rm_eo-match[cc].rm_so;
 533          memcpy(rswap+mlen, rline+match[cc].rm_so, ll);
 534          mlen += ll;
 535        }
 536
 537        rline = rswap+newlen;
 538        free(line);
 539        line = swap;
 540
 541        // Stop after first substitution unless we have flag g
 542        if (!(command->sflags & 2)) break;
 543      }
 544
 545      if (mflags) {
 546        // flag p
 547        if (command->sflags & 4) emit(line, len, eol);
 548
 549        tea = 1;
 550        if (command->w) goto writenow;
 551      }
 552    } else if (c=='w') {
 553      int fd, noeol;
 554      char *name;
 555
 556writenow:
 557      // Swap out emit() context
 558      fd = TT.fdout;
 559      noeol = TT.noeol;
 560
 561      // We save filehandle and newline status before filename
 562      name = command->w + (char *)command;
 563      memcpy(&TT.fdout, name, 4);
 564      name += 4;
 565      TT.noeol = *(name++);
 566
 567      // write, then save/restore context
 568      if (emit(line, len, eol))
 569        perror_exit("w '%s'", command->arg1+(char *)command);
 570      *(--name) = TT.noeol;
 571      TT.noeol = noeol;
 572      TT.fdout = fd;
 573    } else if (c=='x') {
 574      long swap = TT.rememberlen;
 575
 576      str = TT.remember;
 577      TT.remember = line;
 578      line = str;
 579      TT.rememberlen = len;
 580      len = swap;
 581    } else if (c=='y') {
 582      char *from, *to = (char *)command;
 583      int i, j;
 584
 585      from = to+command->arg1;
 586      to += command->arg2;
 587
 588      for (i = 0; i < len; i++) {
 589        j = stridx(from, line[i]);
 590        if (j != -1) line[i] = to[j];
 591      }
 592    } else if (c=='=') {
 593      sprintf(toybuf, "%ld", TT.count);
 594      emit(toybuf, strlen(toybuf), 1);
 595    }
 596
 597    command = command->next;
 598  }
 599
 600  if (line && !(toys.optflags & FLAG_n)) emit(line, len, eol);
 601
 602done:
 603  if (dlist_terminate(append)) while (append) {
 604    struct append *a = append->next;
 605
 606    if (append->file) {
 607      int fd = open(append->str, O_RDONLY);
 608
 609      // Force newline if noeol pending
 610      if (fd != -1) {
 611        if (TT.noeol) xwrite(TT.fdout, "\n", 1);
 612        TT.noeol = 0;
 613        xsendfile(fd, TT.fdout);
 614        close(fd);
 615      }
 616    } else if (append->str) emit(append->str, strlen(append->str), 1);
 617    else emit(line, 0, 0);
 618    free(append);
 619    append = a;
 620  }
 621  free(line);
 622}
 623
 624// Callback called on each input file
 625static void do_sed(int fd, char *name)
 626{
 627  int i = toys.optflags & FLAG_i;
 628  char *tmp;
 629
 630  if (i) {
 631    struct sedcmd *command;
 632
 633    if (!fd) {
 634      error_msg("-i on stdin");
 635      return;
 636    }
 637    TT.fdout = copy_tempfile(fd, name, &tmp);
 638    TT.count = 0;
 639    for (command = (void *)TT.pattern; command; command = command->next)
 640      command->hit = 0;
 641  }
 642  do_lines(fd, process_line);
 643  if (i) {
 644    process_line(0, 0);
 645    replace_tempfile(-1, TT.fdout, &tmp);
 646    TT.fdout = 1;
 647    TT.nextline = 0;
 648    TT.nextlen = TT.noeol = 0;
 649  }
 650}
 651
 652// Copy chunk of string between two delimiters, converting printf escapes.
 653// returns processed copy of string (0 if error), *pstr advances to next
 654// unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
 655// if regxex, ignore delimiter in [ranges]
 656static char *unescape_delimited_string(char **pstr, char *delim)
 657{
 658  char *to, *from, mode = 0, d;
 659
 660  // Grab leading delimiter (if necessary), allocate space for new string
 661  from = *pstr;
 662  if (!delim || !*delim) {
 663    if (!(d = *(from++))) return 0;
 664    if (d == '\\') d = *(from++);
 665    if (!d || d == '\\') return 0;
 666    if (delim) *delim = d;
 667  } else d = *delim;
 668  to = delim = xmalloc(strlen(*pstr)+1);
 669
 670  while (mode || *from != d) {
 671    if (!*from) return 0;
 672
 673    // delimiter in regex character range doesn't count
 674    if (*from == '[') {
 675      if (!mode) {
 676        mode = ']';
 677        if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
 678      } else if (mode == ']' && strchr(".=:", from[1])) {
 679        *(to++) = *(from++);
 680        mode = *from;
 681      }
 682    } else if (*from == mode) {
 683      if (mode == ']') mode = 0;
 684      else {
 685        *(to++) = *(from++);
 686        mode = ']';
 687      }
 688    // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
 689    // but the perl build does it, so we need to filter it out.
 690    } else if (mode && *from == '-' && from[-1] == from[1]) {
 691      from+=2;
 692      continue;
 693    } else if (*from == '\\') {
 694      if (!from[1]) return 0;
 695
 696      // Check escaped end delimiter before printf style escapes.
 697      if (from[1] == d) from++;
 698      else if (from[1]=='\\') *(to++) = *(from++);
 699      else {
 700        char c = unescape(from[1]);
 701
 702        if (c) {
 703          *(to++) = c;
 704          from+=2;
 705          continue;
 706        } else if (!mode) *(to++) = *(from++);
 707      }
 708    }
 709    *(to++) = *(from++);
 710  }
 711  *to = 0;
 712  *pstr = from+1;
 713
 714  return delim;
 715}
 716
 717// Translate pattern strings into command structures. Each command structure
 718// is a single allocation (which requires some math and remalloc at times).
 719static void parse_pattern(char **pline, long len)
 720{
 721  struct sedcmd *command = (void *)TT.pattern;
 722  char *line, *reg, c, *errstart;
 723  int i;
 724
 725  line = errstart = pline ? *pline : "";
 726  if (len && line[len-1]=='\n') line[--len] = 0;
 727
 728  // Append this line to previous multiline command? (hit indicates type.)
 729  // During parsing "hit" stores data about line continuations, but in
 730  // process_line() it means the match range attached to this command
 731  // is active, so processing the continuation must zero it again.
 732  if (command && command->prev->hit) {
 733    // Remove half-finished entry from list so remalloc() doesn't confuse it
 734    TT.pattern = TT.pattern->prev;
 735    command = dlist_pop(&TT.pattern);
 736    c = command->c;
 737    reg = (char *)command;
 738    reg += command->arg1 + strlen(reg + command->arg1);
 739
 740    // Resume parsing for 'a' or 's' command. (Only two that can do this.)
 741    // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
 742    // a unicode character.
 743    if (command->hit < 256) goto resume_s;
 744    else goto resume_a;
 745  }
 746
 747  // Loop through commands in this line.
 748
 749  command = 0;
 750  for (;;) {
 751    if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
 752
 753    // If there's no more data on this line, return.
 754    for (;;) {
 755      while (isspace(*line) || *line == ';') line++;
 756      if (*line == '#') while (*line && *line != '\n') line++;
 757      else break;
 758    }
 759    if (!*line) return;
 760
 761    // We start by writing data into toybuf. Later we'll allocate the
 762    // ex
 763
 764    errstart = line;
 765    memset(toybuf, 0, sizeof(struct sedcmd));
 766    command = (void *)toybuf;
 767    reg = toybuf + sizeof(struct sedcmd);
 768
 769    // Parse address range (if any)
 770    for (i = 0; i < 2; i++) {
 771      if (*line == ',') line++;
 772      else if (i) break;
 773
 774      if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
 775      else if (*line == '$') {
 776        command->lmatch[i] = -1;
 777        line++;
 778      } else if (*line == '/' || *line == '\\') {
 779        char *s = line;
 780
 781        if (!(s = unescape_delimited_string(&line, 0))) goto error;
 782        if (!*s) command->rmatch[i] = 0;
 783        else {
 784          xregcomp((void *)reg, s, (toys.optflags & FLAG_r)*REG_EXTENDED);
 785          command->rmatch[i] = reg-toybuf;
 786          reg += sizeof(regex_t);
 787        }
 788        free(s);
 789      } else break;
 790    }
 791
 792    while (isspace(*line)) line++;
 793    if (!*line) break;
 794
 795    while (*line == '!') {
 796      command->not = 1;
 797      line++;
 798    }
 799    while (isspace(*line)) line++;
 800
 801    c = command->c = *(line++);
 802    if (strchr("}:", c) && i) break;
 803    if (strchr("aiqr=", c) && i>1) break;
 804
 805    // Add step to pattern
 806    command = xmemdup(toybuf, reg-toybuf);
 807    reg = (reg-toybuf) + (char *)command;
 808
 809    // Parse arguments by command type
 810    if (c == '{') TT.nextlen++;
 811    else if (c == '}') {
 812      if (!TT.nextlen--) break;
 813    } else if (c == 's') {
 814      char *end, delim = 0;
 815
 816      // s/pattern/replacement/flags
 817
 818      // line continuations use arg1 (back at the start of the function),
 819      // so let's fill out arg2 first (since the regex part can't be multiple
 820      // lines) and swap them back later.
 821
 822      // get pattern (just record, we parse it later)
 823      command->arg2 = reg - (char *)command;
 824      if (!(TT.remember = unescape_delimited_string(&line, &delim)))
 825        goto error;
 826
 827      reg += sizeof(regex_t);
 828      command->arg1 = reg-(char *)command;
 829      command->hit = delim;
 830resume_s:
 831      // get replacement - don't replace escapes yet because \1 and \& need
 832      // processing later, after we replace \\ with \ we can't tell \\1 from \1
 833      end = line;
 834      while (*end != command->hit) {
 835        if (!*end) goto error;
 836        if (*end++ == '\\') {
 837          if (!*end || *end == '\n') {
 838            end[-1] = '\n';
 839            break;
 840          }
 841          end++;
 842        }
 843      }
 844
 845      reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
 846      line = end;
 847      // line continuation? (note: '\n' can't be a valid delim).
 848      if (*line == command->hit) command->hit = 0;
 849      else {
 850        if (!*line) continue;
 851        reg--;
 852        line++;
 853        goto resume_s;
 854      }
 855
 856      // swap arg1/arg2 so they're back in order arguments occur.
 857      i = command->arg1;
 858      command->arg1 = command->arg2;
 859      command->arg2 = i;
 860
 861      // get flags
 862      for (line++; *line; line++) {
 863        long l;
 864
 865        if (isspace(*line) && *line != '\n') continue;
 866
 867        if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l;
 868        else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
 869          command->sflags |= l << 3;
 870          line--;
 871        } else break;
 872      }
 873
 874      // We deferred actually parsing the regex until we had the s///i flag
 875      // allocating the space was done by extend_string() above
 876      if (!*TT.remember) command->arg1 = 0;
 877      else xregcomp((void *)(command->arg1 + (char *)command), TT.remember,
 878        ((toys.optflags & FLAG_r)*REG_EXTENDED)|((command->sflags&1)*REG_ICASE));
 879      free(TT.remember);
 880      TT.remember = 0;
 881      if (*line == 'w') {
 882        line++;
 883        goto writenow;
 884      }
 885    } else if (c == 'w') {
 886      int fd, delim;
 887      char *cc;
 888
 889      // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
 890      // eol status, and to retain the filename for error messages, we'd need
 891      // to go up to arg5 just for this. Compromise: dynamically allocate the
 892      // filehandle and eol status.
 893
 894writenow:
 895      while (isspace(*line)) line++;
 896      if (!*line) goto error;
 897      for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
 898      delim = *cc;
 899      *cc = 0;
 900      fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
 901      *cc = delim;
 902
 903      command->w = reg - (char *)command;
 904      command = xrealloc(command, command->w+(cc-line)+6);
 905      reg = command->w + (char *)command;
 906
 907      memcpy(reg, &fd, 4);
 908      reg += 4;
 909      *(reg++) = 0;
 910      memcpy(reg, line, delim);
 911      reg += delim;
 912      *(reg++) = 0;
 913
 914      line = cc;
 915      if (delim) line += 2;
 916    } else if (c == 'y') {
 917      char *s, delim = 0;
 918      int len;
 919
 920      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
 921      command->arg1 = reg-(char *)command;
 922      len = strlen(s);
 923      reg = extend_string((void *)&command, s, reg-(char *)command, len);
 924      free(s);
 925      command->arg2 = reg-(char *)command;
 926      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
 927      if (len != strlen(s)) goto error;
 928      reg = extend_string((void *)&command, s, reg-(char*)command, len);
 929      free(s);
 930    } else if (strchr("abcirtTw:", c)) {
 931      int end;
 932
 933      // trim leading spaces
 934      while (isspace(*line) && *line != '\n') line++;
 935
 936      // Resume logic differs from 's' case because we don't add a newline
 937      // unless it's after something, so we add it on return instead.
 938resume_a:
 939      command->hit = 0;
 940
 941      // btT: end with space or semicolon, aicrw continue to newline.
 942      if (!(end = strcspn(line, strchr(":btT", c) ? "; \t\r\n\v\f" : "\n"))) {
 943        // Argument's optional for btT
 944        if (strchr("btT", c)) continue;
 945        else if (!command->arg1) break;
 946      }
 947
 948      // Extend allocation to include new string. We use offsets instead of
 949      // pointers so realloc() moving stuff doesn't break things. Ok to write
 950      // \n over NUL terminator because call to extend_string() adds it back.
 951      if (!command->arg1) command->arg1 = reg - (char*)command;
 952      else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
 953      else if (!pline) {
 954        command->arg1 = 0;
 955        continue;
 956      }
 957      reg = extend_string((void *)&command, line, reg - (char *)command, end);
 958
 959      // Recopy data to remove escape sequences and handle line continuation.
 960      if (strchr("aci", c)) {
 961        reg -= end+1;
 962        for (i = end; i; i--) {
 963          if ((*reg++ = *line++)=='\\') {
 964
 965            // escape at end of line: resume if -e escaped literal newline,
 966            // else request callback and resume with next line
 967            if (!--i) {
 968              *--reg = 0;
 969              if (*line) {
 970                line++;
 971                goto resume_a;
 972              }
 973              command->hit = 256;
 974              break;
 975            }
 976            if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
 977            line++;
 978          }
 979        }
 980        *reg = 0;
 981      } else line += end;
 982
 983    // Commands that take no arguments
 984    } else if (!strchr("{dDgGhHlnNpPqx=", c)) break;
 985  }
 986
 987error:
 988  error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
 989}
 990
 991void sed_main(void)
 992{
 993  struct arg_list *al;
 994  char **args = toys.optargs;
 995
 996  // Lie to autoconf when it asks stupid questions, so configure regexes
 997  // that look for "GNU sed version %f" greater than some old buggy number
 998  // don't fail us for not matching their narrow expectations.
 999  if (toys.optflags & FLAG_version) {
1000    xprintf("This is not GNU sed version 9.0\n");
1001    return;
1002  }
1003
1004  // Handling our own --version means we handle our own --help too.
1005  if (toys.optflags&FLAG_help) help_exit(0);
1006
1007  // Parse pattern into commands.
1008
1009  // If no -e or -f, first argument is the pattern.
1010  if (!TT.e && !TT.f) {
1011    if (!*toys.optargs) error_exit("no pattern");
1012    (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1013  }
1014
1015  // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1016  // so handle all -e, then all -f. (At least the behavior's consistent.)
1017
1018  for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
1019  for (al = TT.f; al; al = al->next) do_lines(xopenro(al->arg), parse_pattern);
1020  parse_pattern(0, 0);
1021  dlist_terminate(TT.pattern);
1022  if (TT.nextlen) error_exit("no }");  
1023
1024  TT.fdout = 1;
1025  TT.remember = xstrdup("");
1026
1027  // Inflict pattern upon input files. Long version because !O_CLOEXEC
1028  loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed);
1029
1030  if (!(toys.optflags & FLAG_i)) process_line(0, 0);
1031
1032  // todo: need to close fd when done for TOYBOX_FREE?
1033}
1034