toybox/toys/posix/sed.c
<<
>>
Prefs
   1/* sed.c - stream editor. Thing that does s/// and other stuff.
   2 *
   3 * Copyright 2014 Rob Landley <rob@landley.net>
   4 *
   5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
   6 *
   7 * TODO: lines > 2G could wrap signed int length counters. Not just getline()
   8 * but N and s///
   9 * TODO: make y// handle unicode, unicode delimiters
  10 * TODO: handle error return from emit(), error_msg/exit consistently
  11 *       What's the right thing to do for -i when write fails? Skip to next?
  12 * test '//q' with no previous regex, also repeat previous regex?
  13
  14USE_SED(NEWTOY(sed, "(help)(version)e*f*inEr[+Er]", TOYFLAG_USR|TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
  15
  16config SED
  17  bool "sed"
  18  default y
  19  help
  20    usage: sed [-inrE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
  21
  22    Stream editor. Apply one or more editing SCRIPTs to each line of input
  23    (from FILE or stdin) producing output (by default to stdout).
  24
  25    -e  add SCRIPT to list
  26    -f  add contents of SCRIPT_FILE to list
  27    -i  Edit each file in place
  28    -n  No default output (use the p command to output matched lines)
  29    -r  Use extended regular expression syntax
  30    -E  Alias for -r
  31    -s  Treat input files separately (implied by -i)
  32
  33    A SCRIPT is a series of one or more COMMANDs separated by newlines or
  34    semicolons. All -e SCRIPTs are concatenated together as if separated
  35    by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
  36    If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
  37
  38    Each COMMAND may be preceded by an address which limits the command to
  39    apply only to the specified line(s). Commands without an address apply to
  40    every line. Addresses are of the form:
  41
  42      [ADDRESS[,ADDRESS]]COMMAND
  43
  44    The ADDRESS may be a decimal line number (starting at 1), a /regular
  45    expression/ within a pair of forward slashes, or the character "$" which
  46    matches the last line of input. (In -s or -i mode this matches the last
  47    line of each file, otherwise just the last line of the last file.) A single
  48    address matches one line, a pair of comma separated addresses match
  49    everything from the first address to the second address (inclusive). If
  50    both addresses are regular expressions, more than one range of lines in
  51    each file can match.
  52
  53    REGULAR EXPRESSIONS in sed are started and ended by the same character
  54    (traditionally / but anything except a backslash or a newline works).
  55    Backslashes may be used to escape the delimiter if it occurs in the
  56    regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
  57    and unicode). An empty regex repeats the previous one. ADDRESS regexes
  58    (above) require the first delimeter to be escaped with a backslash when
  59    it isn't a forward slash (to distinguish it from the COMMANDs below).
  60
  61    Sed mostly operates on individual lines one at a time. It reads each line,
  62    processes it, and either writes it to the output or discards it before
  63    reading the next line. Sed can remember one additional line in a separate
  64    buffer (using the h, H, g, G, and x commands), and can read the next line
  65    of input early (using the n and N command), but other than that command
  66    scripts operate on individual lines of text.
  67
  68    Each COMMAND starts with a single character. The following commands take
  69    no arguments:
  70
  71      {  Start a new command block, continuing until a corresponding "}".
  72         Command blocks may nest. If the block has an address, commands within
  73         the block are only run for lines within the block's address range.
  74
  75      }  End command block (this command cannot have an address)
  76
  77      d  Delete this line and move on to the next one
  78         (ignores remaining COMMANDs)
  79
  80      D  Delete one line of input and restart command SCRIPT (same as "d"
  81         unless you've glued lines together with "N" or similar)
  82
  83      g  Get remembered line (overwriting current line)
  84
  85      G  Get remembered line (appending to current line)
  86
  87      h  Remember this line (overwriting remembered line)
  88
  89      H  Remember this line (appending to remembered line, if any)
  90
  91      l  Print line, escaping \abfrtv (but not newline), octal escaping other
  92         nonprintable characters, wrapping lines to terminal width with a
  93         backslash, and appending $ to actual end of line.
  94
  95      n  Print default output and read next line, replacing current line
  96         (If no next line available, quit processing script)
  97
  98      N  Append next line of input to this line, separated by a newline
  99         (This advances the line counter for address matching and "=", if no
 100         next line available quit processing script without default output)
 101
 102      p  Print this line
 103
 104      P  Print this line up to first newline (from "N")
 105
 106      q  Quit (print default output, no more commands processed or lines read)
 107
 108      x  Exchange this line with remembered line (overwrite in both directions)
 109
 110      =  Print the current line number (followed by a newline)
 111
 112    The following commands (may) take an argument. The "text" arguments (to
 113    the "a", "b", and "c" commands) may end with an unescaped "\" to append
 114    the next line (for which leading whitespace is not skipped), and also
 115    treat ";" as a literal character (use "\;" instead).
 116
 117      a [text]   Append text to output before attempting to read next line
 118
 119      b [label]  Branch, jumps to :label (or with no label, to end of SCRIPT)
 120
 121      c [text]   Delete line, output text at end of matching address range
 122                 (ignores remaining COMMANDs)
 123
 124      i [text]   Print text
 125
 126      r [file]   Append contents of file to output before attempting to read
 127                 next line.
 128
 129      s/S/R/F    Search for regex S, replace matched text with R using flags F.
 130                 The first character after the "s" (anything but newline or
 131                 backslash) is the delimiter, escape with \ to use normally.
 132
 133                 The replacement text may contain "&" to substitute the matched
 134                 text (escape it with backslash for a literal &), or \1 through
 135                 \9 to substitute a parenthetical subexpression in the regex.
 136                 You can also use the normal backslash escapes such as \n and
 137                 a backslash at the end of the line appends the next line.
 138
 139                 The flags are:
 140
 141                 [0-9]    A number, substitute only that occurrence of pattern
 142                 g        Global, substitute all occurrences of pattern
 143                 i        Ignore case when matching
 144                 p        Print the line if match was found and replaced
 145                 w [file] Write (append) line to file if match replaced
 146
 147      t [label]  Test, jump to :label only if an "s" command found a match in
 148                 this line since last test (replacing with same text counts)
 149
 150      T [label]  Test false, jump only if "s" hasn't found a match.
 151
 152      w [file]   Write (append) line to file
 153
 154      y/old/new/ Change each character in 'old' to corresponding character
 155                 in 'new' (with standard backslash escapes, delimiter can be
 156                 any repeated character except \ or \n)
 157
 158      : [label]  Labeled target for jump commands
 159
 160      #  Comment, ignore rest of this line of SCRIPT
 161
 162    Deviations from posix: allow extended regular expressions with -r,
 163    editing in place with -i, separate with -s, printf escapes in text, line
 164    continuations, semicolons after all commands, 2-address anywhere an
 165    address is allowed, "T" command, multiline continuations for [abc],
 166    \; to end [abc] argument before end of line.
 167*/
 168
 169#define FOR_sed
 170#include "toys.h"
 171
 172GLOBALS(
 173  struct arg_list *f;
 174  struct arg_list *e;
 175
 176  // processed pattern list
 177  struct double_list *pattern;
 178
 179  char *nextline, *remember;
 180  void *restart, *lastregex;
 181  long nextlen, rememberlen, count;
 182  int fdout, noeol;
 183  unsigned xx;
 184)
 185
 186// Linked list of parsed sed commands. Offset fields indicate location where
 187// regex or string starts, ala offset+(char *)struct, because we remalloc()
 188// these to expand them for multiline inputs, and pointers would have to be
 189// individually adjusted.
 190
 191struct sedcmd {
 192  struct sedcmd *next, *prev;
 193
 194  // Begin and end of each match
 195  long lmatch[2]; // line number of match
 196  int rmatch[2];  // offset of regex struct for prefix matches (/abc/,/def/p)
 197  int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
 198  unsigned not, hit;
 199  unsigned sflags; // s///flag bits: i=1, g=2, p=4
 200  char c; // action
 201};
 202
 203// Write out line with potential embedded NUL, handling eol/noeol
 204static int emit(char *line, long len, int eol)
 205{
 206  int l, old = line[len];
 207
 208  if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
 209  TT.noeol = !eol;
 210  if (eol) line[len++] = '\n';
 211  if (!len) return 0;
 212  l = writeall(TT.fdout, line, len);
 213  if (eol) line[len-1] = old;
 214  if (l != len) {
 215    perror_msg("short write");
 216
 217    return 1;
 218  }
 219
 220  return 0;
 221}
 222
 223// Extend allocation to include new string, with newline between if newlen<0
 224
 225static char *extend_string(char **old, char *new, int oldlen, int newlen)
 226{
 227  int newline = newlen < 0;
 228  char *s;
 229
 230  if (newline) newlen = -newlen;
 231  s = *old = xrealloc(*old, oldlen+newlen+newline+1);
 232  if (newline) s[oldlen++] = '\n';
 233  memcpy(s+oldlen, new, newlen);
 234  s[oldlen+newlen] = 0;
 235
 236  return s+oldlen+newlen+1;
 237}
 238
 239// An empty regex repeats the previous one
 240static void *get_regex(void *trump, int offset)
 241{
 242  if (!offset) {
 243    if (!TT.lastregex) error_exit("no previous regex");
 244    return TT.lastregex;
 245  }
 246
 247  return TT.lastregex = offset+(char *)trump;
 248}
 249
 250// Apply pattern to line from input file
 251static void sed_line(char **pline, long plen)
 252{
 253  struct append {
 254    struct append *next, *prev;
 255    int file;
 256    char *str;
 257  } *append = 0;
 258  char *line = TT.nextline;
 259  long len = TT.nextlen;
 260  struct sedcmd *command;
 261  int eol = 0, tea = 0;
 262
 263  // Ignore EOF for all files before last unless -i
 264  if (!pline && !(toys.optflags&FLAG_i)) return;
 265
 266  // Grab next line for deferred processing (EOF detection: we get a NULL
 267  // pline at EOF to flush last line). Note that only end of _last_ input
 268  // file matches $ (unless we're doing -i).
 269  TT.nextline = 0;
 270  TT.nextlen = 0;
 271  if (pline) {
 272    TT.nextline = *pline;
 273    TT.nextlen = plen;
 274    *pline = 0;
 275  }
 276
 277  if (!line || !len) return;
 278  if (line[len-1] == '\n') line[--len] = eol++;
 279  TT.count++;
 280
 281  // The restart-1 is because we added one to make sure it wasn't NULL,
 282  // otherwise N as last command would restart script
 283  command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
 284  TT.restart = 0;
 285
 286  while (command) {
 287    char *str, c = command->c;
 288
 289    // Have we got a line or regex matching range for this rule?
 290    if (*command->lmatch || *command->rmatch) {
 291      int miss = 0;
 292      long lm;
 293
 294      // In a match that might end?
 295      if (command->hit) {
 296        if (!(lm = command->lmatch[1])) {
 297          if (!command->rmatch[1]) command->hit = 0;
 298          else {
 299            void *rm = get_regex(command, command->rmatch[1]);
 300
 301            // regex match end includes matching line, so defer deactivation
 302            if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
 303          }
 304        } else if (lm > 0 && lm < TT.count) command->hit = 0;
 305
 306      // Start a new match?
 307      } else {
 308        if (!(lm = *command->lmatch)) {
 309          void *rm = get_regex(command, *command->rmatch);
 310
 311          if (line && !regexec0(rm, line, len, 0, 0, 0)) command->hit++;
 312        } else if (lm == TT.count || (lm == -1 && !pline)) command->hit++;
 313
 314        if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
 315      } 
 316
 317      // Didn't match?
 318      lm = !(command->hit ^ command->not);
 319
 320      // Deferred disable from regex end match
 321      if (miss || command->lmatch[1] == TT.count) command->hit = 0;
 322
 323      if (lm) {
 324        // Handle skipping curly bracket command group
 325        if (c == '{') {
 326          int curly = 1;
 327
 328          while (curly) {
 329            command = command->next;
 330            if (command->c == '{') curly++;
 331            if (command->c == '}') curly--;
 332          }
 333        }
 334        command = command->next;
 335        continue;
 336      }
 337    }
 338
 339    // A deleted line can still update line match state for later commands
 340    if (!line) {
 341      command = command->next;
 342      continue;
 343    }
 344
 345    // Process command
 346
 347    if (c=='a' || c=='r') {
 348      struct append *a = xzalloc(sizeof(struct append));
 349      if (command->arg1) a->str = command->arg1+(char *)command;
 350      a->file = c=='r';
 351      dlist_add_nomalloc((void *)&append, (void *)a);
 352    } else if (c=='b' || c=='t' || c=='T') {
 353      int t = tea;
 354
 355      if (c != 'b') tea = 0;
 356      if (c=='b' || t^(c=='T')) {
 357        if (!command->arg1) break;
 358        str = command->arg1+(char *)command;
 359        for (command = (void *)TT.pattern; command; command = command->next)
 360          if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
 361            break;
 362        if (!command) error_exit("no :%s", str);
 363      }
 364    } else if (c=='c') {
 365      str = command->arg1+(char *)command;
 366      if (!command->hit) emit(str, strlen(str), 1);
 367      free(line);
 368      line = 0;
 369      continue;
 370    } else if (c=='d') {
 371      free(line);
 372      line = 0;
 373      continue;
 374    } else if (c=='D') {
 375      // Delete up to \n or end of buffer
 376      str = line;
 377      while ((str-line)<len) if (*(str++) == '\n') break;
 378      len -= str - line;
 379      memmove(line, str, len);
 380
 381      // if "delete" blanks line, disable further processing
 382      // otherwise trim and restart script
 383      if (!len) {
 384        free(line);
 385        line = 0;
 386      } else {
 387        line[len] = 0;
 388        command = (void *)TT.pattern;
 389      }
 390      continue;
 391    } else if (c=='g') {
 392      free(line);
 393      line = xstrdup(TT.remember);
 394      len = TT.rememberlen;
 395    } else if (c=='G') {
 396      line = xrealloc(line, len+TT.rememberlen+2);
 397      line[len++] = '\n';
 398      memcpy(line+len, TT.remember, TT.rememberlen);
 399      line[len += TT.rememberlen] = 0;
 400    } else if (c=='h') {
 401      free(TT.remember);
 402      TT.remember = xstrdup(line);
 403      TT.rememberlen = len;
 404    } else if (c=='H') {
 405      TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
 406      TT.remember[TT.rememberlen++] = '\n';
 407      memcpy(TT.remember+TT.rememberlen, line, len);
 408      TT.remember[TT.rememberlen += len] = 0;
 409    } else if (c=='i') {
 410      str = command->arg1+(char *)command;
 411      emit(str, strlen(str), 1);
 412    } else if (c=='l') {
 413      int i, x, off;
 414
 415      if (!TT.xx) {
 416        terminal_size(&TT.xx, 0);
 417        if (!TT.xx) TT.xx = 80;
 418        if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
 419        if (TT.xx > 4) TT.xx -= 4;
 420      }
 421
 422      for (i = off = 0; i<len; i++) {
 423        if (off >= TT.xx) {
 424          toybuf[off++] = '\\';
 425          emit(toybuf, off, 1);
 426          off = 0;
 427        }
 428        x = stridx("\\\a\b\f\r\t\v", line[i]);
 429        if (x != -1) {
 430          toybuf[off++] = '\\';
 431          toybuf[off++] = "\\abfrtv"[x];
 432        } else if (line[i] >= ' ') toybuf[off++] = line[i];
 433        else off += sprintf(toybuf+off, "\\%03o", line[i]);
 434      }
 435      toybuf[off++] = '$';
 436      emit(toybuf, off, 1);
 437    } else if (c=='n') {
 438      TT.restart = command->next+1;
 439
 440      break;
 441    } else if (c=='N') {
 442      // Can't just grab next line because we could have multiple N and
 443      // we need to actually read ahead to get N;$p EOF detection right.
 444      if (pline) {
 445        TT.restart = command->next+1;
 446        extend_string(&line, TT.nextline, len, -TT.nextlen);
 447        free(TT.nextline);
 448        TT.nextline = line;
 449        TT.nextlen += len + 1;
 450        line = 0;
 451      }
 452
 453      // Pending append goes out right after N
 454      goto done; 
 455    } else if (c=='p' || c=='P') {
 456      char *l = (c=='P') ? strchr(line, '\n') : 0;
 457
 458      if (emit(line, l ? l-line : len, eol)) break;
 459    } else if (c=='q') {
 460      if (pline) *pline = (void *)1;
 461      free(TT.nextline);
 462      TT.nextline = 0;
 463      TT.nextlen = 0;
 464
 465      break;
 466    } else if (c=='s') {
 467      char *rline = line, *new = command->arg2 + (char *)command, *swap, *rswap;
 468      regmatch_t *match = (void *)toybuf;
 469      regex_t *reg = get_regex(command, command->arg1);
 470      int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen;
 471
 472      // Find match in remaining line (up to remaining len)
 473      while (!regexec0(reg, rline, rlen, 10, match, mflags)) {
 474        mflags = REG_NOTBOL;
 475
 476        // Zero length matches don't count immediately after a previous match
 477        mlen = match[0].rm_eo-match[0].rm_so;
 478        if (!mlen && !zmatch) {
 479          if (!rlen--) break;
 480          rline++;
 481          zmatch++;
 482          continue;
 483        } else zmatch = 0;
 484
 485        // If we're replacing only a specific match, skip if this isn't it
 486        off = command->sflags>>3;
 487        if (off && off != ++count) {
 488          rline += match[0].rm_eo;
 489          rlen -= match[0].rm_eo;
 490
 491          continue;
 492        }
 493        // The fact getline() can allocate unbounded amounts of memory is
 494        // a bigger issue, but while we're here check for integer overflow
 495        if (match[0].rm_eo > INT_MAX) perror_exit(0);
 496
 497        // newlen = strlen(new) but with \1 and & and printf escapes
 498        for (off = newlen = 0; new[off]; off++) {
 499          int cc = -1;
 500
 501          if (new[off] == '&') cc = 0;
 502          else if (new[off] == '\\') cc = new[++off] - '0';
 503          if (cc < 0 || cc > 9) {
 504            newlen++;
 505            continue;
 506          }
 507          newlen += match[cc].rm_eo-match[cc].rm_so;
 508        }
 509
 510        // Allocate new size, copy start/end around match. (Can't extend in
 511        // place because backrefs may refer to text after it's overwritten.)
 512        len += newlen-mlen;
 513        swap = xmalloc(len+1);
 514        rswap = swap+(rline-line)+match[0].rm_so;
 515        memcpy(swap, line, (rline-line)+match[0].rm_so);
 516        memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1);
 517
 518        // copy in new replacement text
 519        for (off = mlen = 0; new[off]; off++) {
 520          int cc = 0, ll;
 521
 522          if (new[off] == '\\') {
 523            cc = new[++off] - '0';
 524            if (cc<0 || cc>9) {
 525              if (!(rswap[mlen++] = unescape(new[off])))
 526                rswap[mlen-1] = new[off];
 527
 528              continue;
 529            } else if (match[cc].rm_so == -1) error_exit("no s//\\%d/", cc);
 530          } else if (new[off] != '&') {
 531            rswap[mlen++] = new[off];
 532
 533            continue;
 534          }
 535
 536          ll = match[cc].rm_eo-match[cc].rm_so;
 537          memcpy(rswap+mlen, rline+match[cc].rm_so, ll);
 538          mlen += ll;
 539        }
 540
 541        rline = rswap+newlen;
 542        free(line);
 543        line = swap;
 544
 545        // Stop after first substitution unless we have flag g
 546        if (!(command->sflags & 2)) break;
 547      }
 548
 549      if (mflags) {
 550        // flag p
 551        if (command->sflags & 4) emit(line, len, eol);
 552
 553        tea = 1;
 554        if (command->w) goto writenow;
 555      }
 556    } else if (c=='w') {
 557      int fd, noeol;
 558      char *name;
 559
 560writenow:
 561      // Swap out emit() context
 562      fd = TT.fdout;
 563      noeol = TT.noeol;
 564
 565      // We save filehandle and newline status before filename
 566      name = command->w + (char *)command;
 567      memcpy(&TT.fdout, name, 4);
 568      name += 4;
 569      TT.noeol = *(name++);
 570
 571      // write, then save/restore context
 572      if (emit(line, len, eol))
 573        perror_exit("w '%s'", command->arg1+(char *)command);
 574      *(--name) = TT.noeol;
 575      TT.noeol = noeol;
 576      TT.fdout = fd;
 577    } else if (c=='x') {
 578      long swap = TT.rememberlen;
 579
 580      str = TT.remember;
 581      TT.remember = line;
 582      line = str;
 583      TT.rememberlen = len;
 584      len = swap;
 585    } else if (c=='y') {
 586      char *from, *to = (char *)command;
 587      int i, j;
 588
 589      from = to+command->arg1;
 590      to += command->arg2;
 591
 592      for (i = 0; i < len; i++) {
 593        j = stridx(from, line[i]);
 594        if (j != -1) line[i] = to[j];
 595      }
 596    } else if (c=='=') {
 597      sprintf(toybuf, "%ld", TT.count);
 598      emit(toybuf, strlen(toybuf), 1);
 599    }
 600
 601    command = command->next;
 602  }
 603
 604  if (line && !(toys.optflags & FLAG_n)) emit(line, len, eol);
 605
 606done:
 607  if (dlist_terminate(append)) while (append) {
 608    struct append *a = append->next;
 609
 610    if (append->file) {
 611      int fd = open(append->str, O_RDONLY);
 612
 613      // Force newline if noeol pending
 614      if (fd != -1) {
 615        if (TT.noeol) xwrite(TT.fdout, "\n", 1);
 616        TT.noeol = 0;
 617        xsendfile(fd, TT.fdout);
 618        close(fd);
 619      }
 620    } else if (append->str) emit(append->str, strlen(append->str), 1);
 621    else emit(line, 0, 0);
 622    free(append);
 623    append = a;
 624  }
 625  free(line);
 626}
 627
 628// Callback called on each input file
 629static void do_sed_file(int fd, char *name)
 630{
 631  int i = toys.optflags & FLAG_i;
 632  char *tmp;
 633
 634  if (i) {
 635    struct sedcmd *command;
 636
 637    if (!fd) return error_msg("-i on stdin");
 638    TT.fdout = copy_tempfile(fd, name, &tmp);
 639    TT.count = 0;
 640    for (command = (void *)TT.pattern; command; command = command->next)
 641      command->hit = 0;
 642  }
 643  do_lines(fd, sed_line);
 644  if (i) {
 645    replace_tempfile(-1, TT.fdout, &tmp);
 646    TT.fdout = 1;
 647    TT.nextline = 0;
 648    TT.nextlen = TT.noeol = 0;
 649  }
 650}
 651
 652// Copy chunk of string between two delimiters, converting printf escapes.
 653// returns processed copy of string (0 if error), *pstr advances to next
 654// unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
 655// if regxex, ignore delimiter in [ranges]
 656static char *unescape_delimited_string(char **pstr, char *delim)
 657{
 658  char *to, *from, mode = 0, d;
 659
 660  // Grab leading delimiter (if necessary), allocate space for new string
 661  from = *pstr;
 662  if (!delim || !*delim) {
 663    if (!(d = *(from++))) return 0;
 664    if (d == '\\') d = *(from++);
 665    if (!d || d == '\\') return 0;
 666    if (delim) *delim = d;
 667  } else d = *delim;
 668  to = delim = xmalloc(strlen(*pstr)+1);
 669
 670  while (mode || *from != d) {
 671    if (!*from) return 0;
 672
 673    // delimiter in regex character range doesn't count
 674    if (*from == '[') {
 675      if (!mode) {
 676        mode = ']';
 677        if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
 678      } else if (mode == ']' && strchr(".=:", from[1])) {
 679        *(to++) = *(from++);
 680        mode = *from;
 681      }
 682    } else if (*from == mode) {
 683      if (mode == ']') mode = 0;
 684      else {
 685        *(to++) = *(from++);
 686        mode = ']';
 687      }
 688    // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
 689    // but the perl build does it, so we need to filter it out.
 690    } else if (mode && *from == '-' && from[-1] == from[1]) {
 691      from+=2;
 692      continue;
 693    } else if (*from == '\\') {
 694      if (!from[1]) return 0;
 695
 696      // Check escaped end delimiter before printf style escapes.
 697      if (from[1] == d) from++;
 698      else if (from[1]=='\\') *(to++) = *(from++);
 699      else {
 700        char c = unescape(from[1]);
 701
 702        if (c) {
 703          *(to++) = c;
 704          from+=2;
 705          continue;
 706        } else if (!mode) *(to++) = *(from++);
 707      }
 708    }
 709    *(to++) = *(from++);
 710  }
 711  *to = 0;
 712  *pstr = from+1;
 713
 714  return delim;
 715}
 716
 717// Translate pattern strings into command structures. Each command structure
 718// is a single allocation (which requires some math and remalloc at times).
 719static void parse_pattern(char **pline, long len)
 720{
 721  struct sedcmd *command = (void *)TT.pattern;
 722  char *line, *reg, c, *errstart;
 723  int i;
 724
 725  line = errstart = pline ? *pline : "";
 726  if (len && line[len-1]=='\n') line[--len] = 0;
 727
 728  // Append this line to previous multiline command? (hit indicates type.)
 729  // During parsing "hit" stores data about line continuations, but in
 730  // sed_line() it means the match range attached to this command
 731  // is active, so processing the continuation must zero it again.
 732  if (command && command->prev->hit) {
 733    // Remove half-finished entry from list so remalloc() doesn't confuse it
 734    TT.pattern = TT.pattern->prev;
 735    command = dlist_pop(&TT.pattern);
 736    c = command->c;
 737    reg = (char *)command;
 738    reg += command->arg1 + strlen(reg + command->arg1);
 739
 740    // Resume parsing for 'a' or 's' command. (Only two that can do this.)
 741    // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
 742    // a unicode character.
 743    if (command->hit < 256) goto resume_s;
 744    else goto resume_a;
 745  }
 746
 747  // Loop through commands in this line.
 748
 749  command = 0;
 750  for (;;) {
 751    if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
 752
 753    // If there's no more data on this line, return.
 754    for (;;) {
 755      while (isspace(*line) || *line == ';') line++;
 756      if (*line == '#') while (*line && *line != '\n') line++;
 757      else break;
 758    }
 759    if (!*line) return;
 760
 761    // We start by writing data into toybuf. Later we'll allocate the
 762    // ex
 763
 764    errstart = line;
 765    memset(toybuf, 0, sizeof(struct sedcmd));
 766    command = (void *)toybuf;
 767    reg = toybuf + sizeof(struct sedcmd);
 768
 769    // Parse address range (if any)
 770    for (i = 0; i < 2; i++) {
 771      if (*line == ',') line++;
 772      else if (i) break;
 773
 774      if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
 775      else if (*line == '$') {
 776        command->lmatch[i] = -1;
 777        line++;
 778      } else if (*line == '/' || *line == '\\') {
 779        char *s = line;
 780
 781        if (!(s = unescape_delimited_string(&line, 0))) goto error;
 782        if (!*s) command->rmatch[i] = 0;
 783        else {
 784          xregcomp((void *)reg, s, (toys.optflags & FLAG_r)*REG_EXTENDED);
 785          command->rmatch[i] = reg-toybuf;
 786          reg += sizeof(regex_t);
 787        }
 788        free(s);
 789      } else break;
 790    }
 791
 792    while (isspace(*line)) line++;
 793    if (!*line) break;
 794
 795    while (*line == '!') {
 796      command->not = 1;
 797      line++;
 798    }
 799    while (isspace(*line)) line++;
 800
 801    c = command->c = *(line++);
 802    if (strchr("}:", c) && i) break;
 803    if (strchr("aiqr=", c) && i>1) break;
 804
 805    // Add step to pattern
 806    command = xmemdup(toybuf, reg-toybuf);
 807    reg = (reg-toybuf) + (char *)command;
 808
 809    // Parse arguments by command type
 810    if (c == '{') TT.nextlen++;
 811    else if (c == '}') {
 812      if (!TT.nextlen--) break;
 813    } else if (c == 's') {
 814      char *end, delim = 0;
 815
 816      // s/pattern/replacement/flags
 817
 818      // line continuations use arg1 (back at the start of the function),
 819      // so let's fill out arg2 first (since the regex part can't be multiple
 820      // lines) and swap them back later.
 821
 822      // get pattern (just record, we parse it later)
 823      command->arg2 = reg - (char *)command;
 824      if (!(TT.remember = unescape_delimited_string(&line, &delim)))
 825        goto error;
 826
 827      reg += sizeof(regex_t);
 828      command->arg1 = reg-(char *)command;
 829      command->hit = delim;
 830resume_s:
 831      // get replacement - don't replace escapes yet because \1 and \& need
 832      // processing later, after we replace \\ with \ we can't tell \\1 from \1
 833      end = line;
 834      while (*end != command->hit) {
 835        if (!*end) goto error;
 836        if (*end++ == '\\') {
 837          if (!*end || *end == '\n') {
 838            end[-1] = '\n';
 839            break;
 840          }
 841          end++;
 842        }
 843      }
 844
 845      reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
 846      line = end;
 847      // line continuation? (note: '\n' can't be a valid delim).
 848      if (*line == command->hit) command->hit = 0;
 849      else {
 850        if (!*line) continue;
 851        reg--;
 852        line++;
 853        goto resume_s;
 854      }
 855
 856      // swap arg1/arg2 so they're back in order arguments occur.
 857      i = command->arg1;
 858      command->arg1 = command->arg2;
 859      command->arg2 = i;
 860
 861      // get flags
 862      for (line++; *line; line++) {
 863        long l;
 864
 865        if (isspace(*line) && *line != '\n') continue;
 866
 867        if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l;
 868        else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
 869          command->sflags |= l << 3;
 870          line--;
 871        } else break;
 872      }
 873
 874      // We deferred actually parsing the regex until we had the s///i flag
 875      // allocating the space was done by extend_string() above
 876      if (!*TT.remember) command->arg1 = 0;
 877      else xregcomp((void *)(command->arg1 + (char *)command), TT.remember,
 878        ((toys.optflags & FLAG_r)*REG_EXTENDED)|((command->sflags&1)*REG_ICASE));
 879      free(TT.remember);
 880      TT.remember = 0;
 881      if (*line == 'w') {
 882        line++;
 883        goto writenow;
 884      }
 885    } else if (c == 'w') {
 886      int fd, delim;
 887      char *cc;
 888
 889      // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
 890      // eol status, and to retain the filename for error messages, we'd need
 891      // to go up to arg5 just for this. Compromise: dynamically allocate the
 892      // filehandle and eol status.
 893
 894writenow:
 895      while (isspace(*line)) line++;
 896      if (!*line) goto error;
 897      for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
 898      delim = *cc;
 899      *cc = 0;
 900      fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
 901      *cc = delim;
 902
 903      command->w = reg - (char *)command;
 904      command = xrealloc(command, command->w+(cc-line)+6);
 905      reg = command->w + (char *)command;
 906
 907      memcpy(reg, &fd, 4);
 908      reg += 4;
 909      *(reg++) = 0;
 910      memcpy(reg, line, delim);
 911      reg += delim;
 912      *(reg++) = 0;
 913
 914      line = cc;
 915      if (delim) line += 2;
 916    } else if (c == 'y') {
 917      char *s, delim = 0;
 918      int len;
 919
 920      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
 921      command->arg1 = reg-(char *)command;
 922      len = strlen(s);
 923      reg = extend_string((void *)&command, s, reg-(char *)command, len);
 924      free(s);
 925      command->arg2 = reg-(char *)command;
 926      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
 927      if (len != strlen(s)) goto error;
 928      reg = extend_string((void *)&command, s, reg-(char*)command, len);
 929      free(s);
 930    } else if (strchr("abcirtTw:", c)) {
 931      int end;
 932
 933      // trim leading spaces
 934      while (isspace(*line) && *line != '\n') line++;
 935
 936      // Resume logic differs from 's' case because we don't add a newline
 937      // unless it's after something, so we add it on return instead.
 938resume_a:
 939      command->hit = 0;
 940
 941      // btT: end with space or semicolon, aicrw continue to newline.
 942      if (!(end = strcspn(line, strchr(":btT", c) ? "; \t\r\n\v\f" : "\n"))) {
 943        // Argument's optional for btT
 944        if (strchr("btT", c)) continue;
 945        else if (!command->arg1) break;
 946      }
 947
 948      // Extend allocation to include new string. We use offsets instead of
 949      // pointers so realloc() moving stuff doesn't break things. Ok to write
 950      // \n over NUL terminator because call to extend_string() adds it back.
 951      if (!command->arg1) command->arg1 = reg - (char*)command;
 952      else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
 953      else if (!pline) {
 954        command->arg1 = 0;
 955        continue;
 956      }
 957      reg = extend_string((void *)&command, line, reg - (char *)command, end);
 958
 959      // Recopy data to remove escape sequences and handle line continuation.
 960      if (strchr("aci", c)) {
 961        reg -= end+1;
 962        for (i = end; i; i--) {
 963          if ((*reg++ = *line++)=='\\') {
 964
 965            // escape at end of line: resume if -e escaped literal newline,
 966            // else request callback and resume with next line
 967            if (!--i) {
 968              *--reg = 0;
 969              if (*line) {
 970                line++;
 971                goto resume_a;
 972              }
 973              command->hit = 256;
 974              break;
 975            }
 976            if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
 977            line++;
 978          }
 979        }
 980        *reg = 0;
 981      } else line += end;
 982
 983    // Commands that take no arguments
 984    } else if (!strchr("{dDgGhHlnNpPqx=", c)) break;
 985  }
 986
 987error:
 988  error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
 989}
 990
 991void sed_main(void)
 992{
 993  struct arg_list *al;
 994  char **args = toys.optargs;
 995
 996  // Lie to autoconf when it asks stupid questions, so configure regexes
 997  // that look for "GNU sed version %f" greater than some old buggy number
 998  // don't fail us for not matching their narrow expectations.
 999  if (toys.optflags & FLAG_version) {
1000    xprintf("This is not GNU sed version 9.0\n");
1001    return;
1002  }
1003
1004  // Handling our own --version means we handle our own --help too.
1005  if (toys.optflags&FLAG_help) help_exit(0);
1006
1007  // Parse pattern into commands.
1008
1009  // If no -e or -f, first argument is the pattern.
1010  if (!TT.e && !TT.f) {
1011    if (!*toys.optargs) error_exit("no pattern");
1012    (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1013  }
1014
1015  // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1016  // so handle all -e, then all -f. (At least the behavior's consistent.)
1017
1018  for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
1019  parse_pattern(0, 0);
1020  for (al = TT.f; al; al = al->next) do_lines(xopenro(al->arg), parse_pattern);
1021  dlist_terminate(TT.pattern);
1022  if (TT.nextlen) error_exit("no }");  
1023
1024  TT.fdout = 1;
1025  TT.remember = xstrdup("");
1026
1027  // Inflict pattern upon input files. Long version because !O_CLOEXEC
1028  loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file);
1029
1030  // Provide EOF flush at end of cumulative input for non-i mode.
1031  if (!(toys.optflags & FLAG_i)) {
1032    toys.optflags |= FLAG_i;
1033    sed_line(0, 0);
1034  }
1035
1036  // todo: need to close fd when done for TOYBOX_FREE?
1037}
1038