toybox/toys/posix/sed.c
<<
>>
Prefs
   1/* sed.c - stream editor. Thing that does s/// and other stuff.
   2 *
   3 * Copyright 2014 Rob Landley <rob@landley.net>
   4 *
   5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
   6 *
   7 * TODO: lines > 2G could wrap signed int length counters. Not just getline()
   8 * but N and s///
   9 * TODO: make y// handle unicode, unicode delimiters
  10 * TODO: handle error return from emit(), error_msg/exit consistently
  11 *       What's the right thing to do for -i when write fails? Skip to next?
  12 * test '//q' with no previous regex, also repeat previous regex?
  13
  14USE_SED(NEWTOY(sed, "(help)(version)e*f*inEr[+Er]", TOYFLAG_USR|TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
  15
  16config SED
  17  bool "sed"
  18  default y
  19  help
  20    usage: sed [-inrE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
  21
  22    Stream editor. Apply one or more editing SCRIPTs to each line of input
  23    (from FILE or stdin) producing output (by default to stdout).
  24
  25    -e  add SCRIPT to list
  26    -f  add contents of SCRIPT_FILE to list
  27    -i  Edit each file in place
  28    -n  No default output (use the p command to output matched lines)
  29    -r  Use extended regular expression syntax
  30    -E  Alias for -r
  31    -s  Treat input files separately (implied by -i)
  32
  33    A SCRIPT is a series of one or more COMMANDs separated by newlines or
  34    semicolons. All -e SCRIPTs are concatenated together as if separated
  35    by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
  36    If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
  37
  38    Each COMMAND may be preceded by an address which limits the command to
  39    apply only to the specified line(s). Commands without an address apply to
  40    every line. Addresses are of the form:
  41
  42      [ADDRESS[,ADDRESS]]COMMAND
  43
  44    The ADDRESS may be a decimal line number (starting at 1), a /regular
  45    expression/ within a pair of forward slashes, or the character "$" which
  46    matches the last line of input. (In -s or -i mode this matches the last
  47    line of each file, otherwise just the last line of the last file.) A single
  48    address matches one line, a pair of comma separated addresses match
  49    everything from the first address to the second address (inclusive). If
  50    both addresses are regular expressions, more than one range of lines in
  51    each file can match.
  52
  53    REGULAR EXPRESSIONS in sed are started and ended by the same character
  54    (traditionally / but anything except a backslash or a newline works).
  55    Backslashes may be used to escape the delimiter if it occurs in the
  56    regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
  57    and unicode). An empty regex repeats the previous one. ADDRESS regexes
  58    (above) require the first delimeter to be escaped with a backslash when
  59    it isn't a forward slash (to distinguish it from the COMMANDs below).
  60
  61    Sed mostly operates on individual lines one at a time. It reads each line,
  62    processes it, and either writes it to the output or discards it before
  63    reading the next line. Sed can remember one additional line in a separate
  64    buffer (using the h, H, g, G, and x commands), and can read the next line
  65    of input early (using the n and N command), but other than that command
  66    scripts operate on individual lines of text.
  67
  68    Each COMMAND starts with a single character. The following commands take
  69    no arguments:
  70
  71      {  Start a new command block, continuing until a corresponding "}".
  72         Command blocks may nest. If the block has an address, commands within
  73         the block are only run for lines within the block's address range.
  74
  75      }  End command block (this command cannot have an address)
  76
  77      d  Delete this line and move on to the next one
  78         (ignores remaining COMMANDs)
  79
  80      D  Delete one line of input and restart command SCRIPT (same as "d"
  81         unless you've glued lines together with "N" or similar)
  82
  83      g  Get remembered line (overwriting current line)
  84
  85      G  Get remembered line (appending to current line)
  86
  87      h  Remember this line (overwriting remembered line)
  88
  89      H  Remember this line (appending to remembered line, if any)
  90
  91      l  Print line, escaping \abfrtv (but not newline), octal escaping other
  92         nonprintable characters, wrapping lines to terminal width with a
  93         backslash, and appending $ to actual end of line.
  94
  95      n  Print default output and read next line, replacing current line
  96         (If no next line available, quit processing script)
  97
  98      N  Append next line of input to this line, separated by a newline
  99         (This advances the line counter for address matching and "=", if no
 100         next line available quit processing script without default output)
 101
 102      p  Print this line
 103
 104      P  Print this line up to first newline (from "N")
 105
 106      q  Quit (print default output, no more commands processed or lines read)
 107
 108      x  Exchange this line with remembered line (overwrite in both directions)
 109
 110      =  Print the current line number (followed by a newline)
 111
 112    The following commands (may) take an argument. The "text" arguments (to
 113    the "a", "b", and "c" commands) may end with an unescaped "\" to append
 114    the next line (for which leading whitespace is not skipped), and also
 115    treat ";" as a literal character (use "\;" instead).
 116
 117      a [text]   Append text to output before attempting to read next line
 118
 119      b [label]  Branch, jumps to :label (or with no label, to end of SCRIPT)
 120
 121      c [text]   Delete line, output text at end of matching address range
 122                 (ignores remaining COMMANDs)
 123
 124      i [text]   Print text
 125
 126      r [file]   Append contents of file to output before attempting to read
 127                 next line.
 128
 129      s/S/R/F    Search for regex S, replace matched text with R using flags F.
 130                 The first character after the "s" (anything but newline or
 131                 backslash) is the delimiter, escape with \ to use normally.
 132
 133                 The replacement text may contain "&" to substitute the matched
 134                 text (escape it with backslash for a literal &), or \1 through
 135                 \9 to substitute a parenthetical subexpression in the regex.
 136                 You can also use the normal backslash escapes such as \n and
 137                 a backslash at the end of the line appends the next line.
 138
 139                 The flags are:
 140
 141                 [0-9]    A number, substitute only that occurrence of pattern
 142                 g        Global, substitute all occurrences of pattern
 143                 i        Ignore case when matching
 144                 p        Print the line if match was found and replaced
 145                 w [file] Write (append) line to file if match replaced
 146
 147      t [label]  Test, jump to :label only if an "s" command found a match in
 148                 this line since last test (replacing with same text counts)
 149
 150      T [label]  Test false, jump only if "s" hasn't found a match.
 151
 152      w [file]   Write (append) line to file
 153
 154      y/old/new/ Change each character in 'old' to corresponding character
 155                 in 'new' (with standard backslash escapes, delimiter can be
 156                 any repeated character except \ or \n)
 157
 158      : [label]  Labeled target for jump commands
 159
 160      #  Comment, ignore rest of this line of SCRIPT
 161
 162    Deviations from posix: allow extended regular expressions with -r,
 163    editing in place with -i, separate with -s, printf escapes in text, line
 164    continuations, semicolons after all commands, 2-address anywhere an
 165    address is allowed, "T" command, multiline continuations for [abc],
 166    \; to end [abc] argument before end of line.
 167*/
 168
 169#define FOR_sed
 170#include "toys.h"
 171
 172GLOBALS(
 173  struct arg_list *f, *e;
 174
 175  // processed pattern list
 176  struct double_list *pattern;
 177
 178  char *nextline, *remember;
 179  void *restart, *lastregex;
 180  long nextlen, rememberlen, count;
 181  int fdout, noeol;
 182  unsigned xx;
 183)
 184
 185// Linked list of parsed sed commands. Offset fields indicate location where
 186// regex or string starts, ala offset+(char *)struct, because we remalloc()
 187// these to expand them for multiline inputs, and pointers would have to be
 188// individually adjusted.
 189
 190struct sedcmd {
 191  struct sedcmd *next, *prev;
 192
 193  // Begin and end of each match
 194  long lmatch[2]; // line number of match
 195  int rmatch[2];  // offset of regex struct for prefix matches (/abc/,/def/p)
 196  int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
 197  unsigned not, hit;
 198  unsigned sflags; // s///flag bits: i=1, g=2, p=4
 199  char c; // action
 200};
 201
 202// Write out line with potential embedded NUL, handling eol/noeol
 203static int emit(char *line, long len, int eol)
 204{
 205  int l, old = line[len];
 206
 207  if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
 208  TT.noeol = !eol;
 209  if (eol) line[len++] = '\n';
 210  if (!len) return 0;
 211  l = writeall(TT.fdout, line, len);
 212  if (eol) line[len-1] = old;
 213  if (l != len) {
 214    perror_msg("short write");
 215
 216    return 1;
 217  }
 218
 219  return 0;
 220}
 221
 222// Extend allocation to include new string, with newline between if newlen<0
 223
 224static char *extend_string(char **old, char *new, int oldlen, int newlen)
 225{
 226  int newline = newlen < 0;
 227  char *s;
 228
 229  if (newline) newlen = -newlen;
 230  s = *old = xrealloc(*old, oldlen+newlen+newline+1);
 231  if (newline) s[oldlen++] = '\n';
 232  memcpy(s+oldlen, new, newlen);
 233  s[oldlen+newlen] = 0;
 234
 235  return s+oldlen+newlen+1;
 236}
 237
 238// An empty regex repeats the previous one
 239static void *get_regex(void *trump, int offset)
 240{
 241  if (!offset) {
 242    if (!TT.lastregex) error_exit("no previous regex");
 243    return TT.lastregex;
 244  }
 245
 246  return TT.lastregex = offset+(char *)trump;
 247}
 248
 249// Apply pattern to line from input file
 250static void sed_line(char **pline, long plen)
 251{
 252  struct append {
 253    struct append *next, *prev;
 254    int file;
 255    char *str;
 256  } *append = 0;
 257  char *line = TT.nextline;
 258  long len = TT.nextlen;
 259  struct sedcmd *command;
 260  int eol = 0, tea = 0;
 261
 262  // Ignore EOF for all files before last unless -i
 263  if (!pline && !(toys.optflags&FLAG_i)) return;
 264
 265  // Grab next line for deferred processing (EOF detection: we get a NULL
 266  // pline at EOF to flush last line). Note that only end of _last_ input
 267  // file matches $ (unless we're doing -i).
 268  TT.nextline = 0;
 269  TT.nextlen = 0;
 270  if (pline) {
 271    TT.nextline = *pline;
 272    TT.nextlen = plen;
 273    *pline = 0;
 274  }
 275
 276  if (!line || !len) return;
 277  if (line[len-1] == '\n') line[--len] = eol++;
 278  TT.count++;
 279
 280  // The restart-1 is because we added one to make sure it wasn't NULL,
 281  // otherwise N as last command would restart script
 282  command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
 283  TT.restart = 0;
 284
 285  while (command) {
 286    char *str, c = command->c;
 287
 288    // Have we got a line or regex matching range for this rule?
 289    if (*command->lmatch || *command->rmatch) {
 290      int miss = 0;
 291      long lm;
 292
 293      // In a match that might end?
 294      if (command->hit) {
 295        if (!(lm = command->lmatch[1])) {
 296          if (!command->rmatch[1]) command->hit = 0;
 297          else {
 298            void *rm = get_regex(command, command->rmatch[1]);
 299
 300            // regex match end includes matching line, so defer deactivation
 301            if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
 302          }
 303        } else if (lm > 0 && lm < TT.count) command->hit = 0;
 304
 305      // Start a new match?
 306      } else {
 307        if (!(lm = *command->lmatch)) {
 308          void *rm = get_regex(command, *command->rmatch);
 309
 310          if (line && !regexec0(rm, line, len, 0, 0, 0)) command->hit++;
 311        } else if (lm == TT.count || (lm == -1 && !pline)) command->hit++;
 312
 313        if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
 314      } 
 315
 316      // Didn't match?
 317      lm = !(command->hit ^ command->not);
 318
 319      // Deferred disable from regex end match
 320      if (miss || command->lmatch[1] == TT.count) command->hit = 0;
 321
 322      if (lm) {
 323        // Handle skipping curly bracket command group
 324        if (c == '{') {
 325          int curly = 1;
 326
 327          while (curly) {
 328            command = command->next;
 329            if (command->c == '{') curly++;
 330            if (command->c == '}') curly--;
 331          }
 332        }
 333        command = command->next;
 334        continue;
 335      }
 336    }
 337
 338    // A deleted line can still update line match state for later commands
 339    if (!line) {
 340      command = command->next;
 341      continue;
 342    }
 343
 344    // Process command
 345
 346    if (c=='a' || c=='r') {
 347      struct append *a = xzalloc(sizeof(struct append));
 348      if (command->arg1) a->str = command->arg1+(char *)command;
 349      a->file = c=='r';
 350      dlist_add_nomalloc((void *)&append, (void *)a);
 351    } else if (c=='b' || c=='t' || c=='T') {
 352      int t = tea;
 353
 354      if (c != 'b') tea = 0;
 355      if (c=='b' || t^(c=='T')) {
 356        if (!command->arg1) break;
 357        str = command->arg1+(char *)command;
 358        for (command = (void *)TT.pattern; command; command = command->next)
 359          if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
 360            break;
 361        if (!command) error_exit("no :%s", str);
 362      }
 363    } else if (c=='c') {
 364      str = command->arg1+(char *)command;
 365      if (!command->hit) emit(str, strlen(str), 1);
 366      free(line);
 367      line = 0;
 368      continue;
 369    } else if (c=='d') {
 370      free(line);
 371      line = 0;
 372      continue;
 373    } else if (c=='D') {
 374      // Delete up to \n or end of buffer
 375      str = line;
 376      while ((str-line)<len) if (*(str++) == '\n') break;
 377      len -= str - line;
 378      memmove(line, str, len);
 379
 380      // if "delete" blanks line, disable further processing
 381      // otherwise trim and restart script
 382      if (!len) {
 383        free(line);
 384        line = 0;
 385      } else {
 386        line[len] = 0;
 387        command = (void *)TT.pattern;
 388      }
 389      continue;
 390    } else if (c=='g') {
 391      free(line);
 392      line = xstrdup(TT.remember);
 393      len = TT.rememberlen;
 394    } else if (c=='G') {
 395      line = xrealloc(line, len+TT.rememberlen+2);
 396      line[len++] = '\n';
 397      memcpy(line+len, TT.remember, TT.rememberlen);
 398      line[len += TT.rememberlen] = 0;
 399    } else if (c=='h') {
 400      free(TT.remember);
 401      TT.remember = xstrdup(line);
 402      TT.rememberlen = len;
 403    } else if (c=='H') {
 404      TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
 405      TT.remember[TT.rememberlen++] = '\n';
 406      memcpy(TT.remember+TT.rememberlen, line, len);
 407      TT.remember[TT.rememberlen += len] = 0;
 408    } else if (c=='i') {
 409      str = command->arg1+(char *)command;
 410      emit(str, strlen(str), 1);
 411    } else if (c=='l') {
 412      int i, x, off;
 413
 414      if (!TT.xx) {
 415        terminal_size(&TT.xx, 0);
 416        if (!TT.xx) TT.xx = 80;
 417        if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
 418        if (TT.xx > 4) TT.xx -= 4;
 419      }
 420
 421      for (i = off = 0; i<len; i++) {
 422        if (off >= TT.xx) {
 423          toybuf[off++] = '\\';
 424          emit(toybuf, off, 1);
 425          off = 0;
 426        }
 427        x = stridx("\\\a\b\f\r\t\v", line[i]);
 428        if (x != -1) {
 429          toybuf[off++] = '\\';
 430          toybuf[off++] = "\\abfrtv"[x];
 431        } else if (line[i] >= ' ') toybuf[off++] = line[i];
 432        else off += sprintf(toybuf+off, "\\%03o", line[i]);
 433      }
 434      toybuf[off++] = '$';
 435      emit(toybuf, off, 1);
 436    } else if (c=='n') {
 437      TT.restart = command->next+1;
 438
 439      break;
 440    } else if (c=='N') {
 441      // Can't just grab next line because we could have multiple N and
 442      // we need to actually read ahead to get N;$p EOF detection right.
 443      if (pline) {
 444        TT.restart = command->next+1;
 445        extend_string(&line, TT.nextline, len, -TT.nextlen);
 446        free(TT.nextline);
 447        TT.nextline = line;
 448        TT.nextlen += len + 1;
 449        line = 0;
 450      }
 451
 452      // Pending append goes out right after N
 453      goto done; 
 454    } else if (c=='p' || c=='P') {
 455      char *l = (c=='P') ? strchr(line, '\n') : 0;
 456
 457      if (emit(line, l ? l-line : len, eol)) break;
 458    } else if (c=='q') {
 459      if (pline) *pline = (void *)1;
 460      free(TT.nextline);
 461      TT.nextline = 0;
 462      TT.nextlen = 0;
 463
 464      break;
 465    } else if (c=='s') {
 466      char *rline = line, *new = command->arg2 + (char *)command, *swap, *rswap;
 467      regmatch_t *match = (void *)toybuf;
 468      regex_t *reg = get_regex(command, command->arg1);
 469      int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen;
 470
 471      // Find match in remaining line (up to remaining len)
 472      while (!regexec0(reg, rline, rlen, 10, match, mflags)) {
 473        mflags = REG_NOTBOL;
 474
 475        // Zero length matches don't count immediately after a previous match
 476        mlen = match[0].rm_eo-match[0].rm_so;
 477        if (!mlen && !zmatch) {
 478          if (!rlen--) break;
 479          rline++;
 480          zmatch++;
 481          continue;
 482        } else zmatch = 0;
 483
 484        // If we're replacing only a specific match, skip if this isn't it
 485        off = command->sflags>>3;
 486        if (off && off != ++count) {
 487          rline += match[0].rm_eo;
 488          rlen -= match[0].rm_eo;
 489
 490          continue;
 491        }
 492        // The fact getline() can allocate unbounded amounts of memory is
 493        // a bigger issue, but while we're here check for integer overflow
 494        if (match[0].rm_eo > INT_MAX) perror_exit(0);
 495
 496        // newlen = strlen(new) but with \1 and & and printf escapes
 497        for (off = newlen = 0; new[off]; off++) {
 498          int cc = -1;
 499
 500          if (new[off] == '&') cc = 0;
 501          else if (new[off] == '\\') cc = new[++off] - '0';
 502          if (cc < 0 || cc > 9) {
 503            newlen++;
 504            continue;
 505          }
 506          newlen += match[cc].rm_eo-match[cc].rm_so;
 507        }
 508
 509        // Allocate new size, copy start/end around match. (Can't extend in
 510        // place because backrefs may refer to text after it's overwritten.)
 511        len += newlen-mlen;
 512        swap = xmalloc(len+1);
 513        rswap = swap+(rline-line)+match[0].rm_so;
 514        memcpy(swap, line, (rline-line)+match[0].rm_so);
 515        memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1);
 516
 517        // copy in new replacement text
 518        for (off = mlen = 0; new[off]; off++) {
 519          int cc = 0, ll;
 520
 521          if (new[off] == '\\') {
 522            cc = new[++off] - '0';
 523            if (cc<0 || cc>9) {
 524              if (!(rswap[mlen++] = unescape(new[off])))
 525                rswap[mlen-1] = new[off];
 526
 527              continue;
 528            } else if (match[cc].rm_so == -1) error_exit("no s//\\%d/", cc);
 529          } else if (new[off] != '&') {
 530            rswap[mlen++] = new[off];
 531
 532            continue;
 533          }
 534
 535          ll = match[cc].rm_eo-match[cc].rm_so;
 536          memcpy(rswap+mlen, rline+match[cc].rm_so, ll);
 537          mlen += ll;
 538        }
 539
 540        rline = rswap+newlen;
 541        free(line);
 542        line = swap;
 543
 544        // Stop after first substitution unless we have flag g
 545        if (!(command->sflags & 2)) break;
 546      }
 547
 548      if (mflags) {
 549        // flag p
 550        if (command->sflags & 4) emit(line, len, eol);
 551
 552        tea = 1;
 553        if (command->w) goto writenow;
 554      }
 555    } else if (c=='w') {
 556      int fd, noeol;
 557      char *name;
 558
 559writenow:
 560      // Swap out emit() context
 561      fd = TT.fdout;
 562      noeol = TT.noeol;
 563
 564      // We save filehandle and newline status before filename
 565      name = command->w + (char *)command;
 566      memcpy(&TT.fdout, name, 4);
 567      name += 4;
 568      TT.noeol = *(name++);
 569
 570      // write, then save/restore context
 571      if (emit(line, len, eol))
 572        perror_exit("w '%s'", command->arg1+(char *)command);
 573      *(--name) = TT.noeol;
 574      TT.noeol = noeol;
 575      TT.fdout = fd;
 576    } else if (c=='x') {
 577      long swap = TT.rememberlen;
 578
 579      str = TT.remember;
 580      TT.remember = line;
 581      line = str;
 582      TT.rememberlen = len;
 583      len = swap;
 584    } else if (c=='y') {
 585      char *from, *to = (char *)command;
 586      int i, j;
 587
 588      from = to+command->arg1;
 589      to += command->arg2;
 590
 591      for (i = 0; i < len; i++) {
 592        j = stridx(from, line[i]);
 593        if (j != -1) line[i] = to[j];
 594      }
 595    } else if (c=='=') {
 596      sprintf(toybuf, "%ld", TT.count);
 597      emit(toybuf, strlen(toybuf), 1);
 598    }
 599
 600    command = command->next;
 601  }
 602
 603  if (line && !(toys.optflags & FLAG_n)) emit(line, len, eol);
 604
 605done:
 606  if (dlist_terminate(append)) while (append) {
 607    struct append *a = append->next;
 608
 609    if (append->file) {
 610      int fd = open(append->str, O_RDONLY);
 611
 612      // Force newline if noeol pending
 613      if (fd != -1) {
 614        if (TT.noeol) xwrite(TT.fdout, "\n", 1);
 615        TT.noeol = 0;
 616        xsendfile(fd, TT.fdout);
 617        close(fd);
 618      }
 619    } else if (append->str) emit(append->str, strlen(append->str), 1);
 620    else emit(line, 0, 0);
 621    free(append);
 622    append = a;
 623  }
 624  free(line);
 625}
 626
 627// Callback called on each input file
 628static void do_sed_file(int fd, char *name)
 629{
 630  int i = toys.optflags & FLAG_i;
 631  char *tmp;
 632
 633  if (i) {
 634    struct sedcmd *command;
 635
 636    if (!fd) return error_msg("-i on stdin");
 637    TT.fdout = copy_tempfile(fd, name, &tmp);
 638    TT.count = 0;
 639    for (command = (void *)TT.pattern; command; command = command->next)
 640      command->hit = 0;
 641  }
 642  do_lines(fd, sed_line);
 643  if (i) {
 644    replace_tempfile(-1, TT.fdout, &tmp);
 645    TT.fdout = 1;
 646    TT.nextline = 0;
 647    TT.nextlen = TT.noeol = 0;
 648  }
 649}
 650
 651// Copy chunk of string between two delimiters, converting printf escapes.
 652// returns processed copy of string (0 if error), *pstr advances to next
 653// unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
 654// if regxex, ignore delimiter in [ranges]
 655static char *unescape_delimited_string(char **pstr, char *delim)
 656{
 657  char *to, *from, mode = 0, d;
 658
 659  // Grab leading delimiter (if necessary), allocate space for new string
 660  from = *pstr;
 661  if (!delim || !*delim) {
 662    if (!(d = *(from++))) return 0;
 663    if (d == '\\') d = *(from++);
 664    if (!d || d == '\\') return 0;
 665    if (delim) *delim = d;
 666  } else d = *delim;
 667  to = delim = xmalloc(strlen(*pstr)+1);
 668
 669  while (mode || *from != d) {
 670    if (!*from) return 0;
 671
 672    // delimiter in regex character range doesn't count
 673    if (*from == '[') {
 674      if (!mode) {
 675        mode = ']';
 676        if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
 677      } else if (mode == ']' && strchr(".=:", from[1])) {
 678        *(to++) = *(from++);
 679        mode = *from;
 680      }
 681    } else if (*from == mode) {
 682      if (mode == ']') mode = 0;
 683      else {
 684        *(to++) = *(from++);
 685        mode = ']';
 686      }
 687    // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
 688    // but the perl build does it, so we need to filter it out.
 689    } else if (mode && *from == '-' && from[-1] == from[1]) {
 690      from+=2;
 691      continue;
 692    } else if (*from == '\\') {
 693      if (!from[1]) return 0;
 694
 695      // Check escaped end delimiter before printf style escapes.
 696      if (from[1] == d) from++;
 697      else if (from[1]=='\\') *(to++) = *(from++);
 698      else {
 699        char c = unescape(from[1]);
 700
 701        if (c) {
 702          *(to++) = c;
 703          from+=2;
 704          continue;
 705        } else if (!mode) *(to++) = *(from++);
 706      }
 707    }
 708    *(to++) = *(from++);
 709  }
 710  *to = 0;
 711  *pstr = from+1;
 712
 713  return delim;
 714}
 715
 716// Translate pattern strings into command structures. Each command structure
 717// is a single allocation (which requires some math and remalloc at times).
 718static void parse_pattern(char **pline, long len)
 719{
 720  struct sedcmd *command = (void *)TT.pattern;
 721  char *line, *reg, c, *errstart;
 722  int i;
 723
 724  line = errstart = pline ? *pline : "";
 725  if (len && line[len-1]=='\n') line[--len] = 0;
 726
 727  // Append this line to previous multiline command? (hit indicates type.)
 728  // During parsing "hit" stores data about line continuations, but in
 729  // sed_line() it means the match range attached to this command
 730  // is active, so processing the continuation must zero it again.
 731  if (command && command->prev->hit) {
 732    // Remove half-finished entry from list so remalloc() doesn't confuse it
 733    TT.pattern = TT.pattern->prev;
 734    command = dlist_pop(&TT.pattern);
 735    c = command->c;
 736    reg = (char *)command;
 737    reg += command->arg1 + strlen(reg + command->arg1);
 738
 739    // Resume parsing for 'a' or 's' command. (Only two that can do this.)
 740    // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
 741    // a unicode character.
 742    if (command->hit < 256) goto resume_s;
 743    else goto resume_a;
 744  }
 745
 746  // Loop through commands in this line.
 747
 748  command = 0;
 749  for (;;) {
 750    if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
 751
 752    // If there's no more data on this line, return.
 753    for (;;) {
 754      while (isspace(*line) || *line == ';') line++;
 755      if (*line == '#') while (*line && *line != '\n') line++;
 756      else break;
 757    }
 758    if (!*line) return;
 759
 760    // We start by writing data into toybuf. Later we'll allocate the
 761    // ex
 762
 763    errstart = line;
 764    memset(toybuf, 0, sizeof(struct sedcmd));
 765    command = (void *)toybuf;
 766    reg = toybuf + sizeof(struct sedcmd);
 767
 768    // Parse address range (if any)
 769    for (i = 0; i < 2; i++) {
 770      if (*line == ',') line++;
 771      else if (i) break;
 772
 773      if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
 774      else if (*line == '$') {
 775        command->lmatch[i] = -1;
 776        line++;
 777      } else if (*line == '/' || *line == '\\') {
 778        char *s = line;
 779
 780        if (!(s = unescape_delimited_string(&line, 0))) goto error;
 781        if (!*s) command->rmatch[i] = 0;
 782        else {
 783          xregcomp((void *)reg, s, (toys.optflags & FLAG_r)*REG_EXTENDED);
 784          command->rmatch[i] = reg-toybuf;
 785          reg += sizeof(regex_t);
 786        }
 787        free(s);
 788      } else break;
 789    }
 790
 791    while (isspace(*line)) line++;
 792    if (!*line) break;
 793
 794    while (*line == '!') {
 795      command->not = 1;
 796      line++;
 797    }
 798    while (isspace(*line)) line++;
 799
 800    c = command->c = *(line++);
 801    if (strchr("}:", c) && i) break;
 802    if (strchr("aiqr=", c) && i>1) break;
 803
 804    // Add step to pattern
 805    command = xmemdup(toybuf, reg-toybuf);
 806    reg = (reg-toybuf) + (char *)command;
 807
 808    // Parse arguments by command type
 809    if (c == '{') TT.nextlen++;
 810    else if (c == '}') {
 811      if (!TT.nextlen--) break;
 812    } else if (c == 's') {
 813      char *end, delim = 0;
 814
 815      // s/pattern/replacement/flags
 816
 817      // line continuations use arg1 (back at the start of the function),
 818      // so let's fill out arg2 first (since the regex part can't be multiple
 819      // lines) and swap them back later.
 820
 821      // get pattern (just record, we parse it later)
 822      command->arg2 = reg - (char *)command;
 823      if (!(TT.remember = unescape_delimited_string(&line, &delim)))
 824        goto error;
 825
 826      reg += sizeof(regex_t);
 827      command->arg1 = reg-(char *)command;
 828      command->hit = delim;
 829resume_s:
 830      // get replacement - don't replace escapes yet because \1 and \& need
 831      // processing later, after we replace \\ with \ we can't tell \\1 from \1
 832      end = line;
 833      while (*end != command->hit) {
 834        if (!*end) goto error;
 835        if (*end++ == '\\') {
 836          if (!*end || *end == '\n') {
 837            end[-1] = '\n';
 838            break;
 839          }
 840          end++;
 841        }
 842      }
 843
 844      reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
 845      line = end;
 846      // line continuation? (note: '\n' can't be a valid delim).
 847      if (*line == command->hit) command->hit = 0;
 848      else {
 849        if (!*line) continue;
 850        reg--;
 851        line++;
 852        goto resume_s;
 853      }
 854
 855      // swap arg1/arg2 so they're back in order arguments occur.
 856      i = command->arg1;
 857      command->arg1 = command->arg2;
 858      command->arg2 = i;
 859
 860      // get flags
 861      for (line++; *line; line++) {
 862        long l;
 863
 864        if (isspace(*line) && *line != '\n') continue;
 865
 866        if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l;
 867        else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
 868          command->sflags |= l << 3;
 869          line--;
 870        } else break;
 871      }
 872
 873      // We deferred actually parsing the regex until we had the s///i flag
 874      // allocating the space was done by extend_string() above
 875      if (!*TT.remember) command->arg1 = 0;
 876      else xregcomp((void *)(command->arg1 + (char *)command), TT.remember,
 877        ((toys.optflags & FLAG_r)*REG_EXTENDED)|((command->sflags&1)*REG_ICASE));
 878      free(TT.remember);
 879      TT.remember = 0;
 880      if (*line == 'w') {
 881        line++;
 882        goto writenow;
 883      }
 884    } else if (c == 'w') {
 885      int fd, delim;
 886      char *cc;
 887
 888      // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
 889      // eol status, and to retain the filename for error messages, we'd need
 890      // to go up to arg5 just for this. Compromise: dynamically allocate the
 891      // filehandle and eol status.
 892
 893writenow:
 894      while (isspace(*line)) line++;
 895      if (!*line) goto error;
 896      for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
 897      delim = *cc;
 898      *cc = 0;
 899      fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
 900      *cc = delim;
 901
 902      command->w = reg - (char *)command;
 903      command = xrealloc(command, command->w+(cc-line)+6);
 904      reg = command->w + (char *)command;
 905
 906      memcpy(reg, &fd, 4);
 907      reg += 4;
 908      *(reg++) = 0;
 909      memcpy(reg, line, delim);
 910      reg += delim;
 911      *(reg++) = 0;
 912
 913      line = cc;
 914      if (delim) line += 2;
 915    } else if (c == 'y') {
 916      char *s, delim = 0;
 917      int len;
 918
 919      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
 920      command->arg1 = reg-(char *)command;
 921      len = strlen(s);
 922      reg = extend_string((void *)&command, s, reg-(char *)command, len);
 923      free(s);
 924      command->arg2 = reg-(char *)command;
 925      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
 926      if (len != strlen(s)) goto error;
 927      reg = extend_string((void *)&command, s, reg-(char*)command, len);
 928      free(s);
 929    } else if (strchr("abcirtTw:", c)) {
 930      int end;
 931
 932      // trim leading spaces
 933      while (isspace(*line) && *line != '\n') line++;
 934
 935      // Resume logic differs from 's' case because we don't add a newline
 936      // unless it's after something, so we add it on return instead.
 937resume_a:
 938      command->hit = 0;
 939
 940      // btT: end with space or semicolon, aicrw continue to newline.
 941      if (!(end = strcspn(line, strchr(":btT", c) ? "; \t\r\n\v\f" : "\n"))) {
 942        // Argument's optional for btT
 943        if (strchr("btT", c)) continue;
 944        else if (!command->arg1) break;
 945      }
 946
 947      // Extend allocation to include new string. We use offsets instead of
 948      // pointers so realloc() moving stuff doesn't break things. Ok to write
 949      // \n over NUL terminator because call to extend_string() adds it back.
 950      if (!command->arg1) command->arg1 = reg - (char*)command;
 951      else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
 952      else if (!pline) {
 953        command->arg1 = 0;
 954        continue;
 955      }
 956      reg = extend_string((void *)&command, line, reg - (char *)command, end);
 957
 958      // Recopy data to remove escape sequences and handle line continuation.
 959      if (strchr("aci", c)) {
 960        reg -= end+1;
 961        for (i = end; i; i--) {
 962          if ((*reg++ = *line++)=='\\') {
 963
 964            // escape at end of line: resume if -e escaped literal newline,
 965            // else request callback and resume with next line
 966            if (!--i) {
 967              *--reg = 0;
 968              if (*line) {
 969                line++;
 970                goto resume_a;
 971              }
 972              command->hit = 256;
 973              break;
 974            }
 975            if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
 976            line++;
 977          }
 978        }
 979        *reg = 0;
 980      } else line += end;
 981
 982    // Commands that take no arguments
 983    } else if (!strchr("{dDgGhHlnNpPqx=", c)) break;
 984  }
 985
 986error:
 987  error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
 988}
 989
 990void sed_main(void)
 991{
 992  struct arg_list *al;
 993  char **args = toys.optargs;
 994
 995  // Lie to autoconf when it asks stupid questions, so configure regexes
 996  // that look for "GNU sed version %f" greater than some old buggy number
 997  // don't fail us for not matching their narrow expectations.
 998  if (toys.optflags & FLAG_version) {
 999    xprintf("This is not GNU sed version 9.0\n");
1000    return;
1001  }
1002
1003  // Handling our own --version means we handle our own --help too.
1004  if (toys.optflags&FLAG_help) help_exit(0);
1005
1006  // Parse pattern into commands.
1007
1008  // If no -e or -f, first argument is the pattern.
1009  if (!TT.e && !TT.f) {
1010    if (!*toys.optargs) error_exit("no pattern");
1011    (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1012  }
1013
1014  // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1015  // so handle all -e, then all -f. (At least the behavior's consistent.)
1016
1017  for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
1018  parse_pattern(0, 0);
1019  for (al = TT.f; al; al = al->next) do_lines(xopenro(al->arg), parse_pattern);
1020  dlist_terminate(TT.pattern);
1021  if (TT.nextlen) error_exit("no }");  
1022
1023  TT.fdout = 1;
1024  TT.remember = xstrdup("");
1025
1026  // Inflict pattern upon input files. Long version because !O_CLOEXEC
1027  loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file);
1028
1029  // Provide EOF flush at end of cumulative input for non-i mode.
1030  if (!(toys.optflags & FLAG_i)) {
1031    toys.optflags |= FLAG_i;
1032    sed_line(0, 0);
1033  }
1034
1035  // todo: need to close fd when done for TOYBOX_FREE?
1036}
1037