toybox/toys/posix/sed.c
<<
>>
Prefs
   1/* sed.c - stream editor. Thing that does s/// and other stuff.
   2 *
   3 * Copyright 2014 Rob Landley <rob@landley.net>
   4 *
   5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
   6 *
   7 * TODO: lines > 2G could wrap signed int length counters. Not just getline()
   8 * but N and s///
   9 * TODO: make y// handle unicode, unicode delimiters
  10 * TODO: handle error return from emit(), error_msg/exit consistently
  11 *       What's the right thing to do for -i when write fails? Skip to next?
  12 * test '//q' with no previous regex, also repeat previous regex?
  13 *
  14 * Deviations from POSIX: allow extended regular expressions with -r,
  15 * editing in place with -i, separate with -s, NUL-separated input with -z,
  16 * printf escapes in text, line continuations, semicolons after all commands,
  17 * 2-address anywhere an address is allowed, "T" command, multiline
  18 * continuations for [abc], \; to end [abc] argument before end of line.
  19
  20USE_SED(NEWTOY(sed, "(help)(version)e*f*i:;nErz(null-data)s[+Er]", TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
  21
  22config SED
  23  bool "sed"
  24  default y
  25  help
  26    usage: sed [-inrszE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
  27
  28    Stream editor. Apply editing SCRIPTs to lines of input.
  29
  30    -e  Add SCRIPT to list
  31    -f  Add contents of SCRIPT_FILE to list
  32    -i  Edit each file in place (-iEXT keeps backup file with extension EXT)
  33    -n  No default output (use the p command to output matched lines)
  34    -r  Use extended regular expression syntax
  35    -E  POSIX alias for -r
  36    -s  Treat input files separately (implied by -i)
  37    -z  Use \0 rather than \n as input line separator
  38
  39    A SCRIPT is one or more COMMANDs separated by newlines or semicolons.
  40    All -e SCRIPTs are combined as if separated by newlines, followed by all -f
  41    SCRIPT_FILEs. If no -e or -f then first argument is the SCRIPT.
  42
  43    COMMANDs apply to every line unless prefixed with an ADDRESS of the form:
  44
  45      [ADDRESS[,ADDRESS]][!]COMMAND
  46
  47    ADDRESS is a line number (starting at 1), a /REGULAR EXPRESSION/, or $ for
  48    last line (-s or -i makes it last line of each file). One address matches one
  49    line, ADDRESS,ADDRESS matches from first to second inclusive. Two regexes can
  50    match multiple ranges. ADDRESS,+N ends N lines later. ! inverts the match.
  51
  52    REGULAR EXPRESSIONS start and end with the same character (anything but
  53    backslash or newline). To use the delimiter in the regex escape it with a
  54    backslash, and printf escapes (\abcefnrtv and octal, hex, and unicode) work.
  55    An empty regex repeats the previous one. ADDRESS regexes require any
  56    first delimiter except / to be \escaped to distinguish it from COMMANDs.
  57
  58    Sed reads each line of input, processes it, and writes it out or discards it
  59    before reading the next. Sed can remember one additional line in a separate
  60    buffer (the h, H, g, G, and x commands), and can read the next line of input
  61    early (the n and N commands), but otherwise operates on individual lines.
  62
  63    Each COMMAND starts with a single character. Commands with no arguments are:
  64
  65      !  Run this command when the ADDRESS _didn't_ match.
  66      {  Start new command block, continuing until a corresponding "}".
  67         Command blocks nest and can have ADDRESSes applying to the whole block.
  68      }  End command block (this COMMAND cannot have an address)
  69      d  Delete this line and move on to the next one
  70         (ignores remaining COMMANDs)
  71      D  Delete one line of input and restart command SCRIPT (same as "d"
  72         unless you've glued lines together with "N" or similar)
  73      g  Get remembered line (overwriting current line)
  74      G  Get remembered line (appending to current line)
  75      h  Remember this line (overwriting remembered line)
  76      H  Remember this line (appending to remembered line, if any)
  77      l  Print line escaping \abfrtv (but not \n), octal escape other nonprintng
  78         chars, wrap lines to terminal width with \, append $ to end of line.
  79      n  Print default output and read next line over current line (quit at EOF)
  80      N  Append \n and next line of input to this line. Quit at EOF without
  81         default output. Advances line counter for ADDRESS and "=".
  82      p  Print this line
  83      P  Print this line up to first newline (from "N")
  84      q  Quit (print default output, no more commands processed or lines read)
  85      x  Exchange this line with remembered line (overwrite in both directions)
  86      =  Print the current line number (plus newline)
  87      #  Comment, ignores rest of this line of SCRIPT (until newline)
  88
  89    Commands that take an argument: 
  90
  91      : LABEL    Target for jump commands
  92      a TEXT     Append text to output before reading next line
  93      b LABEL    Branch, jumps to :LABEL (with no LABEL to end of SCRIPT)
  94      c TEXT     Delete matching ADDRESS range and output TEXT instead
  95      i TEXT     Insert text (output immediately)
  96      r FILE     Append contents of FILE to output before reading next line.
  97      s/S/R/F    Search for regex S replace match with R using flags F. Delimiter
  98                 is anything but \n or \, escape with \ to use in S or R. Printf
  99                 escapes work. Unescaped & in R becomes full matched text, \1
 100                 through \9 = parenthetical subexpression from S. \ at end of
 101                 line appends next line of SCRIPT. The flags in F are:
 102                 [0-9]    A number N, substitute only Nth match
 103                 g        Global, substitute all matches
 104                 i/I      Ignore case when matching
 105                 p        Print resulting line when match found and replaced
 106                 w [file] Write (append) line to file when match replaced
 107      t LABEL    Test, jump if s/// command matched this line since last test 
 108      T LABEL    Test false, jump to :LABEL only if no s/// found a match
 109      w FILE     Write (append) line to file
 110      y/old/new/ Change each character in 'old' to corresponding character
 111                 in 'new' (with standard backslash escapes, delimiter can be
 112                 any repeated character except \ or \n)
 113
 114    The TEXT arguments (to a c i) may end with an unescaped "\" to append
 115    the next line (leading whitespace is not skipped), and treat ";" as a
 116    literal character (use "\;" instead).
 117*/
 118
 119#define FOR_sed
 120#include "toys.h"
 121
 122GLOBALS(
 123  char *i;
 124  struct arg_list *f, *e;
 125
 126  // processed pattern list
 127  struct double_list *pattern;
 128
 129  char *nextline, *remember;
 130  void *restart, *lastregex;
 131  long nextlen, rememberlen, count;
 132  int fdout, noeol;
 133  unsigned xx;
 134  char delim;
 135)
 136
 137// Linked list of parsed sed commands. Offset fields indicate location where
 138// regex or string starts, ala offset+(char *)struct, because we remalloc()
 139// these to expand them for multiline inputs, and pointers would have to be
 140// individually adjusted.
 141
 142struct sedcmd {
 143  struct sedcmd *next, *prev;
 144
 145  // Begin and end of each match
 146  long lmatch[2]; // line number of match
 147  int rmatch[2];  // offset of regex struct for prefix matches (/abc/,/def/p)
 148  int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
 149  unsigned not, hit;
 150  unsigned sflags; // s///flag bits: i=1, g=2, p=4, x=8
 151  char c; // action
 152};
 153
 154// Write out line with potential embedded NUL, handling eol/noeol
 155static int emit(char *line, long len, int eol)
 156{
 157  int l, old = line[len];
 158
 159  if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
 160  TT.noeol = !eol;
 161  if (eol) line[len++] = '\n';
 162  if (!len) return 0;
 163  l = writeall(TT.fdout, line, len);
 164  if (eol) line[len-1] = old;
 165  if (l != len) {
 166    if (TT.fdout != 1) perror_msg("short write");
 167
 168    return 1;
 169  }
 170
 171  return 0;
 172}
 173
 174// Extend allocation to include new string, with newline between if newlen<0
 175
 176static char *extend_string(char **old, char *new, int oldlen, int newlen)
 177{
 178  int newline = newlen < 0;
 179  char *s;
 180
 181  if (newline) newlen = -newlen;
 182  s = *old = xrealloc(*old, oldlen+newlen+newline+1);
 183  if (newline) s[oldlen++] = '\n';
 184  memcpy(s+oldlen, new, newlen);
 185  s[oldlen+newlen] = 0;
 186
 187  return s+oldlen+newlen+1;
 188}
 189
 190// An empty regex repeats the previous one
 191static void *get_regex(void *command, int offset)
 192{
 193  if (!offset) {
 194    if (!TT.lastregex) error_exit("no previous regex");
 195    return TT.lastregex;
 196  }
 197
 198  return TT.lastregex = offset+(char *)command;
 199}
 200
 201// Apply pattern to line from input file
 202static void sed_line(char **pline, long plen)
 203{
 204  struct append {
 205    struct append *next, *prev;
 206    int file;
 207    char *str;
 208  } *append = 0;
 209  char *line = TT.nextline;
 210  long len = TT.nextlen;
 211  struct sedcmd *command;
 212  int eol = 0, tea = 0;
 213
 214  // Ignore EOF for all files before last unless -i
 215  if (!pline && !FLAG(i) && !FLAG(s)) return;
 216
 217  // Grab next line for deferred processing (EOF detection: we get a NULL
 218  // pline at EOF to flush last line). Note that only end of _last_ input
 219  // file matches $ (unless we're doing -i).
 220  TT.nextline = 0;
 221  TT.nextlen = 0;
 222  if (pline) {
 223    TT.nextline = *pline;
 224    TT.nextlen = plen;
 225    *pline = 0;
 226  }
 227
 228  if (!line || !len) return;
 229  if (line[len-1] == '\n') line[--len] = eol++;
 230  TT.count++;
 231
 232  // The restart-1 is because we added one to make sure it wasn't NULL,
 233  // otherwise N as last command would restart script
 234  command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
 235  TT.restart = 0;
 236
 237  while (command) {
 238    char *str, c = command->c;
 239
 240    // Have we got a line or regex matching range for this rule?
 241    if (*command->lmatch || *command->rmatch) {
 242      int miss = 0;
 243      long lm;
 244
 245      // In a match that might end?
 246      if (command->hit) {
 247        if (!(lm = command->lmatch[1])) {
 248          if (!command->rmatch[1]) command->hit = 0;
 249          else {
 250            void *rm = get_regex(command, command->rmatch[1]);
 251
 252            // regex match end includes matching line, so defer deactivation
 253            if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
 254          }
 255        } else if (lm > 0 && lm < TT.count) command->hit = 0;
 256        else if (lm < -1 && TT.count == command->hit+(-lm-1)) command->hit = 0;
 257
 258      // Start a new match?
 259      } else {
 260        if (!(lm = *command->lmatch)) {
 261          void *rm = get_regex(command, *command->rmatch);
 262
 263          if (line && !regexec0(rm, line, len, 0, 0, 0))
 264            command->hit = TT.count;
 265        } else if (lm == TT.count || (lm == -1 && !pline))
 266          command->hit = TT.count;
 267
 268        if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
 269      } 
 270
 271      // Didn't match?
 272      lm = !(command->not^!!command->hit);
 273
 274      // Deferred disable from regex end match
 275      if (miss || command->lmatch[1] == TT.count) command->hit = 0;
 276
 277      if (lm) {
 278        // Handle skipping curly bracket command group
 279        if (c == '{') {
 280          int curly = 1;
 281
 282          while (curly) {
 283            command = command->next;
 284            if (command->c == '{') curly++;
 285            if (command->c == '}') curly--;
 286          }
 287        }
 288        command = command->next;
 289        continue;
 290      }
 291    }
 292
 293    // A deleted line can still update line match state for later commands
 294    if (!line) {
 295      command = command->next;
 296      continue;
 297    }
 298
 299    // Process command
 300
 301    if (c=='a' || c=='r') {
 302      struct append *a = xzalloc(sizeof(struct append));
 303      if (command->arg1) a->str = command->arg1+(char *)command;
 304      a->file = c=='r';
 305      dlist_add_nomalloc((void *)&append, (void *)a);
 306    } else if (c=='b' || c=='t' || c=='T') {
 307      int t = tea;
 308
 309      if (c != 'b') tea = 0;
 310      if (c=='b' || t^(c=='T')) {
 311        if (!command->arg1) break;
 312        str = command->arg1+(char *)command;
 313        for (command = (void *)TT.pattern; command; command = command->next)
 314          if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
 315            break;
 316        if (!command) error_exit("no :%s", str);
 317      }
 318    } else if (c=='c') {
 319      str = command->arg1+(char *)command;
 320      if (!command->hit) emit(str, strlen(str), 1);
 321      free(line);
 322      line = 0;
 323      continue;
 324    } else if (c=='d') {
 325      free(line);
 326      line = 0;
 327      continue;
 328    } else if (c=='D') {
 329      // Delete up to \n or end of buffer
 330      str = line;
 331      while ((str-line)<len) if (*(str++) == '\n') break;
 332      len -= str - line;
 333      memmove(line, str, len);
 334
 335      // if "delete" blanks line, disable further processing
 336      // otherwise trim and restart script
 337      if (!len) {
 338        free(line);
 339        line = 0;
 340      } else {
 341        line[len] = 0;
 342        command = (void *)TT.pattern;
 343      }
 344      continue;
 345    } else if (c=='g') {
 346      free(line);
 347      line = xstrdup(TT.remember);
 348      len = TT.rememberlen;
 349    } else if (c=='G') {
 350      line = xrealloc(line, len+TT.rememberlen+2);
 351      line[len++] = '\n';
 352      memcpy(line+len, TT.remember, TT.rememberlen);
 353      line[len += TT.rememberlen] = 0;
 354    } else if (c=='h') {
 355      free(TT.remember);
 356      TT.remember = xstrdup(line);
 357      TT.rememberlen = len;
 358    } else if (c=='H') {
 359      TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
 360      TT.remember[TT.rememberlen++] = '\n';
 361      memcpy(TT.remember+TT.rememberlen, line, len);
 362      TT.remember[TT.rememberlen += len] = 0;
 363    } else if (c=='i') {
 364      str = command->arg1+(char *)command;
 365      emit(str, strlen(str), 1);
 366    } else if (c=='l') {
 367      int i, x, off;
 368
 369      if (!TT.xx) {
 370        terminal_size(&TT.xx, 0);
 371        if (!TT.xx) TT.xx = 80;
 372        if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
 373        if (TT.xx > 4) TT.xx -= 4;
 374      }
 375
 376      for (i = off = 0; i<len; i++) {
 377        if (off >= TT.xx) {
 378          toybuf[off++] = '\\';
 379          emit(toybuf, off, 1);
 380          off = 0;
 381        }
 382        x = stridx("\\\a\b\f\r\t\v", line[i]);
 383        if (x != -1) {
 384          toybuf[off++] = '\\';
 385          toybuf[off++] = "\\abfrtv"[x];
 386        } else if (line[i] >= ' ') toybuf[off++] = line[i];
 387        else off += sprintf(toybuf+off, "\\%03o", line[i]);
 388      }
 389      toybuf[off++] = '$';
 390      emit(toybuf, off, 1);
 391    } else if (c=='n') {
 392      TT.restart = command->next+1;
 393
 394      break;
 395    } else if (c=='N') {
 396      // Can't just grab next line because we could have multiple N and
 397      // we need to actually read ahead to get N;$p EOF detection right.
 398      if (pline) {
 399        TT.restart = command->next+1;
 400        extend_string(&line, TT.nextline, len, -TT.nextlen);
 401        free(TT.nextline);
 402        TT.nextline = line;
 403        TT.nextlen += len + 1;
 404        line = 0;
 405      }
 406
 407      // Pending append goes out right after N
 408      goto done; 
 409    } else if (c=='p' || c=='P') {
 410      char *l = (c=='P') ? strchr(line, '\n') : 0;
 411
 412      if (emit(line, l ? l-line : len, eol)) break;
 413    } else if (c=='q' || c=='Q') {
 414      if (pline) *pline = (void *)1;
 415      free(TT.nextline);
 416      if (!toys.exitval && command->arg1)
 417        toys.exitval = atoi(command->arg1+(char *)command);
 418      TT.nextline = 0;
 419      TT.nextlen = 0;
 420      if (c=='Q') line = 0;
 421
 422      break;
 423    } else if (c=='s') {
 424      char *rline = line, *new = command->arg2 + (char *)command, *l2 = 0;
 425      regmatch_t *match = (void *)toybuf;
 426      regex_t *reg = get_regex(command, command->arg1);
 427      int mflags = 0, count = 0, l2used = 0, zmatch = 1, l2l = len, l2old = 0,
 428        mlen, off, newlen;
 429
 430      // Loop finding match in remaining line (up to remaining len)
 431      while (!regexec0(reg, rline, len-(rline-line), 10, match, mflags)) {
 432        mflags = REG_NOTBOL;
 433
 434        // Zero length matches don't count immediately after a previous match
 435        mlen = match[0].rm_eo-match[0].rm_so;
 436        if (!mlen && !zmatch) {
 437          if (rline-line == len) break;
 438          l2[l2used++] = *rline++;
 439          zmatch++;
 440          continue;
 441        } else zmatch = 0;
 442
 443        // If we're replacing only a specific match, skip if this isn't it
 444        off = command->sflags>>4;
 445        if (off && off != ++count) {
 446          if (l2) memcpy(l2+l2used, rline, match[0].rm_eo);
 447          l2used += match[0].rm_eo;
 448          rline += match[0].rm_eo;
 449
 450          continue;
 451        }
 452        // The fact getline() can allocate unbounded amounts of memory is
 453        // a bigger issue, but while we're here check for integer overflow
 454        if (match[0].rm_eo > INT_MAX) perror_exit(0);
 455
 456        // newlen = strlen(new) but with \1 and & and printf escapes
 457        for (off = newlen = 0; new[off]; off++) {
 458          int cc = -1;
 459
 460          if (new[off] == '&') cc = 0;
 461          else if (new[off] == '\\') cc = new[++off] - '0';
 462          if (cc < 0 || cc > 9) {
 463            newlen++;
 464            continue;
 465          }
 466          newlen += match[cc].rm_eo-match[cc].rm_so;
 467        }
 468
 469        // Copy changed data to new string
 470
 471        // Adjust allocation size of new string, copy data we know we'll keep
 472        l2l += newlen-mlen;
 473        if ((l2l|0xfff) > l2old) l2 = xrealloc(l2, l2old = (l2l|0xfff)+1);
 474        if (match[0].rm_so) {
 475          memcpy(l2+l2used, rline, match[0].rm_so);
 476          l2used += match[0].rm_so;
 477        }
 478
 479        // copy in new replacement text
 480        for (off = mlen = 0; new[off]; off++) {
 481          int cc = 0, ll;
 482
 483          if (new[off] == '\\') {
 484            cc = new[++off] - '0';
 485            if (cc<0 || cc>9) {
 486              if (!(l2[l2used+mlen++] = unescape(new[off])))
 487                l2[l2used+mlen-1] = new[off];
 488
 489              continue;
 490            } else if (cc > reg->re_nsub) error_exit("no s//\\%d/", cc);
 491          } else if (new[off] != '&') {
 492            l2[l2used+mlen++] = new[off];
 493
 494            continue;
 495          }
 496
 497          if (match[cc].rm_so != -1) {
 498            ll = match[cc].rm_eo-match[cc].rm_so;
 499            memcpy(l2+l2used+mlen, rline+match[cc].rm_so, ll);
 500            mlen += ll;
 501          }
 502        }
 503        l2used += newlen;
 504        rline += match[0].rm_eo;
 505
 506        // Stop after first substitution unless we have flag g
 507        if (!(command->sflags & 2)) break;
 508      }
 509
 510      // If we made any changes, finish off l2 and swap it for line
 511      if (l2) {
 512        // grab trailing unmatched data and null terminator, swap with original
 513        mlen = len-(rline-line);
 514        memcpy(l2+l2used, rline, mlen+1);
 515        len = l2used + mlen;
 516        free(line);
 517        line = l2;
 518      }
 519
 520      if (mflags) {
 521        // flag p
 522        if (command->sflags & 4) emit(line, len, eol);
 523
 524        tea = 1;
 525        if (command->w) goto writenow;
 526      }
 527    } else if (c=='w') {
 528      int fd, noeol;
 529      char *name;
 530
 531writenow:
 532      // Swap out emit() context
 533      fd = TT.fdout;
 534      noeol = TT.noeol;
 535
 536      // We save filehandle and newline status before filename
 537      name = command->w + (char *)command;
 538      memcpy(&TT.fdout, name, 4);
 539      name += 4;
 540      TT.noeol = *(name++);
 541
 542      // write, then save/restore context
 543      if (emit(line, len, eol))
 544        perror_exit("w '%s'", command->arg1+(char *)command);
 545      *(--name) = TT.noeol;
 546      TT.noeol = noeol;
 547      TT.fdout = fd;
 548    } else if (c=='x') {
 549      long swap = TT.rememberlen;
 550
 551      str = TT.remember;
 552      TT.remember = line;
 553      line = str;
 554      TT.rememberlen = len;
 555      len = swap;
 556    } else if (c=='y') {
 557      char *from, *to = (char *)command;
 558      int i, j;
 559
 560      from = to+command->arg1;
 561      to += command->arg2;
 562
 563      for (i = 0; i < len; i++) {
 564        j = stridx(from, line[i]);
 565        if (j != -1) line[i] = to[j];
 566      }
 567    } else if (c=='=') {
 568      sprintf(toybuf, "%ld", TT.count);
 569      if (emit(toybuf, strlen(toybuf), 1)) break;
 570    }
 571
 572    command = command->next;
 573  }
 574
 575  if (line && !FLAG(n)) emit(line, len, eol);
 576
 577done:
 578  if (dlist_terminate(append)) while (append) {
 579    struct append *a = append->next;
 580
 581    if (append->file) {
 582      int fd = open(append->str, O_RDONLY);
 583
 584      // Force newline if noeol pending
 585      if (fd != -1) {
 586        if (TT.noeol) xwrite(TT.fdout, "\n", 1);
 587        TT.noeol = 0;
 588        xsendfile(fd, TT.fdout);
 589        close(fd);
 590      }
 591    } else if (append->str) emit(append->str, strlen(append->str), 1);
 592    else emit(line, 0, 0);
 593    free(append);
 594    append = a;
 595  }
 596  free(line);
 597}
 598
 599// Callback called on each input file
 600static void do_sed_file(int fd, char *name)
 601{
 602  char *tmp, *s;
 603
 604  if (FLAG(i)) {
 605    if (!fd) return error_msg("-i on stdin");
 606    TT.fdout = copy_tempfile(fd, name, &tmp);
 607  }
 608  if (FLAG(i) || FLAG(s)) {
 609    struct sedcmd *command;
 610
 611    TT.count = 0;
 612    for (command = (void *)TT.pattern; command; command = command->next)
 613      command->hit = 0;
 614  }
 615  do_lines(fd, TT.delim, sed_line);
 616  if (FLAG(i)) {
 617    if (TT.i && *TT.i) {
 618      xrename(name, s = xmprintf("%s%s", name, TT.i));
 619      free(s);
 620    }
 621    replace_tempfile(-1, TT.fdout, &tmp);
 622    TT.fdout = 1;
 623  }
 624  if (FLAG(i) || FLAG(s)) {
 625    TT.nextline = 0;
 626    TT.nextlen = TT.noeol = 0;
 627  }
 628}
 629
 630// Copy chunk of string between two delimiters, converting printf escapes.
 631// returns processed copy of string (0 if error), *pstr advances to next
 632// unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
 633// if regxex, ignore delimiter in [ranges]
 634static char *unescape_delimited_string(char **pstr, char *delim)
 635{
 636  char *to, *from, mode = 0, d;
 637
 638  // Grab leading delimiter (if necessary), allocate space for new string
 639  from = *pstr;
 640  if (!delim || !*delim) {
 641    if (!(d = *(from++))) return 0;
 642    if (d == '\\') d = *(from++);
 643    if (!d || d == '\\') return 0;
 644    if (delim) *delim = d;
 645  } else d = *delim;
 646  to = delim = xmalloc(strlen(*pstr)+1);
 647
 648  while (mode || *from != d) {
 649    if (!*from) return 0;
 650
 651    // delimiter in regex character range doesn't count
 652    if (*from == '[') {
 653      if (!mode) {
 654        mode = ']';
 655        if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
 656      } else if (mode == ']' && strchr(".=:", from[1])) {
 657        *(to++) = *(from++);
 658        mode = *from;
 659      }
 660    } else if (*from == mode) {
 661      if (mode == ']') mode = 0;
 662      else {
 663        *(to++) = *(from++);
 664        mode = ']';
 665      }
 666    // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
 667    // but the perl build does it, so we need to filter it out.
 668    } else if (mode && *from == '-' && from[-1] == from[1]) {
 669      from+=2;
 670      continue;
 671    } else if (*from == '\\') {
 672      if (!from[1]) return 0;
 673
 674      // Check escaped end delimiter before printf style escapes.
 675      if (from[1] == d) from++;
 676      else if (from[1]=='\\') *(to++) = *(from++);
 677      else {
 678        char c = unescape(from[1]);
 679
 680        if (c) {
 681          *(to++) = c;
 682          from+=2;
 683          continue;
 684        } else if (!mode) *(to++) = *(from++);
 685      }
 686    }
 687    *(to++) = *(from++);
 688  }
 689  *to = 0;
 690  *pstr = from+1;
 691
 692  return delim;
 693}
 694
 695// Translate pattern strings into command structures. Each command structure
 696// is a single allocation (which requires some math and remalloc at times).
 697static void parse_pattern(char **pline, long len)
 698{
 699  struct sedcmd *command = (void *)TT.pattern;
 700  char *line, *reg, c, *errstart;
 701  int i;
 702
 703  line = errstart = pline ? *pline : "";
 704  if (len && line[len-1]=='\n') line[--len] = 0;
 705
 706  // Append this line to previous multiline command? (hit indicates type.)
 707  // During parsing "hit" stores data about line continuations, but in
 708  // sed_line() it means the match range attached to this command
 709  // is active, so processing the continuation must zero it again.
 710  if (command && command->prev->hit) {
 711    // Remove half-finished entry from list so remalloc() doesn't confuse it
 712    TT.pattern = TT.pattern->prev;
 713    command = dlist_pop(&TT.pattern);
 714    c = command->c;
 715    reg = (char *)command;
 716    reg += command->arg1 + strlen(reg + command->arg1);
 717
 718    // Resume parsing for 'a' or 's' command. (Only two that can do this.)
 719    // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
 720    // a unicode character.
 721    if (command->hit < 256) goto resume_s;
 722    else goto resume_a;
 723  }
 724
 725  // Loop through commands in this line.
 726
 727  command = 0;
 728  for (;;) {
 729    if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
 730
 731    // If there's no more data on this line, return.
 732    for (;;) {
 733      while (isspace(*line) || *line == ';') line++;
 734      if (*line == '#') while (*line && *line != '\n') line++;
 735      else break;
 736    }
 737    if (!*line) return;
 738
 739    // Start by writing data into toybuf.
 740
 741    errstart = line;
 742    memset(toybuf, 0, sizeof(struct sedcmd));
 743    command = (void *)toybuf;
 744    reg = toybuf + sizeof(struct sedcmd);
 745
 746    // Parse address range (if any)
 747    for (i = 0; i < 2; i++) {
 748      if (*line == ',') line++;
 749      else if (i) break;
 750
 751      if (i && *line == '+' && isdigit(line[1])) {
 752        line++;
 753        command->lmatch[i] = -2-strtol(line, &line, 0);
 754      } else if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
 755      else if (*line == '$') {
 756        command->lmatch[i] = -1;
 757        line++;
 758      } else if (*line == '/' || *line == '\\') {
 759        char *s = line;
 760
 761        if (!(s = unescape_delimited_string(&line, 0))) goto error;
 762        if (!*s) command->rmatch[i] = 0;
 763        else {
 764          xregcomp((void *)reg, s, REG_EXTENDED*!!FLAG(r));
 765          command->rmatch[i] = reg-toybuf;
 766          reg += sizeof(regex_t);
 767        }
 768        free(s);
 769      } else break;
 770    }
 771
 772    while (isspace(*line)) line++;
 773    if (!*line) break;
 774
 775    if (*line == '!') {
 776      command->not = 1;
 777      line++;
 778    }
 779    while (isspace(*line)) line++;
 780    if (!*line) break;
 781
 782    c = command->c = *(line++);
 783    if (strchr("}:", c) && i) break;
 784    if (strchr("aiqQr=", c) && i>1) break;
 785
 786    // Allocate memory and copy out of toybuf now that we know how big it is
 787    command = xmemdup(toybuf, reg-toybuf);
 788    reg = (reg-toybuf) + (char *)command;
 789
 790    // Parse arguments by command type
 791    if (c == '{') TT.nextlen++;
 792    else if (c == '}') {
 793      if (!TT.nextlen--) break;
 794    } else if (c == 's') {
 795      char *end, delim = 0;
 796      int flags;
 797
 798      // s/pattern/replacement/flags
 799
 800      // line continuations use arg1 (back at the start of the function),
 801      // so let's fill out arg2 first (since the regex part can't be multiple
 802      // lines) and swap them back later.
 803
 804      // get pattern (just record, we parse it later)
 805      command->arg2 = reg - (char *)command;
 806      if (!(TT.remember = unescape_delimited_string(&line, &delim)))
 807        goto error;
 808
 809      reg += sizeof(regex_t);
 810      command->arg1 = reg-(char *)command;
 811      command->hit = delim;
 812resume_s:
 813      // get replacement - don't replace escapes yet because \1 and \& need
 814      // processing later, after we replace \\ with \ we can't tell \\1 from \1
 815      end = line;
 816      while (*end != command->hit) {
 817        if (!*end) goto error;
 818        if (*end++ == '\\') {
 819          if (!*end || *end == '\n') {
 820            end[-1] = '\n';
 821            break;
 822          }
 823          end++;
 824        }
 825      }
 826
 827      reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
 828      line = end;
 829      // line continuation? (note: '\n' can't be a valid delim).
 830      if (*line == command->hit) command->hit = 0;
 831      else {
 832        if (!*line) continue;
 833        reg--;
 834        line++;
 835        goto resume_s;
 836      }
 837
 838      // swap arg1/arg2 so they're back in order arguments occur.
 839      i = command->arg1;
 840      command->arg1 = command->arg2;
 841      command->arg2 = i;
 842
 843      // get flags
 844      for (line++; *line; line++) {
 845        long l;
 846
 847        if (isspace(*line) && *line != '\n') continue;
 848
 849        if (0 <= (l = stridx("igpx", *line))) command->sflags |= 1<<l;
 850        else if (*line == 'I') command->sflags |= 1<<0;
 851        else if (!(command->sflags>>4) && 0<(l = strtol(line, &line, 10))) {
 852          command->sflags |= l << 4;
 853          line--;
 854        } else break;
 855      }
 856      flags = (FLAG(r) || (command->sflags&8)) ? REG_EXTENDED : 0;
 857      if (command->sflags&1) flags |= REG_ICASE;
 858
 859      // We deferred actually parsing the regex until we had the s///i flag
 860      // allocating the space was done by extend_string() above
 861      if (!*TT.remember) command->arg1 = 0;
 862      else xregcomp((void *)(command->arg1+(char *)command),TT.remember,flags);
 863      free(TT.remember);
 864      TT.remember = 0;
 865      if (*line == 'w') {
 866        line++;
 867        goto writenow;
 868      }
 869    } else if (c == 'w') {
 870      int fd, delim;
 871      char *cc;
 872
 873      // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
 874      // eol status, and to retain the filename for error messages, we'd need
 875      // to go up to arg5 just for this. Compromise: dynamically allocate the
 876      // filehandle and eol status.
 877
 878writenow:
 879      while (isspace(*line)) line++;
 880      if (!*line) goto error;
 881      for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
 882      delim = *cc;
 883      *cc = 0;
 884      fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC|O_APPEND, 0644);
 885      *cc = delim;
 886
 887      command->w = reg - (char *)command;
 888      command = xrealloc(command, command->w+(cc-line)+6);
 889      reg = command->w + (char *)command;
 890
 891      memcpy(reg, &fd, 4);
 892      reg += 4;
 893      *(reg++) = 0;
 894      memcpy(reg, line, delim);
 895      reg += delim;
 896      *(reg++) = 0;
 897
 898      line = cc;
 899      if (delim) line += 2;
 900    } else if (c == 'y') {
 901      char *s, delim = 0;
 902      int len;
 903
 904      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
 905      command->arg1 = reg-(char *)command;
 906      len = strlen(s);
 907      reg = extend_string((void *)&command, s, reg-(char *)command, len);
 908      free(s);
 909      command->arg2 = reg-(char *)command;
 910      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
 911      if (len != strlen(s)) goto error;
 912      reg = extend_string((void *)&command, s, reg-(char*)command, len);
 913      free(s);
 914    } else if (strchr("abcirtTqQw:", c)) {
 915      int end;
 916
 917      // trim leading spaces
 918      while (isspace(*line) && *line != '\n') line++;
 919
 920      // Resume logic differs from 's' case because we don't add a newline
 921      // unless it's after something, so we add it on return instead.
 922resume_a:
 923      command->hit = 0;
 924
 925      // btTqQ: end with space or semicolon, aicrw continue to newline.
 926      if (!(end = strcspn(line, strchr(":btTqQ", c) ? "}; \t\r\n\v\f" : "\n"))){
 927        // Argument's optional for btTqQ
 928        if (strchr("btTqQ", c)) continue;
 929        else if (!command->arg1) break;
 930      }
 931      // Error checking: qQ can only have digits after them
 932      if (c=='q' || c=='Q') {
 933        for (i = 0; i<end && isdigit(line[i]); i++);
 934        if (i != end) {
 935          line += i;
 936          break;
 937        }
 938      }
 939
 940      // Extend allocation to include new string. We use offsets instead of
 941      // pointers so realloc() moving stuff doesn't break things. Ok to write
 942      // \n over NUL terminator because call to extend_string() adds it back.
 943      if (!command->arg1) command->arg1 = reg - (char*)command;
 944      else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
 945      else if (!pline) {
 946        command->arg1 = 0;
 947        continue;
 948      }
 949      reg = extend_string((void *)&command, line, reg - (char *)command, end);
 950
 951      // Recopy data to remove escape sequences and handle line continuation.
 952      if (strchr("aci", c)) {
 953        reg -= end+1;
 954        for (i = end; i; i--) {
 955          if ((*reg++ = *line++)=='\\') {
 956
 957            // escape at end of line: resume if -e escaped literal newline,
 958            // else request callback and resume with next line
 959            if (!--i) {
 960              *--reg = 0;
 961              if (*line) {
 962                line++;
 963                goto resume_a;
 964              }
 965              command->hit = 256;
 966              break;
 967            }
 968            if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
 969            line++;
 970          }
 971        }
 972        *reg = 0;
 973      } else line += end;
 974
 975    // Commands that take no arguments
 976    } else if (!strchr("{dDgGhHlnNpPx=", c)) break;
 977  }
 978
 979error:
 980  error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
 981}
 982
 983void sed_main(void)
 984{
 985  struct arg_list *al;
 986  char **args = toys.optargs;
 987
 988  if (!FLAG(z)) TT.delim = '\n';
 989
 990  // Lie to autoconf when it asks stupid questions, so configure regexes
 991  // that look for "GNU sed version %f" greater than some old buggy number
 992  // don't fail us for not matching their narrow expectations.
 993  if (FLAG(version)) {
 994    xprintf("This is not GNU sed version 9.0\n");
 995    return;
 996  }
 997
 998  // Handling our own --version means we handle our own --help too.
 999  if (FLAG(help)) help_exit(0);
1000
1001  // Parse pattern into commands.
1002
1003  // If no -e or -f, first argument is the pattern.
1004  if (!TT.e && !TT.f) {
1005    if (!*toys.optargs) error_exit("no pattern");
1006    (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1007  }
1008
1009  // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1010  // so handle all -e, then all -f. (At least the behavior's consistent.)
1011
1012  for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
1013  parse_pattern(0, 0);
1014  for (al = TT.f; al; al = al->next)
1015    do_lines(xopenro(al->arg), TT.delim, parse_pattern);
1016  dlist_terminate(TT.pattern);
1017  if (TT.nextlen) error_exit("no }");  
1018
1019  TT.fdout = 1;
1020  TT.remember = xstrdup("");
1021
1022  // Inflict pattern upon input files. Long version because !O_CLOEXEC
1023  loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file);
1024
1025  // Provide EOF flush at end of cumulative input for non-i mode.
1026  if (!FLAG(i) && !FLAG(s)) {
1027    toys.optflags |= FLAG_s;
1028    sed_line(0, 0);
1029  }
1030
1031  // todo: need to close fd when done for TOYBOX_FREE?
1032}
1033