toybox/toys/posix/sed.c
<<
>>
Prefs
   1/* sed.c - stream editor. Thing that does s/// and other stuff.
   2 *
   3 * Copyright 2014 Rob Landley <rob@landley.net>
   4 *
   5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
   6 *
   7 * TODO: lines > 2G could wrap signed int length counters. Not just getline()
   8 * but N and s///
   9 * TODO: make y// handle unicode, unicode delimiters
  10 * TODO: handle error return from emit(), error_msg/exit consistently
  11 *       What's the right thing to do for -i when write fails? Skip to next?
  12 * test '//q' with no previous regex, also repeat previous regex?
  13
  14USE_SED(NEWTOY(sed, "(help)(version)e*f*i:;nErz(null-data)[+Er]", TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
  15
  16config SED
  17  bool "sed"
  18  default y
  19  help
  20    usage: sed [-inrzE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
  21
  22    Stream editor. Apply one or more editing SCRIPTs to each line of input
  23    (from FILE or stdin) producing output (by default to stdout).
  24
  25    -e  Add SCRIPT to list
  26    -f  Add contents of SCRIPT_FILE to list
  27    -i  Edit each file in place (-iEXT keeps backup file with extension EXT)
  28    -n  No default output (use the p command to output matched lines)
  29    -r  Use extended regular expression syntax
  30    -E  POSIX alias for -r
  31    -s  Treat input files separately (implied by -i)
  32    -z  Use \0 rather than \n as the input line separator
  33
  34    A SCRIPT is a series of one or more COMMANDs separated by newlines or
  35    semicolons. All -e SCRIPTs are concatenated together as if separated
  36    by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
  37    If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
  38
  39    Each COMMAND may be preceded by an address which limits the command to
  40    apply only to the specified line(s). Commands without an address apply to
  41    every line. Addresses are of the form:
  42
  43      [ADDRESS[,ADDRESS]][!]COMMAND
  44
  45    The ADDRESS may be a decimal line number (starting at 1), a /regular
  46    expression/ within a pair of forward slashes, or the character "$" which
  47    matches the last line of input. (In -s or -i mode this matches the last
  48    line of each file, otherwise just the last line of the last file.) A single
  49    address matches one line, a pair of comma separated addresses match
  50    everything from the first address to the second address (inclusive). If
  51    both addresses are regular expressions, more than one range of lines in
  52    each file can match. The second address can be +N to end N lines later.
  53
  54    REGULAR EXPRESSIONS in sed are started and ended by the same character
  55    (traditionally / but anything except a backslash or a newline works).
  56    Backslashes may be used to escape the delimiter if it occurs in the
  57    regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
  58    and unicode). An empty regex repeats the previous one. ADDRESS regexes
  59    (above) require the first delimiter to be escaped with a backslash when
  60    it isn't a forward slash (to distinguish it from the COMMANDs below).
  61
  62    Sed mostly operates on individual lines one at a time. It reads each line,
  63    processes it, and either writes it to the output or discards it before
  64    reading the next line. Sed can remember one additional line in a separate
  65    buffer (using the h, H, g, G, and x commands), and can read the next line
  66    of input early (using the n and N command), but other than that command
  67    scripts operate on individual lines of text.
  68
  69    Each COMMAND starts with a single character. The following commands take
  70    no arguments:
  71
  72      !  Run this command when the test _didn't_ match.
  73
  74      {  Start a new command block, continuing until a corresponding "}".
  75         Command blocks may nest. If the block has an address, commands within
  76         the block are only run for lines within the block's address range.
  77
  78      }  End command block (this command cannot have an address)
  79
  80      d  Delete this line and move on to the next one
  81         (ignores remaining COMMANDs)
  82
  83      D  Delete one line of input and restart command SCRIPT (same as "d"
  84         unless you've glued lines together with "N" or similar)
  85
  86      g  Get remembered line (overwriting current line)
  87
  88      G  Get remembered line (appending to current line)
  89
  90      h  Remember this line (overwriting remembered line)
  91
  92      H  Remember this line (appending to remembered line, if any)
  93
  94      l  Print line, escaping \abfrtv (but not newline), octal escaping other
  95         nonprintable characters, wrapping lines to terminal width with a
  96         backslash, and appending $ to actual end of line.
  97
  98      n  Print default output and read next line, replacing current line
  99         (If no next line available, quit processing script)
 100
 101      N  Append next line of input to this line, separated by a newline
 102         (This advances the line counter for address matching and "=", if no
 103         next line available quit processing script without default output)
 104
 105      p  Print this line
 106
 107      P  Print this line up to first newline (from "N")
 108
 109      q  Quit (print default output, no more commands processed or lines read)
 110
 111      x  Exchange this line with remembered line (overwrite in both directions)
 112
 113      =  Print the current line number (followed by a newline)
 114
 115    The following commands (may) take an argument. The "text" arguments (to
 116    the "a", "b", and "c" commands) may end with an unescaped "\" to append
 117    the next line (for which leading whitespace is not skipped), and also
 118    treat ";" as a literal character (use "\;" instead).
 119
 120      a [text]   Append text to output before attempting to read next line
 121
 122      b [label]  Branch, jumps to :label (or with no label, to end of SCRIPT)
 123
 124      c [text]   Delete line, output text at end of matching address range
 125                 (ignores remaining COMMANDs)
 126
 127      i [text]   Print text
 128
 129      r [file]   Append contents of file to output before attempting to read
 130                 next line.
 131
 132      s/S/R/F    Search for regex S, replace matched text with R using flags F.
 133                 The first character after the "s" (anything but newline or
 134                 backslash) is the delimiter, escape with \ to use normally.
 135
 136                 The replacement text may contain "&" to substitute the matched
 137                 text (escape it with backslash for a literal &), or \1 through
 138                 \9 to substitute a parenthetical subexpression in the regex.
 139                 You can also use the normal backslash escapes such as \n and
 140                 a backslash at the end of the line appends the next line.
 141
 142                 The flags are:
 143
 144                 [0-9]    A number, substitute only that occurrence of pattern
 145                 g        Global, substitute all occurrences of pattern
 146                 i        Ignore case when matching
 147                 p        Print the line if match was found and replaced
 148                 w [file] Write (append) line to file if match replaced
 149
 150      t [label]  Test, jump to :label only if an "s" command found a match in
 151                 this line since last test (replacing with same text counts)
 152
 153      T [label]  Test false, jump only if "s" hasn't found a match.
 154
 155      w [file]   Write (append) line to file
 156
 157      y/old/new/ Change each character in 'old' to corresponding character
 158                 in 'new' (with standard backslash escapes, delimiter can be
 159                 any repeated character except \ or \n)
 160
 161      : [label]  Labeled target for jump commands
 162
 163      #  Comment, ignore rest of this line of SCRIPT
 164
 165    Deviations from POSIX: allow extended regular expressions with -r,
 166    editing in place with -i, separate with -s, NUL-separated input with -z,
 167    printf escapes in text, line continuations, semicolons after all commands,
 168    2-address anywhere an address is allowed, "T" command, multiline
 169    continuations for [abc], \; to end [abc] argument before end of line.
 170*/
 171
 172#define FOR_sed
 173#include "toys.h"
 174
 175GLOBALS(
 176  char *i;
 177  struct arg_list *f, *e;
 178
 179  // processed pattern list
 180  struct double_list *pattern;
 181
 182  char *nextline, *remember;
 183  void *restart, *lastregex;
 184  long nextlen, rememberlen, count;
 185  int fdout, noeol;
 186  unsigned xx;
 187  char delim;
 188)
 189
 190// Linked list of parsed sed commands. Offset fields indicate location where
 191// regex or string starts, ala offset+(char *)struct, because we remalloc()
 192// these to expand them for multiline inputs, and pointers would have to be
 193// individually adjusted.
 194
 195struct sedcmd {
 196  struct sedcmd *next, *prev;
 197
 198  // Begin and end of each match
 199  long lmatch[2]; // line number of match
 200  int rmatch[2];  // offset of regex struct for prefix matches (/abc/,/def/p)
 201  int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
 202  unsigned not, hit;
 203  unsigned sflags; // s///flag bits: i=1, g=2, p=4
 204  char c; // action
 205};
 206
 207// Write out line with potential embedded NUL, handling eol/noeol
 208static int emit(char *line, long len, int eol)
 209{
 210  int l, old = line[len];
 211
 212  if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
 213  TT.noeol = !eol;
 214  if (eol) line[len++] = '\n';
 215  if (!len) return 0;
 216  l = writeall(TT.fdout, line, len);
 217  if (eol) line[len-1] = old;
 218  if (l != len) {
 219    if (TT.fdout != 1) perror_msg("short write");
 220
 221    return 1;
 222  }
 223
 224  return 0;
 225}
 226
 227// Extend allocation to include new string, with newline between if newlen<0
 228
 229static char *extend_string(char **old, char *new, int oldlen, int newlen)
 230{
 231  int newline = newlen < 0;
 232  char *s;
 233
 234  if (newline) newlen = -newlen;
 235  s = *old = xrealloc(*old, oldlen+newlen+newline+1);
 236  if (newline) s[oldlen++] = '\n';
 237  memcpy(s+oldlen, new, newlen);
 238  s[oldlen+newlen] = 0;
 239
 240  return s+oldlen+newlen+1;
 241}
 242
 243// An empty regex repeats the previous one
 244static void *get_regex(void *command, int offset)
 245{
 246  if (!offset) {
 247    if (!TT.lastregex) error_exit("no previous regex");
 248    return TT.lastregex;
 249  }
 250
 251  return TT.lastregex = offset+(char *)command;
 252}
 253
 254// Apply pattern to line from input file
 255static void sed_line(char **pline, long plen)
 256{
 257  struct append {
 258    struct append *next, *prev;
 259    int file;
 260    char *str;
 261  } *append = 0;
 262  char *line = TT.nextline;
 263  long len = TT.nextlen;
 264  struct sedcmd *command;
 265  int eol = 0, tea = 0;
 266
 267  // Ignore EOF for all files before last unless -i
 268  if (!pline && !FLAG(i)) return;
 269
 270  // Grab next line for deferred processing (EOF detection: we get a NULL
 271  // pline at EOF to flush last line). Note that only end of _last_ input
 272  // file matches $ (unless we're doing -i).
 273  TT.nextline = 0;
 274  TT.nextlen = 0;
 275  if (pline) {
 276    TT.nextline = *pline;
 277    TT.nextlen = plen;
 278    *pline = 0;
 279  }
 280
 281  if (!line || !len) return;
 282  if (line[len-1] == '\n') line[--len] = eol++;
 283  TT.count++;
 284
 285  // The restart-1 is because we added one to make sure it wasn't NULL,
 286  // otherwise N as last command would restart script
 287  command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
 288  TT.restart = 0;
 289
 290  while (command) {
 291    char *str, c = command->c;
 292
 293    // Have we got a line or regex matching range for this rule?
 294    if (*command->lmatch || *command->rmatch) {
 295      int miss = 0;
 296      long lm;
 297
 298      // In a match that might end?
 299      if (command->hit) {
 300        if (!(lm = command->lmatch[1])) {
 301          if (!command->rmatch[1]) command->hit = 0;
 302          else {
 303            void *rm = get_regex(command, command->rmatch[1]);
 304
 305            // regex match end includes matching line, so defer deactivation
 306            if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
 307          }
 308        } else if (lm > 0 && lm < TT.count) command->hit = 0;
 309        else if (lm < -1 && TT.count == command->hit+(-lm-1)) command->hit = 0;
 310
 311      // Start a new match?
 312      } else {
 313        if (!(lm = *command->lmatch)) {
 314          void *rm = get_regex(command, *command->rmatch);
 315
 316          if (line && !regexec0(rm, line, len, 0, 0, 0))
 317            command->hit = TT.count;
 318        } else if (lm == TT.count || (lm == -1 && !pline))
 319          command->hit = TT.count;
 320
 321        if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
 322      } 
 323
 324      // Didn't match?
 325      lm = !(command->not^!!command->hit);
 326
 327      // Deferred disable from regex end match
 328      if (miss || command->lmatch[1] == TT.count) command->hit = 0;
 329
 330      if (lm) {
 331        // Handle skipping curly bracket command group
 332        if (c == '{') {
 333          int curly = 1;
 334
 335          while (curly) {
 336            command = command->next;
 337            if (command->c == '{') curly++;
 338            if (command->c == '}') curly--;
 339          }
 340        }
 341        command = command->next;
 342        continue;
 343      }
 344    }
 345
 346    // A deleted line can still update line match state for later commands
 347    if (!line) {
 348      command = command->next;
 349      continue;
 350    }
 351
 352    // Process command
 353
 354    if (c=='a' || c=='r') {
 355      struct append *a = xzalloc(sizeof(struct append));
 356      if (command->arg1) a->str = command->arg1+(char *)command;
 357      a->file = c=='r';
 358      dlist_add_nomalloc((void *)&append, (void *)a);
 359    } else if (c=='b' || c=='t' || c=='T') {
 360      int t = tea;
 361
 362      if (c != 'b') tea = 0;
 363      if (c=='b' || t^(c=='T')) {
 364        if (!command->arg1) break;
 365        str = command->arg1+(char *)command;
 366        for (command = (void *)TT.pattern; command; command = command->next)
 367          if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
 368            break;
 369        if (!command) error_exit("no :%s", str);
 370      }
 371    } else if (c=='c') {
 372      str = command->arg1+(char *)command;
 373      if (!command->hit) emit(str, strlen(str), 1);
 374      free(line);
 375      line = 0;
 376      continue;
 377    } else if (c=='d') {
 378      free(line);
 379      line = 0;
 380      continue;
 381    } else if (c=='D') {
 382      // Delete up to \n or end of buffer
 383      str = line;
 384      while ((str-line)<len) if (*(str++) == '\n') break;
 385      len -= str - line;
 386      memmove(line, str, len);
 387
 388      // if "delete" blanks line, disable further processing
 389      // otherwise trim and restart script
 390      if (!len) {
 391        free(line);
 392        line = 0;
 393      } else {
 394        line[len] = 0;
 395        command = (void *)TT.pattern;
 396      }
 397      continue;
 398    } else if (c=='g') {
 399      free(line);
 400      line = xstrdup(TT.remember);
 401      len = TT.rememberlen;
 402    } else if (c=='G') {
 403      line = xrealloc(line, len+TT.rememberlen+2);
 404      line[len++] = '\n';
 405      memcpy(line+len, TT.remember, TT.rememberlen);
 406      line[len += TT.rememberlen] = 0;
 407    } else if (c=='h') {
 408      free(TT.remember);
 409      TT.remember = xstrdup(line);
 410      TT.rememberlen = len;
 411    } else if (c=='H') {
 412      TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
 413      TT.remember[TT.rememberlen++] = '\n';
 414      memcpy(TT.remember+TT.rememberlen, line, len);
 415      TT.remember[TT.rememberlen += len] = 0;
 416    } else if (c=='i') {
 417      str = command->arg1+(char *)command;
 418      emit(str, strlen(str), 1);
 419    } else if (c=='l') {
 420      int i, x, off;
 421
 422      if (!TT.xx) {
 423        terminal_size(&TT.xx, 0);
 424        if (!TT.xx) TT.xx = 80;
 425        if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
 426        if (TT.xx > 4) TT.xx -= 4;
 427      }
 428
 429      for (i = off = 0; i<len; i++) {
 430        if (off >= TT.xx) {
 431          toybuf[off++] = '\\';
 432          emit(toybuf, off, 1);
 433          off = 0;
 434        }
 435        x = stridx("\\\a\b\f\r\t\v", line[i]);
 436        if (x != -1) {
 437          toybuf[off++] = '\\';
 438          toybuf[off++] = "\\abfrtv"[x];
 439        } else if (line[i] >= ' ') toybuf[off++] = line[i];
 440        else off += sprintf(toybuf+off, "\\%03o", line[i]);
 441      }
 442      toybuf[off++] = '$';
 443      emit(toybuf, off, 1);
 444    } else if (c=='n') {
 445      TT.restart = command->next+1;
 446
 447      break;
 448    } else if (c=='N') {
 449      // Can't just grab next line because we could have multiple N and
 450      // we need to actually read ahead to get N;$p EOF detection right.
 451      if (pline) {
 452        TT.restart = command->next+1;
 453        extend_string(&line, TT.nextline, len, -TT.nextlen);
 454        free(TT.nextline);
 455        TT.nextline = line;
 456        TT.nextlen += len + 1;
 457        line = 0;
 458      }
 459
 460      // Pending append goes out right after N
 461      goto done; 
 462    } else if (c=='p' || c=='P') {
 463      char *l = (c=='P') ? strchr(line, '\n') : 0;
 464
 465      if (emit(line, l ? l-line : len, eol)) break;
 466    } else if (c=='q' || c=='Q') {
 467      if (pline) *pline = (void *)1;
 468      free(TT.nextline);
 469      if (!toys.exitval && command->arg1)
 470        toys.exitval = atoi(command->arg1+(char *)command);
 471      TT.nextline = 0;
 472      TT.nextlen = 0;
 473      if (c=='Q') line = 0;
 474
 475      break;
 476    } else if (c=='s') {
 477      char *rline = line, *new = command->arg2 + (char *)command, *l2 = 0;
 478      regmatch_t *match = (void *)toybuf;
 479      regex_t *reg = get_regex(command, command->arg1);
 480      int mflags = 0, count = 0, l2used = 0, zmatch = 1, l2l = len, l2old = 0,
 481        mlen, off, newlen;
 482
 483      // Loop finding match in remaining line (up to remaining len)
 484      while (!regexec0(reg, rline, len-(rline-line), 10, match, mflags)) {
 485        mflags = REG_NOTBOL;
 486
 487        // Zero length matches don't count immediately after a previous match
 488        mlen = match[0].rm_eo-match[0].rm_so;
 489        if (!mlen && !zmatch) {
 490          if (rline-line == len) break;
 491          l2[l2used++] = *rline++;
 492          zmatch++;
 493          continue;
 494        } else zmatch = 0;
 495
 496        // If we're replacing only a specific match, skip if this isn't it
 497        off = command->sflags>>3;
 498        if (off && off != ++count) {
 499          memcpy(l2+l2used, rline, match[0].rm_eo);
 500          l2used += match[0].rm_eo;
 501          rline += match[0].rm_eo;
 502
 503          continue;
 504        }
 505        // The fact getline() can allocate unbounded amounts of memory is
 506        // a bigger issue, but while we're here check for integer overflow
 507        if (match[0].rm_eo > INT_MAX) perror_exit(0);
 508
 509        // newlen = strlen(new) but with \1 and & and printf escapes
 510        for (off = newlen = 0; new[off]; off++) {
 511          int cc = -1;
 512
 513          if (new[off] == '&') cc = 0;
 514          else if (new[off] == '\\') cc = new[++off] - '0';
 515          if (cc < 0 || cc > 9) {
 516            newlen++;
 517            continue;
 518          }
 519          newlen += match[cc].rm_eo-match[cc].rm_so;
 520        }
 521
 522        // Copy changed data to new string
 523
 524        // Adjust allocation size of new string, copy data we know we'll keep
 525        l2l += newlen-mlen;
 526        if ((l2l|0xfff) > l2old) l2 = xrealloc(l2, l2old = (l2l|0xfff)+1);
 527        if (match[0].rm_so) {
 528          memcpy(l2+l2used, rline, match[0].rm_so);
 529          l2used += match[0].rm_so;
 530        }
 531
 532        // copy in new replacement text
 533        for (off = mlen = 0; new[off]; off++) {
 534          int cc = 0, ll;
 535
 536          if (new[off] == '\\') {
 537            cc = new[++off] - '0';
 538            if (cc<0 || cc>9) {
 539              if (!(l2[l2used+mlen++] = unescape(new[off])))
 540                l2[l2used+mlen-1] = new[off];
 541
 542              continue;
 543            } else if (cc > reg->re_nsub) error_exit("no s//\\%d/", cc);
 544          } else if (new[off] != '&') {
 545            l2[l2used+mlen++] = new[off];
 546
 547            continue;
 548          }
 549
 550          if (match[cc].rm_so != -1) {
 551            ll = match[cc].rm_eo-match[cc].rm_so;
 552            memcpy(l2+l2used+mlen, rline+match[cc].rm_so, ll);
 553            mlen += ll;
 554          }
 555        }
 556        l2used += newlen;
 557        rline += match[0].rm_eo;
 558
 559        // Stop after first substitution unless we have flag g
 560        if (!(command->sflags & 2)) break;
 561      }
 562
 563      // If we made any changes, finish off l2 and swap it for line
 564      if (l2) {
 565        // grab trailing unmatched data and null terminator, swap with original
 566        mlen = len-(rline-line);
 567        memcpy(l2+l2used, rline, mlen+1);
 568        len = l2used + mlen;
 569        free(line);
 570        line = l2;
 571      }
 572
 573      if (mflags) {
 574        // flag p
 575        if (command->sflags & 4) emit(line, len, eol);
 576
 577        tea = 1;
 578        if (command->w) goto writenow;
 579      }
 580    } else if (c=='w') {
 581      int fd, noeol;
 582      char *name;
 583
 584writenow:
 585      // Swap out emit() context
 586      fd = TT.fdout;
 587      noeol = TT.noeol;
 588
 589      // We save filehandle and newline status before filename
 590      name = command->w + (char *)command;
 591      memcpy(&TT.fdout, name, 4);
 592      name += 4;
 593      TT.noeol = *(name++);
 594
 595      // write, then save/restore context
 596      if (emit(line, len, eol))
 597        perror_exit("w '%s'", command->arg1+(char *)command);
 598      *(--name) = TT.noeol;
 599      TT.noeol = noeol;
 600      TT.fdout = fd;
 601    } else if (c=='x') {
 602      long swap = TT.rememberlen;
 603
 604      str = TT.remember;
 605      TT.remember = line;
 606      line = str;
 607      TT.rememberlen = len;
 608      len = swap;
 609    } else if (c=='y') {
 610      char *from, *to = (char *)command;
 611      int i, j;
 612
 613      from = to+command->arg1;
 614      to += command->arg2;
 615
 616      for (i = 0; i < len; i++) {
 617        j = stridx(from, line[i]);
 618        if (j != -1) line[i] = to[j];
 619      }
 620    } else if (c=='=') {
 621      sprintf(toybuf, "%ld", TT.count);
 622      if (emit(toybuf, strlen(toybuf), 1)) break;
 623    }
 624
 625    command = command->next;
 626  }
 627
 628  if (line && !FLAG(n)) emit(line, len, eol);
 629
 630done:
 631  if (dlist_terminate(append)) while (append) {
 632    struct append *a = append->next;
 633
 634    if (append->file) {
 635      int fd = open(append->str, O_RDONLY);
 636
 637      // Force newline if noeol pending
 638      if (fd != -1) {
 639        if (TT.noeol) xwrite(TT.fdout, "\n", 1);
 640        TT.noeol = 0;
 641        xsendfile(fd, TT.fdout);
 642        close(fd);
 643      }
 644    } else if (append->str) emit(append->str, strlen(append->str), 1);
 645    else emit(line, 0, 0);
 646    free(append);
 647    append = a;
 648  }
 649  free(line);
 650}
 651
 652// Callback called on each input file
 653static void do_sed_file(int fd, char *name)
 654{
 655  char *tmp;
 656
 657  if (FLAG(i)) {
 658    struct sedcmd *command;
 659
 660    if (!fd) return error_msg("-i on stdin");
 661    TT.fdout = copy_tempfile(fd, name, &tmp);
 662    TT.count = 0;
 663    for (command = (void *)TT.pattern; command; command = command->next)
 664      command->hit = 0;
 665  }
 666  do_lines(fd, TT.delim, sed_line);
 667  if (FLAG(i)) {
 668    if (TT.i && *TT.i) {
 669      char *s = xmprintf("%s%s", name, TT.i);
 670
 671      xrename(name, s);
 672      free(s);
 673    }
 674    replace_tempfile(-1, TT.fdout, &tmp);
 675    TT.fdout = 1;
 676    TT.nextline = 0;
 677    TT.nextlen = TT.noeol = 0;
 678  }
 679}
 680
 681// Copy chunk of string between two delimiters, converting printf escapes.
 682// returns processed copy of string (0 if error), *pstr advances to next
 683// unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
 684// if regxex, ignore delimiter in [ranges]
 685static char *unescape_delimited_string(char **pstr, char *delim)
 686{
 687  char *to, *from, mode = 0, d;
 688
 689  // Grab leading delimiter (if necessary), allocate space for new string
 690  from = *pstr;
 691  if (!delim || !*delim) {
 692    if (!(d = *(from++))) return 0;
 693    if (d == '\\') d = *(from++);
 694    if (!d || d == '\\') return 0;
 695    if (delim) *delim = d;
 696  } else d = *delim;
 697  to = delim = xmalloc(strlen(*pstr)+1);
 698
 699  while (mode || *from != d) {
 700    if (!*from) return 0;
 701
 702    // delimiter in regex character range doesn't count
 703    if (*from == '[') {
 704      if (!mode) {
 705        mode = ']';
 706        if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
 707      } else if (mode == ']' && strchr(".=:", from[1])) {
 708        *(to++) = *(from++);
 709        mode = *from;
 710      }
 711    } else if (*from == mode) {
 712      if (mode == ']') mode = 0;
 713      else {
 714        *(to++) = *(from++);
 715        mode = ']';
 716      }
 717    // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
 718    // but the perl build does it, so we need to filter it out.
 719    } else if (mode && *from == '-' && from[-1] == from[1]) {
 720      from+=2;
 721      continue;
 722    } else if (*from == '\\') {
 723      if (!from[1]) return 0;
 724
 725      // Check escaped end delimiter before printf style escapes.
 726      if (from[1] == d) from++;
 727      else if (from[1]=='\\') *(to++) = *(from++);
 728      else {
 729        char c = unescape(from[1]);
 730
 731        if (c) {
 732          *(to++) = c;
 733          from+=2;
 734          continue;
 735        } else if (!mode) *(to++) = *(from++);
 736      }
 737    }
 738    *(to++) = *(from++);
 739  }
 740  *to = 0;
 741  *pstr = from+1;
 742
 743  return delim;
 744}
 745
 746// Translate pattern strings into command structures. Each command structure
 747// is a single allocation (which requires some math and remalloc at times).
 748static void parse_pattern(char **pline, long len)
 749{
 750  struct sedcmd *command = (void *)TT.pattern;
 751  char *line, *reg, c, *errstart;
 752  int i;
 753
 754  line = errstart = pline ? *pline : "";
 755  if (len && line[len-1]=='\n') line[--len] = 0;
 756
 757  // Append this line to previous multiline command? (hit indicates type.)
 758  // During parsing "hit" stores data about line continuations, but in
 759  // sed_line() it means the match range attached to this command
 760  // is active, so processing the continuation must zero it again.
 761  if (command && command->prev->hit) {
 762    // Remove half-finished entry from list so remalloc() doesn't confuse it
 763    TT.pattern = TT.pattern->prev;
 764    command = dlist_pop(&TT.pattern);
 765    c = command->c;
 766    reg = (char *)command;
 767    reg += command->arg1 + strlen(reg + command->arg1);
 768
 769    // Resume parsing for 'a' or 's' command. (Only two that can do this.)
 770    // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
 771    // a unicode character.
 772    if (command->hit < 256) goto resume_s;
 773    else goto resume_a;
 774  }
 775
 776  // Loop through commands in this line.
 777
 778  command = 0;
 779  for (;;) {
 780    if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
 781
 782    // If there's no more data on this line, return.
 783    for (;;) {
 784      while (isspace(*line) || *line == ';') line++;
 785      if (*line == '#') while (*line && *line != '\n') line++;
 786      else break;
 787    }
 788    if (!*line) return;
 789
 790    // Start by writing data into toybuf.
 791
 792    errstart = line;
 793    memset(toybuf, 0, sizeof(struct sedcmd));
 794    command = (void *)toybuf;
 795    reg = toybuf + sizeof(struct sedcmd);
 796
 797    // Parse address range (if any)
 798    for (i = 0; i < 2; i++) {
 799      if (*line == ',') line++;
 800      else if (i) break;
 801
 802      if (i && *line == '+' && isdigit(line[1])) {
 803        line++;
 804        command->lmatch[i] = -2-strtol(line, &line, 0);
 805      } else if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
 806      else if (*line == '$') {
 807        command->lmatch[i] = -1;
 808        line++;
 809      } else if (*line == '/' || *line == '\\') {
 810        char *s = line;
 811
 812        if (!(s = unescape_delimited_string(&line, 0))) goto error;
 813        if (!*s) command->rmatch[i] = 0;
 814        else {
 815          xregcomp((void *)reg, s, REG_EXTENDED*!!FLAG(r));
 816          command->rmatch[i] = reg-toybuf;
 817          reg += sizeof(regex_t);
 818        }
 819        free(s);
 820      } else break;
 821    }
 822
 823    while (isspace(*line)) line++;
 824    if (!*line) break;
 825
 826    if (*line == '!') {
 827      command->not = 1;
 828      line++;
 829    }
 830    while (isspace(*line)) line++;
 831    if (!*line) break;
 832
 833    c = command->c = *(line++);
 834    if (strchr("}:", c) && i) break;
 835    if (strchr("aiqQr=", c) && i>1) break;
 836
 837    // Allocate memory and copy out of toybuf now that we know how big it is
 838    command = xmemdup(toybuf, reg-toybuf);
 839    reg = (reg-toybuf) + (char *)command;
 840
 841    // Parse arguments by command type
 842    if (c == '{') TT.nextlen++;
 843    else if (c == '}') {
 844      if (!TT.nextlen--) break;
 845    } else if (c == 's') {
 846      char *end, delim = 0;
 847
 848      // s/pattern/replacement/flags
 849
 850      // line continuations use arg1 (back at the start of the function),
 851      // so let's fill out arg2 first (since the regex part can't be multiple
 852      // lines) and swap them back later.
 853
 854      // get pattern (just record, we parse it later)
 855      command->arg2 = reg - (char *)command;
 856      if (!(TT.remember = unescape_delimited_string(&line, &delim)))
 857        goto error;
 858
 859      reg += sizeof(regex_t);
 860      command->arg1 = reg-(char *)command;
 861      command->hit = delim;
 862resume_s:
 863      // get replacement - don't replace escapes yet because \1 and \& need
 864      // processing later, after we replace \\ with \ we can't tell \\1 from \1
 865      end = line;
 866      while (*end != command->hit) {
 867        if (!*end) goto error;
 868        if (*end++ == '\\') {
 869          if (!*end || *end == '\n') {
 870            end[-1] = '\n';
 871            break;
 872          }
 873          end++;
 874        }
 875      }
 876
 877      reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
 878      line = end;
 879      // line continuation? (note: '\n' can't be a valid delim).
 880      if (*line == command->hit) command->hit = 0;
 881      else {
 882        if (!*line) continue;
 883        reg--;
 884        line++;
 885        goto resume_s;
 886      }
 887
 888      // swap arg1/arg2 so they're back in order arguments occur.
 889      i = command->arg1;
 890      command->arg1 = command->arg2;
 891      command->arg2 = i;
 892
 893      // get flags
 894      for (line++; *line; line++) {
 895        long l;
 896
 897        if (isspace(*line) && *line != '\n') continue;
 898
 899        if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l;
 900        else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
 901          command->sflags |= l << 3;
 902          line--;
 903        } else break;
 904      }
 905
 906      // We deferred actually parsing the regex until we had the s///i flag
 907      // allocating the space was done by extend_string() above
 908      if (!*TT.remember) command->arg1 = 0;
 909      else xregcomp((void *)(command->arg1 + (char *)command), TT.remember,
 910        (REG_EXTENDED*!!FLAG(r))|((command->sflags&1)*REG_ICASE));
 911      free(TT.remember);
 912      TT.remember = 0;
 913      if (*line == 'w') {
 914        line++;
 915        goto writenow;
 916      }
 917    } else if (c == 'w') {
 918      int fd, delim;
 919      char *cc;
 920
 921      // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
 922      // eol status, and to retain the filename for error messages, we'd need
 923      // to go up to arg5 just for this. Compromise: dynamically allocate the
 924      // filehandle and eol status.
 925
 926writenow:
 927      while (isspace(*line)) line++;
 928      if (!*line) goto error;
 929      for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
 930      delim = *cc;
 931      *cc = 0;
 932      fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
 933      *cc = delim;
 934
 935      command->w = reg - (char *)command;
 936      command = xrealloc(command, command->w+(cc-line)+6);
 937      reg = command->w + (char *)command;
 938
 939      memcpy(reg, &fd, 4);
 940      reg += 4;
 941      *(reg++) = 0;
 942      memcpy(reg, line, delim);
 943      reg += delim;
 944      *(reg++) = 0;
 945
 946      line = cc;
 947      if (delim) line += 2;
 948    } else if (c == 'y') {
 949      char *s, delim = 0;
 950      int len;
 951
 952      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
 953      command->arg1 = reg-(char *)command;
 954      len = strlen(s);
 955      reg = extend_string((void *)&command, s, reg-(char *)command, len);
 956      free(s);
 957      command->arg2 = reg-(char *)command;
 958      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
 959      if (len != strlen(s)) goto error;
 960      reg = extend_string((void *)&command, s, reg-(char*)command, len);
 961      free(s);
 962    } else if (strchr("abcirtTqQw:", c)) {
 963      int end;
 964
 965      // trim leading spaces
 966      while (isspace(*line) && *line != '\n') line++;
 967
 968      // Resume logic differs from 's' case because we don't add a newline
 969      // unless it's after something, so we add it on return instead.
 970resume_a:
 971      command->hit = 0;
 972
 973      // btTqQ: end with space or semicolon, aicrw continue to newline.
 974      if (!(end = strcspn(line, strchr(":btTqQ", c) ? "}; \t\r\n\v\f" : "\n"))){
 975        // Argument's optional for btTqQ
 976        if (strchr("btTqQ", c)) continue;
 977        else if (!command->arg1) break;
 978      }
 979      // Error checking: qQ can only have digits after them
 980      if (c=='q' || c=='Q') {
 981        for (i = 0; i<end && isdigit(line[i]); i++);
 982        if (i != end) {
 983          line += i;
 984          break;
 985        }
 986      }
 987
 988      // Extend allocation to include new string. We use offsets instead of
 989      // pointers so realloc() moving stuff doesn't break things. Ok to write
 990      // \n over NUL terminator because call to extend_string() adds it back.
 991      if (!command->arg1) command->arg1 = reg - (char*)command;
 992      else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
 993      else if (!pline) {
 994        command->arg1 = 0;
 995        continue;
 996      }
 997      reg = extend_string((void *)&command, line, reg - (char *)command, end);
 998
 999      // Recopy data to remove escape sequences and handle line continuation.
1000      if (strchr("aci", c)) {
1001        reg -= end+1;
1002        for (i = end; i; i--) {
1003          if ((*reg++ = *line++)=='\\') {
1004
1005            // escape at end of line: resume if -e escaped literal newline,
1006            // else request callback and resume with next line
1007            if (!--i) {
1008              *--reg = 0;
1009              if (*line) {
1010                line++;
1011                goto resume_a;
1012              }
1013              command->hit = 256;
1014              break;
1015            }
1016            if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
1017            line++;
1018          }
1019        }
1020        *reg = 0;
1021      } else line += end;
1022
1023    // Commands that take no arguments
1024    } else if (!strchr("{dDgGhHlnNpPx=", c)) break;
1025  }
1026
1027error:
1028  error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
1029}
1030
1031void sed_main(void)
1032{
1033  struct arg_list *al;
1034  char **args = toys.optargs;
1035
1036  if (!FLAG(z)) TT.delim = '\n';
1037
1038  // Lie to autoconf when it asks stupid questions, so configure regexes
1039  // that look for "GNU sed version %f" greater than some old buggy number
1040  // don't fail us for not matching their narrow expectations.
1041  if (FLAG(version)) {
1042    xprintf("This is not GNU sed version 9.0\n");
1043    return;
1044  }
1045
1046  // Handling our own --version means we handle our own --help too.
1047  if (FLAG(help)) help_exit(0);
1048
1049  // Parse pattern into commands.
1050
1051  // If no -e or -f, first argument is the pattern.
1052  if (!TT.e && !TT.f) {
1053    if (!*toys.optargs) error_exit("no pattern");
1054    (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1055  }
1056
1057  // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1058  // so handle all -e, then all -f. (At least the behavior's consistent.)
1059
1060  for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
1061  parse_pattern(0, 0);
1062  for (al = TT.f; al; al = al->next)
1063    do_lines(xopenro(al->arg), TT.delim, parse_pattern);
1064  dlist_terminate(TT.pattern);
1065  if (TT.nextlen) error_exit("no }");  
1066
1067  TT.fdout = 1;
1068  TT.remember = xstrdup("");
1069
1070  // Inflict pattern upon input files. Long version because !O_CLOEXEC
1071  loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file);
1072
1073  // Provide EOF flush at end of cumulative input for non-i mode.
1074  if (!FLAG(i)) {
1075    toys.optflags |= FLAG_i;
1076    sed_line(0, 0);
1077  }
1078
1079  // todo: need to close fd when done for TOYBOX_FREE?
1080}
1081