toybox/toys/posix/sed.c
<<
>>
Prefs
   1/* sed.c - stream editor. Thing that does s/// and other stuff.
   2 *
   3 * Copyright 2014 Rob Landley <rob@landley.net>
   4 *
   5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
   6 *
   7 * TODO: lines > 2G could wrap signed int length counters. Not just getline()
   8 * but N and s///
   9 * TODO: make y// handle unicode, unicode delimiters
  10 * TODO: handle error return from emit(), error_msg/exit consistently
  11 *       What's the right thing to do for -i when write fails? Skip to next?
  12 * test '//q' with no previous regex, also repeat previous regex?
  13 *
  14 * Deviations from POSIX: allow extended regular expressions with -r,
  15 * editing in place with -i, separate with -s, NUL-separated input with -z,
  16 * printf escapes in text, line continuations, semicolons after all commands,
  17 * 2-address anywhere an address is allowed, "T" command, multiline
  18 * continuations for [abc], \; to end [abc] argument before end of line.
  19
  20USE_SED(NEWTOY(sed, "(help)(version)e*f*i:;nErz(null-data)s[+Er]", TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
  21
  22config SED
  23  bool "sed"
  24  default y
  25  help
  26    usage: sed [-inrszE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
  27
  28    Stream editor. Apply editing SCRIPTs to lines of input.
  29
  30    -e  Add SCRIPT to list
  31    -f  Add contents of SCRIPT_FILE to list
  32    -i  Edit each file in place (-iEXT keeps backup file with extension EXT)
  33    -n  No default output (use the p command to output matched lines)
  34    -r  Use extended regular expression syntax
  35    -E  POSIX alias for -r
  36    -s  Treat input files separately (implied by -i)
  37    -z  Use \0 rather than \n as input line separator
  38
  39    A SCRIPT is one or more COMMANDs separated by newlines or semicolons.
  40    All -e SCRIPTs are combined as if separated by newlines, followed by all -f
  41    SCRIPT_FILEs. If no -e or -f then first argument is the SCRIPT.
  42
  43    COMMANDs apply to every line unless prefixed with an ADDRESS of the form:
  44
  45      [ADDRESS[,ADDRESS]][!]COMMAND
  46
  47    ADDRESS is a line number (starting at 1), a /REGULAR EXPRESSION/, or $ for
  48    last line (-s or -i makes it last line of each file). One address matches one
  49    line, ADDRESS,ADDRESS matches from first to second inclusive. Two regexes can
  50    match multiple ranges. ADDRESS,+N ends N lines later. ! inverts the match.
  51
  52    REGULAR EXPRESSIONS start and end with the same character (anything but
  53    backslash or newline). To use the delimiter in the regex escape it with a
  54    backslash, and printf escapes (\abcefnrtv and octal, hex, and unicode) work.
  55    An empty regex repeats the previous one. ADDRESS regexes require any
  56    first delimiter except / to be \escaped to distinguish it from COMMANDs.
  57
  58    Sed reads each line of input, processes it, and writes it out or discards it
  59    before reading the next. Sed can remember one additional line in a separate
  60    buffer (the h, H, g, G, and x commands), and can read the next line of input
  61    early (the n and N commands), but otherwise operates on individual lines.
  62
  63    Each COMMAND starts with a single character. Commands with no arguments are:
  64
  65      !  Run this command when the ADDRESS _didn't_ match.
  66      {  Start new command block, continuing until a corresponding "}".
  67         Command blocks nest and can have ADDRESSes applying to the whole block.
  68      }  End command block (this COMMAND cannot have an address)
  69      d  Delete this line and move on to the next one
  70         (ignores remaining COMMANDs)
  71      D  Delete one line of input and restart command SCRIPT (same as "d"
  72         unless you've glued lines together with "N" or similar)
  73      g  Get remembered line (overwriting current line)
  74      G  Get remembered line (appending to current line)
  75      h  Remember this line (overwriting remembered line)
  76      H  Remember this line (appending to remembered line, if any)
  77      l  Print line escaping \abfrtv (but not \n), octal escape other nonprintng
  78         chars, wrap lines to terminal width with \, append $ to end of line.
  79      n  Print default output and read next line over current line (quit at EOF)
  80      N  Append \n and next line of input to this line. Quit at EOF without
  81         default output. Advances line counter for ADDRESS and "=".
  82      p  Print this line
  83      P  Print this line up to first newline (from "N")
  84      q  Quit (print default output, no more commands processed or lines read)
  85      x  Exchange this line with remembered line (overwrite in both directions)
  86      =  Print the current line number (plus newline)
  87      #  Comment, ignores rest of this line of SCRIPT (until newline)
  88
  89    Commands that take an argument: 
  90
  91      : LABEL    Target for jump commands
  92      a TEXT     Append text to output before reading next line
  93      b LABEL    Branch, jumps to :LABEL (with no LABEL to end of SCRIPT)
  94      c TEXT     Delete matching ADDRESS range and output TEXT instead
  95      i TEXT     Insert text (output immediately)
  96      r FILE     Append contents of FILE to output before reading next line.
  97      s/S/R/F    Search for regex S replace match with R using flags F. Delimiter
  98                 is anything but \n or \, escape with \ to use in S or R. Printf
  99                 escapes work. Unescaped & in R becomes full matched text, \1
 100                 through \9 = parenthetical subexpression from S. \ at end of
 101                 line appends next line of SCRIPT. The flags in F are:
 102                 [0-9]    A number N, substitute only Nth match
 103                 g        Global, substitute all matches
 104                 i/I      Ignore case when matching
 105                 p        Print resulting line when match found and replaced
 106                 w [file] Write (append) line to file when match replaced
 107      t LABEL    Test, jump if s/// command matched this line since last test 
 108      T LABEL    Test false, jump to :LABEL only if no s/// found a match
 109      w FILE     Write (append) line to file
 110      y/old/new/ Change each character in 'old' to corresponding character
 111                 in 'new' (with standard backslash escapes, delimiter can be
 112                 any repeated character except \ or \n)
 113
 114    The TEXT arguments (to a c i) may end with an unescaped "\" to append
 115    the next line (leading whitespace is not skipped), and treat ";" as a
 116    literal character (use "\;" instead).
 117*/
 118
 119#define FOR_sed
 120#include "toys.h"
 121
 122GLOBALS(
 123  char *i;
 124  struct arg_list *f, *e;
 125
 126  // processed pattern list
 127  struct double_list *pattern;
 128
 129  char *nextline, *remember;
 130  void *restart, *lastregex;
 131  long nextlen, rememberlen, count;
 132  int fdout, noeol;
 133  unsigned xx;
 134  char delim;
 135)
 136
 137// Linked list of parsed sed commands. Offset fields indicate location where
 138// regex or string starts, ala offset+(char *)struct, because we remalloc()
 139// these to expand them for multiline inputs, and pointers would have to be
 140// individually adjusted.
 141
 142struct sedcmd {
 143  struct sedcmd *next, *prev;
 144
 145  // Begin and end of each match
 146  long lmatch[2]; // line number of match
 147  int rmatch[2];  // offset of regex struct for prefix matches (/abc/,/def/p)
 148  int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
 149  unsigned not, hit;
 150  unsigned sflags; // s///flag bits: i=1, g=2, p=4
 151  char c; // action
 152};
 153
 154// Write out line with potential embedded NUL, handling eol/noeol
 155static int emit(char *line, long len, int eol)
 156{
 157  int l, old = line[len];
 158
 159  if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
 160  TT.noeol = !eol;
 161  if (eol) line[len++] = '\n';
 162  if (!len) return 0;
 163  l = writeall(TT.fdout, line, len);
 164  if (eol) line[len-1] = old;
 165  if (l != len) {
 166    if (TT.fdout != 1) perror_msg("short write");
 167
 168    return 1;
 169  }
 170
 171  return 0;
 172}
 173
 174// Extend allocation to include new string, with newline between if newlen<0
 175
 176static char *extend_string(char **old, char *new, int oldlen, int newlen)
 177{
 178  int newline = newlen < 0;
 179  char *s;
 180
 181  if (newline) newlen = -newlen;
 182  s = *old = xrealloc(*old, oldlen+newlen+newline+1);
 183  if (newline) s[oldlen++] = '\n';
 184  memcpy(s+oldlen, new, newlen);
 185  s[oldlen+newlen] = 0;
 186
 187  return s+oldlen+newlen+1;
 188}
 189
 190// An empty regex repeats the previous one
 191static void *get_regex(void *command, int offset)
 192{
 193  if (!offset) {
 194    if (!TT.lastregex) error_exit("no previous regex");
 195    return TT.lastregex;
 196  }
 197
 198  return TT.lastregex = offset+(char *)command;
 199}
 200
 201// Apply pattern to line from input file
 202static void sed_line(char **pline, long plen)
 203{
 204  struct append {
 205    struct append *next, *prev;
 206    int file;
 207    char *str;
 208  } *append = 0;
 209  char *line = TT.nextline;
 210  long len = TT.nextlen;
 211  struct sedcmd *command;
 212  int eol = 0, tea = 0;
 213
 214  // Ignore EOF for all files before last unless -i
 215  if (!pline && !FLAG(i) && !FLAG(s)) return;
 216
 217  // Grab next line for deferred processing (EOF detection: we get a NULL
 218  // pline at EOF to flush last line). Note that only end of _last_ input
 219  // file matches $ (unless we're doing -i).
 220  TT.nextline = 0;
 221  TT.nextlen = 0;
 222  if (pline) {
 223    TT.nextline = *pline;
 224    TT.nextlen = plen;
 225    *pline = 0;
 226  }
 227
 228  if (!line || !len) return;
 229  if (line[len-1] == '\n') line[--len] = eol++;
 230  TT.count++;
 231
 232  // The restart-1 is because we added one to make sure it wasn't NULL,
 233  // otherwise N as last command would restart script
 234  command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
 235  TT.restart = 0;
 236
 237  while (command) {
 238    char *str, c = command->c;
 239
 240    // Have we got a line or regex matching range for this rule?
 241    if (*command->lmatch || *command->rmatch) {
 242      int miss = 0;
 243      long lm;
 244
 245      // In a match that might end?
 246      if (command->hit) {
 247        if (!(lm = command->lmatch[1])) {
 248          if (!command->rmatch[1]) command->hit = 0;
 249          else {
 250            void *rm = get_regex(command, command->rmatch[1]);
 251
 252            // regex match end includes matching line, so defer deactivation
 253            if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
 254          }
 255        } else if (lm > 0 && lm < TT.count) command->hit = 0;
 256        else if (lm < -1 && TT.count == command->hit+(-lm-1)) command->hit = 0;
 257
 258      // Start a new match?
 259      } else {
 260        if (!(lm = *command->lmatch)) {
 261          void *rm = get_regex(command, *command->rmatch);
 262
 263          if (line && !regexec0(rm, line, len, 0, 0, 0))
 264            command->hit = TT.count;
 265        } else if (lm == TT.count || (lm == -1 && !pline))
 266          command->hit = TT.count;
 267
 268        if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
 269      } 
 270
 271      // Didn't match?
 272      lm = !(command->not^!!command->hit);
 273
 274      // Deferred disable from regex end match
 275      if (miss || command->lmatch[1] == TT.count) command->hit = 0;
 276
 277      if (lm) {
 278        // Handle skipping curly bracket command group
 279        if (c == '{') {
 280          int curly = 1;
 281
 282          while (curly) {
 283            command = command->next;
 284            if (command->c == '{') curly++;
 285            if (command->c == '}') curly--;
 286          }
 287        }
 288        command = command->next;
 289        continue;
 290      }
 291    }
 292
 293    // A deleted line can still update line match state for later commands
 294    if (!line) {
 295      command = command->next;
 296      continue;
 297    }
 298
 299    // Process command
 300
 301    if (c=='a' || c=='r') {
 302      struct append *a = xzalloc(sizeof(struct append));
 303      if (command->arg1) a->str = command->arg1+(char *)command;
 304      a->file = c=='r';
 305      dlist_add_nomalloc((void *)&append, (void *)a);
 306    } else if (c=='b' || c=='t' || c=='T') {
 307      int t = tea;
 308
 309      if (c != 'b') tea = 0;
 310      if (c=='b' || t^(c=='T')) {
 311        if (!command->arg1) break;
 312        str = command->arg1+(char *)command;
 313        for (command = (void *)TT.pattern; command; command = command->next)
 314          if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
 315            break;
 316        if (!command) error_exit("no :%s", str);
 317      }
 318    } else if (c=='c') {
 319      str = command->arg1+(char *)command;
 320      if (!command->hit) emit(str, strlen(str), 1);
 321      free(line);
 322      line = 0;
 323      continue;
 324    } else if (c=='d') {
 325      free(line);
 326      line = 0;
 327      continue;
 328    } else if (c=='D') {
 329      // Delete up to \n or end of buffer
 330      str = line;
 331      while ((str-line)<len) if (*(str++) == '\n') break;
 332      len -= str - line;
 333      memmove(line, str, len);
 334
 335      // if "delete" blanks line, disable further processing
 336      // otherwise trim and restart script
 337      if (!len) {
 338        free(line);
 339        line = 0;
 340      } else {
 341        line[len] = 0;
 342        command = (void *)TT.pattern;
 343      }
 344      continue;
 345    } else if (c=='g') {
 346      free(line);
 347      line = xstrdup(TT.remember);
 348      len = TT.rememberlen;
 349    } else if (c=='G') {
 350      line = xrealloc(line, len+TT.rememberlen+2);
 351      line[len++] = '\n';
 352      memcpy(line+len, TT.remember, TT.rememberlen);
 353      line[len += TT.rememberlen] = 0;
 354    } else if (c=='h') {
 355      free(TT.remember);
 356      TT.remember = xstrdup(line);
 357      TT.rememberlen = len;
 358    } else if (c=='H') {
 359      TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
 360      TT.remember[TT.rememberlen++] = '\n';
 361      memcpy(TT.remember+TT.rememberlen, line, len);
 362      TT.remember[TT.rememberlen += len] = 0;
 363    } else if (c=='i') {
 364      str = command->arg1+(char *)command;
 365      emit(str, strlen(str), 1);
 366    } else if (c=='l') {
 367      int i, x, off;
 368
 369      if (!TT.xx) {
 370        terminal_size(&TT.xx, 0);
 371        if (!TT.xx) TT.xx = 80;
 372        if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
 373        if (TT.xx > 4) TT.xx -= 4;
 374      }
 375
 376      for (i = off = 0; i<len; i++) {
 377        if (off >= TT.xx) {
 378          toybuf[off++] = '\\';
 379          emit(toybuf, off, 1);
 380          off = 0;
 381        }
 382        x = stridx("\\\a\b\f\r\t\v", line[i]);
 383        if (x != -1) {
 384          toybuf[off++] = '\\';
 385          toybuf[off++] = "\\abfrtv"[x];
 386        } else if (line[i] >= ' ') toybuf[off++] = line[i];
 387        else off += sprintf(toybuf+off, "\\%03o", line[i]);
 388      }
 389      toybuf[off++] = '$';
 390      emit(toybuf, off, 1);
 391    } else if (c=='n') {
 392      TT.restart = command->next+1;
 393
 394      break;
 395    } else if (c=='N') {
 396      // Can't just grab next line because we could have multiple N and
 397      // we need to actually read ahead to get N;$p EOF detection right.
 398      if (pline) {
 399        TT.restart = command->next+1;
 400        extend_string(&line, TT.nextline, len, -TT.nextlen);
 401        free(TT.nextline);
 402        TT.nextline = line;
 403        TT.nextlen += len + 1;
 404        line = 0;
 405      }
 406
 407      // Pending append goes out right after N
 408      goto done; 
 409    } else if (c=='p' || c=='P') {
 410      char *l = (c=='P') ? strchr(line, '\n') : 0;
 411
 412      if (emit(line, l ? l-line : len, eol)) break;
 413    } else if (c=='q' || c=='Q') {
 414      if (pline) *pline = (void *)1;
 415      free(TT.nextline);
 416      if (!toys.exitval && command->arg1)
 417        toys.exitval = atoi(command->arg1+(char *)command);
 418      TT.nextline = 0;
 419      TT.nextlen = 0;
 420      if (c=='Q') line = 0;
 421
 422      break;
 423    } else if (c=='s') {
 424      char *rline = line, *new = command->arg2 + (char *)command, *l2 = 0;
 425      regmatch_t *match = (void *)toybuf;
 426      regex_t *reg = get_regex(command, command->arg1);
 427      int mflags = 0, count = 0, l2used = 0, zmatch = 1, l2l = len, l2old = 0,
 428        mlen, off, newlen;
 429
 430      // Loop finding match in remaining line (up to remaining len)
 431      while (!regexec0(reg, rline, len-(rline-line), 10, match, mflags)) {
 432        mflags = REG_NOTBOL;
 433
 434        // Zero length matches don't count immediately after a previous match
 435        mlen = match[0].rm_eo-match[0].rm_so;
 436        if (!mlen && !zmatch) {
 437          if (rline-line == len) break;
 438          l2[l2used++] = *rline++;
 439          zmatch++;
 440          continue;
 441        } else zmatch = 0;
 442
 443        // If we're replacing only a specific match, skip if this isn't it
 444        off = command->sflags>>3;
 445        if (off && off != ++count) {
 446          memcpy(l2+l2used, rline, match[0].rm_eo);
 447          l2used += match[0].rm_eo;
 448          rline += match[0].rm_eo;
 449
 450          continue;
 451        }
 452        // The fact getline() can allocate unbounded amounts of memory is
 453        // a bigger issue, but while we're here check for integer overflow
 454        if (match[0].rm_eo > INT_MAX) perror_exit(0);
 455
 456        // newlen = strlen(new) but with \1 and & and printf escapes
 457        for (off = newlen = 0; new[off]; off++) {
 458          int cc = -1;
 459
 460          if (new[off] == '&') cc = 0;
 461          else if (new[off] == '\\') cc = new[++off] - '0';
 462          if (cc < 0 || cc > 9) {
 463            newlen++;
 464            continue;
 465          }
 466          newlen += match[cc].rm_eo-match[cc].rm_so;
 467        }
 468
 469        // Copy changed data to new string
 470
 471        // Adjust allocation size of new string, copy data we know we'll keep
 472        l2l += newlen-mlen;
 473        if ((l2l|0xfff) > l2old) l2 = xrealloc(l2, l2old = (l2l|0xfff)+1);
 474        if (match[0].rm_so) {
 475          memcpy(l2+l2used, rline, match[0].rm_so);
 476          l2used += match[0].rm_so;
 477        }
 478
 479        // copy in new replacement text
 480        for (off = mlen = 0; new[off]; off++) {
 481          int cc = 0, ll;
 482
 483          if (new[off] == '\\') {
 484            cc = new[++off] - '0';
 485            if (cc<0 || cc>9) {
 486              if (!(l2[l2used+mlen++] = unescape(new[off])))
 487                l2[l2used+mlen-1] = new[off];
 488
 489              continue;
 490            } else if (cc > reg->re_nsub) error_exit("no s//\\%d/", cc);
 491          } else if (new[off] != '&') {
 492            l2[l2used+mlen++] = new[off];
 493
 494            continue;
 495          }
 496
 497          if (match[cc].rm_so != -1) {
 498            ll = match[cc].rm_eo-match[cc].rm_so;
 499            memcpy(l2+l2used+mlen, rline+match[cc].rm_so, ll);
 500            mlen += ll;
 501          }
 502        }
 503        l2used += newlen;
 504        rline += match[0].rm_eo;
 505
 506        // Stop after first substitution unless we have flag g
 507        if (!(command->sflags & 2)) break;
 508      }
 509
 510      // If we made any changes, finish off l2 and swap it for line
 511      if (l2) {
 512        // grab trailing unmatched data and null terminator, swap with original
 513        mlen = len-(rline-line);
 514        memcpy(l2+l2used, rline, mlen+1);
 515        len = l2used + mlen;
 516        free(line);
 517        line = l2;
 518      }
 519
 520      if (mflags) {
 521        // flag p
 522        if (command->sflags & 4) emit(line, len, eol);
 523
 524        tea = 1;
 525        if (command->w) goto writenow;
 526      }
 527    } else if (c=='w') {
 528      int fd, noeol;
 529      char *name;
 530
 531writenow:
 532      // Swap out emit() context
 533      fd = TT.fdout;
 534      noeol = TT.noeol;
 535
 536      // We save filehandle and newline status before filename
 537      name = command->w + (char *)command;
 538      memcpy(&TT.fdout, name, 4);
 539      name += 4;
 540      TT.noeol = *(name++);
 541
 542      // write, then save/restore context
 543      if (emit(line, len, eol))
 544        perror_exit("w '%s'", command->arg1+(char *)command);
 545      *(--name) = TT.noeol;
 546      TT.noeol = noeol;
 547      TT.fdout = fd;
 548    } else if (c=='x') {
 549      long swap = TT.rememberlen;
 550
 551      str = TT.remember;
 552      TT.remember = line;
 553      line = str;
 554      TT.rememberlen = len;
 555      len = swap;
 556    } else if (c=='y') {
 557      char *from, *to = (char *)command;
 558      int i, j;
 559
 560      from = to+command->arg1;
 561      to += command->arg2;
 562
 563      for (i = 0; i < len; i++) {
 564        j = stridx(from, line[i]);
 565        if (j != -1) line[i] = to[j];
 566      }
 567    } else if (c=='=') {
 568      sprintf(toybuf, "%ld", TT.count);
 569      if (emit(toybuf, strlen(toybuf), 1)) break;
 570    }
 571
 572    command = command->next;
 573  }
 574
 575  if (line && !FLAG(n)) emit(line, len, eol);
 576
 577done:
 578  if (dlist_terminate(append)) while (append) {
 579    struct append *a = append->next;
 580
 581    if (append->file) {
 582      int fd = open(append->str, O_RDONLY);
 583
 584      // Force newline if noeol pending
 585      if (fd != -1) {
 586        if (TT.noeol) xwrite(TT.fdout, "\n", 1);
 587        TT.noeol = 0;
 588        xsendfile(fd, TT.fdout);
 589        close(fd);
 590      }
 591    } else if (append->str) emit(append->str, strlen(append->str), 1);
 592    else emit(line, 0, 0);
 593    free(append);
 594    append = a;
 595  }
 596  free(line);
 597}
 598
 599// Callback called on each input file
 600static void do_sed_file(int fd, char *name)
 601{
 602  char *tmp, *s;
 603
 604  if (FLAG(i)) {
 605    if (!fd) return error_msg("-i on stdin");
 606    TT.fdout = copy_tempfile(fd, name, &tmp);
 607  }
 608  if (FLAG(i) || FLAG(s)) {
 609    struct sedcmd *command;
 610
 611    TT.count = 0;
 612    for (command = (void *)TT.pattern; command; command = command->next)
 613      command->hit = 0;
 614  }
 615  do_lines(fd, TT.delim, sed_line);
 616  if (FLAG(i)) {
 617    if (TT.i && *TT.i) {
 618      xrename(name, s = xmprintf("%s%s", name, TT.i));
 619      free(s);
 620    }
 621    replace_tempfile(-1, TT.fdout, &tmp);
 622    TT.fdout = 1;
 623  }
 624  if (FLAG(i) || FLAG(s)) {
 625    TT.nextline = 0;
 626    TT.nextlen = TT.noeol = 0;
 627  }
 628}
 629
 630// Copy chunk of string between two delimiters, converting printf escapes.
 631// returns processed copy of string (0 if error), *pstr advances to next
 632// unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
 633// if regxex, ignore delimiter in [ranges]
 634static char *unescape_delimited_string(char **pstr, char *delim)
 635{
 636  char *to, *from, mode = 0, d;
 637
 638  // Grab leading delimiter (if necessary), allocate space for new string
 639  from = *pstr;
 640  if (!delim || !*delim) {
 641    if (!(d = *(from++))) return 0;
 642    if (d == '\\') d = *(from++);
 643    if (!d || d == '\\') return 0;
 644    if (delim) *delim = d;
 645  } else d = *delim;
 646  to = delim = xmalloc(strlen(*pstr)+1);
 647
 648  while (mode || *from != d) {
 649    if (!*from) return 0;
 650
 651    // delimiter in regex character range doesn't count
 652    if (*from == '[') {
 653      if (!mode) {
 654        mode = ']';
 655        if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
 656      } else if (mode == ']' && strchr(".=:", from[1])) {
 657        *(to++) = *(from++);
 658        mode = *from;
 659      }
 660    } else if (*from == mode) {
 661      if (mode == ']') mode = 0;
 662      else {
 663        *(to++) = *(from++);
 664        mode = ']';
 665      }
 666    // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
 667    // but the perl build does it, so we need to filter it out.
 668    } else if (mode && *from == '-' && from[-1] == from[1]) {
 669      from+=2;
 670      continue;
 671    } else if (*from == '\\') {
 672      if (!from[1]) return 0;
 673
 674      // Check escaped end delimiter before printf style escapes.
 675      if (from[1] == d) from++;
 676      else if (from[1]=='\\') *(to++) = *(from++);
 677      else {
 678        char c = unescape(from[1]);
 679
 680        if (c) {
 681          *(to++) = c;
 682          from+=2;
 683          continue;
 684        } else if (!mode) *(to++) = *(from++);
 685      }
 686    }
 687    *(to++) = *(from++);
 688  }
 689  *to = 0;
 690  *pstr = from+1;
 691
 692  return delim;
 693}
 694
 695// Translate pattern strings into command structures. Each command structure
 696// is a single allocation (which requires some math and remalloc at times).
 697static void parse_pattern(char **pline, long len)
 698{
 699  struct sedcmd *command = (void *)TT.pattern;
 700  char *line, *reg, c, *errstart;
 701  int i;
 702
 703  line = errstart = pline ? *pline : "";
 704  if (len && line[len-1]=='\n') line[--len] = 0;
 705
 706  // Append this line to previous multiline command? (hit indicates type.)
 707  // During parsing "hit" stores data about line continuations, but in
 708  // sed_line() it means the match range attached to this command
 709  // is active, so processing the continuation must zero it again.
 710  if (command && command->prev->hit) {
 711    // Remove half-finished entry from list so remalloc() doesn't confuse it
 712    TT.pattern = TT.pattern->prev;
 713    command = dlist_pop(&TT.pattern);
 714    c = command->c;
 715    reg = (char *)command;
 716    reg += command->arg1 + strlen(reg + command->arg1);
 717
 718    // Resume parsing for 'a' or 's' command. (Only two that can do this.)
 719    // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
 720    // a unicode character.
 721    if (command->hit < 256) goto resume_s;
 722    else goto resume_a;
 723  }
 724
 725  // Loop through commands in this line.
 726
 727  command = 0;
 728  for (;;) {
 729    if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
 730
 731    // If there's no more data on this line, return.
 732    for (;;) {
 733      while (isspace(*line) || *line == ';') line++;
 734      if (*line == '#') while (*line && *line != '\n') line++;
 735      else break;
 736    }
 737    if (!*line) return;
 738
 739    // Start by writing data into toybuf.
 740
 741    errstart = line;
 742    memset(toybuf, 0, sizeof(struct sedcmd));
 743    command = (void *)toybuf;
 744    reg = toybuf + sizeof(struct sedcmd);
 745
 746    // Parse address range (if any)
 747    for (i = 0; i < 2; i++) {
 748      if (*line == ',') line++;
 749      else if (i) break;
 750
 751      if (i && *line == '+' && isdigit(line[1])) {
 752        line++;
 753        command->lmatch[i] = -2-strtol(line, &line, 0);
 754      } else if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
 755      else if (*line == '$') {
 756        command->lmatch[i] = -1;
 757        line++;
 758      } else if (*line == '/' || *line == '\\') {
 759        char *s = line;
 760
 761        if (!(s = unescape_delimited_string(&line, 0))) goto error;
 762        if (!*s) command->rmatch[i] = 0;
 763        else {
 764          xregcomp((void *)reg, s, REG_EXTENDED*!!FLAG(r));
 765          command->rmatch[i] = reg-toybuf;
 766          reg += sizeof(regex_t);
 767        }
 768        free(s);
 769      } else break;
 770    }
 771
 772    while (isspace(*line)) line++;
 773    if (!*line) break;
 774
 775    if (*line == '!') {
 776      command->not = 1;
 777      line++;
 778    }
 779    while (isspace(*line)) line++;
 780    if (!*line) break;
 781
 782    c = command->c = *(line++);
 783    if (strchr("}:", c) && i) break;
 784    if (strchr("aiqQr=", c) && i>1) break;
 785
 786    // Allocate memory and copy out of toybuf now that we know how big it is
 787    command = xmemdup(toybuf, reg-toybuf);
 788    reg = (reg-toybuf) + (char *)command;
 789
 790    // Parse arguments by command type
 791    if (c == '{') TT.nextlen++;
 792    else if (c == '}') {
 793      if (!TT.nextlen--) break;
 794    } else if (c == 's') {
 795      char *end, delim = 0;
 796
 797      // s/pattern/replacement/flags
 798
 799      // line continuations use arg1 (back at the start of the function),
 800      // so let's fill out arg2 first (since the regex part can't be multiple
 801      // lines) and swap them back later.
 802
 803      // get pattern (just record, we parse it later)
 804      command->arg2 = reg - (char *)command;
 805      if (!(TT.remember = unescape_delimited_string(&line, &delim)))
 806        goto error;
 807
 808      reg += sizeof(regex_t);
 809      command->arg1 = reg-(char *)command;
 810      command->hit = delim;
 811resume_s:
 812      // get replacement - don't replace escapes yet because \1 and \& need
 813      // processing later, after we replace \\ with \ we can't tell \\1 from \1
 814      end = line;
 815      while (*end != command->hit) {
 816        if (!*end) goto error;
 817        if (*end++ == '\\') {
 818          if (!*end || *end == '\n') {
 819            end[-1] = '\n';
 820            break;
 821          }
 822          end++;
 823        }
 824      }
 825
 826      reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
 827      line = end;
 828      // line continuation? (note: '\n' can't be a valid delim).
 829      if (*line == command->hit) command->hit = 0;
 830      else {
 831        if (!*line) continue;
 832        reg--;
 833        line++;
 834        goto resume_s;
 835      }
 836
 837      // swap arg1/arg2 so they're back in order arguments occur.
 838      i = command->arg1;
 839      command->arg1 = command->arg2;
 840      command->arg2 = i;
 841
 842      // get flags
 843      for (line++; *line; line++) {
 844        long l;
 845
 846        if (isspace(*line) && *line != '\n') continue;
 847
 848        if (0 <= (l = stridx("igp", *line))) command->sflags |= 1<<l;
 849        else if (*line == 'I') command->sflags |= 1<<0;
 850        else if (!(command->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
 851          command->sflags |= l << 3;
 852          line--;
 853        } else break;
 854      }
 855
 856      // We deferred actually parsing the regex until we had the s///i flag
 857      // allocating the space was done by extend_string() above
 858      if (!*TT.remember) command->arg1 = 0;
 859      else xregcomp((void *)(command->arg1 + (char *)command), TT.remember,
 860        (REG_EXTENDED*!!FLAG(r))|((command->sflags&1)*REG_ICASE));
 861      free(TT.remember);
 862      TT.remember = 0;
 863      if (*line == 'w') {
 864        line++;
 865        goto writenow;
 866      }
 867    } else if (c == 'w') {
 868      int fd, delim;
 869      char *cc;
 870
 871      // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
 872      // eol status, and to retain the filename for error messages, we'd need
 873      // to go up to arg5 just for this. Compromise: dynamically allocate the
 874      // filehandle and eol status.
 875
 876writenow:
 877      while (isspace(*line)) line++;
 878      if (!*line) goto error;
 879      for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
 880      delim = *cc;
 881      *cc = 0;
 882      fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
 883      *cc = delim;
 884
 885      command->w = reg - (char *)command;
 886      command = xrealloc(command, command->w+(cc-line)+6);
 887      reg = command->w + (char *)command;
 888
 889      memcpy(reg, &fd, 4);
 890      reg += 4;
 891      *(reg++) = 0;
 892      memcpy(reg, line, delim);
 893      reg += delim;
 894      *(reg++) = 0;
 895
 896      line = cc;
 897      if (delim) line += 2;
 898    } else if (c == 'y') {
 899      char *s, delim = 0;
 900      int len;
 901
 902      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
 903      command->arg1 = reg-(char *)command;
 904      len = strlen(s);
 905      reg = extend_string((void *)&command, s, reg-(char *)command, len);
 906      free(s);
 907      command->arg2 = reg-(char *)command;
 908      if (!(s = unescape_delimited_string(&line, &delim))) goto error;
 909      if (len != strlen(s)) goto error;
 910      reg = extend_string((void *)&command, s, reg-(char*)command, len);
 911      free(s);
 912    } else if (strchr("abcirtTqQw:", c)) {
 913      int end;
 914
 915      // trim leading spaces
 916      while (isspace(*line) && *line != '\n') line++;
 917
 918      // Resume logic differs from 's' case because we don't add a newline
 919      // unless it's after something, so we add it on return instead.
 920resume_a:
 921      command->hit = 0;
 922
 923      // btTqQ: end with space or semicolon, aicrw continue to newline.
 924      if (!(end = strcspn(line, strchr(":btTqQ", c) ? "}; \t\r\n\v\f" : "\n"))){
 925        // Argument's optional for btTqQ
 926        if (strchr("btTqQ", c)) continue;
 927        else if (!command->arg1) break;
 928      }
 929      // Error checking: qQ can only have digits after them
 930      if (c=='q' || c=='Q') {
 931        for (i = 0; i<end && isdigit(line[i]); i++);
 932        if (i != end) {
 933          line += i;
 934          break;
 935        }
 936      }
 937
 938      // Extend allocation to include new string. We use offsets instead of
 939      // pointers so realloc() moving stuff doesn't break things. Ok to write
 940      // \n over NUL terminator because call to extend_string() adds it back.
 941      if (!command->arg1) command->arg1 = reg - (char*)command;
 942      else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
 943      else if (!pline) {
 944        command->arg1 = 0;
 945        continue;
 946      }
 947      reg = extend_string((void *)&command, line, reg - (char *)command, end);
 948
 949      // Recopy data to remove escape sequences and handle line continuation.
 950      if (strchr("aci", c)) {
 951        reg -= end+1;
 952        for (i = end; i; i--) {
 953          if ((*reg++ = *line++)=='\\') {
 954
 955            // escape at end of line: resume if -e escaped literal newline,
 956            // else request callback and resume with next line
 957            if (!--i) {
 958              *--reg = 0;
 959              if (*line) {
 960                line++;
 961                goto resume_a;
 962              }
 963              command->hit = 256;
 964              break;
 965            }
 966            if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
 967            line++;
 968          }
 969        }
 970        *reg = 0;
 971      } else line += end;
 972
 973    // Commands that take no arguments
 974    } else if (!strchr("{dDgGhHlnNpPx=", c)) break;
 975  }
 976
 977error:
 978  error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
 979}
 980
 981void sed_main(void)
 982{
 983  struct arg_list *al;
 984  char **args = toys.optargs;
 985
 986  if (!FLAG(z)) TT.delim = '\n';
 987
 988  // Lie to autoconf when it asks stupid questions, so configure regexes
 989  // that look for "GNU sed version %f" greater than some old buggy number
 990  // don't fail us for not matching their narrow expectations.
 991  if (FLAG(version)) {
 992    xprintf("This is not GNU sed version 9.0\n");
 993    return;
 994  }
 995
 996  // Handling our own --version means we handle our own --help too.
 997  if (FLAG(help)) help_exit(0);
 998
 999  // Parse pattern into commands.
1000
1001  // If no -e or -f, first argument is the pattern.
1002  if (!TT.e && !TT.f) {
1003    if (!*toys.optargs) error_exit("no pattern");
1004    (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1005  }
1006
1007  // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
1008  // so handle all -e, then all -f. (At least the behavior's consistent.)
1009
1010  for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
1011  parse_pattern(0, 0);
1012  for (al = TT.f; al; al = al->next)
1013    do_lines(xopenro(al->arg), TT.delim, parse_pattern);
1014  dlist_terminate(TT.pattern);
1015  if (TT.nextlen) error_exit("no }");  
1016
1017  TT.fdout = 1;
1018  TT.remember = xstrdup("");
1019
1020  // Inflict pattern upon input files. Long version because !O_CLOEXEC
1021  loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file);
1022
1023  // Provide EOF flush at end of cumulative input for non-i mode.
1024  if (!FLAG(i) && !FLAG(s)) {
1025    toys.optflags |= FLAG_s;
1026    sed_line(0, 0);
1027  }
1028
1029  // todo: need to close fd when done for TOYBOX_FREE?
1030}
1031