busybox/editors/sed.c
<<
>>
Prefs
   1/* vi: set sw=4 ts=4: */
   2/*
   3 * sed.c - very minimalist version of sed
   4 *
   5 * Copyright (C) 1999,2000,2001 by Lineo, inc. and Mark Whitley
   6 * Copyright (C) 1999,2000,2001 by Mark Whitley <markw@codepoet.org>
   7 * Copyright (C) 2002  Matt Kraai
   8 * Copyright (C) 2003 by Glenn McGrath
   9 * Copyright (C) 2003,2004 by Rob Landley <rob@landley.net>
  10 *
  11 * MAINTAINER: Rob Landley <rob@landley.net>
  12 *
  13 * Licensed under GPL version 2, see file LICENSE in this tarball for details.
  14 */
  15
  16/* Code overview.
  17
  18  Files are laid out to avoid unnecessary function declarations.  So for
  19  example, every function add_cmd calls occurs before add_cmd in this file.
  20
  21  add_cmd() is called on each line of sed command text (from a file or from
  22  the command line).  It calls get_address() and parse_cmd_args().  The
  23  resulting sed_cmd_t structures are appended to a linked list
  24  (G.sed_cmd_head/G.sed_cmd_tail).
  25
  26  add_input_file() adds a FILE* to the list of input files.  We need to
  27  know all input sources ahead of time to find the last line for the $ match.
  28
  29  process_files() does actual sedding, reading data lines from each input FILE *
  30  (which could be stdin) and applying the sed command list (sed_cmd_head) to
  31  each of the resulting lines.
  32
  33  sed_main() is where external code calls into this, with a command line.
  34*/
  35
  36
  37/*
  38        Supported features and commands in this version of sed:
  39
  40         - comments ('#')
  41         - address matching: num|/matchstr/[,num|/matchstr/|$]command
  42         - commands: (p)rint, (d)elete, (s)ubstitue (with g & I flags)
  43         - edit commands: (a)ppend, (i)nsert, (c)hange
  44         - file commands: (r)ead
  45         - backreferences in substitution expressions (\0, \1, \2...\9)
  46         - grouped commands: {cmd1;cmd2}
  47         - transliteration (y/source-chars/dest-chars/)
  48         - pattern space hold space storing / swapping (g, h, x)
  49         - labels / branching (: label, b, t, T)
  50
  51         (Note: Specifying an address (range) to match is *optional*; commands
  52         default to the whole pattern space if no specific address match was
  53         requested.)
  54
  55        Todo:
  56         - Create a wrapper around regex to make libc's regex conform with sed
  57
  58        Reference http://www.opengroup.org/onlinepubs/007904975/utilities/sed.html
  59*/
  60
  61#include "libbb.h"
  62#include "xregex.h"
  63
  64/* Each sed command turns into one of these structures. */
  65typedef struct sed_cmd_s {
  66        /* Ordered by alignment requirements: currently 36 bytes on x86 */
  67        struct sed_cmd_s *next; /* Next command (linked list, NULL terminated) */
  68
  69        /* address storage */
  70        regex_t *beg_match;     /* sed -e '/match/cmd' */
  71        regex_t *end_match;     /* sed -e '/match/,/end_match/cmd' */
  72        regex_t *sub_match;     /* For 's/sub_match/string/' */
  73        int beg_line;           /* 'sed 1p'   0 == apply commands to all lines */
  74        int end_line;           /* 'sed 1,3p' 0 == one line only. -1 = last line ($) */
  75
  76        FILE *sw_file;          /* File (sw) command writes to, -1 for none. */
  77        char *string;           /* Data string for (saicytb) commands. */
  78
  79        unsigned which_match;   /* (s) Which match to replace (0 for all) */
  80
  81        /* Bitfields (gcc won't group them if we don't) */
  82        unsigned invert:1;      /* the '!' after the address */
  83        unsigned in_match:1;    /* Next line also included in match? */
  84        unsigned sub_p:1;       /* (s) print option */
  85
  86        char sw_last_char;      /* Last line written by (sw) had no '\n' */
  87
  88        /* GENERAL FIELDS */
  89        char cmd;               /* The command char: abcdDgGhHilnNpPqrstwxy:={} */
  90} sed_cmd_t;
  91
  92static const char semicolon_whitespace[] ALIGN1 = "; \n\r\t\v";
  93
  94struct globals {
  95        /* options */
  96        int be_quiet, regex_type;
  97        FILE *nonstdout;
  98        char *outname, *hold_space;
  99
 100        /* List of input files */
 101        int input_file_count, current_input_file;
 102        FILE **input_file_list;
 103
 104        regmatch_t regmatch[10];
 105        regex_t *previous_regex_ptr;
 106
 107        /* linked list of sed commands */
 108        sed_cmd_t sed_cmd_head, *sed_cmd_tail;
 109
 110        /* Linked list of append lines */
 111        llist_t *append_head;
 112
 113        char *add_cmd_line;
 114
 115        struct pipeline {
 116                char *buf;      /* Space to hold string */
 117                int idx;        /* Space used */
 118                int len;        /* Space allocated */
 119        } pipeline;
 120};
 121#define G (*(struct globals*)&bb_common_bufsiz1)
 122void BUG_sed_globals_too_big(void);
 123#define INIT_G() do { \
 124        if (sizeof(struct globals) > COMMON_BUFSIZE) \
 125                BUG_sed_globals_too_big(); \
 126        G.sed_cmd_tail = &G.sed_cmd_head; \
 127} while (0)
 128
 129
 130#if ENABLE_FEATURE_CLEAN_UP
 131static void sed_free_and_close_stuff(void)
 132{
 133        sed_cmd_t *sed_cmd = G.sed_cmd_head.next;
 134
 135        llist_free(G.append_head, free);
 136
 137        while (sed_cmd) {
 138                sed_cmd_t *sed_cmd_next = sed_cmd->next;
 139
 140                if (sed_cmd->sw_file)
 141                        xprint_and_close_file(sed_cmd->sw_file);
 142
 143                if (sed_cmd->beg_match) {
 144                        regfree(sed_cmd->beg_match);
 145                        free(sed_cmd->beg_match);
 146                }
 147                if (sed_cmd->end_match) {
 148                        regfree(sed_cmd->end_match);
 149                        free(sed_cmd->end_match);
 150                }
 151                if (sed_cmd->sub_match) {
 152                        regfree(sed_cmd->sub_match);
 153                        free(sed_cmd->sub_match);
 154                }
 155                free(sed_cmd->string);
 156                free(sed_cmd);
 157                sed_cmd = sed_cmd_next;
 158        }
 159
 160        free(G.hold_space);
 161
 162        while (G.current_input_file < G.input_file_count)
 163                fclose(G.input_file_list[G.current_input_file++]);
 164}
 165#else
 166void sed_free_and_close_stuff(void);
 167#endif
 168
 169/* If something bad happens during -i operation, delete temp file */
 170
 171static void cleanup_outname(void)
 172{
 173        if (G.outname) unlink(G.outname);
 174}
 175
 176/* strcpy, replacing "\from" with 'to'. If to is NUL, replacing "\any" with 'any' */
 177
 178static void parse_escapes(char *dest, const char *string, int len, char from, char to)
 179{
 180        int i = 0;
 181
 182        while (i < len) {
 183                if (string[i] == '\\') {
 184                        if (!to || string[i+1] == from) {
 185                                *dest++ = to ? to : string[i+1];
 186                                i += 2;
 187                                continue;
 188                        }
 189                        *dest++ = string[i++];
 190                }
 191                /* TODO: is it safe wrt a string with trailing '\\' ? */
 192                *dest++ = string[i++];
 193        }
 194        *dest = '\0';
 195}
 196
 197static char *copy_parsing_escapes(const char *string, int len)
 198{
 199        char *dest = xmalloc(len + 1);
 200
 201        parse_escapes(dest, string, len, 'n', '\n');
 202        /* GNU sed also recognizes \t */
 203        parse_escapes(dest, dest, strlen(dest), 't', '\t');
 204        return dest;
 205}
 206
 207
 208/*
 209 * index_of_next_unescaped_regexp_delim - walks left to right through a string
 210 * beginning at a specified index and returns the index of the next regular
 211 * expression delimiter (typically a forward slash ('/')) not preceded by
 212 * a backslash ('\').  A negative delimiter disables square bracket checking.
 213 */
 214static int index_of_next_unescaped_regexp_delim(int delimiter, const char *str)
 215{
 216        int bracket = -1;
 217        int escaped = 0;
 218        int idx = 0;
 219        char ch;
 220
 221        if (delimiter < 0) {
 222                bracket--;
 223                delimiter = -delimiter;
 224        }
 225
 226        for (; (ch = str[idx]); idx++) {
 227                if (bracket >= 0) {
 228                        if (ch == ']' && !(bracket == idx - 1 || (bracket == idx - 2
 229                                        && str[idx - 1] == '^')))
 230                                bracket = -1;
 231                } else if (escaped)
 232                        escaped = 0;
 233                else if (ch == '\\')
 234                        escaped = 1;
 235                else if (bracket == -1 && ch == '[')
 236                        bracket = idx;
 237                else if (ch == delimiter)
 238                        return idx;
 239        }
 240
 241        /* if we make it to here, we've hit the end of the string */
 242        bb_error_msg_and_die("unmatched '%c'", delimiter);
 243}
 244
 245/*
 246 *  Returns the index of the third delimiter
 247 */
 248static int parse_regex_delim(const char *cmdstr, char **match, char **replace)
 249{
 250        const char *cmdstr_ptr = cmdstr;
 251        char delimiter;
 252        int idx = 0;
 253
 254        /* verify that the 's' or 'y' is followed by something.  That something
 255         * (typically a 'slash') is now our regexp delimiter... */
 256        if (*cmdstr == '\0')
 257                bb_error_msg_and_die("bad format in substitution expression");
 258        delimiter = *cmdstr_ptr++;
 259
 260        /* save the match string */
 261        idx = index_of_next_unescaped_regexp_delim(delimiter, cmdstr_ptr);
 262        *match = copy_parsing_escapes(cmdstr_ptr, idx);
 263
 264        /* save the replacement string */
 265        cmdstr_ptr += idx + 1;
 266        idx = index_of_next_unescaped_regexp_delim(-delimiter, cmdstr_ptr);
 267        *replace = copy_parsing_escapes(cmdstr_ptr, idx);
 268
 269        return ((cmdstr_ptr - cmdstr) + idx);
 270}
 271
 272/*
 273 * returns the index in the string just past where the address ends.
 274 */
 275static int get_address(const char *my_str, int *linenum, regex_t ** regex)
 276{
 277        const char *pos = my_str;
 278
 279        if (isdigit(*my_str)) {
 280                *linenum = strtol(my_str, (char**)&pos, 10);
 281                /* endstr shouldnt ever equal NULL */
 282        } else if (*my_str == '$') {
 283                *linenum = -1;
 284                pos++;
 285        } else if (*my_str == '/' || *my_str == '\\') {
 286                int next;
 287                char delimiter;
 288                char *temp;
 289
 290                delimiter = '/';
 291                if (*my_str == '\\') delimiter = *++pos;
 292                next = index_of_next_unescaped_regexp_delim(delimiter, ++pos);
 293                temp = copy_parsing_escapes(pos, next);
 294                *regex = xmalloc(sizeof(regex_t));
 295                xregcomp(*regex, temp, G.regex_type|REG_NEWLINE);
 296                free(temp);
 297                /* Move position to next character after last delimiter */
 298                pos += (next+1);
 299        }
 300        return pos - my_str;
 301}
 302
 303/* Grab a filename.  Whitespace at start is skipped, then goes to EOL. */
 304static int parse_file_cmd(/*sed_cmd_t *sed_cmd,*/ const char *filecmdstr, char **retval)
 305{
 306        int start = 0, idx, hack = 0;
 307
 308        /* Skip whitespace, then grab filename to end of line */
 309        while (isspace(filecmdstr[start]))
 310                start++;
 311        idx = start;
 312        while (filecmdstr[idx] && filecmdstr[idx] != '\n')
 313                idx++;
 314
 315        /* If lines glued together, put backslash back. */
 316        if (filecmdstr[idx] == '\n')
 317                hack = 1;
 318        if (idx == start)
 319                bb_error_msg_and_die("empty filename");
 320        *retval = xstrndup(filecmdstr+start, idx-start+hack+1);
 321        if (hack)
 322                (*retval)[idx] = '\\';
 323
 324        return idx;
 325}
 326
 327static int parse_subst_cmd(sed_cmd_t *sed_cmd, const char *substr)
 328{
 329        int cflags = G.regex_type;
 330        char *match;
 331        int idx;
 332
 333        /*
 334         * A substitution command should look something like this:
 335         *    s/match/replace/ #gIpw
 336         *    ||     |        |||
 337         *    mandatory       optional
 338         */
 339        idx = parse_regex_delim(substr, &match, &sed_cmd->string);
 340
 341        /* determine the number of back references in the match string */
 342        /* Note: we compute this here rather than in the do_subst_command()
 343         * function to save processor time, at the expense of a little more memory
 344         * (4 bits) per sed_cmd */
 345
 346        /* process the flags */
 347
 348        sed_cmd->which_match = 1;
 349        while (substr[++idx]) {
 350                /* Parse match number */
 351                if (isdigit(substr[idx])) {
 352                        if (match[0] != '^') {
 353                                /* Match 0 treated as all, multiple matches we take the last one. */
 354                                const char *pos = substr + idx;
 355/* FIXME: error check? */
 356                                sed_cmd->which_match = (unsigned)strtol(substr+idx, (char**) &pos, 10);
 357                                idx = pos - substr;
 358                        }
 359                        continue;
 360                }
 361                /* Skip spaces */
 362                if (isspace(substr[idx])) continue;
 363
 364                switch (substr[idx]) {
 365                /* Replace all occurrences */
 366                case 'g':
 367                        if (match[0] != '^')
 368                                sed_cmd->which_match = 0;
 369                        break;
 370                /* Print pattern space */
 371                case 'p':
 372                        sed_cmd->sub_p = 1;
 373                        break;
 374                /* Write to file */
 375                case 'w':
 376                {
 377                        char *temp;
 378                        idx += parse_file_cmd(/*sed_cmd,*/ substr+idx, &temp);
 379                        break;
 380                }
 381                /* Ignore case (gnu exension) */
 382                case 'I':
 383                        cflags |= REG_ICASE;
 384                        break;
 385                /* Comment */
 386                case '#':
 387                        while (substr[++idx]) /*skip all*/;
 388                        /* Fall through */
 389                /* End of command */
 390                case ';':
 391                case '}':
 392                        goto out;
 393                default:
 394                        bb_error_msg_and_die("bad option in substitution expression");
 395                }
 396        }
 397out:
 398        /* compile the match string into a regex */
 399        if (*match != '\0') {
 400                /* If match is empty, we use last regex used at runtime */
 401                sed_cmd->sub_match = xmalloc(sizeof(regex_t));
 402                xregcomp(sed_cmd->sub_match, match, cflags);
 403        }
 404        free(match);
 405
 406        return idx;
 407}
 408
 409/*
 410 *  Process the commands arguments
 411 */
 412static const char *parse_cmd_args(sed_cmd_t *sed_cmd, const char *cmdstr)
 413{
 414        /* handle (s)ubstitution command */
 415        if (sed_cmd->cmd == 's')
 416                cmdstr += parse_subst_cmd(sed_cmd, cmdstr);
 417        /* handle edit cmds: (a)ppend, (i)nsert, and (c)hange */
 418        else if (strchr("aic", sed_cmd->cmd)) {
 419                if ((sed_cmd->end_line || sed_cmd->end_match) && sed_cmd->cmd != 'c')
 420                        bb_error_msg_and_die
 421                                ("only a beginning address can be specified for edit commands");
 422                for (;;) {
 423                        if (*cmdstr == '\n' || *cmdstr == '\\') {
 424                                cmdstr++;
 425                                break;
 426                        } else if (isspace(*cmdstr))
 427                                cmdstr++;
 428                        else
 429                                break;
 430                }
 431                sed_cmd->string = xstrdup(cmdstr);
 432                /* "\anychar" -> "anychar" */
 433                parse_escapes(sed_cmd->string, sed_cmd->string, strlen(cmdstr), '\0', '\0');
 434                cmdstr += strlen(cmdstr);
 435        /* handle file cmds: (r)ead */
 436        } else if (strchr("rw", sed_cmd->cmd)) {
 437                if (sed_cmd->end_line || sed_cmd->end_match)
 438                        bb_error_msg_and_die("command only uses one address");
 439                cmdstr += parse_file_cmd(/*sed_cmd,*/ cmdstr, &sed_cmd->string);
 440                if (sed_cmd->cmd == 'w') {
 441                        sed_cmd->sw_file = xfopen_for_write(sed_cmd->string);
 442                        sed_cmd->sw_last_char = '\n';
 443                }
 444        /* handle branch commands */
 445        } else if (strchr(":btT", sed_cmd->cmd)) {
 446                int length;
 447
 448                cmdstr = skip_whitespace(cmdstr);
 449                length = strcspn(cmdstr, semicolon_whitespace);
 450                if (length) {
 451                        sed_cmd->string = xstrndup(cmdstr, length);
 452                        cmdstr += length;
 453                }
 454        }
 455        /* translation command */
 456        else if (sed_cmd->cmd == 'y') {
 457                char *match, *replace;
 458                int i = cmdstr[0];
 459
 460                cmdstr += parse_regex_delim(cmdstr, &match, &replace)+1;
 461                /* \n already parsed, but \delimiter needs unescaping. */
 462                parse_escapes(match, match, strlen(match), i, i);
 463                parse_escapes(replace, replace, strlen(replace), i, i);
 464
 465                sed_cmd->string = xzalloc((strlen(match) + 1) * 2);
 466                for (i = 0; match[i] && replace[i]; i++) {
 467                        sed_cmd->string[i*2] = match[i];
 468                        sed_cmd->string[i*2+1] = replace[i];
 469                }
 470                free(match);
 471                free(replace);
 472        }
 473        /* if it wasnt a single-letter command that takes no arguments
 474         * then it must be an invalid command.
 475         */
 476        else if (strchr("dDgGhHlnNpPqx={}", sed_cmd->cmd) == 0) {
 477                bb_error_msg_and_die("unsupported command %c", sed_cmd->cmd);
 478        }
 479
 480        /* give back whatever's left over */
 481        return cmdstr;
 482}
 483
 484
 485/* Parse address+command sets, skipping comment lines. */
 486
 487static void add_cmd(const char *cmdstr)
 488{
 489        sed_cmd_t *sed_cmd;
 490        int temp;
 491
 492        /* Append this line to any unfinished line from last time. */
 493        if (G.add_cmd_line) {
 494                char *tp = xasprintf("%s\n%s", G.add_cmd_line, cmdstr);
 495                free(G.add_cmd_line);
 496                cmdstr = G.add_cmd_line = tp;
 497        }
 498
 499        /* If this line ends with backslash, request next line. */
 500        temp = strlen(cmdstr);
 501        if (temp && cmdstr[--temp] == '\\') {
 502                if (!G.add_cmd_line)
 503                        G.add_cmd_line = xstrdup(cmdstr);
 504                G.add_cmd_line[temp] = '\0';
 505                return;
 506        }
 507
 508        /* Loop parsing all commands in this line. */
 509        while (*cmdstr) {
 510                /* Skip leading whitespace and semicolons */
 511                cmdstr += strspn(cmdstr, semicolon_whitespace);
 512
 513                /* If no more commands, exit. */
 514                if (!*cmdstr) break;
 515
 516                /* if this is a comment, jump past it and keep going */
 517                if (*cmdstr == '#') {
 518                        /* "#n" is the same as using -n on the command line */
 519                        if (cmdstr[1] == 'n')
 520                                G.be_quiet++;
 521                        cmdstr = strpbrk(cmdstr, "\n\r");
 522                        if (!cmdstr) break;
 523                        continue;
 524                }
 525
 526                /* parse the command
 527                 * format is: [addr][,addr][!]cmd
 528                 *            |----||-----||-|
 529                 *            part1 part2  part3
 530                 */
 531
 532                sed_cmd = xzalloc(sizeof(sed_cmd_t));
 533
 534                /* first part (if present) is an address: either a '$', a number or a /regex/ */
 535                cmdstr += get_address(cmdstr, &sed_cmd->beg_line, &sed_cmd->beg_match);
 536
 537                /* second part (if present) will begin with a comma */
 538                if (*cmdstr == ',') {
 539                        int idx;
 540
 541                        cmdstr++;
 542                        idx = get_address(cmdstr, &sed_cmd->end_line, &sed_cmd->end_match);
 543                        if (!idx)
 544                                bb_error_msg_and_die("no address after comma");
 545                        cmdstr += idx;
 546                }
 547
 548                /* skip whitespace before the command */
 549                cmdstr = skip_whitespace(cmdstr);
 550
 551                /* Check for inversion flag */
 552                if (*cmdstr == '!') {
 553                        sed_cmd->invert = 1;
 554                        cmdstr++;
 555
 556                        /* skip whitespace before the command */
 557                        cmdstr = skip_whitespace(cmdstr);
 558                }
 559
 560                /* last part (mandatory) will be a command */
 561                if (!*cmdstr)
 562                        bb_error_msg_and_die("missing command");
 563                sed_cmd->cmd = *(cmdstr++);
 564                cmdstr = parse_cmd_args(sed_cmd, cmdstr);
 565
 566                /* Add the command to the command array */
 567                G.sed_cmd_tail->next = sed_cmd;
 568                G.sed_cmd_tail = G.sed_cmd_tail->next;
 569        }
 570
 571        /* If we glued multiple lines together, free the memory. */
 572        free(G.add_cmd_line);
 573        G.add_cmd_line = NULL;
 574}
 575
 576/* Append to a string, reallocating memory as necessary. */
 577
 578#define PIPE_GROW 64
 579
 580static void pipe_putc(char c)
 581{
 582        if (G.pipeline.idx == G.pipeline.len) {
 583                G.pipeline.buf = xrealloc(G.pipeline.buf,
 584                                G.pipeline.len + PIPE_GROW);
 585                G.pipeline.len += PIPE_GROW;
 586        }
 587        G.pipeline.buf[G.pipeline.idx++] = c;
 588}
 589
 590static void do_subst_w_backrefs(char *line, char *replace)
 591{
 592        int i,j;
 593
 594        /* go through the replacement string */
 595        for (i = 0; replace[i]; i++) {
 596                /* if we find a backreference (\1, \2, etc.) print the backref'ed * text */
 597                if (replace[i] == '\\') {
 598                        unsigned backref = replace[++i] - '0';
 599                        if (backref <= 9) {
 600                                /* print out the text held in G.regmatch[backref] */
 601                                if (G.regmatch[backref].rm_so != -1) {
 602                                        j = G.regmatch[backref].rm_so;
 603                                        while (j < G.regmatch[backref].rm_eo)
 604                                                pipe_putc(line[j++]);
 605                                }
 606                                continue;
 607                        }
 608                        /* I _think_ it is impossible to get '\' to be
 609                         * the last char in replace string. Thus we dont check
 610                         * for replace[i] == NUL. (counterexample anyone?) */
 611                        /* if we find a backslash escaped character, print the character */
 612                        pipe_putc(replace[i]);
 613                        continue;
 614                }
 615                /* if we find an unescaped '&' print out the whole matched text. */
 616                if (replace[i] == '&') {
 617                        j = G.regmatch[0].rm_so;
 618                        while (j < G.regmatch[0].rm_eo)
 619                                pipe_putc(line[j++]);
 620                        continue;
 621                }
 622                /* Otherwise just output the character. */
 623                pipe_putc(replace[i]);
 624        }
 625}
 626
 627static int do_subst_command(sed_cmd_t *sed_cmd, char **line)
 628{
 629        char *oldline = *line;
 630        int altered = 0;
 631        unsigned match_count = 0;
 632        regex_t *current_regex;
 633
 634        /* Handle empty regex. */
 635        if (sed_cmd->sub_match == NULL) {
 636                current_regex = G.previous_regex_ptr;
 637                if (!current_regex)
 638                        bb_error_msg_and_die("no previous regexp");
 639        } else
 640                G.previous_regex_ptr = current_regex = sed_cmd->sub_match;
 641
 642        /* Find the first match */
 643        if (REG_NOMATCH == regexec(current_regex, oldline, 10, G.regmatch, 0))
 644                return 0;
 645
 646        /* Initialize temporary output buffer. */
 647        G.pipeline.buf = xmalloc(PIPE_GROW);
 648        G.pipeline.len = PIPE_GROW;
 649        G.pipeline.idx = 0;
 650
 651        /* Now loop through, substituting for matches */
 652        do {
 653                int i;
 654
 655                /* Work around bug in glibc regexec, demonstrated by:
 656                   echo " a.b" | busybox sed 's [^ .]* x g'
 657                   The match_count check is so not to break
 658                   echo "hi" | busybox sed 's/^/!/g' */
 659                if (!G.regmatch[0].rm_so && !G.regmatch[0].rm_eo && match_count) {
 660                        pipe_putc(*oldline++);
 661                        continue;
 662                }
 663
 664                match_count++;
 665
 666                /* If we aren't interested in this match, output old line to
 667                   end of match and continue */
 668                if (sed_cmd->which_match
 669                 && (sed_cmd->which_match != match_count)
 670                ) {
 671                        for (i = 0; i < G.regmatch[0].rm_eo; i++)
 672                                pipe_putc(*oldline++);
 673                        continue;
 674                }
 675
 676                /* print everything before the match */
 677                for (i = 0; i < G.regmatch[0].rm_so; i++)
 678                        pipe_putc(oldline[i]);
 679
 680                /* then print the substitution string */
 681                do_subst_w_backrefs(oldline, sed_cmd->string);
 682
 683                /* advance past the match */
 684                oldline += G.regmatch[0].rm_eo;
 685                /* flag that something has changed */
 686                altered++;
 687
 688                /* if we're not doing this globally, get out now */
 689                if (sed_cmd->which_match)
 690                        break;
 691        } while (*oldline && (regexec(current_regex, oldline, 10, G.regmatch, 0) != REG_NOMATCH));
 692
 693        /* Copy rest of string into output pipeline */
 694
 695        while (*oldline)
 696                pipe_putc(*oldline++);
 697        pipe_putc(0);
 698
 699        free(*line);
 700        *line = G.pipeline.buf;
 701        return altered;
 702}
 703
 704/* Set command pointer to point to this label.  (Does not handle null label.) */
 705static sed_cmd_t *branch_to(char *label)
 706{
 707        sed_cmd_t *sed_cmd;
 708
 709        for (sed_cmd = G.sed_cmd_head.next; sed_cmd; sed_cmd = sed_cmd->next) {
 710                if (sed_cmd->cmd == ':' && sed_cmd->string && !strcmp(sed_cmd->string, label)) {
 711                        return sed_cmd;
 712                }
 713        }
 714        bb_error_msg_and_die("can't find label for jump to '%s'", label);
 715}
 716
 717static void append(char *s)
 718{
 719        llist_add_to_end(&G.append_head, xstrdup(s));
 720}
 721
 722static void flush_append(void)
 723{
 724        char *data;
 725
 726        /* Output appended lines. */
 727        while ((data = (char *)llist_pop(&G.append_head))) {
 728                fprintf(G.nonstdout, "%s\n", data);
 729                free(data);
 730        }
 731}
 732
 733static void add_input_file(FILE *file)
 734{
 735        G.input_file_list = xrealloc_vector(G.input_file_list, 2, G.input_file_count);
 736        G.input_file_list[G.input_file_count++] = file;
 737}
 738
 739/* Get next line of input from G.input_file_list, flushing append buffer and
 740 * noting if we ran out of files without a newline on the last line we read.
 741 */
 742enum {
 743        NO_EOL_CHAR = 1,
 744        LAST_IS_NUL = 2,
 745};
 746static char *get_next_line(char *gets_char)
 747{
 748        char *temp = NULL;
 749        int len;
 750        char gc;
 751
 752        flush_append();
 753
 754        /* will be returned if last line in the file
 755         * doesn't end with either '\n' or '\0' */
 756        gc = NO_EOL_CHAR;
 757        while (G.current_input_file < G.input_file_count) {
 758                FILE *fp = G.input_file_list[G.current_input_file];
 759                /* Read line up to a newline or NUL byte, inclusive,
 760                 * return malloc'ed char[]. length of the chunk read
 761                 * is stored in len. NULL if EOF/error */
 762                temp = bb_get_chunk_from_file(fp, &len);
 763                if (temp) {
 764                        /* len > 0 here, it's ok to do temp[len-1] */
 765                        char c = temp[len-1];
 766                        if (c == '\n' || c == '\0') {
 767                                temp[len-1] = '\0';
 768                                gc = c;
 769                                if (c == '\0') {
 770                                        int ch = fgetc(fp);
 771                                        if (ch != EOF)
 772                                                ungetc(ch, fp);
 773                                        else
 774                                                gc = LAST_IS_NUL;
 775                                }
 776                        }
 777                        /* else we put NO_EOL_CHAR into *gets_char */
 778                        break;
 779
 780                /* NB: I had the idea of peeking next file(s) and returning
 781                 * NO_EOL_CHAR only if it is the *last* non-empty
 782                 * input file. But there is a case where this won't work:
 783                 * file1: "a woo\nb woo"
 784                 * file2: "c no\nd no"
 785                 * sed -ne 's/woo/bang/p' input1 input2 => "a bang\nb bang"
 786                 * (note: *no* newline after "b bang"!) */
 787                }
 788                /* Close this file and advance to next one */
 789                fclose(fp);
 790                G.current_input_file++;
 791        }
 792        *gets_char = gc;
 793        return temp;
 794}
 795
 796/* Output line of text. */
 797/* Note:
 798 * The tricks with NO_EOL_CHAR and last_puts_char are there to emulate gnu sed.
 799 * Without them, we had this:
 800 * echo -n thingy >z1
 801 * echo -n again >z2
 802 * >znull
 803 * sed "s/i/z/" z1 z2 znull | hexdump -vC
 804 * output:
 805 * gnu sed 4.1.5:
 806 * 00000000  74 68 7a 6e 67 79 0a 61  67 61 7a 6e              |thzngy.agazn|
 807 * bbox:
 808 * 00000000  74 68 7a 6e 67 79 61 67  61 7a 6e                 |thzngyagazn|
 809 */
 810static void puts_maybe_newline(char *s, FILE *file, char *last_puts_char, char last_gets_char)
 811{
 812        char lpc = *last_puts_char;
 813
 814        /* Need to insert a '\n' between two files because first file's
 815         * last line wasn't terminated? */
 816        if (lpc != '\n' && lpc != '\0') {
 817                fputc('\n', file);
 818                lpc = '\n';
 819        }
 820        fputs(s, file);
 821
 822        /* 'x' - just something which is not '\n', '\0' or NO_EOL_CHAR */
 823        if (s[0])
 824                lpc = 'x';
 825
 826        /* had trailing '\0' and it was last char of file? */
 827        if (last_gets_char == LAST_IS_NUL) {
 828                fputc('\0', file);
 829                lpc = 'x'; /* */
 830        } else
 831        /* had trailing '\n' or '\0'? */
 832        if (last_gets_char != NO_EOL_CHAR) {
 833                fputc(last_gets_char, file);
 834                lpc = last_gets_char;
 835        }
 836
 837        if (ferror(file)) {
 838                xfunc_error_retval = 4;  /* It's what gnu sed exits with... */
 839                bb_error_msg_and_die(bb_msg_write_error);
 840        }
 841        *last_puts_char = lpc;
 842}
 843
 844#define sed_puts(s, n) (puts_maybe_newline(s, G.nonstdout, &last_puts_char, n))
 845
 846static int beg_match(sed_cmd_t *sed_cmd, const char *pattern_space)
 847{
 848        int retval = sed_cmd->beg_match && !regexec(sed_cmd->beg_match, pattern_space, 0, NULL, 0);
 849        if (retval)
 850                G.previous_regex_ptr = sed_cmd->beg_match;
 851        return retval;
 852}
 853
 854/* Process all the lines in all the files */
 855
 856static void process_files(void)
 857{
 858        char *pattern_space, *next_line;
 859        int linenum = 0;
 860        char last_puts_char = '\n';
 861        char last_gets_char, next_gets_char;
 862        sed_cmd_t *sed_cmd;
 863        int substituted;
 864
 865        /* Prime the pump */
 866        next_line = get_next_line(&next_gets_char);
 867
 868        /* go through every line in each file */
 869 again:
 870        substituted = 0;
 871
 872        /* Advance to next line.  Stop if out of lines. */
 873        pattern_space = next_line;
 874        if (!pattern_space) return;
 875        last_gets_char = next_gets_char;
 876
 877        /* Read one line in advance so we can act on the last line,
 878         * the '$' address */
 879        next_line = get_next_line(&next_gets_char);
 880        linenum++;
 881 restart:
 882        /* for every line, go through all the commands */
 883        for (sed_cmd = G.sed_cmd_head.next; sed_cmd; sed_cmd = sed_cmd->next) {
 884                int old_matched, matched;
 885
 886                old_matched = sed_cmd->in_match;
 887
 888                /* Determine if this command matches this line: */
 889
 890                /* Are we continuing a previous multi-line match? */
 891                sed_cmd->in_match = sed_cmd->in_match
 892                        /* Or is no range necessary? */
 893                        || (!sed_cmd->beg_line && !sed_cmd->end_line
 894                                && !sed_cmd->beg_match && !sed_cmd->end_match)
 895                        /* Or did we match the start of a numerical range? */
 896                        || (sed_cmd->beg_line > 0 && (sed_cmd->beg_line == linenum))
 897                        /* Or does this line match our begin address regex? */
 898                        || (beg_match(sed_cmd, pattern_space))
 899                        /* Or did we match last line of input? */
 900                        || (sed_cmd->beg_line == -1 && next_line == NULL);
 901
 902                /* Snapshot the value */
 903
 904                matched = sed_cmd->in_match;
 905
 906                /* Is this line the end of the current match? */
 907
 908                if (matched) {
 909                        sed_cmd->in_match = !(
 910                                /* has the ending line come, or is this a single address command? */
 911                                (sed_cmd->end_line ?
 912                                        sed_cmd->end_line == -1 ?
 913                                                !next_line
 914                                                : (sed_cmd->end_line <= linenum)
 915                                        : !sed_cmd->end_match
 916                                )
 917                                /* or does this line matches our last address regex */
 918                                || (sed_cmd->end_match && old_matched
 919                                     && (regexec(sed_cmd->end_match,
 920                                                 pattern_space, 0, NULL, 0) == 0))
 921                        );
 922                }
 923
 924                /* Skip blocks of commands we didn't match. */
 925                if (sed_cmd->cmd == '{') {
 926                        if (sed_cmd->invert ? matched : !matched) {
 927                                while (sed_cmd->cmd != '}') {
 928                                        sed_cmd = sed_cmd->next;
 929                                        if (!sed_cmd)
 930                                                bb_error_msg_and_die("unterminated {");
 931                                }
 932                        }
 933                        continue;
 934                }
 935
 936                /* Okay, so did this line match? */
 937                if (sed_cmd->invert ? !matched : matched) {
 938                        /* Update last used regex in case a blank substitute BRE is found */
 939                        if (sed_cmd->beg_match) {
 940                                G.previous_regex_ptr = sed_cmd->beg_match;
 941                        }
 942
 943                        /* actual sedding */
 944                        switch (sed_cmd->cmd) {
 945
 946                        /* Print line number */
 947                        case '=':
 948                                fprintf(G.nonstdout, "%d\n", linenum);
 949                                break;
 950
 951                        /* Write the current pattern space up to the first newline */
 952                        case 'P':
 953                        {
 954                                char *tmp = strchr(pattern_space, '\n');
 955
 956                                if (tmp) {
 957                                        *tmp = '\0';
 958                                        /* TODO: explain why '\n' below */
 959                                        sed_puts(pattern_space, '\n');
 960                                        *tmp = '\n';
 961                                        break;
 962                                }
 963                                /* Fall Through */
 964                        }
 965
 966                        /* Write the current pattern space to output */
 967                        case 'p':
 968                                /* NB: we print this _before_ the last line
 969                                 * (of current file) is printed. Even if
 970                                 * that line is nonterminated, we print
 971                                 * '\n' here (gnu sed does the same) */
 972                                sed_puts(pattern_space, '\n');
 973                                break;
 974                        /* Delete up through first newline */
 975                        case 'D':
 976                        {
 977                                char *tmp = strchr(pattern_space, '\n');
 978
 979                                if (tmp) {
 980                                        tmp = xstrdup(tmp+1);
 981                                        free(pattern_space);
 982                                        pattern_space = tmp;
 983                                        goto restart;
 984                                }
 985                        }
 986                        /* discard this line. */
 987                        case 'd':
 988                                goto discard_line;
 989
 990                        /* Substitute with regex */
 991                        case 's':
 992                                if (!do_subst_command(sed_cmd, &pattern_space))
 993                                        break;
 994                                substituted |= 1;
 995
 996                                /* handle p option */
 997                                if (sed_cmd->sub_p)
 998                                        sed_puts(pattern_space, last_gets_char);
 999                                /* handle w option */
1000                                if (sed_cmd->sw_file)
1001                                        puts_maybe_newline(
1002                                                pattern_space, sed_cmd->sw_file,
1003                                                &sed_cmd->sw_last_char, last_gets_char);
1004                                break;
1005
1006                        /* Append line to linked list to be printed later */
1007                        case 'a':
1008                                append(sed_cmd->string);
1009                                break;
1010
1011                        /* Insert text before this line */
1012                        case 'i':
1013                                sed_puts(sed_cmd->string, '\n');
1014                                break;
1015
1016                        /* Cut and paste text (replace) */
1017                        case 'c':
1018                                /* Only triggers on last line of a matching range. */
1019                                if (!sed_cmd->in_match)
1020                                        sed_puts(sed_cmd->string, NO_EOL_CHAR);
1021                                goto discard_line;
1022
1023                        /* Read file, append contents to output */
1024                        case 'r':
1025                        {
1026                                FILE *rfile;
1027
1028                                rfile = fopen_for_read(sed_cmd->string);
1029                                if (rfile) {
1030                                        char *line;
1031
1032                                        while ((line = xmalloc_fgetline(rfile))
1033                                                        != NULL)
1034                                                append(line);
1035                                        xprint_and_close_file(rfile);
1036                                }
1037
1038                                break;
1039                        }
1040
1041                        /* Write pattern space to file. */
1042                        case 'w':
1043                                puts_maybe_newline(
1044                                        pattern_space, sed_cmd->sw_file,
1045                                        &sed_cmd->sw_last_char, last_gets_char);
1046                                break;
1047
1048                        /* Read next line from input */
1049                        case 'n':
1050                                if (!G.be_quiet)
1051                                        sed_puts(pattern_space, last_gets_char);
1052                                if (next_line) {
1053                                        free(pattern_space);
1054                                        pattern_space = next_line;
1055                                        last_gets_char = next_gets_char;
1056                                        next_line = get_next_line(&next_gets_char);
1057                                        substituted = 0;
1058                                        linenum++;
1059                                        break;
1060                                }
1061                                /* fall through */
1062
1063                        /* Quit.  End of script, end of input. */
1064                        case 'q':
1065                                /* Exit the outer while loop */
1066                                free(next_line);
1067                                next_line = NULL;
1068                                goto discard_commands;
1069
1070                        /* Append the next line to the current line */
1071                        case 'N':
1072                        {
1073                                int len;
1074                                /* If no next line, jump to end of script and exit. */
1075                                if (next_line == NULL) {
1076                                        /* Jump to end of script and exit */
1077                                        free(next_line);
1078                                        next_line = NULL;
1079                                        goto discard_line;
1080                                /* append next_line, read new next_line. */
1081                                }
1082                                len = strlen(pattern_space);
1083                                pattern_space = realloc(pattern_space, len + strlen(next_line) + 2);
1084                                pattern_space[len] = '\n';
1085                                strcpy(pattern_space + len+1, next_line);
1086                                last_gets_char = next_gets_char;
1087                                next_line = get_next_line(&next_gets_char);
1088                                linenum++;
1089                                break;
1090                        }
1091
1092                        /* Test/branch if substitution occurred */
1093                        case 't':
1094                                if (!substituted) break;
1095                                substituted = 0;
1096                                /* Fall through */
1097                        /* Test/branch if substitution didn't occur */
1098                        case 'T':
1099                                if (substituted) break;
1100                                /* Fall through */
1101                        /* Branch to label */
1102                        case 'b':
1103                                if (!sed_cmd->string) goto discard_commands;
1104                                else sed_cmd = branch_to(sed_cmd->string);
1105                                break;
1106                        /* Transliterate characters */
1107                        case 'y':
1108                        {
1109                                int i, j;
1110
1111                                for (i = 0; pattern_space[i]; i++) {
1112                                        for (j = 0; sed_cmd->string[j]; j += 2) {
1113                                                if (pattern_space[i] == sed_cmd->string[j]) {
1114                                                        pattern_space[i] = sed_cmd->string[j + 1];
1115                                                        break;
1116                                                }
1117                                        }
1118                                }
1119
1120                                break;
1121                        }
1122                        case 'g':       /* Replace pattern space with hold space */
1123                                free(pattern_space);
1124                                pattern_space = xstrdup(G.hold_space ? G.hold_space : "");
1125                                break;
1126                        case 'G':       /* Append newline and hold space to pattern space */
1127                        {
1128                                int pattern_space_size = 2;
1129                                int hold_space_size = 0;
1130
1131                                if (pattern_space)
1132                                        pattern_space_size += strlen(pattern_space);
1133                                if (G.hold_space)
1134                                        hold_space_size = strlen(G.hold_space);
1135                                pattern_space = xrealloc(pattern_space,
1136                                                pattern_space_size + hold_space_size);
1137                                if (pattern_space_size == 2)
1138                                        pattern_space[0] = 0;
1139                                strcat(pattern_space, "\n");
1140                                if (G.hold_space)
1141                                        strcat(pattern_space, G.hold_space);
1142                                last_gets_char = '\n';
1143
1144                                break;
1145                        }
1146                        case 'h':       /* Replace hold space with pattern space */
1147                                free(G.hold_space);
1148                                G.hold_space = xstrdup(pattern_space);
1149                                break;
1150                        case 'H':       /* Append newline and pattern space to hold space */
1151                        {
1152                                int hold_space_size = 2;
1153                                int pattern_space_size = 0;
1154
1155                                if (G.hold_space)
1156                                        hold_space_size += strlen(G.hold_space);
1157                                if (pattern_space)
1158                                        pattern_space_size = strlen(pattern_space);
1159                                G.hold_space = xrealloc(G.hold_space,
1160                                                hold_space_size + pattern_space_size);
1161
1162                                if (hold_space_size == 2)
1163                                        *G.hold_space = 0;
1164                                strcat(G.hold_space, "\n");
1165                                if (pattern_space)
1166                                        strcat(G.hold_space, pattern_space);
1167
1168                                break;
1169                        }
1170                        case 'x': /* Exchange hold and pattern space */
1171                        {
1172                                char *tmp = pattern_space;
1173                                pattern_space = G.hold_space ? : xzalloc(1);
1174                                last_gets_char = '\n';
1175                                G.hold_space = tmp;
1176                                break;
1177                        }
1178                        }
1179                }
1180        }
1181
1182        /*
1183         * exit point from sedding...
1184         */
1185 discard_commands:
1186        /* we will print the line unless we were told to be quiet ('-n')
1187           or if the line was suppressed (ala 'd'elete) */
1188        if (!G.be_quiet)
1189                sed_puts(pattern_space, last_gets_char);
1190
1191        /* Delete and such jump here. */
1192 discard_line:
1193        flush_append();
1194        free(pattern_space);
1195
1196        goto again;
1197}
1198
1199/* It is possible to have a command line argument with embedded
1200 * newlines.  This counts as multiple command lines.
1201 * However, newline can be escaped: 's/e/z\<newline>z/'
1202 * We check for this.
1203 */
1204
1205static void add_cmd_block(char *cmdstr)
1206{
1207        char *sv, *eol;
1208
1209        cmdstr = sv = xstrdup(cmdstr);
1210        do {
1211                eol = strchr(cmdstr, '\n');
1212 next:
1213                if (eol) {
1214                        /* Count preceding slashes */
1215                        int slashes = 0;
1216                        char *sl = eol;
1217
1218                        while (sl != cmdstr && *--sl == '\\')
1219                                slashes++;
1220                        /* Odd number of preceding slashes - newline is escaped */
1221                        if (slashes & 1) {
1222                                overlapping_strcpy(eol - 1, eol);
1223                                eol = strchr(eol, '\n');
1224                                goto next;
1225                        }
1226                        *eol = '\0';
1227                }
1228                add_cmd(cmdstr);
1229                cmdstr = eol + 1;
1230        } while (eol);
1231        free(sv);
1232}
1233
1234int sed_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
1235int sed_main(int argc UNUSED_PARAM, char **argv)
1236{
1237        enum {
1238                OPT_in_place = 1 << 0,
1239        };
1240        unsigned opt;
1241        llist_t *opt_e, *opt_f;
1242        int status = EXIT_SUCCESS;
1243
1244        INIT_G();
1245
1246        /* destroy command strings on exit */
1247        if (ENABLE_FEATURE_CLEAN_UP) atexit(sed_free_and_close_stuff);
1248
1249        /* Lie to autoconf when it starts asking stupid questions. */
1250        if (argv[1] && !strcmp(argv[1], "--version")) {
1251                puts("This is not GNU sed version 4.0");
1252                return 0;
1253        }
1254
1255        /* do normal option parsing */
1256        opt_e = opt_f = NULL;
1257        opt_complementary = "e::f::" /* can occur multiple times */
1258                            "nn"; /* count -n */
1259        opt = getopt32(argv, "irne:f:", &opt_e, &opt_f,
1260                            &G.be_quiet); /* counter for -n */
1261        //argc -= optind;
1262        argv += optind;
1263        if (opt & OPT_in_place) { // -i
1264                atexit(cleanup_outname);
1265        }
1266        if (opt & 0x2) G.regex_type |= REG_EXTENDED; // -r
1267        //if (opt & 0x4) G.be_quiet++; // -n
1268        while (opt_e) { // -e
1269                add_cmd_block(llist_pop(&opt_e));
1270        }
1271        while (opt_f) { // -f
1272                char *line;
1273                FILE *cmdfile;
1274                cmdfile = xfopen_for_read(llist_pop(&opt_f));
1275                while ((line = xmalloc_fgetline(cmdfile)) != NULL) {
1276                        add_cmd(line);
1277                        free(line);
1278                }
1279                fclose(cmdfile);
1280        }
1281        /* if we didn't get a pattern from -e or -f, use argv[0] */
1282        if (!(opt & 0x18)) {
1283                if (!*argv)
1284                        bb_show_usage();
1285                add_cmd_block(*argv++);
1286        }
1287        /* Flush any unfinished commands. */
1288        add_cmd("");
1289
1290        /* By default, we write to stdout */
1291        G.nonstdout = stdout;
1292
1293        /* argv[0..(argc-1)] should be names of file to process. If no
1294         * files were specified or '-' was specified, take input from stdin.
1295         * Otherwise, we process all the files specified. */
1296        if (argv[0] == NULL) {
1297                if (opt & OPT_in_place)
1298                        bb_error_msg_and_die(bb_msg_requires_arg, "-i");
1299                add_input_file(stdin);
1300                process_files();
1301        } else {
1302                int i;
1303                FILE *file;
1304
1305                for (i = 0; argv[i]; i++) {
1306                        struct stat statbuf;
1307                        int nonstdoutfd;
1308
1309                        if (LONE_DASH(argv[i]) && !(opt & OPT_in_place)) {
1310                                add_input_file(stdin);
1311                                process_files();
1312                                continue;
1313                        }
1314                        file = fopen_or_warn(argv[i], "r");
1315                        if (!file) {
1316                                status = EXIT_FAILURE;
1317                                continue;
1318                        }
1319                        if (!(opt & OPT_in_place)) {
1320                                add_input_file(file);
1321                                continue;
1322                        }
1323
1324                        G.outname = xasprintf("%sXXXXXX", argv[i]);
1325                        nonstdoutfd = mkstemp(G.outname);
1326                        if (-1 == nonstdoutfd)
1327                                bb_perror_msg_and_die("cannot create temp file %s", G.outname);
1328                        G.nonstdout = fdopen(nonstdoutfd, "w");
1329
1330                        /* Set permissions of output file */
1331
1332                        fstat(fileno(file), &statbuf);
1333                        fchmod(nonstdoutfd, statbuf.st_mode);
1334                        add_input_file(file);
1335                        process_files();
1336                        fclose(G.nonstdout);
1337
1338                        G.nonstdout = stdout;
1339                        /* unlink(argv[i]); */
1340                        xrename(G.outname, argv[i]);
1341                        free(G.outname);
1342                        G.outname = NULL;
1343                }
1344                if (G.input_file_count > G.current_input_file)
1345                        process_files();
1346        }
1347
1348        return status;
1349}
1350