busybox/coreutils/tr.c
<<
>>
Prefs
   1/* vi: set sw=4 ts=4: */
   2/*
   3 * Mini tr implementation for busybox
   4 *
   5 ** Copyright (c) 1987,1997, Prentice Hall   All rights reserved.
   6 *
   7 * The name of Prentice Hall may not be used to endorse or promote
   8 * products derived from this software without specific prior
   9 * written permission.
  10 *
  11 * Copyright (c) Michiel Huisjes
  12 *
  13 * This version of tr is adapted from Minix tr and was modified
  14 * by Erik Andersen <andersen@codepoet.org> to be used in busybox.
  15 *
  16 * Licensed under GPLv2 or later, see file LICENSE in this tarball for details.
  17 */
  18/* http://www.opengroup.org/onlinepubs/009695399/utilities/tr.html
  19 * TODO: graph, print
  20 */
  21#include "libbb.h"
  22
  23enum {
  24        ASCII = 256,
  25        /* string buffer needs to be at least as big as the whole "alphabet".
  26         * BUFSIZ == ASCII is ok, but we will realloc in expand
  27         * even for smallest patterns, let's avoid that by using *2:
  28         */
  29        TR_BUFSIZ = (BUFSIZ > ASCII*2) ? BUFSIZ : ASCII*2,
  30};
  31
  32static void map(char *pvector,
  33                char *string1, unsigned string1_len,
  34                char *string2, unsigned string2_len)
  35{
  36        char last = '0';
  37        unsigned i, j;
  38
  39        for (j = 0, i = 0; i < string1_len; i++) {
  40                if (string2_len <= j)
  41                        pvector[(unsigned char)(string1[i])] = last;
  42                else
  43                        pvector[(unsigned char)(string1[i])] = last = string2[j++];
  44        }
  45}
  46
  47/* supported constructs:
  48 *   Ranges,  e.g.,  0-9   ==>  0123456789
  49 *   Escapes, e.g.,  \a    ==>  Control-G
  50 *   Character classes, e.g. [:upper:] ==> A...Z
  51 *   Equiv classess, e.g. [=A=] ==> A   (hmmmmmmm?)
  52 * not supported:
  53 *   \ooo-\ooo - octal ranges
  54 *   [x*N] - repeat char x N times
  55 *   [x*] - repeat char x until it fills STRING2:
  56 * # echo qwe123 | /usr/bin/tr 123456789 '[d]'
  57 * qwe[d]
  58 * # echo qwe123 | /usr/bin/tr 123456789 '[d*]'
  59 * qweddd
  60 */
  61static unsigned expand(const char *arg, char **buffer_p)
  62{
  63        char *buffer = *buffer_p;
  64        unsigned pos = 0;
  65        unsigned size = TR_BUFSIZ;
  66        unsigned i; /* can't be unsigned char: must be able to hold 256 */
  67        unsigned char ac;
  68
  69        while (*arg) {
  70                if (pos + ASCII > size) {
  71                        size += ASCII;
  72                        *buffer_p = buffer = xrealloc(buffer, size);
  73                }
  74                if (*arg == '\\') {
  75                        arg++;
  76                        buffer[pos++] = bb_process_escape_sequence(&arg);
  77                        continue;
  78                }
  79                if (arg[1] == '-') { /* "0-9..." */
  80                        ac = arg[2];
  81                        if (ac == '\0') { /* "0-": copy verbatim */
  82                                buffer[pos++] = *arg++; /* copy '0' */
  83                                continue; /* next iter will copy '-' and stop */
  84                        }
  85                        i = (unsigned char) *arg;
  86                        while (i <= ac) /* ok: i is unsigned _int_ */
  87                                buffer[pos++] = i++;
  88                        arg += 3; /* skip 0-9 */
  89                        continue;
  90                }
  91                if ((ENABLE_FEATURE_TR_CLASSES || ENABLE_FEATURE_TR_EQUIV)
  92                 && *arg == '['
  93                ) {
  94                        arg++;
  95                        i = (unsigned char) *arg++;
  96                        /* "[xyz...". i=x, arg points to y */
  97                        if (ENABLE_FEATURE_TR_CLASSES && i == ':') { /* [:class:] */
  98#define CLO ":]\0"
  99                                static const char classes[] ALIGN1 =
 100                                        "alpha"CLO "alnum"CLO "digit"CLO
 101                                        "lower"CLO "upper"CLO "space"CLO
 102                                        "blank"CLO "punct"CLO "cntrl"CLO
 103                                        "xdigit"CLO;
 104                                enum {
 105                                        CLASS_invalid = 0, /* we increment the retval */
 106                                        CLASS_alpha = 1,
 107                                        CLASS_alnum = 2,
 108                                        CLASS_digit = 3,
 109                                        CLASS_lower = 4,
 110                                        CLASS_upper = 5,
 111                                        CLASS_space = 6,
 112                                        CLASS_blank = 7,
 113                                        CLASS_punct = 8,
 114                                        CLASS_cntrl = 9,
 115                                        CLASS_xdigit = 10,
 116                                        //CLASS_graph = 11,
 117                                        //CLASS_print = 12,
 118                                };
 119                                smalluint j;
 120                                char *tmp;
 121
 122                                /* xdigit needs 8, not 7 */
 123                                i = 7 + (arg[0] == 'x');
 124                                tmp = xstrndup(arg, i);
 125                                j = index_in_strings(classes, tmp) + 1;
 126                                free(tmp);
 127
 128                                if (j == CLASS_invalid)
 129                                        goto skip_bracket;
 130
 131                                arg += i;
 132                                if (j == CLASS_alnum || j == CLASS_digit || j == CLASS_xdigit) {
 133                                        for (i = '0'; i <= '9'; i++)
 134                                                buffer[pos++] = i;
 135                                }
 136                                if (j == CLASS_alpha || j == CLASS_alnum || j == CLASS_upper) {
 137                                        for (i = 'A'; i <= 'Z'; i++)
 138                                                buffer[pos++] = i;
 139                                }
 140                                if (j == CLASS_alpha || j == CLASS_alnum || j == CLASS_lower) {
 141                                        for (i = 'a'; i <= 'z'; i++)
 142                                                buffer[pos++] = i;
 143                                }
 144                                if (j == CLASS_space || j == CLASS_blank) {
 145                                        buffer[pos++] = '\t';
 146                                        if (j == CLASS_space) {
 147                                                buffer[pos++] = '\n';
 148                                                buffer[pos++] = '\v';
 149                                                buffer[pos++] = '\f';
 150                                                buffer[pos++] = '\r';
 151                                        }
 152                                        buffer[pos++] = ' ';
 153                                }
 154                                if (j == CLASS_punct || j == CLASS_cntrl) {
 155                                        for (i = '\0'; i < ASCII; i++) {
 156                                                if ((j == CLASS_punct && isprint(i) && !isalnum(i) && !isspace(i))
 157                                                 || (j == CLASS_cntrl && iscntrl(i))
 158                                                ) {
 159                                                        buffer[pos++] = i;
 160                                                }
 161                                        }
 162                                }
 163                                if (j == CLASS_xdigit) {
 164                                        for (i = 'A'; i <= 'F'; i++) {
 165                                                buffer[pos + 6] = i | 0x20;
 166                                                buffer[pos++] = i;
 167                                        }
 168                                        pos += 6;
 169                                }
 170                                continue;
 171                        }
 172                        /* "[xyz...", i=x, arg points to y */
 173                        if (ENABLE_FEATURE_TR_EQUIV && i == '=') { /* [=CHAR=] */
 174                                buffer[pos++] = *arg; /* copy CHAR */
 175                                if (!arg[0] || arg[1] != '=' || arg[2] != ']')
 176                                        bb_show_usage();
 177                                arg += 3;       /* skip CHAR=] */
 178                                continue;
 179                        }
 180                        /* The rest of "[xyz..." cases is treated as normal
 181                         * string, "[" has no special meaning here:
 182                         * tr "[a-z]" "[A-Z]" can be written as tr "a-z" "A-Z",
 183                         * also try tr "[a-z]" "_A-Z+" and you'll see that
 184                         * [] is not special here.
 185                         */
 186 skip_bracket:
 187                        arg -= 2; /* points to "[" in "[xyz..." */
 188                }
 189                buffer[pos++] = *arg++;
 190        }
 191        return pos;
 192}
 193
 194/* NB: buffer is guaranteed to be at least TR_BUFSIZE
 195 * (which is >= ASCII) big.
 196 */
 197static int complement(char *buffer, int buffer_len)
 198{
 199        int len;
 200        char conv[ASCII];
 201        unsigned char ch;
 202
 203        len = 0;
 204        ch = '\0';
 205        while (1) {
 206                if (memchr(buffer, ch, buffer_len) == NULL)
 207                        conv[len++] = ch;
 208                if (++ch == '\0')
 209                        break;
 210        }
 211        memcpy(buffer, conv, len);
 212        return len;
 213}
 214
 215int tr_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
 216int tr_main(int argc UNUSED_PARAM, char **argv)
 217{
 218        int i;
 219        smalluint opts;
 220        ssize_t read_chars;
 221        size_t in_index, out_index;
 222        unsigned last = UCHAR_MAX + 1; /* not equal to any char */
 223        unsigned char coded, c;
 224        char *str1 = xmalloc(TR_BUFSIZ);
 225        char *str2 = xmalloc(TR_BUFSIZ);
 226        int str2_length;
 227        int str1_length;
 228        char *vector = xzalloc(ASCII * 3);
 229        char *invec  = vector + ASCII;
 230        char *outvec = vector + ASCII * 2;
 231
 232#define TR_OPT_complement       (3 << 0)
 233#define TR_OPT_delete           (1 << 2)
 234#define TR_OPT_squeeze_reps     (1 << 3)
 235
 236        for (i = 0; i < ASCII; i++) {
 237                vector[i] = i;
 238                /*invec[i] = outvec[i] = FALSE; - done by xzalloc */
 239        }
 240
 241        /* -C/-c difference is that -C complements "characters",
 242         * and -c complements "values" (binary bytes I guess).
 243         * In POSIX locale, these are the same.
 244         */
 245
 246        opt_complementary = "-1";
 247        opts = getopt32(argv, "+Ccds"); /* '+': stop at first non-option */
 248        argv += optind;
 249
 250        str1_length = expand(*argv++, &str1);
 251        str2_length = 0;
 252        if (opts & TR_OPT_complement)
 253                str1_length = complement(str1, str1_length);
 254        if (*argv) {
 255                if (argv[0][0] == '\0')
 256                        bb_error_msg_and_die("STRING2 cannot be empty");
 257                str2_length = expand(*argv, &str2);
 258                map(vector, str1, str1_length,
 259                                str2, str2_length);
 260        }
 261        for (i = 0; i < str1_length; i++)
 262                invec[(unsigned char)(str1[i])] = TRUE;
 263        for (i = 0; i < str2_length; i++)
 264                outvec[(unsigned char)(str2[i])] = TRUE;
 265
 266        goto start_from;
 267
 268        /* In this loop, str1 space is reused as input buffer,
 269         * str2 - as output one. */
 270        for (;;) {
 271                /* If we're out of input, flush output and read more input. */
 272                if ((ssize_t)in_index == read_chars) {
 273                        if (out_index) {
 274                                xwrite(STDOUT_FILENO, str2, out_index);
 275 start_from:
 276                                out_index = 0;
 277                        }
 278                        read_chars = safe_read(STDIN_FILENO, str1, TR_BUFSIZ);
 279                        if (read_chars <= 0) {
 280                                if (read_chars < 0)
 281                                        bb_perror_msg_and_die(bb_msg_read_error);
 282                                break;
 283                        }
 284                        in_index = 0;
 285                }
 286                c = str1[in_index++];
 287                if ((opts & TR_OPT_delete) && invec[c])
 288                        continue;
 289                coded = vector[c];
 290                if ((opts & TR_OPT_squeeze_reps) && last == coded
 291                 && (invec[c] || outvec[coded])
 292                ) {
 293                        continue;
 294                }
 295                str2[out_index++] = last = coded;
 296        }
 297
 298        return EXIT_SUCCESS;
 299}
 300