busybox/coreutils/wc.c
<<
>>
Prefs
   1/* vi: set sw=4 ts=4: */
   2/*
   3 * wc implementation for busybox
   4 *
   5 * Copyright (C) 2003  Manuel Novoa III  <mjn3@codepoet.org>
   6 *
   7 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
   8 */
   9
  10/* BB_AUDIT SUSv3 compliant. */
  11/* http://www.opengroup.org/onlinepubs/007904975/utilities/wc.html */
  12
  13/* Mar 16, 2003      Manuel Novoa III   (mjn3@codepoet.org)
  14 *
  15 * Rewritten to fix a number of problems and do some size optimizations.
  16 * Problems in the previous busybox implementation (besides bloat) included:
  17 *  1) broken 'wc -c' optimization (read note below)
  18 *  2) broken handling of '-' args
  19 *  3) no checking of ferror on EOF returns
  20 *  4) isprint() wasn't considered when word counting.
  21 *
  22 * NOTES:
  23 *
  24 * The previous busybox wc attempted an optimization using stat for the
  25 * case of counting chars only.  I omitted that because it was broken.
  26 * It didn't take into account the possibility of input coming from a
  27 * pipe, or input from a file with file pointer not at the beginning.
  28 *
  29 * To implement such a speed optimization correctly, not only do you
  30 * need the size, but also the file position.  Note also that the
  31 * file position may be past the end of file.  Consider the example
  32 * (adapted from example in gnu wc.c)
  33 *
  34 *      echo hello > /tmp/testfile &&
  35 *      (dd ibs=1k skip=1 count=0 &> /dev/null; wc -c) < /tmp/testfile
  36 *
  37 * for which 'wc -c' should output '0'.
  38 */
  39#include "libbb.h"
  40#include "unicode.h"
  41
  42#if !ENABLE_LOCALE_SUPPORT
  43# undef isprint
  44# undef isspace
  45# define isprint(c) ((unsigned)((c) - 0x20) <= (0x7e - 0x20))
  46# define isspace(c) ((c) == ' ')
  47#endif
  48
  49#if ENABLE_FEATURE_WC_LARGE
  50# define COUNT_T unsigned long long
  51# define COUNT_FMT "llu"
  52#else
  53# define COUNT_T unsigned
  54# define COUNT_FMT "u"
  55#endif
  56
  57/* We support -m even when UNICODE_SUPPORT is off,
  58 * we just don't advertise it in help text,
  59 * since it is the same as -c in this case.
  60 */
  61
  62//usage:#define wc_trivial_usage
  63//usage:       "[-c"IF_UNICODE_SUPPORT("m")"lwL] [FILE]..."
  64//usage:
  65//usage:#define wc_full_usage "\n\n"
  66//usage:       "Count lines, words, and bytes for each FILE (or stdin)\n"
  67//usage:     "\n        -c      Count bytes"
  68//usage:        IF_UNICODE_SUPPORT(
  69//usage:     "\n        -m      Count characters"
  70//usage:        )
  71//usage:     "\n        -l      Count newlines"
  72//usage:     "\n        -w      Count words"
  73//usage:     "\n        -L      Print longest line length"
  74//usage:
  75//usage:#define wc_example_usage
  76//usage:       "$ wc /etc/passwd\n"
  77//usage:       "     31      46    1365 /etc/passwd\n"
  78
  79/* Order is important if we want to be compatible with
  80 * column order in "wc -cmlwL" output:
  81 */
  82enum {
  83        WC_LINES    = 0, /* -l */
  84        WC_WORDS    = 1, /* -w */
  85        WC_UNICHARS = 2, /* -m */
  86        WC_BYTES    = 3, /* -c */
  87        WC_LENGTH   = 4, /* -L */
  88        NUM_WCS     = 5,
  89};
  90
  91int wc_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
  92int wc_main(int argc UNUSED_PARAM, char **argv)
  93{
  94        const char *arg;
  95        const char *start_fmt = " %9"COUNT_FMT + 1;
  96        const char *fname_fmt = " %s\n";
  97        COUNT_T *pcounts;
  98        COUNT_T counts[NUM_WCS];
  99        COUNT_T totals[NUM_WCS];
 100        int num_files;
 101        smallint status = EXIT_SUCCESS;
 102        unsigned print_type;
 103
 104        init_unicode();
 105
 106        print_type = getopt32(argv, "lwmcL");
 107
 108        if (print_type == 0) {
 109                print_type = (1 << WC_LINES) | (1 << WC_WORDS) | (1 << WC_BYTES);
 110        }
 111
 112        argv += optind;
 113        if (!argv[0]) {
 114                *--argv = (char *) bb_msg_standard_input;
 115                fname_fmt = "\n";
 116        }
 117        if (!argv[1]) { /* zero or one filename? */
 118                if (!((print_type-1) & print_type)) /* exactly one option? */
 119                        start_fmt = "%"COUNT_FMT;
 120        }
 121
 122        memset(totals, 0, sizeof(totals));
 123
 124        pcounts = counts;
 125
 126        num_files = 0;
 127        while ((arg = *argv++) != NULL) {
 128                FILE *fp;
 129                const char *s;
 130                unsigned u;
 131                unsigned linepos;
 132                smallint in_word;
 133
 134                ++num_files;
 135                fp = fopen_or_warn_stdin(arg);
 136                if (!fp) {
 137                        status = EXIT_FAILURE;
 138                        continue;
 139                }
 140
 141                memset(counts, 0, sizeof(counts));
 142                linepos = 0;
 143                in_word = 0;
 144
 145                while (1) {
 146                        int c;
 147                        /* Our -w doesn't match GNU wc exactly... oh well */
 148
 149                        c = getc(fp);
 150                        if (c == EOF) {
 151                                if (ferror(fp)) {
 152                                        bb_simple_perror_msg(arg);
 153                                        status = EXIT_FAILURE;
 154                                }
 155                                goto DO_EOF;  /* Treat an EOF as '\r'. */
 156                        }
 157
 158                        /* Cater for -c and -m */
 159                        ++counts[WC_BYTES];
 160                        if (unicode_status != UNICODE_ON /* every byte is a new char */
 161                         || (c & 0xc0) != 0x80 /* it isn't a 2nd+ byte of a Unicode char */
 162                        ) {
 163                                ++counts[WC_UNICHARS];
 164                        }
 165
 166                        if (isprint_asciionly(c)) { /* FIXME: not unicode-aware */
 167                                ++linepos;
 168                                if (!isspace(c)) {
 169                                        in_word = 1;
 170                                        continue;
 171                                }
 172                        } else if ((unsigned)(c - 9) <= 4) {
 173                                /* \t  9
 174                                 * \n 10
 175                                 * \v 11
 176                                 * \f 12
 177                                 * \r 13
 178                                 */
 179                                if (c == '\t') {
 180                                        linepos = (linepos | 7) + 1;
 181                                } else {  /* '\n', '\r', '\f', or '\v' */
 182 DO_EOF:
 183                                        if (linepos > counts[WC_LENGTH]) {
 184                                                counts[WC_LENGTH] = linepos;
 185                                        }
 186                                        if (c == '\n') {
 187                                                ++counts[WC_LINES];
 188                                        }
 189                                        if (c != '\v') {
 190                                                linepos = 0;
 191                                        }
 192                                }
 193                        } else {
 194                                continue;
 195                        }
 196
 197                        counts[WC_WORDS] += in_word;
 198                        in_word = 0;
 199                        if (c == EOF) {
 200                                break;
 201                        }
 202                }
 203
 204                fclose_if_not_stdin(fp);
 205
 206                if (totals[WC_LENGTH] < counts[WC_LENGTH]) {
 207                        totals[WC_LENGTH] = counts[WC_LENGTH];
 208                }
 209                totals[WC_LENGTH] -= counts[WC_LENGTH];
 210
 211 OUTPUT:
 212                /* coreutils wc tries hard to print pretty columns
 213                 * (saves results for all files, finds max col len etc...)
 214                 * we won't try that hard, it will bloat us too much */
 215                s = start_fmt;
 216                u = 0;
 217                do {
 218                        if (print_type & (1 << u)) {
 219                                printf(s, pcounts[u]);
 220                                s = " %9"COUNT_FMT; /* Ok... restore the leading space. */
 221                        }
 222                        totals[u] += pcounts[u];
 223                } while (++u < NUM_WCS);
 224                printf(fname_fmt, arg);
 225        }
 226
 227        /* If more than one file was processed, we want the totals.  To save some
 228         * space, we set the pcounts ptr to the totals array.  This has the side
 229         * effect of trashing the totals array after outputting it, but that's
 230         * irrelavent since we no longer need it. */
 231        if (num_files > 1) {
 232                num_files = 0;  /* Make sure we don't get here again. */
 233                arg = "total";
 234                pcounts = totals;
 235                --argv;
 236                goto OUTPUT;
 237        }
 238
 239        fflush_stdout_and_exit(status);
 240}
 241