busybox/coreutils/wc.c
<<
>>
Prefs
   1/* vi: set sw=4 ts=4: */
   2/*
   3 * wc implementation for busybox
   4 *
   5 * Copyright (C) 2003  Manuel Novoa III  <mjn3@codepoet.org>
   6 *
   7 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
   8 */
   9/* Mar 16, 2003      Manuel Novoa III   (mjn3@codepoet.org)
  10 *
  11 * Rewritten to fix a number of problems and do some size optimizations.
  12 * Problems in the previous busybox implementation (besides bloat) included:
  13 *  1) broken 'wc -c' optimization (read note below)
  14 *  2) broken handling of '-' args
  15 *  3) no checking of ferror on EOF returns
  16 *  4) isprint() wasn't considered when word counting.
  17 *
  18 * NOTES:
  19 *
  20 * The previous busybox wc attempted an optimization using stat for the
  21 * case of counting chars only.  I omitted that because it was broken.
  22 * It didn't take into account the possibility of input coming from a
  23 * pipe, or input from a file with file pointer not at the beginning.
  24 *
  25 * To implement such a speed optimization correctly, not only do you
  26 * need the size, but also the file position.  Note also that the
  27 * file position may be past the end of file.  Consider the example
  28 * (adapted from example in gnu wc.c)
  29 *
  30 *      echo hello > /tmp/testfile &&
  31 *      (dd ibs=1k skip=1 count=0 &> /dev/null; wc -c) < /tmp/testfile
  32 *
  33 * for which 'wc -c' should output '0'.
  34 */
  35//config:config WC
  36//config:       bool "wc (4.5 kb)"
  37//config:       default y
  38//config:       help
  39//config:       wc is used to print the number of bytes, words, and lines,
  40//config:       in specified files.
  41//config:
  42//config:config FEATURE_WC_LARGE
  43//config:       bool "Support very large counts"
  44//config:       default y
  45//config:       depends on WC
  46//config:       help
  47//config:       Use "unsigned long long" for counter variables.
  48
  49//applet:IF_WC(APPLET(wc, BB_DIR_USR_BIN, BB_SUID_DROP))
  50
  51//kbuild:lib-$(CONFIG_WC) += wc.o
  52
  53/* BB_AUDIT SUSv3 compliant. */
  54/* http://www.opengroup.org/onlinepubs/007904975/utilities/wc.html */
  55
  56#include "libbb.h"
  57#include "unicode.h"
  58
  59#if !ENABLE_LOCALE_SUPPORT
  60# undef isprint
  61# undef isspace
  62# define isprint(c) ((unsigned)((c) - 0x20) <= (0x7e - 0x20))
  63# define isspace(c) ((c) == ' ')
  64#endif
  65
  66#if ENABLE_FEATURE_WC_LARGE
  67# define COUNT_T unsigned long long
  68# define COUNT_FMT "llu"
  69#else
  70# define COUNT_T unsigned
  71# define COUNT_FMT "u"
  72#endif
  73
  74/* We support -m even when UNICODE_SUPPORT is off,
  75 * we just don't advertise it in help text,
  76 * since it is the same as -c in this case.
  77 */
  78
  79//usage:#define wc_trivial_usage
  80//usage:       "[-c"IF_UNICODE_SUPPORT("m")"lwL] [FILE]..."
  81//usage:
  82//usage:#define wc_full_usage "\n\n"
  83//usage:       "Count lines, words, and bytes for FILEs (or stdin)\n"
  84//usage:     "\n        -c      Count bytes"
  85//usage:        IF_UNICODE_SUPPORT(
  86//usage:     "\n        -m      Count characters"
  87//usage:        )
  88//usage:     "\n        -l      Count newlines"
  89//usage:     "\n        -w      Count words"
  90//usage:     "\n        -L      Print longest line length"
  91//usage:
  92//usage:#define wc_example_usage
  93//usage:       "$ wc /etc/passwd\n"
  94//usage:       "     31      46    1365 /etc/passwd\n"
  95
  96/* Order is important if we want to be compatible with
  97 * column order in "wc -cmlwL" output:
  98 */
  99enum {
 100        WC_LINES    = 0, /* -l */
 101        WC_WORDS    = 1, /* -w */
 102        WC_UNICHARS = 2, /* -m */
 103        WC_BYTES    = 3, /* -c */
 104        WC_LENGTH   = 4, /* -L */
 105        NUM_WCS     = 5,
 106};
 107
 108int wc_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
 109int wc_main(int argc UNUSED_PARAM, char **argv)
 110{
 111        const char *arg;
 112        const char *start_fmt = " %9"COUNT_FMT + 1;
 113        const char *fname_fmt = " %s\n";
 114        COUNT_T *pcounts;
 115        COUNT_T counts[NUM_WCS];
 116        COUNT_T totals[NUM_WCS];
 117        int num_files;
 118        smallint status = EXIT_SUCCESS;
 119        unsigned print_type;
 120
 121        init_unicode();
 122
 123        print_type = getopt32(argv, "lwmcL");
 124
 125        if (print_type == 0) {
 126                print_type = (1 << WC_LINES) | (1 << WC_WORDS) | (1 << WC_BYTES);
 127        }
 128
 129        argv += optind;
 130        if (!argv[0]) {
 131                *--argv = (char *) bb_msg_standard_input;
 132                fname_fmt = "\n";
 133        }
 134        if (!argv[1]) { /* zero or one filename? */
 135                if (!((print_type-1) & print_type)) /* exactly one option? */
 136                        start_fmt = "%"COUNT_FMT;
 137        }
 138
 139        memset(totals, 0, sizeof(totals));
 140
 141        pcounts = counts;
 142
 143        num_files = 0;
 144        while ((arg = *argv++) != NULL) {
 145                FILE *fp;
 146                const char *s;
 147                unsigned u;
 148                unsigned linepos;
 149                smallint in_word;
 150
 151                ++num_files;
 152                fp = fopen_or_warn_stdin(arg);
 153                if (!fp) {
 154                        status = EXIT_FAILURE;
 155                        continue;
 156                }
 157
 158                memset(counts, 0, sizeof(counts));
 159                linepos = 0;
 160                in_word = 0;
 161
 162                while (1) {
 163                        int c;
 164                        /* Our -w doesn't match GNU wc exactly... oh well */
 165
 166                        c = getc(fp);
 167                        if (c == EOF) {
 168                                if (ferror(fp)) {
 169                                        bb_simple_perror_msg(arg);
 170                                        status = EXIT_FAILURE;
 171                                }
 172                                goto DO_EOF;  /* Treat an EOF as '\r'. */
 173                        }
 174
 175                        /* Cater for -c and -m */
 176                        ++counts[WC_BYTES];
 177                        if (unicode_status != UNICODE_ON /* every byte is a new char */
 178                         || (c & 0xc0) != 0x80 /* it isn't a 2nd+ byte of a Unicode char */
 179                        ) {
 180                                ++counts[WC_UNICHARS];
 181                        }
 182
 183                        if (isprint_asciionly(c)) { /* FIXME: not unicode-aware */
 184                                ++linepos;
 185                                if (!isspace(c)) {
 186                                        in_word = 1;
 187                                        continue;
 188                                }
 189                        } else if ((unsigned)(c - 9) <= 4) {
 190                                /* \t  9
 191                                 * \n 10
 192                                 * \v 11
 193                                 * \f 12
 194                                 * \r 13
 195                                 */
 196                                if (c == '\t') {
 197                                        linepos = (linepos | 7) + 1;
 198                                } else {  /* '\n', '\r', '\f', or '\v' */
 199 DO_EOF:
 200                                        if (linepos > counts[WC_LENGTH]) {
 201                                                counts[WC_LENGTH] = linepos;
 202                                        }
 203                                        if (c == '\n') {
 204                                                ++counts[WC_LINES];
 205                                        }
 206                                        if (c != '\v') {
 207                                                linepos = 0;
 208                                        }
 209                                }
 210                        } else {
 211                                continue;
 212                        }
 213
 214                        counts[WC_WORDS] += in_word;
 215                        in_word = 0;
 216                        if (c == EOF) {
 217                                break;
 218                        }
 219                }
 220
 221                fclose_if_not_stdin(fp);
 222
 223                if (totals[WC_LENGTH] < counts[WC_LENGTH]) {
 224                        totals[WC_LENGTH] = counts[WC_LENGTH];
 225                }
 226                totals[WC_LENGTH] -= counts[WC_LENGTH];
 227
 228 OUTPUT:
 229                /* coreutils wc tries hard to print pretty columns
 230                 * (saves results for all files, finds max col len etc...)
 231                 * we won't try that hard, it will bloat us too much */
 232                s = start_fmt;
 233                u = 0;
 234                do {
 235                        if (print_type & (1 << u)) {
 236                                printf(s, pcounts[u]);
 237                                s = " %9"COUNT_FMT; /* Ok... restore the leading space. */
 238                        }
 239                        totals[u] += pcounts[u];
 240                } while (++u < NUM_WCS);
 241                printf(fname_fmt, arg);
 242        }
 243
 244        /* If more than one file was processed, we want the totals.  To save some
 245         * space, we set the pcounts ptr to the totals array.  This has the side
 246         * effect of trashing the totals array after outputting it, but that's
 247         * irrelavent since we no longer need it. */
 248        if (num_files > 1) {
 249                num_files = 0;  /* Make sure we don't get here again. */
 250                arg = "total";
 251                pcounts = totals;
 252                --argv;
 253                goto OUTPUT;
 254        }
 255
 256        fflush_stdout_and_exit(status);
 257}
 258