busybox/coreutils/shuf.c
<<
>>
Prefs
   1/* vi: set sw=4 ts=4: */
   2/*
   3 * shuf: Write a random permutation of the input lines to standard output.
   4 *
   5 * Copyright (C) 2014 by Bartosz Golaszewski <bartekgola@gmail.com>
   6 *
   7 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
   8 */
   9//config:config SHUF
  10//config:       bool "shuf (5.4 kb)"
  11//config:       default y
  12//config:       help
  13//config:       Generate random permutations
  14
  15//applet:IF_SHUF(APPLET_NOEXEC(shuf, shuf, BB_DIR_USR_BIN, BB_SUID_DROP, shuf))
  16
  17//kbuild:lib-$(CONFIG_SHUF) += shuf.o
  18
  19//usage:#define shuf_trivial_usage
  20//usage:       "[-n NUM] [-o FILE] [-z] [FILE | -e [ARG...] | -i L-H]"
  21//usage:#define shuf_full_usage "\n\n"
  22//usage:       "Randomly permute lines\n"
  23//usage:     "\n        -n NUM  Output at most NUM lines"
  24//usage:     "\n        -o FILE Write to FILE, not standard output"
  25//usage:     "\n        -z      NUL terminated output"
  26//usage:     "\n        -e      Treat ARGs as lines"
  27//usage:     "\n        -i L-H  Treat numbers L-H as lines"
  28
  29#include "libbb.h"
  30
  31/* This is a NOEXEC applet. Be very careful! */
  32
  33#define OPT_e           (1 << 0)
  34#define OPT_i           (1 << 1)
  35#define OPT_n           (1 << 2)
  36#define OPT_o           (1 << 3)
  37#define OPT_z           (1 << 4)
  38#define OPT_STR         "ei:n:o:z"
  39
  40/*
  41 * Use the Fisher-Yates shuffle algorithm on an array of lines.
  42 * If the required number of output lines is less than the total
  43 * we can stop shuffling early.
  44 */
  45static void shuffle_lines(char **lines, unsigned numlines, unsigned outlines)
  46{
  47        srand(monotonic_us());
  48
  49        while (outlines != 0) {
  50                char *tmp;
  51                unsigned r = rand();
  52                /* RAND_MAX can be as small as 32767 */
  53                if (numlines > RAND_MAX)
  54                        r ^= rand() << 15;
  55                r %= numlines;
  56//TODO: the above method is seriously non-uniform when numlines is very large.
  57//For example, with numlines of   0xf0000000,
  58//values of (r % numlines) in [0, 0x0fffffff] range
  59//are more likely: e.g. r=1 and r=0xf0000001 both map to 1,
  60//whereas only one value, r=0xefffffff, maps to 0xefffffff.
  61                numlines--;
  62                tmp = lines[numlines];
  63                lines[numlines] = lines[r];
  64                lines[r] = tmp;
  65                outlines--;
  66        }
  67}
  68
  69int shuf_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
  70int shuf_main(int argc, char **argv)
  71{
  72        unsigned opts;
  73        char *opt_i_str, *opt_n_str, *opt_o_str;
  74        char **lines;
  75        unsigned long long lo = lo;
  76        unsigned numlines, outlines;
  77        unsigned i;
  78        char eol;
  79
  80        opts = getopt32(argv, "^"
  81                        OPT_STR
  82                        "\0" "e--i:i--e"/* mutually exclusive */,
  83                        &opt_i_str, &opt_n_str, &opt_o_str
  84        );
  85
  86        argc -= optind;
  87        argv += optind;
  88
  89        /* Prepare lines for shuffling - either: */
  90        if (opts & OPT_e) {
  91                /* make lines from command-line arguments */
  92                numlines = argc;
  93                lines = argv;
  94        } else
  95        if (opts & OPT_i) {
  96                /* create a range of numbers */
  97                unsigned long long hi;
  98                char *dash;
  99
 100                if (argv[0])
 101                        bb_show_usage();
 102
 103                dash = strchr(opt_i_str, '-');
 104                if (!dash) {
 105                        bb_error_msg_and_die("bad range '%s'", opt_i_str);
 106                }
 107                *dash = '\0';
 108                lo = xatoull(opt_i_str);
 109                hi = xatoull(dash + 1);
 110                *dash = '-';
 111                if (hi < lo)
 112                        bb_error_msg_and_die("bad range '%s'", opt_i_str);
 113                hi -= lo;
 114                if (sizeof(size_t) > sizeof(numlines)) {
 115                        if (hi >= UINT_MAX)
 116                                bb_error_msg_and_die("bad range '%s'", opt_i_str);
 117                } else {
 118                        if (hi >= UINT_MAX / sizeof(lines[0]))
 119                                bb_error_msg_and_die("bad range '%s'", opt_i_str);
 120                }
 121
 122                numlines = hi + 1;
 123                lines = xmalloc((size_t)numlines * sizeof(lines[0]));
 124                for (i = 0; i < numlines; i++) {
 125                        lines[i] = (char*)(uintptr_t)i;
 126                }
 127        } else {
 128                /* default - read lines from stdin or the input file */
 129                FILE *fp;
 130                const char *fname = "-";
 131
 132                if (argv[0]) {
 133                        if (argv[1])
 134                                bb_show_usage();
 135                        fname = argv[0];
 136                }
 137
 138                fp = xfopen_stdin(fname);
 139                lines = NULL;
 140                numlines = 0;
 141                for (;;) {
 142                        char *line = xmalloc_fgetline(fp);
 143                        if (!line)
 144                                break;
 145                        lines = xrealloc_vector(lines, 6, numlines);
 146                        lines[numlines++] = line;
 147                }
 148                fclose_if_not_stdin(fp);
 149        }
 150
 151        outlines = numlines;
 152        if (opts & OPT_n) {
 153                outlines = xatou(opt_n_str);
 154                if (outlines > numlines)
 155                        outlines = numlines;
 156        }
 157
 158        shuffle_lines(lines, numlines, outlines);
 159
 160        if (opts & OPT_o)
 161                xmove_fd(xopen(opt_o_str, O_WRONLY|O_CREAT|O_TRUNC), STDOUT_FILENO);
 162
 163        eol = '\n';
 164        if (opts & OPT_z)
 165                eol = '\0';
 166
 167        for (i = numlines - outlines; i < numlines; i++) {
 168                if (opts & OPT_i)
 169                        printf("%llu%c", lo + (uintptr_t)lines[i], eol);
 170                else
 171                        printf("%s%c", lines[i], eol);
 172        }
 173
 174        fflush_stdout_and_exit(EXIT_SUCCESS);
 175}
 176