busybox/editors/diff.c
<<
>>
Prefs
   1/* vi: set sw=4 ts=4: */
   2/*
   3 * Mini diff implementation for busybox, adapted from OpenBSD diff.
   4 *
   5 * Copyright (C) 2010 by Matheus Izvekov <mizvekov@gmail.com>
   6 * Copyright (C) 2006 by Robert Sullivan <cogito.ergo.cogito@hotmail.com>
   7 * Copyright (c) 2003 Todd C. Miller <Todd.Miller@courtesan.com>
   8 *
   9 * Sponsored in part by the Defense Advanced Research Projects
  10 * Agency (DARPA) and Air Force Research Laboratory, Air Force
  11 * Materiel Command, USAF, under agreement number F39502-99-1-0512.
  12 *
  13 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
  14 */
  15/*
  16 * The following code uses an algorithm due to Harold Stone,
  17 * which finds a pair of longest identical subsequences in
  18 * the two files.
  19 *
  20 * The major goal is to generate the match vector J.
  21 * J[i] is the index of the line in file1 corresponding
  22 * to line i in file0. J[i] = 0 if there is no
  23 * such line in file1.
  24 *
  25 * Lines are hashed so as to work in core. All potential
  26 * matches are located by sorting the lines of each file
  27 * on the hash (called "value"). In particular, this
  28 * collects the equivalence classes in file1 together.
  29 * Subroutine equiv replaces the value of each line in
  30 * file0 by the index of the first element of its
  31 * matching equivalence in (the reordered) file1.
  32 * To save space equiv squeezes file1 into a single
  33 * array member in which the equivalence classes
  34 * are simply concatenated, except that their first
  35 * members are flagged by changing sign.
  36 *
  37 * Next the indices that point into member are unsorted into
  38 * array class according to the original order of file0.
  39 *
  40 * The cleverness lies in routine stone. This marches
  41 * through the lines of file0, developing a vector klist
  42 * of "k-candidates". At step i a k-candidate is a matched
  43 * pair of lines x,y (x in file0, y in file1) such that
  44 * there is a common subsequence of length k
  45 * between the first i lines of file0 and the first y
  46 * lines of file1, but there is no such subsequence for
  47 * any smaller y. x is the earliest possible mate to y
  48 * that occurs in such a subsequence.
  49 *
  50 * Whenever any of the members of the equivalence class of
  51 * lines in file1 matable to a line in file0 has serial number
  52 * less than the y of some k-candidate, that k-candidate
  53 * with the smallest such y is replaced. The new
  54 * k-candidate is chained (via pred) to the current
  55 * k-1 candidate so that the actual subsequence can
  56 * be recovered. When a member has serial number greater
  57 * that the y of all k-candidates, the klist is extended.
  58 * At the end, the longest subsequence is pulled out
  59 * and placed in the array J by unravel
  60 *
  61 * With J in hand, the matches there recorded are
  62 * checked against reality to assure that no spurious
  63 * matches have crept in due to hashing. If they have,
  64 * they are broken, and "jackpot" is recorded--a harmless
  65 * matter except that a true match for a spuriously
  66 * mated line may now be unnecessarily reported as a change.
  67 *
  68 * Much of the complexity of the program comes simply
  69 * from trying to minimize core utilization and
  70 * maximize the range of doable problems by dynamically
  71 * allocating what is needed and reusing what is not.
  72 * The core requirements for problems larger than somewhat
  73 * are (in words) 2*length(file0) + length(file1) +
  74 * 3*(number of k-candidates installed), typically about
  75 * 6n words for files of length n.
  76 */
  77//config:config DIFF
  78//config:       bool "diff (13 kb)"
  79//config:       default y
  80//config:       help
  81//config:       diff compares two files or directories and outputs the
  82//config:       differences between them in a form that can be given to
  83//config:       the patch command.
  84//config:
  85//config:config FEATURE_DIFF_LONG_OPTIONS
  86//config:       bool "Enable long options"
  87//config:       default y
  88//config:       depends on DIFF && LONG_OPTS
  89//config:
  90//config:config FEATURE_DIFF_DIR
  91//config:       bool "Enable directory support"
  92//config:       default y
  93//config:       depends on DIFF
  94//config:       help
  95//config:       This option enables support for directory and subdirectory
  96//config:       comparison.
  97
  98//applet:IF_DIFF(APPLET(diff, BB_DIR_USR_BIN, BB_SUID_DROP))
  99
 100//kbuild:lib-$(CONFIG_DIFF) += diff.o
 101
 102//usage:#define diff_trivial_usage
 103//usage:       "[-abBdiNqrTstw] [-L LABEL] [-S FILE] [-U LINES] FILE1 FILE2"
 104//usage:#define diff_full_usage "\n\n"
 105//usage:       "Compare files line by line and output the differences between them.\n"
 106//usage:       "This implementation supports unified diffs only.\n"
 107//usage:     "\n        -a      Treat all files as text"
 108//usage:     "\n        -b      Ignore changes in the amount of whitespace"
 109//usage:     "\n        -B      Ignore changes whose lines are all blank"
 110//usage:     "\n        -d      Try hard to find a smaller set of changes"
 111//usage:     "\n        -i      Ignore case differences"
 112//usage:     "\n        -L      Use LABEL instead of the filename in the unified header"
 113//usage:     "\n        -N      Treat absent files as empty"
 114//usage:     "\n        -q      Output only whether files differ"
 115//usage:     "\n        -r      Recurse"
 116//usage:     "\n        -S      Start with FILE when comparing directories"
 117//usage:     "\n        -T      Make tabs line up by prefixing a tab when necessary"
 118//usage:     "\n        -s      Report when two files are the same"
 119//usage:     "\n        -t      Expand tabs to spaces in output"
 120//usage:     "\n        -U      Output LINES lines of context"
 121//usage:     "\n        -w      Ignore all whitespace"
 122
 123#include "libbb.h"
 124#include "common_bufsiz.h"
 125
 126#if 0
 127# define dbg_error_msg(...) bb_error_msg(__VA_ARGS__)
 128#else
 129# define dbg_error_msg(...) ((void)0)
 130#endif
 131
 132enum {                  /* print_status() and diffreg() return values */
 133        STATUS_SAME,    /* files are the same */
 134        STATUS_DIFFER,  /* files differ */
 135        STATUS_BINARY,  /* binary files differ */
 136};
 137
 138enum {                  /* Commandline flags */
 139        FLAG_a,
 140        FLAG_b,
 141        FLAG_d,
 142        FLAG_i,
 143        FLAG_L,         /* never used, handled by getopt32 */
 144        FLAG_N,
 145        FLAG_q,
 146        FLAG_r,
 147        FLAG_s,
 148        FLAG_S,         /* never used, handled by getopt32 */
 149        FLAG_t,
 150        FLAG_T,
 151        FLAG_U,         /* never used, handled by getopt32 */
 152        FLAG_w,
 153        FLAG_u,         /* ignored, this is the default */
 154        FLAG_p,         /* not implemented */
 155        FLAG_B,
 156        FLAG_E,         /* not implemented */
 157};
 158#define FLAG(x) (1 << FLAG_##x)
 159
 160/* We cache file position to avoid excessive seeking */
 161typedef struct FILE_and_pos_t {
 162        FILE *ft_fp;
 163        off_t ft_pos;
 164} FILE_and_pos_t;
 165
 166struct globals {
 167        smallint exit_status;
 168        int opt_U_context;
 169        const char *other_dir;
 170        char *label[2];
 171        struct stat stb[2];
 172};
 173#define G (*ptr_to_globals)
 174#define exit_status        (G.exit_status       )
 175#define opt_U_context      (G.opt_U_context     )
 176#define label              (G.label             )
 177#define stb                (G.stb               )
 178#define INIT_G() do { \
 179        SET_PTR_TO_GLOBALS(xzalloc(sizeof(G))); \
 180        opt_U_context = 3; \
 181} while (0)
 182
 183typedef int token_t;
 184
 185enum {
 186        /* Public */
 187        TOK_EMPTY = 1 << 9,  /* Line fully processed, you can proceed to the next */
 188        TOK_EOF   = 1 << 10, /* File ended */
 189        /* Private (Only to be used by read_token() */
 190        TOK_EOL   = 1 << 11, /* we saw EOL (sticky) */
 191        TOK_SPACE = 1 << 12, /* used -b code, means we are skipping spaces */
 192        SHIFT_EOF = (sizeof(token_t)*8 - 8) - 1,
 193        CHAR_MASK = 0x1ff,   /* 8th bit is used to distinguish EOF from 0xff */
 194};
 195
 196/* Restores full EOF from one 8th bit: */
 197//#define TOK2CHAR(t) (((t) << SHIFT_EOF) >> SHIFT_EOF)
 198/* We don't really need the above, we only need to have EOF != any_real_char: */
 199#define TOK2CHAR(t) ((t) & CHAR_MASK)
 200
 201static void seek_ft(FILE_and_pos_t *ft, off_t pos)
 202{
 203        if (ft->ft_pos != pos) {
 204                ft->ft_pos = pos;
 205                fseeko(ft->ft_fp, pos, SEEK_SET);
 206        }
 207}
 208
 209/* Reads tokens from given fp, handling -b and -w flags
 210 * The user must reset tok every line start
 211 */
 212static int read_token(FILE_and_pos_t *ft, token_t tok)
 213{
 214        tok |= TOK_EMPTY;
 215        while (!(tok & TOK_EOL)) {
 216                bool is_space;
 217                int t;
 218
 219                t = fgetc(ft->ft_fp);
 220                if (t != EOF)
 221                        ft->ft_pos++;
 222                is_space = (t == EOF || isspace(t));
 223
 224                /* If t == EOF (-1), set both TOK_EOF and TOK_EOL */
 225                tok |= (t & (TOK_EOF + TOK_EOL));
 226                /* Only EOL? */
 227                if (t == '\n')
 228                        tok |= TOK_EOL;
 229
 230                if (option_mask32 & FLAG(i)) /* Handcoded tolower() */
 231                        t = (t >= 'A' && t <= 'Z') ? t - ('A' - 'a') : t;
 232
 233                if ((option_mask32 & FLAG(w)) && is_space)
 234                        continue;
 235
 236                /* Trim char value to low 9 bits */
 237                t &= CHAR_MASK;
 238
 239                if (option_mask32 & FLAG(b)) {
 240                        /* Was prev char whitespace? */
 241                        if (tok & TOK_SPACE) { /* yes */
 242                                if (is_space) /* this one too, ignore it */
 243                                        continue;
 244                                tok &= ~TOK_SPACE;
 245                        } else if (is_space) {
 246                                /* 1st whitespace char.
 247                                 * Set TOK_SPACE and replace char by ' ' */
 248                                t = TOK_SPACE + ' ';
 249                        }
 250                }
 251                /* Clear EMPTY */
 252                tok &= ~(TOK_EMPTY + CHAR_MASK);
 253                /* Assign char value (low 9 bits) and maybe set TOK_SPACE */
 254                tok |= t;
 255                break;
 256        }
 257#if 0
 258        bb_error_msg("fp:%p tok:%x '%c'%s%s%s%s", fp, tok, tok & 0xff
 259                , tok & TOK_EOF ? " EOF" : ""
 260                , tok & TOK_EOL ? " EOL" : ""
 261                , tok & TOK_EMPTY ? " EMPTY" : ""
 262                , tok & TOK_SPACE ? " SPACE" : ""
 263        );
 264#endif
 265        return tok;
 266}
 267
 268struct cand {
 269        int x;
 270        int y;
 271        int pred;
 272};
 273
 274static int search(const int *c, int k, int y, const struct cand *list)
 275{
 276        int i, j;
 277
 278        if (list[c[k]].y < y)  /* quick look for typical case */
 279                return k + 1;
 280
 281        for (i = 0, j = k + 1;;) {
 282                const int l = (i + j) >> 1;
 283                if (l > i) {
 284                        const int t = list[c[l]].y;
 285                        if (t > y)
 286                                j = l;
 287                        else if (t < y)
 288                                i = l;
 289                        else
 290                                return l;
 291                } else
 292                        return l + 1;
 293        }
 294}
 295
 296static void stone(const int *a, int n, const int *b, int *J, int pref)
 297{
 298        const unsigned isq = isqrt(n);
 299        const unsigned bound =
 300                (option_mask32 & FLAG(d)) ? UINT_MAX : MAX(256, isq);
 301        int clen = 1;
 302        int clistlen = 100;
 303        int k = 0;
 304        struct cand *clist = xzalloc(clistlen * sizeof(clist[0]));
 305        struct cand cand;
 306        struct cand *q;
 307        int *klist = xzalloc((n + 2) * sizeof(klist[0]));
 308        /*clist[0] = (struct cand){0}; - xzalloc did it */
 309        /*klist[0] = 0; */
 310
 311        for (cand.x = 1; cand.x <= n; cand.x++) {
 312                int j = a[cand.x], oldl = 0;
 313                unsigned numtries = 0;
 314                if (j == 0)
 315                        continue;
 316                cand.y = -b[j];
 317                cand.pred = klist[0];
 318                do {
 319                        int l, tc;
 320                        if (cand.y <= clist[cand.pred].y)
 321                                continue;
 322                        l = search(klist, k, cand.y, clist);
 323                        if (l != oldl + 1)
 324                                cand.pred = klist[l - 1];
 325                        if (l <= k && clist[klist[l]].y <= cand.y)
 326                                continue;
 327                        if (clen == clistlen) {
 328                                clistlen = clistlen * 11 / 10;
 329                                clist = xrealloc(clist, clistlen * sizeof(clist[0]));
 330                        }
 331                        clist[clen] = cand;
 332                        tc = klist[l];
 333                        klist[l] = clen++;
 334                        if (l <= k) {
 335                                cand.pred = tc;
 336                                oldl = l;
 337                                numtries++;
 338                        } else {
 339                                k++;
 340                                break;
 341                        }
 342                } while ((cand.y = b[++j]) > 0 && numtries < bound);
 343        }
 344        /* Unravel */
 345        for (q = clist + klist[k]; q->y; q = clist + q->pred)
 346                J[q->x + pref] = q->y + pref;
 347        free(klist);
 348        free(clist);
 349}
 350
 351struct line {
 352        /* 'serial' is not used in the beginning, so we reuse it
 353         * to store line offsets, thus reducing memory pressure
 354         */
 355        union {
 356                unsigned serial;
 357                off_t offset;
 358        };
 359        unsigned value;
 360};
 361
 362static void equiv(struct line *a, int n, struct line *b, int m, int *c)
 363{
 364        int i = 1, j = 1;
 365
 366        while (i <= n && j <= m) {
 367                if (a[i].value < b[j].value)
 368                        a[i++].value = 0;
 369                else if (a[i].value == b[j].value)
 370                        a[i++].value = j;
 371                else
 372                        j++;
 373        }
 374        while (i <= n)
 375                a[i++].value = 0;
 376        b[m + 1].value = 0;
 377        j = 0;
 378        while (++j <= m) {
 379                c[j] = -b[j].serial;
 380                while (b[j + 1].value == b[j].value) {
 381                        j++;
 382                        c[j] = b[j].serial;
 383                }
 384        }
 385        c[j] = -1;
 386}
 387
 388static void unsort(const struct line *f, int l, int *b)
 389{
 390        int i;
 391        int *a = xmalloc((l + 1) * sizeof(a[0]));
 392        for (i = 1; i <= l; i++)
 393                a[f[i].serial] = f[i].value;
 394        for (i = 1; i <= l; i++)
 395                b[i] = a[i];
 396        free(a);
 397}
 398
 399static int line_compar(const void *a, const void *b)
 400{
 401#define l0 ((const struct line*)a)
 402#define l1 ((const struct line*)b)
 403        int r = l0->value - l1->value;
 404        if (r)
 405                return r;
 406        return l0->serial - l1->serial;
 407#undef l0
 408#undef l1
 409}
 410
 411static void fetch(FILE_and_pos_t *ft, const off_t *ix, int a, int b, int ch)
 412{
 413        int i, j, col;
 414        for (i = a; i <= b; i++) {
 415                seek_ft(ft, ix[i - 1]);
 416                putchar(ch);
 417                if (option_mask32 & FLAG(T))
 418                        putchar('\t');
 419                for (j = 0, col = 0; j < ix[i] - ix[i - 1]; j++) {
 420                        int c = fgetc(ft->ft_fp);
 421                        if (c == EOF) {
 422                                puts("\n\\ No newline at end of file");
 423                                return;
 424                        }
 425                        ft->ft_pos++;
 426                        if (c == '\t' && (option_mask32 & FLAG(t)))
 427                                do putchar(' '); while (++col & 7);
 428                        else {
 429                                putchar(c);
 430                                col++;
 431                        }
 432                }
 433        }
 434}
 435
 436/* Creates the match vector J, where J[i] is the index
 437 * of the line in the new file corresponding to the line i
 438 * in the old file. Lines start at 1 instead of 0, that value
 439 * being used instead to denote no corresponding line.
 440 * This vector is dynamically allocated and must be freed by the caller.
 441 *
 442 * * fp is an input parameter, where fp[0] and fp[1] are the open
 443 *   old file and new file respectively.
 444 * * nlen is an output variable, where nlen[0] and nlen[1]
 445 *   gets the number of lines in the old and new file respectively.
 446 * * ix is an output variable, where ix[0] and ix[1] gets
 447 *   assigned dynamically allocated vectors of the offsets of the lines
 448 *   of the old and new file respectively. These must be freed by the caller.
 449 */
 450static NOINLINE int *create_J(FILE_and_pos_t ft[2], int nlen[2], off_t *ix[2])
 451{
 452        int *J, slen[2], *class, *member;
 453        struct line *nfile[2], *sfile[2];
 454        int pref = 0, suff = 0, i, j, delta;
 455
 456        /* Lines of both files are hashed, and in the process
 457         * their offsets are stored in the array ix[fileno]
 458         * where fileno == 0 points to the old file, and
 459         * fileno == 1 points to the new one.
 460         */
 461        for (i = 0; i < 2; i++) {
 462                unsigned hash;
 463                token_t tok;
 464                size_t sz = 100;
 465                nfile[i] = xmalloc((sz + 3) * sizeof(nfile[i][0]));
 466                /* ft gets here without the correct position, cant use seek_ft */
 467                ft[i].ft_pos = 0;
 468                fseeko(ft[i].ft_fp, 0, SEEK_SET);
 469
 470                nlen[i] = 0;
 471                /* We could zalloc nfile, but then zalloc starts showing in gprof at ~1% */
 472                nfile[i][0].offset = 0;
 473                goto start; /* saves code */
 474                while (1) {
 475                        tok = read_token(&ft[i], tok);
 476                        if (!(tok & TOK_EMPTY)) {
 477                                /* Hash algorithm taken from Robert Sedgewick, Algorithms in C, 3d ed., p 578. */
 478                                /*hash = hash * 128 - hash + TOK2CHAR(tok);
 479                                 * gcc insists on optimizing above to "hash * 127 + ...", thus... */
 480                                unsigned o = hash - TOK2CHAR(tok);
 481                                hash = hash * 128 - o; /* we want SPEED here */
 482                                continue;
 483                        }
 484                        if (nlen[i]++ == sz) {
 485                                sz = sz * 3 / 2;
 486                                nfile[i] = xrealloc(nfile[i], (sz + 3) * sizeof(nfile[i][0]));
 487                        }
 488                        /* line_compar needs hashes fit into positive int */
 489                        nfile[i][nlen[i]].value = hash & INT_MAX;
 490                        /* like ftello(ft[i].ft_fp) but faster (avoids lseek syscall) */
 491                        nfile[i][nlen[i]].offset = ft[i].ft_pos;
 492                        if (tok & TOK_EOF) {
 493                                /* EOF counts as a token, so we have to adjust it here */
 494                                nfile[i][nlen[i]].offset++;
 495                                break;
 496                        }
 497start:
 498                        hash = tok = 0;
 499                }
 500                /* Exclude lone EOF line from the end of the file, to make fetch()'s job easier */
 501                if (nfile[i][nlen[i]].offset - nfile[i][nlen[i] - 1].offset == 1)
 502                        nlen[i]--;
 503                /* Now we copy the line offsets into ix */
 504                ix[i] = xmalloc((nlen[i] + 2) * sizeof(ix[i][0]));
 505                for (j = 0; j < nlen[i] + 1; j++)
 506                        ix[i][j] = nfile[i][j].offset;
 507        }
 508
 509        /* length of prefix and suffix is calculated */
 510        for (; pref < nlen[0] && pref < nlen[1] &&
 511               nfile[0][pref + 1].value == nfile[1][pref + 1].value;
 512               pref++);
 513        for (; suff < nlen[0] - pref && suff < nlen[1] - pref &&
 514               nfile[0][nlen[0] - suff].value == nfile[1][nlen[1] - suff].value;
 515               suff++);
 516        /* Arrays are pruned by the suffix and prefix length,
 517         * the result being sorted and stored in sfile[fileno],
 518         * and their sizes are stored in slen[fileno]
 519         */
 520        for (j = 0; j < 2; j++) {
 521                sfile[j] = nfile[j] + pref;
 522                slen[j] = nlen[j] - pref - suff;
 523                for (i = 0; i <= slen[j]; i++)
 524                        sfile[j][i].serial = i;
 525                qsort(sfile[j] + 1, slen[j], sizeof(*sfile[j]), line_compar);
 526        }
 527        /* nfile arrays are reused to reduce memory pressure
 528         * The #if zeroed out section performs the same task as the
 529         * one in the #else section.
 530         * Peak memory usage is higher, but one array copy is avoided
 531         * by not using unsort()
 532         */
 533#if 0
 534        member = xmalloc((slen[1] + 2) * sizeof(member[0]));
 535        equiv(sfile[0], slen[0], sfile[1], slen[1], member);
 536        free(nfile[1]);
 537
 538        class = xmalloc((slen[0] + 1) * sizeof(class[0]));
 539        for (i = 1; i <= slen[0]; i++) /* Unsorting */
 540                class[sfile[0][i].serial] = sfile[0][i].value;
 541        free(nfile[0]);
 542#else
 543        member = (int *)nfile[1];
 544        equiv(sfile[0], slen[0], sfile[1], slen[1], member);
 545        member = xrealloc(member, (slen[1] + 2) * sizeof(member[0]));
 546
 547        class = (int *)nfile[0];
 548        unsort(sfile[0], slen[0], (int *)nfile[0]);
 549        class = xrealloc(class, (slen[0] + 2) * sizeof(class[0]));
 550#endif
 551        J = xmalloc((nlen[0] + 2) * sizeof(J[0]));
 552        /* The elements of J which fall inside the prefix and suffix regions
 553         * are marked as unchanged, while the ones which fall outside
 554         * are initialized with 0 (no matches), so that function stone can
 555         * then assign them their right values
 556         */
 557        for (i = 0, delta = nlen[1] - nlen[0]; i <= nlen[0]; i++)
 558                J[i] = i <= pref            ?  i :
 559                       i > (nlen[0] - suff) ? (i + delta) : 0;
 560        /* Here the magic is performed */
 561        stone(class, slen[0], member, J, pref);
 562        J[nlen[0] + 1] = nlen[1] + 1;
 563
 564        free(class);
 565        free(member);
 566
 567        /* Both files are rescanned, in an effort to find any lines
 568         * which, due to limitations intrinsic to any hashing algorithm,
 569         * are different but ended up confounded as the same
 570         */
 571        for (i = 1; i <= nlen[0]; i++) {
 572                if (!J[i])
 573                        continue;
 574
 575                seek_ft(&ft[0], ix[0][i - 1]);
 576                seek_ft(&ft[1], ix[1][J[i] - 1]);
 577
 578                for (j = J[i]; i <= nlen[0] && J[i] == j; i++, j++) {
 579                        token_t tok0 = 0, tok1 = 0;
 580                        do {
 581                                tok0 = read_token(&ft[0], tok0);
 582                                tok1 = read_token(&ft[1], tok1);
 583
 584                                if (((tok0 ^ tok1) & TOK_EMPTY) != 0 /* one is empty (not both) */
 585                                 || (!(tok0 & TOK_EMPTY) && TOK2CHAR(tok0) != TOK2CHAR(tok1))
 586                                ) {
 587                                        J[i] = 0; /* Break the correspondence */
 588                                }
 589                        } while (!(tok0 & tok1 & TOK_EMPTY));
 590                }
 591        }
 592
 593        return J;
 594}
 595
 596static bool diff(FILE* fp[2], char *file[2])
 597{
 598        int nlen[2];
 599        off_t *ix[2];
 600        FILE_and_pos_t ft[2];
 601        typedef struct { int a, b; } vec_t[2];
 602        vec_t *vec = NULL;
 603        int i = 1, j, k, idx = -1;
 604        bool anychange = false;
 605        int *J;
 606
 607        ft[0].ft_fp = fp[0];
 608        ft[1].ft_fp = fp[1];
 609        /* note that ft[i].ft_pos is unintitalized, create_J()
 610         * must not assume otherwise */
 611        J = create_J(ft, nlen, ix);
 612
 613        do {
 614                bool nonempty = false;
 615
 616                while (1) {
 617                        vec_t v;
 618
 619                        for (v[0].a = i; v[0].a <= nlen[0] && J[v[0].a] == J[v[0].a - 1] + 1; v[0].a++)
 620                                continue;
 621                        v[1].a = J[v[0].a - 1] + 1;
 622
 623                        for (v[0].b = v[0].a - 1; v[0].b < nlen[0] && !J[v[0].b + 1]; v[0].b++)
 624                                continue;
 625                        v[1].b = J[v[0].b + 1] - 1;
 626                        /*
 627                         * Indicate that there is a difference between lines a and b of the 'from' file
 628                         * to get to lines c to d of the 'to' file. If a is greater than b then there
 629                         * are no lines in the 'from' file involved and this means that there were
 630                         * lines appended (beginning at b).  If c is greater than d then there are
 631                         * lines missing from the 'to' file.
 632                         */
 633                        if (v[0].a <= v[0].b || v[1].a <= v[1].b) {
 634                                /*
 635                                 * If this change is more than 'context' lines from the
 636                                 * previous change, dump the record and reset it.
 637                                 */
 638                                int ct = (2 * opt_U_context) + 1;
 639                                if (idx >= 0
 640                                 && v[0].a > vec[idx][0].b + ct
 641                                 && v[1].a > vec[idx][1].b + ct
 642                                ) {
 643                                        break;
 644                                }
 645
 646                                for (j = 0; j < 2; j++)
 647                                        for (k = v[j].a; k <= v[j].b; k++)
 648                                                nonempty |= (ix[j][k] - ix[j][k - 1] != 1);
 649
 650                                vec = xrealloc_vector(vec, 6, ++idx);
 651                                memcpy(vec[idx], v, sizeof(v));
 652                        }
 653
 654                        i = v[0].b + 1;
 655                        if (i > nlen[0])
 656                                break;
 657                        J[v[0].b] = v[1].b;
 658                }
 659                if (idx < 0 || ((option_mask32 & FLAG(B)) && !nonempty))
 660                        goto cont;
 661                if (!(option_mask32 & FLAG(q))) {
 662                        int lowa;
 663                        vec_t span, *cvp = vec;
 664
 665                        if (!anychange) {
 666                                /* Print the context/unidiff header first time through */
 667                                printf("--- %s\n", label[0] ? label[0] : file[0]);
 668                                printf("+++ %s\n", label[1] ? label[1] : file[1]);
 669                        }
 670
 671                        printf("@@");
 672                        for (j = 0; j < 2; j++) {
 673                                int a = span[j].a = MAX(1, (*cvp)[j].a - opt_U_context);
 674                                int b = span[j].b = MIN(nlen[j], vec[idx][j].b + opt_U_context);
 675
 676                                printf(" %c%d", j ? '+' : '-', MIN(a, b));
 677                                if (a == b)
 678                                        continue;
 679                                printf(",%d", (a < b) ? b - a + 1 : 0);
 680                        }
 681                        puts(" @@");
 682                        /*
 683                         * Output changes in "unified" diff format--the old and new lines
 684                         * are printed together.
 685                         */
 686                        for (lowa = span[0].a; ; lowa = (*cvp++)[0].b + 1) {
 687                                bool end = cvp > &vec[idx];
 688                                fetch(&ft[0], ix[0], lowa, end ? span[0].b : (*cvp)[0].a - 1, ' ');
 689                                if (end)
 690                                        break;
 691                                for (j = 0; j < 2; j++)
 692                                        fetch(&ft[j], ix[j], (*cvp)[j].a, (*cvp)[j].b, j ? '+' : '-');
 693                        }
 694                }
 695                anychange = true;
 696 cont:
 697                idx = -1;
 698        } while (i <= nlen[0]);
 699
 700        free(vec);
 701        free(ix[0]);
 702        free(ix[1]);
 703        free(J);
 704        return anychange;
 705}
 706
 707static int diffreg(char *file[2])
 708{
 709        FILE *fp[2];
 710        bool binary = false, differ = false;
 711        int status = STATUS_SAME, i;
 712
 713        fp[0] = stdin;
 714        fp[1] = stdin;
 715        for (i = 0; i < 2; i++) {
 716                int fd = STDIN_FILENO;
 717                if (!LONE_DASH(file[i])) {
 718                        if (!(option_mask32 & FLAG(N))) {
 719                                fd = open_or_warn(file[i], O_RDONLY);
 720                                if (fd == -1)
 721                                        goto out;
 722                        } else {
 723                                /* -N: if some file does not exist compare it like empty */
 724                                fd = open(file[i], O_RDONLY);
 725                                if (fd == -1)
 726                                        fd = xopen("/dev/null", O_RDONLY);
 727                        }
 728                }
 729                /* Our diff implementation is using seek.
 730                 * When we meet non-seekable file, we must make a temp copy.
 731                 */
 732                if (lseek(fd, 0, SEEK_SET) == -1 && errno == ESPIPE) {
 733                        char name[] = "/tmp/difXXXXXX";
 734                        int fd_tmp = xmkstemp(name);
 735
 736                        unlink(name);
 737                        if (bb_copyfd_eof(fd, fd_tmp) < 0)
 738                                xfunc_die();
 739                        if (fd != STDIN_FILENO)
 740                                close(fd);
 741                        fd = fd_tmp;
 742                        xlseek(fd, 0, SEEK_SET);
 743                }
 744                fp[i] = fdopen(fd, "r");
 745        }
 746
 747        setup_common_bufsiz();
 748        while (1) {
 749                const size_t sz = COMMON_BUFSIZE / 2;
 750                char *const buf0 = bb_common_bufsiz1;
 751                char *const buf1 = buf0 + sz;
 752                int j, k;
 753                i = fread(buf0, 1, sz, fp[0]);
 754                j = fread(buf1, 1, sz, fp[1]);
 755                if (i != j) {
 756                        differ = true;
 757                        i = MIN(i, j);
 758                }
 759                if (i == 0)
 760                        break;
 761                for (k = 0; k < i; k++) {
 762                        if (!buf0[k] || !buf1[k])
 763                                binary = true;
 764                        if (buf0[k] != buf1[k])
 765                                differ = true;
 766                }
 767        }
 768        if (differ) {
 769                if (binary && !(option_mask32 & FLAG(a)))
 770                        status = STATUS_BINARY;
 771                else if (diff(fp, file))
 772                        status = STATUS_DIFFER;
 773        }
 774        if (status != STATUS_SAME)
 775                exit_status |= 1;
 776out:
 777        fclose_if_not_stdin(fp[0]);
 778        fclose_if_not_stdin(fp[1]);
 779
 780        return status;
 781}
 782
 783static void print_status(int status, char *path[2])
 784{
 785        switch (status) {
 786        case STATUS_BINARY:
 787        case STATUS_DIFFER:
 788                if ((option_mask32 & FLAG(q)) || status == STATUS_BINARY)
 789                        printf("Files %s and %s differ\n", path[0], path[1]);
 790                break;
 791        case STATUS_SAME:
 792                if (option_mask32 & FLAG(s))
 793                        printf("Files %s and %s are identical\n", path[0], path[1]);
 794                break;
 795        }
 796}
 797
 798#if ENABLE_FEATURE_DIFF_DIR
 799struct dlist {
 800        size_t len;
 801        int s, e;
 802        char **dl;
 803};
 804
 805/* This function adds a filename to dl, the directory listing. */
 806static int FAST_FUNC add_to_dirlist(const char *filename,
 807                struct stat *sb UNUSED_PARAM,
 808                void *userdata, int depth UNUSED_PARAM)
 809{
 810        struct dlist *const l = userdata;
 811        const char *file = filename + l->len;
 812        while (*file == '/')
 813                file++;
 814        l->dl = xrealloc_vector(l->dl, 6, l->e);
 815        l->dl[l->e] = xstrdup(file);
 816        l->e++;
 817        return TRUE;
 818}
 819
 820/* If recursion is not set, this function adds the directory
 821 * to the list and prevents recursive_action from recursing into it.
 822 */
 823static int FAST_FUNC skip_dir(const char *filename,
 824                struct stat *sb, void *userdata,
 825                int depth)
 826{
 827        if (!(option_mask32 & FLAG(r)) && depth) {
 828                add_to_dirlist(filename, sb, userdata, depth);
 829                return SKIP;
 830        }
 831        if (!(option_mask32 & FLAG(N))) {
 832                /* -r without -N: no need to recurse into dirs
 833                 * which do not exist on the "other side".
 834                 * Testcase: diff -r /tmp /
 835                 * (it would recurse deep into /proc without this code) */
 836                struct dlist *const l = userdata;
 837                filename += l->len;
 838                if (filename[0]) {
 839                        struct stat osb;
 840                        char *othername = concat_path_file(G.other_dir, filename);
 841                        int r = stat(othername, &osb);
 842                        free(othername);
 843                        if (r != 0 || !S_ISDIR(osb.st_mode)) {
 844                                /* other dir doesn't have similarly named
 845                                 * directory, don't recurse; return 1 upon
 846                                 * exit, just like diffutils' diff */
 847                                exit_status |= 1;
 848                                return SKIP;
 849                        }
 850                }
 851        }
 852        return TRUE;
 853}
 854
 855static void diffdir(char *p[2], const char *s_start)
 856{
 857        struct dlist list[2];
 858        int i;
 859
 860        memset(&list, 0, sizeof(list));
 861        for (i = 0; i < 2; i++) {
 862                /*list[i].s = list[i].e = 0; - memset did it */
 863                /*list[i].dl = NULL; */
 864
 865                G.other_dir = p[1 - i];
 866                /* We need to trim root directory prefix.
 867                 * Using list.len to specify its length,
 868                 * add_to_dirlist will remove it. */
 869                list[i].len = strlen(p[i]);
 870                recursive_action(p[i], ACTION_RECURSE | ACTION_FOLLOWLINKS,
 871                                add_to_dirlist, skip_dir, &list[i], 0);
 872                /* Sort dl alphabetically.
 873                 * GNU diff does this ignoring any number of trailing dots.
 874                 * We don't, so for us dotted files almost always are
 875                 * first on the list.
 876                 */
 877                qsort_string_vector(list[i].dl, list[i].e);
 878                /* If -S was set, find the starting point. */
 879                if (!s_start)
 880                        continue;
 881                while (list[i].s < list[i].e && strcmp(list[i].dl[list[i].s], s_start) < 0)
 882                        list[i].s++;
 883        }
 884        /* Now that both dirlist1 and dirlist2 contain sorted directory
 885         * listings, we can start to go through dirlist1. If both listings
 886         * contain the same file, then do a normal diff. Otherwise, behaviour
 887         * is determined by whether the -N flag is set. */
 888        while (1) {
 889                char *dp[2];
 890                int pos;
 891                int k;
 892
 893                dp[0] = list[0].s < list[0].e ? list[0].dl[list[0].s] : NULL;
 894                dp[1] = list[1].s < list[1].e ? list[1].dl[list[1].s] : NULL;
 895                if (!dp[0] && !dp[1])
 896                        break;
 897                pos = !dp[0] ? 1 : (!dp[1] ? -1 : strcmp(dp[0], dp[1]));
 898                k = pos > 0;
 899                if (pos && !(option_mask32 & FLAG(N))) {
 900                        printf("Only in %s: %s\n", p[k], dp[k]);
 901                        exit_status |= 1;
 902                } else {
 903                        char *fullpath[2], *path[2]; /* if -N */
 904
 905                        for (i = 0; i < 2; i++) {
 906                                if (pos == 0 || i == k) {
 907                                        path[i] = fullpath[i] = concat_path_file(p[i], dp[i]);
 908                                        stat(fullpath[i], &stb[i]);
 909                                } else {
 910                                        fullpath[i] = concat_path_file(p[i], dp[1 - i]);
 911                                        path[i] = (char *)bb_dev_null;
 912                                }
 913                        }
 914                        if (pos)
 915                                stat(fullpath[k], &stb[1 - k]);
 916
 917                        if (S_ISDIR(stb[0].st_mode) && S_ISDIR(stb[1].st_mode))
 918                                printf("Common subdirectories: %s and %s\n", fullpath[0], fullpath[1]);
 919                        else if (!S_ISREG(stb[0].st_mode) && !S_ISDIR(stb[0].st_mode))
 920                                printf("File %s is not a regular file or directory and was skipped\n", fullpath[0]);
 921                        else if (!S_ISREG(stb[1].st_mode) && !S_ISDIR(stb[1].st_mode))
 922                                printf("File %s is not a regular file or directory and was skipped\n", fullpath[1]);
 923                        else if (S_ISDIR(stb[0].st_mode) != S_ISDIR(stb[1].st_mode)) {
 924                                if (S_ISDIR(stb[0].st_mode))
 925                                        printf("File %s is a %s while file %s is a %s\n", fullpath[0], "directory", fullpath[1], "regular file");
 926                                else
 927                                        printf("File %s is a %s while file %s is a %s\n", fullpath[0], "regular file", fullpath[1], "directory");
 928                        } else
 929                                print_status(diffreg(path), fullpath);
 930
 931                        free(fullpath[0]);
 932                        free(fullpath[1]);
 933                }
 934                free(dp[k]);
 935                list[k].s++;
 936                if (pos == 0) {
 937                        free(dp[1 - k]);
 938                        list[1 - k].s++;
 939                }
 940        }
 941        if (ENABLE_FEATURE_CLEAN_UP) {
 942                free(list[0].dl);
 943                free(list[1].dl);
 944        }
 945}
 946#endif
 947
 948#if ENABLE_FEATURE_DIFF_LONG_OPTIONS
 949static const char diff_longopts[] ALIGN1 =
 950        "ignore-case\0"              No_argument       "i"
 951        "ignore-tab-expansion\0"     No_argument       "E"
 952        "ignore-space-change\0"      No_argument       "b"
 953        "ignore-all-space\0"         No_argument       "w"
 954        "ignore-blank-lines\0"       No_argument       "B"
 955        "text\0"                     No_argument       "a"
 956        "unified\0"                  Required_argument "U"
 957        "label\0"                    Required_argument "L"
 958        "show-c-function\0"          No_argument       "p"
 959        "brief\0"                    No_argument       "q"
 960        "expand-tabs\0"              No_argument       "t"
 961        "initial-tab\0"              No_argument       "T"
 962        "recursive\0"                No_argument       "r"
 963        "new-file\0"                 No_argument       "N"
 964        "report-identical-files\0"   No_argument       "s"
 965        "starting-file\0"            Required_argument "S"
 966        "minimal\0"                  No_argument       "d"
 967        ;
 968# define GETOPT32 getopt32long
 969# define LONGOPTS ,diff_longopts
 970#else
 971# define GETOPT32 getopt32
 972# define LONGOPTS
 973#endif
 974
 975int diff_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
 976int diff_main(int argc UNUSED_PARAM, char **argv)
 977{
 978        int gotstdin = 0, i;
 979        char *file[2], *s_start = NULL;
 980        llist_t *L_arg = NULL;
 981
 982        INIT_G();
 983
 984        /* exactly 2 params; collect multiple -L <label>; -U N */
 985        GETOPT32(argv, "^" "abdiL:*NqrsS:tTU:+wupBE" "\0" "=2"
 986                        LONGOPTS,
 987                        &L_arg, &s_start, &opt_U_context);
 988        argv += optind;
 989        while (L_arg)
 990                label[!!label[0]] = llist_pop(&L_arg);
 991
 992        /* Compat: "diff file name_which_doesnt_exist" exits with 2 */
 993        xfunc_error_retval = 2;
 994        for (i = 0; i < 2; i++) {
 995                file[i] = argv[i];
 996                if (LONE_DASH(file[i])) {
 997                        fstat(STDIN_FILENO, &stb[i]);
 998                        gotstdin++;
 999                } else if (option_mask32 & FLAG(N)) {
1000                        if (stat(file[i], &stb[i]))
1001                                xstat("/dev/null", &stb[i]);
1002                } else {
1003                        xstat(file[i], &stb[i]);
1004                }
1005        }
1006        xfunc_error_retval = 1;
1007
1008        if (gotstdin && (S_ISDIR(stb[0].st_mode) || S_ISDIR(stb[1].st_mode)))
1009                bb_error_msg_and_die("can't compare stdin to a directory");
1010
1011        /* Compare metadata to check if the files are the same physical file.
1012         *
1013         * Comment from diffutils source says:
1014         * POSIX says that two files are identical if st_ino and st_dev are
1015         * the same, but many file systems incorrectly assign the same (device,
1016         * inode) pair to two distinct files, including:
1017         * GNU/Linux NFS servers that export all local file systems as a
1018         * single NFS file system, if a local device number (st_dev) exceeds
1019         * 255, or if a local inode number (st_ino) exceeds 16777215.
1020         */
1021        if (ENABLE_DESKTOP
1022         && stb[0].st_ino == stb[1].st_ino
1023         && stb[0].st_dev == stb[1].st_dev
1024         && stb[0].st_size == stb[1].st_size
1025         && stb[0].st_mtime == stb[1].st_mtime
1026         && stb[0].st_ctime == stb[1].st_ctime
1027         && stb[0].st_mode == stb[1].st_mode
1028         && stb[0].st_nlink == stb[1].st_nlink
1029         && stb[0].st_uid == stb[1].st_uid
1030         && stb[0].st_gid == stb[1].st_gid
1031        ) {
1032                /* files are physically the same; no need to compare them */
1033                return STATUS_SAME;
1034        }
1035
1036        if (S_ISDIR(stb[0].st_mode) && S_ISDIR(stb[1].st_mode)) {
1037#if ENABLE_FEATURE_DIFF_DIR
1038                diffdir(file, s_start);
1039#else
1040                bb_error_msg_and_die("no support for directory comparison");
1041#endif
1042        } else {
1043                bool dirfile = S_ISDIR(stb[0].st_mode) || S_ISDIR(stb[1].st_mode);
1044                bool dir = S_ISDIR(stb[1].st_mode);
1045                if (dirfile) {
1046                        const char *slash = strrchr(file[!dir], '/');
1047                        file[dir] = concat_path_file(file[dir], slash ? slash + 1 : file[!dir]);
1048                        xstat(file[dir], &stb[dir]);
1049                }
1050                /* diffreg can get non-regular files here */
1051                print_status(gotstdin > 1 ? STATUS_SAME : diffreg(file), file);
1052
1053                if (dirfile)
1054                        free(file[dir]);
1055        }
1056
1057        return exit_status;
1058}
1059