busybox/archival/bz/blocksort.c
<<
>>
Prefs
   1/*
   2 * bzip2 is written by Julian Seward <jseward@bzip.org>.
   3 * Adapted for busybox by Denys Vlasenko <vda.linux@googlemail.com>.
   4 * See README and LICENSE files in this directory for more information.
   5 */
   6
   7/*-------------------------------------------------------------*/
   8/*--- Block sorting machinery                               ---*/
   9/*---                                           blocksort.c ---*/
  10/*-------------------------------------------------------------*/
  11
  12/* ------------------------------------------------------------------
  13This file is part of bzip2/libbzip2, a program and library for
  14lossless, block-sorting data compression.
  15
  16bzip2/libbzip2 version 1.0.4 of 20 December 2006
  17Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org>
  18
  19Please read the WARNING, DISCLAIMER and PATENTS sections in the
  20README file.
  21
  22This program is released under the terms of the license contained
  23in the file LICENSE.
  24------------------------------------------------------------------ */
  25
  26/* #include "bzlib_private.h" */
  27
  28#define mswap(zz1, zz2) \
  29{ \
  30        int32_t zztmp = zz1; \
  31        zz1 = zz2; \
  32        zz2 = zztmp; \
  33}
  34
  35static
  36/* No measurable speed gain with inlining */
  37/* ALWAYS_INLINE */
  38void mvswap(uint32_t* ptr, int32_t zzp1, int32_t zzp2, int32_t zzn)
  39{
  40        while (zzn > 0) {
  41                mswap(ptr[zzp1], ptr[zzp2]);
  42                zzp1++;
  43                zzp2++;
  44                zzn--;
  45        }
  46}
  47
  48static
  49ALWAYS_INLINE
  50int32_t mmin(int32_t a, int32_t b)
  51{
  52        return (a < b) ? a : b;
  53}
  54
  55
  56/*---------------------------------------------*/
  57/*--- Fallback O(N log(N)^2) sorting        ---*/
  58/*--- algorithm, for repetitive blocks      ---*/
  59/*---------------------------------------------*/
  60
  61/*---------------------------------------------*/
  62static
  63inline
  64void fallbackSimpleSort(uint32_t* fmap,
  65                uint32_t* eclass,
  66                int32_t   lo,
  67                int32_t   hi)
  68{
  69        int32_t i, j, tmp;
  70        uint32_t ec_tmp;
  71
  72        if (lo == hi) return;
  73
  74        if (hi - lo > 3) {
  75                for (i = hi-4; i >= lo; i--) {
  76                        tmp = fmap[i];
  77                        ec_tmp = eclass[tmp];
  78                        for (j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4)
  79                                fmap[j-4] = fmap[j];
  80                        fmap[j-4] = tmp;
  81                }
  82        }
  83
  84        for (i = hi-1; i >= lo; i--) {
  85                tmp = fmap[i];
  86                ec_tmp = eclass[tmp];
  87                for (j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++)
  88                        fmap[j-1] = fmap[j];
  89                fmap[j-1] = tmp;
  90        }
  91}
  92
  93
  94/*---------------------------------------------*/
  95#define fpush(lz,hz) { \
  96        stackLo[sp] = lz; \
  97        stackHi[sp] = hz; \
  98        sp++; \
  99}
 100
 101#define fpop(lz,hz) { \
 102        sp--; \
 103        lz = stackLo[sp]; \
 104        hz = stackHi[sp]; \
 105}
 106
 107#define FALLBACK_QSORT_SMALL_THRESH 10
 108#define FALLBACK_QSORT_STACK_SIZE   100
 109
 110static
 111void fallbackQSort3(uint32_t* fmap,
 112                uint32_t* eclass,
 113                int32_t   loSt,
 114                int32_t   hiSt)
 115{
 116        int32_t unLo, unHi, ltLo, gtHi, n, m;
 117        int32_t sp, lo, hi;
 118        uint32_t med, r, r3;
 119        int32_t stackLo[FALLBACK_QSORT_STACK_SIZE];
 120        int32_t stackHi[FALLBACK_QSORT_STACK_SIZE];
 121
 122        r = 0;
 123
 124        sp = 0;
 125        fpush(loSt, hiSt);
 126
 127        while (sp > 0) {
 128                AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004);
 129
 130                fpop(lo, hi);
 131                if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
 132                        fallbackSimpleSort(fmap, eclass, lo, hi);
 133                        continue;
 134                }
 135
 136                /* Random partitioning.  Median of 3 sometimes fails to
 137                 * avoid bad cases.  Median of 9 seems to help but
 138                 * looks rather expensive.  This too seems to work but
 139                 * is cheaper.  Guidance for the magic constants
 140                 * 7621 and 32768 is taken from Sedgewick's algorithms
 141                 * book, chapter 35.
 142                 */
 143                r = ((r * 7621) + 1) % 32768;
 144                r3 = r % 3;
 145                if (r3 == 0)
 146                        med = eclass[fmap[lo]];
 147                else if (r3 == 1)
 148                        med = eclass[fmap[(lo+hi)>>1]];
 149                else
 150                        med = eclass[fmap[hi]];
 151
 152                unLo = ltLo = lo;
 153                unHi = gtHi = hi;
 154
 155                while (1) {
 156                        while (1) {
 157                                if (unLo > unHi) break;
 158                                n = (int32_t)eclass[fmap[unLo]] - (int32_t)med;
 159                                if (n == 0) {
 160                                        mswap(fmap[unLo], fmap[ltLo]);
 161                                        ltLo++;
 162                                        unLo++;
 163                                        continue;
 164                                };
 165                                if (n > 0) break;
 166                                unLo++;
 167                        }
 168                        while (1) {
 169                                if (unLo > unHi) break;
 170                                n = (int32_t)eclass[fmap[unHi]] - (int32_t)med;
 171                                if (n == 0) {
 172                                        mswap(fmap[unHi], fmap[gtHi]);
 173                                        gtHi--; unHi--;
 174                                        continue;
 175                                };
 176                                if (n < 0) break;
 177                                unHi--;
 178                        }
 179                        if (unLo > unHi) break;
 180                        mswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
 181                }
 182
 183                AssertD(unHi == unLo-1, "fallbackQSort3(2)");
 184
 185                if (gtHi < ltLo) continue;
 186
 187                n = mmin(ltLo-lo, unLo-ltLo); mvswap(fmap, lo, unLo-n, n);
 188                m = mmin(hi-gtHi, gtHi-unHi); mvswap(fmap, unLo, hi-m+1, m);
 189
 190                n = lo + unLo - ltLo - 1;
 191                m = hi - (gtHi - unHi) + 1;
 192
 193                if (n - lo > hi - m) {
 194                        fpush(lo, n);
 195                        fpush(m, hi);
 196                } else {
 197                        fpush(m, hi);
 198                        fpush(lo, n);
 199                }
 200        }
 201}
 202
 203#undef fpush
 204#undef fpop
 205#undef FALLBACK_QSORT_SMALL_THRESH
 206#undef FALLBACK_QSORT_STACK_SIZE
 207
 208
 209/*---------------------------------------------*/
 210/* Pre:
 211 *      nblock > 0
 212 *      eclass exists for [0 .. nblock-1]
 213 *      ((uint8_t*)eclass) [0 .. nblock-1] holds block
 214 *      ptr exists for [0 .. nblock-1]
 215 *
 216 * Post:
 217 *      ((uint8_t*)eclass) [0 .. nblock-1] holds block
 218 *      All other areas of eclass destroyed
 219 *      fmap [0 .. nblock-1] holds sorted order
 220 *      bhtab[0 .. 2+(nblock/32)] destroyed
 221*/
 222
 223#define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
 224#define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
 225#define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
 226#define      WORD_BH(zz)  bhtab[(zz) >> 5]
 227#define UNALIGNED_BH(zz)  ((zz) & 0x01f)
 228
 229static
 230void fallbackSort(uint32_t* fmap,
 231                uint32_t* eclass,
 232                uint32_t* bhtab,
 233                int32_t   nblock)
 234{
 235        int32_t ftab[257];
 236        int32_t ftabCopy[256];
 237        int32_t H, i, j, k, l, r, cc, cc1;
 238        int32_t nNotDone;
 239        int32_t nBhtab;
 240        uint8_t* eclass8 = (uint8_t*)eclass;
 241
 242        /*
 243         * Initial 1-char radix sort to generate
 244         * initial fmap and initial BH bits.
 245         */
 246        for (i = 0; i < 257;    i++) ftab[i] = 0;
 247        for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
 248        for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
 249
 250        j = ftab[0];  /* bbox: optimized */
 251        for (i = 1; i < 257;    i++) {
 252                j += ftab[i];
 253                ftab[i] = j;
 254        }
 255
 256        for (i = 0; i < nblock; i++) {
 257                j = eclass8[i];
 258                k = ftab[j] - 1;
 259                ftab[j] = k;
 260                fmap[k] = i;
 261        }
 262
 263        nBhtab = 2 + ((uint32_t)nblock / 32); /* bbox: unsigned div is easier */
 264        for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
 265        for (i = 0; i < 256; i++) SET_BH(ftab[i]);
 266
 267        /*
 268         * Inductively refine the buckets.  Kind-of an
 269         * "exponential radix sort" (!), inspired by the
 270         * Manber-Myers suffix array construction algorithm.
 271         */
 272
 273        /*-- set sentinel bits for block-end detection --*/
 274        for (i = 0; i < 32; i++) {
 275                SET_BH(nblock + 2*i);
 276                CLEAR_BH(nblock + 2*i + 1);
 277        }
 278
 279        /*-- the log(N) loop --*/
 280        H = 1;
 281        while (1) {
 282                j = 0;
 283                for (i = 0; i < nblock; i++) {
 284                        if (ISSET_BH(i))
 285                                j = i;
 286                        k = fmap[i] - H;
 287                        if (k < 0)
 288                                k += nblock;
 289                        eclass[k] = j;
 290                }
 291
 292                nNotDone = 0;
 293                r = -1;
 294                while (1) {
 295
 296                /*-- find the next non-singleton bucket --*/
 297                        k = r + 1;
 298                        while (ISSET_BH(k) && UNALIGNED_BH(k))
 299                                k++;
 300                        if (ISSET_BH(k)) {
 301                                while (WORD_BH(k) == 0xffffffff) k += 32;
 302                                while (ISSET_BH(k)) k++;
 303                        }
 304                        l = k - 1;
 305                        if (l >= nblock)
 306                                break;
 307                        while (!ISSET_BH(k) && UNALIGNED_BH(k))
 308                                k++;
 309                        if (!ISSET_BH(k)) {
 310                                while (WORD_BH(k) == 0x00000000) k += 32;
 311                                while (!ISSET_BH(k)) k++;
 312                        }
 313                        r = k - 1;
 314                        if (r >= nblock)
 315                                break;
 316
 317                        /*-- now [l, r] bracket current bucket --*/
 318                        if (r > l) {
 319                                nNotDone += (r - l + 1);
 320                                fallbackQSort3(fmap, eclass, l, r);
 321
 322                                /*-- scan bucket and generate header bits-- */
 323                                cc = -1;
 324                                for (i = l; i <= r; i++) {
 325                                        cc1 = eclass[fmap[i]];
 326                                        if (cc != cc1) {
 327                                                SET_BH(i);
 328                                                cc = cc1;
 329                                        };
 330                                }
 331                        }
 332                }
 333
 334                H *= 2;
 335                if (H > nblock || nNotDone == 0)
 336                        break;
 337        }
 338
 339        /*
 340         * Reconstruct the original block in
 341         * eclass8 [0 .. nblock-1], since the
 342         * previous phase destroyed it.
 343         */
 344        j = 0;
 345        for (i = 0; i < nblock; i++) {
 346                while (ftabCopy[j] == 0)
 347                        j++;
 348                ftabCopy[j]--;
 349                eclass8[fmap[i]] = (uint8_t)j;
 350        }
 351        AssertH(j < 256, 1005);
 352}
 353
 354#undef       SET_BH
 355#undef     CLEAR_BH
 356#undef     ISSET_BH
 357#undef      WORD_BH
 358#undef UNALIGNED_BH
 359
 360
 361/*---------------------------------------------*/
 362/*--- The main, O(N^2 log(N)) sorting       ---*/
 363/*--- algorithm.  Faster for "normal"       ---*/
 364/*--- non-repetitive blocks.                ---*/
 365/*---------------------------------------------*/
 366
 367/*---------------------------------------------*/
 368static
 369NOINLINE
 370int mainGtU(
 371                uint32_t  i1,
 372                uint32_t  i2,
 373                uint8_t*  block,
 374                uint16_t* quadrant,
 375                uint32_t  nblock,
 376                int32_t*  budget)
 377{
 378        int32_t  k;
 379        uint8_t  c1, c2;
 380        uint16_t s1, s2;
 381
 382/* Loop unrolling here is actually very useful
 383 * (generated code is much simpler),
 384 * code size increase is only 270 bytes (i386)
 385 * but speeds up compression 10% overall
 386 */
 387
 388#if CONFIG_BZIP2_FEATURE_SPEED >= 1
 389
 390#define TIMES_8(code) \
 391        code; code; code; code; \
 392        code; code; code; code;
 393#define TIMES_12(code) \
 394        code; code; code; code; \
 395        code; code; code; code; \
 396        code; code; code; code;
 397
 398#else
 399
 400#define TIMES_8(code) \
 401{ \
 402        int nn = 8; \
 403        do { \
 404                code; \
 405        } while (--nn); \
 406}
 407#define TIMES_12(code) \
 408{ \
 409        int nn = 12; \
 410        do { \
 411                code; \
 412        } while (--nn); \
 413}
 414
 415#endif
 416
 417        AssertD(i1 != i2, "mainGtU");
 418        TIMES_12(
 419                c1 = block[i1]; c2 = block[i2];
 420                if (c1 != c2) return (c1 > c2);
 421                i1++; i2++;
 422        )
 423
 424        k = nblock + 8;
 425
 426        do {
 427                TIMES_8(
 428                        c1 = block[i1]; c2 = block[i2];
 429                        if (c1 != c2) return (c1 > c2);
 430                        s1 = quadrant[i1]; s2 = quadrant[i2];
 431                        if (s1 != s2) return (s1 > s2);
 432                        i1++; i2++;
 433                )
 434
 435                if (i1 >= nblock) i1 -= nblock;
 436                if (i2 >= nblock) i2 -= nblock;
 437
 438                (*budget)--;
 439                k -= 8;
 440        } while (k >= 0);
 441
 442        return False;
 443}
 444#undef TIMES_8
 445#undef TIMES_12
 446
 447/*---------------------------------------------*/
 448/*
 449 * Knuth's increments seem to work better
 450 * than Incerpi-Sedgewick here.  Possibly
 451 * because the number of elems to sort is
 452 * usually small, typically <= 20.
 453 */
 454static
 455const int32_t incs[14] = {
 456        1, 4, 13, 40, 121, 364, 1093, 3280,
 457        9841, 29524, 88573, 265720,
 458        797161, 2391484
 459};
 460
 461static
 462void mainSimpleSort(uint32_t* ptr,
 463                uint8_t*  block,
 464                uint16_t* quadrant,
 465                int32_t   nblock,
 466                int32_t   lo,
 467                int32_t   hi,
 468                int32_t   d,
 469                int32_t*  budget)
 470{
 471        int32_t i, j, h, bigN, hp;
 472        uint32_t v;
 473
 474        bigN = hi - lo + 1;
 475        if (bigN < 2) return;
 476
 477        hp = 0;
 478        while (incs[hp] < bigN) hp++;
 479        hp--;
 480
 481        for (; hp >= 0; hp--) {
 482                h = incs[hp];
 483
 484                i = lo + h;
 485                while (1) {
 486                        /*-- copy 1 --*/
 487                        if (i > hi) break;
 488                        v = ptr[i];
 489                        j = i;
 490                        while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
 491                                ptr[j] = ptr[j-h];
 492                                j = j - h;
 493                                if (j <= (lo + h - 1)) break;
 494                        }
 495                        ptr[j] = v;
 496                        i++;
 497
 498/* 1.5% overall speedup, +290 bytes */
 499#if CONFIG_BZIP2_FEATURE_SPEED >= 3
 500                        /*-- copy 2 --*/
 501                        if (i > hi) break;
 502                        v = ptr[i];
 503                        j = i;
 504                        while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
 505                                ptr[j] = ptr[j-h];
 506                                j = j - h;
 507                                if (j <= (lo + h - 1)) break;
 508                        }
 509                        ptr[j] = v;
 510                        i++;
 511
 512                        /*-- copy 3 --*/
 513                        if (i > hi) break;
 514                        v = ptr[i];
 515                        j = i;
 516                        while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
 517                                ptr[j] = ptr[j-h];
 518                                j = j - h;
 519                                if (j <= (lo + h - 1)) break;
 520                        }
 521                        ptr[j] = v;
 522                        i++;
 523#endif
 524                        if (*budget < 0) return;
 525                }
 526        }
 527}
 528
 529
 530/*---------------------------------------------*/
 531/*
 532 * The following is an implementation of
 533 * an elegant 3-way quicksort for strings,
 534 * described in a paper "Fast Algorithms for
 535 * Sorting and Searching Strings", by Robert
 536 * Sedgewick and Jon L. Bentley.
 537 */
 538
 539static
 540ALWAYS_INLINE
 541uint8_t mmed3(uint8_t a, uint8_t b, uint8_t c)
 542{
 543        uint8_t t;
 544        if (a > b) {
 545                t = a;
 546                a = b;
 547                b = t;
 548        };
 549        /* here b >= a */
 550        if (b > c) {
 551                b = c;
 552                if (a > b)
 553                        b = a;
 554        }
 555        return b;
 556}
 557
 558#define mpush(lz,hz,dz) \
 559{ \
 560        stackLo[sp] = lz; \
 561        stackHi[sp] = hz; \
 562        stackD [sp] = dz; \
 563        sp++; \
 564}
 565
 566#define mpop(lz,hz,dz) \
 567{ \
 568        sp--; \
 569        lz = stackLo[sp]; \
 570        hz = stackHi[sp]; \
 571        dz = stackD [sp]; \
 572}
 573
 574#define mnextsize(az) (nextHi[az] - nextLo[az])
 575
 576#define mnextswap(az,bz) \
 577{ \
 578        int32_t tz; \
 579        tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
 580        tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
 581        tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; \
 582}
 583
 584#define MAIN_QSORT_SMALL_THRESH 20
 585#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
 586#define MAIN_QSORT_STACK_SIZE   100
 587
 588static
 589void mainQSort3(uint32_t* ptr,
 590                uint8_t*  block,
 591                uint16_t* quadrant,
 592                int32_t   nblock,
 593                int32_t   loSt,
 594                int32_t   hiSt,
 595                int32_t   dSt,
 596                int32_t*  budget)
 597{
 598        int32_t unLo, unHi, ltLo, gtHi, n, m, med;
 599        int32_t sp, lo, hi, d;
 600
 601        int32_t stackLo[MAIN_QSORT_STACK_SIZE];
 602        int32_t stackHi[MAIN_QSORT_STACK_SIZE];
 603        int32_t stackD [MAIN_QSORT_STACK_SIZE];
 604
 605        int32_t nextLo[3];
 606        int32_t nextHi[3];
 607        int32_t nextD [3];
 608
 609        sp = 0;
 610        mpush(loSt, hiSt, dSt);
 611
 612        while (sp > 0) {
 613                AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001);
 614
 615                mpop(lo, hi, d);
 616                if (hi - lo < MAIN_QSORT_SMALL_THRESH
 617                 || d > MAIN_QSORT_DEPTH_THRESH
 618                ) {
 619                        mainSimpleSort(ptr, block, quadrant, nblock, lo, hi, d, budget);
 620                        if (*budget < 0)
 621                                return;
 622                        continue;
 623                }
 624                med = (int32_t) mmed3(block[ptr[lo          ] + d],
 625                                      block[ptr[hi          ] + d],
 626                                      block[ptr[(lo+hi) >> 1] + d]);
 627
 628                unLo = ltLo = lo;
 629                unHi = gtHi = hi;
 630
 631                while (1) {
 632                        while (1) {
 633                                if (unLo > unHi)
 634                                        break;
 635                                n = ((int32_t)block[ptr[unLo]+d]) - med;
 636                                if (n == 0) {
 637                                        mswap(ptr[unLo], ptr[ltLo]);
 638                                        ltLo++;
 639                                        unLo++;
 640                                        continue;
 641                                };
 642                                if (n >  0) break;
 643                                unLo++;
 644                        }
 645                        while (1) {
 646                                if (unLo > unHi)
 647                                        break;
 648                                n = ((int32_t)block[ptr[unHi]+d]) - med;
 649                                if (n == 0) {
 650                                        mswap(ptr[unHi], ptr[gtHi]);
 651                                        gtHi--;
 652                                        unHi--;
 653                                        continue;
 654                                };
 655                                if (n <  0) break;
 656                                unHi--;
 657                        }
 658                        if (unLo > unHi)
 659                                break;
 660                        mswap(ptr[unLo], ptr[unHi]);
 661                        unLo++;
 662                        unHi--;
 663                }
 664
 665                AssertD(unHi == unLo-1, "mainQSort3(2)");
 666
 667                if (gtHi < ltLo) {
 668                        mpush(lo, hi, d + 1);
 669                        continue;
 670                }
 671
 672                n = mmin(ltLo-lo, unLo-ltLo); mvswap(ptr, lo, unLo-n, n);
 673                m = mmin(hi-gtHi, gtHi-unHi); mvswap(ptr, unLo, hi-m+1, m);
 674
 675                n = lo + unLo - ltLo - 1;
 676                m = hi - (gtHi - unHi) + 1;
 677
 678                nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
 679                nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
 680                nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
 681
 682                if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
 683                if (mnextsize(1) < mnextsize(2)) mnextswap(1, 2);
 684                if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
 685
 686                AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)");
 687                AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)");
 688
 689                mpush(nextLo[0], nextHi[0], nextD[0]);
 690                mpush(nextLo[1], nextHi[1], nextD[1]);
 691                mpush(nextLo[2], nextHi[2], nextD[2]);
 692        }
 693}
 694
 695#undef mpush
 696#undef mpop
 697#undef mnextsize
 698#undef mnextswap
 699#undef MAIN_QSORT_SMALL_THRESH
 700#undef MAIN_QSORT_DEPTH_THRESH
 701#undef MAIN_QSORT_STACK_SIZE
 702
 703
 704/*---------------------------------------------*/
 705/* Pre:
 706 *      nblock > N_OVERSHOOT
 707 *      block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
 708 *      ((uint8_t*)block32) [0 .. nblock-1] holds block
 709 *      ptr exists for [0 .. nblock-1]
 710 *
 711 * Post:
 712 *      ((uint8_t*)block32) [0 .. nblock-1] holds block
 713 *      All other areas of block32 destroyed
 714 *      ftab[0 .. 65536] destroyed
 715 *      ptr [0 .. nblock-1] holds sorted order
 716 *      if (*budget < 0), sorting was abandoned
 717 */
 718
 719#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
 720#define SETMASK (1 << 21)
 721#define CLEARMASK (~(SETMASK))
 722
 723static NOINLINE
 724void mainSort(EState* state,
 725                uint32_t* ptr,
 726                uint8_t*  block,
 727                uint16_t* quadrant,
 728                uint32_t* ftab,
 729                int32_t   nblock,
 730                int32_t*  budget)
 731{
 732        int32_t  i, j, k, ss, sb;
 733        uint8_t  c1;
 734        int32_t  numQSorted;
 735        uint16_t s;
 736        Bool     bigDone[256];
 737        /* bbox: moved to EState to save stack
 738        int32_t  runningOrder[256];
 739        int32_t  copyStart[256];
 740        int32_t  copyEnd  [256];
 741        */
 742#define runningOrder (state->mainSort__runningOrder)
 743#define copyStart    (state->mainSort__copyStart)
 744#define copyEnd      (state->mainSort__copyEnd)
 745
 746        /*-- set up the 2-byte frequency table --*/
 747        /* was: for (i = 65536; i >= 0; i--) ftab[i] = 0; */
 748        memset(ftab, 0, 65537 * sizeof(ftab[0]));
 749
 750        j = block[0] << 8;
 751        i = nblock - 1;
 752/* 3%, +300 bytes */
 753#if CONFIG_BZIP2_FEATURE_SPEED >= 2
 754        for (; i >= 3; i -= 4) {
 755                quadrant[i] = 0;
 756                j = (j >> 8) | (((uint16_t)block[i]) << 8);
 757                ftab[j]++;
 758                quadrant[i-1] = 0;
 759                j = (j >> 8) | (((uint16_t)block[i-1]) << 8);
 760                ftab[j]++;
 761                quadrant[i-2] = 0;
 762                j = (j >> 8) | (((uint16_t)block[i-2]) << 8);
 763                ftab[j]++;
 764                quadrant[i-3] = 0;
 765                j = (j >> 8) | (((uint16_t)block[i-3]) << 8);
 766                ftab[j]++;
 767        }
 768#endif
 769        for (; i >= 0; i--) {
 770                quadrant[i] = 0;
 771                j = (j >> 8) | (((uint16_t)block[i]) << 8);
 772                ftab[j]++;
 773        }
 774
 775        /*-- (emphasises close relationship of block & quadrant) --*/
 776        for (i = 0; i < BZ_N_OVERSHOOT; i++) {
 777                block   [nblock+i] = block[i];
 778                quadrant[nblock+i] = 0;
 779        }
 780
 781        /*-- Complete the initial radix sort --*/
 782        j = ftab[0]; /* bbox: optimized */
 783        for (i = 1; i <= 65536; i++) {
 784                j += ftab[i];
 785                ftab[i] = j;
 786        }
 787
 788        s = block[0] << 8;
 789        i = nblock - 1;
 790#if CONFIG_BZIP2_FEATURE_SPEED >= 2
 791        for (; i >= 3; i -= 4) {
 792                s = (s >> 8) | (block[i] << 8);
 793                j = ftab[s] - 1;
 794                ftab[s] = j;
 795                ptr[j] = i;
 796                s = (s >> 8) | (block[i-1] << 8);
 797                j = ftab[s] - 1;
 798                ftab[s] = j;
 799                ptr[j] = i-1;
 800                s = (s >> 8) | (block[i-2] << 8);
 801                j = ftab[s] - 1;
 802                ftab[s] = j;
 803                ptr[j] = i-2;
 804                s = (s >> 8) | (block[i-3] << 8);
 805                j = ftab[s] - 1;
 806                ftab[s] = j;
 807                ptr[j] = i-3;
 808        }
 809#endif
 810        for (; i >= 0; i--) {
 811                s = (s >> 8) | (block[i] << 8);
 812                j = ftab[s] - 1;
 813                ftab[s] = j;
 814                ptr[j] = i;
 815        }
 816
 817        /*
 818         * Now ftab contains the first loc of every small bucket.
 819         * Calculate the running order, from smallest to largest
 820         * big bucket.
 821         */
 822        for (i = 0; i <= 255; i++) {
 823                bigDone     [i] = False;
 824                runningOrder[i] = i;
 825        }
 826
 827        {
 828                int32_t vv;
 829                /* bbox: was: int32_t h = 1; */
 830                /* do h = 3 * h + 1; while (h <= 256); */
 831                uint32_t h = 364;
 832
 833                do {
 834                        /*h = h / 3;*/
 835                        h = (h * 171) >> 9; /* bbox: fast h/3 */
 836                        for (i = h; i <= 255; i++) {
 837                                vv = runningOrder[i];
 838                                j = i;
 839                                while (BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv)) {
 840                                        runningOrder[j] = runningOrder[j-h];
 841                                        j = j - h;
 842                                        if (j <= (h - 1))
 843                                                goto zero;
 844                                }
 845 zero:
 846                                runningOrder[j] = vv;
 847                        }
 848                } while (h != 1);
 849        }
 850
 851        /*
 852         * The main sorting loop.
 853         */
 854
 855        numQSorted = 0;
 856
 857        for (i = 0; i <= 255; i++) {
 858
 859                /*
 860                 * Process big buckets, starting with the least full.
 861                 * Basically this is a 3-step process in which we call
 862                 * mainQSort3 to sort the small buckets [ss, j], but
 863                 * also make a big effort to avoid the calls if we can.
 864                 */
 865                ss = runningOrder[i];
 866
 867                /*
 868                 * Step 1:
 869                 * Complete the big bucket [ss] by quicksorting
 870                 * any unsorted small buckets [ss, j], for j != ss.
 871                 * Hopefully previous pointer-scanning phases have already
 872                 * completed many of the small buckets [ss, j], so
 873                 * we don't have to sort them at all.
 874                 */
 875                for (j = 0; j <= 255; j++) {
 876                        if (j != ss) {
 877                                sb = (ss << 8) + j;
 878                                if (!(ftab[sb] & SETMASK)) {
 879                                        int32_t lo =  ftab[sb]   & CLEARMASK;
 880                                        int32_t hi = (ftab[sb+1] & CLEARMASK) - 1;
 881                                        if (hi > lo) {
 882                                                mainQSort3(
 883                                                        ptr, block, quadrant, nblock,
 884                                                        lo, hi, BZ_N_RADIX, budget
 885                                                );
 886                                                if (*budget < 0) return;
 887                                                numQSorted += (hi - lo + 1);
 888                                        }
 889                                }
 890                                ftab[sb] |= SETMASK;
 891                        }
 892                }
 893
 894                AssertH(!bigDone[ss], 1006);
 895
 896                /*
 897                 * Step 2:
 898                 * Now scan this big bucket [ss] so as to synthesise the
 899                 * sorted order for small buckets [t, ss] for all t,
 900                 * including, magically, the bucket [ss,ss] too.
 901                 * This will avoid doing Real Work in subsequent Step 1's.
 902                 */
 903                {
 904                        for (j = 0; j <= 255; j++) {
 905                                copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
 906                                copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
 907                        }
 908                        for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
 909                                k = ptr[j] - 1;
 910                                if (k < 0)
 911                                        k += nblock;
 912                                c1 = block[k];
 913                                if (!bigDone[c1])
 914                                        ptr[copyStart[c1]++] = k;
 915                        }
 916                        for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
 917                                k = ptr[j]-1;
 918                                if (k < 0)
 919                                        k += nblock;
 920                                c1 = block[k];
 921                                if (!bigDone[c1])
 922                                        ptr[copyEnd[c1]--] = k;
 923                        }
 924                }
 925
 926                /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
 927                 * Necessity for this case is demonstrated by compressing
 928                 * a sequence of approximately 48.5 million of character
 929                 * 251; 1.0.0/1.0.1 will then die here. */
 930                AssertH((copyStart[ss]-1 == copyEnd[ss]) \
 931                     || (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 1007);
 932
 933                for (j = 0; j <= 255; j++)
 934                        ftab[(j << 8) + ss] |= SETMASK;
 935
 936                /*
 937                 * Step 3:
 938                 * The [ss] big bucket is now done.  Record this fact,
 939                 * and update the quadrant descriptors.  Remember to
 940                 * update quadrants in the overshoot area too, if
 941                 * necessary.  The "if (i < 255)" test merely skips
 942                 * this updating for the last bucket processed, since
 943                 * updating for the last bucket is pointless.
 944                 *
 945                 * The quadrant array provides a way to incrementally
 946                 * cache sort orderings, as they appear, so as to
 947                 * make subsequent comparisons in fullGtU() complete
 948                 * faster.  For repetitive blocks this makes a big
 949                 * difference (but not big enough to be able to avoid
 950                 * the fallback sorting mechanism, exponential radix sort).
 951                 *
 952                 * The precise meaning is: at all times:
 953                 *
 954                 *      for 0 <= i < nblock and 0 <= j <= nblock
 955                 *
 956                 *      if block[i] != block[j],
 957                 *
 958                 *              then the relative values of quadrant[i] and
 959                 *                        quadrant[j] are meaningless.
 960                 *
 961                 *              else {
 962                 *                      if quadrant[i] < quadrant[j]
 963                 *                              then the string starting at i lexicographically
 964                 *                              precedes the string starting at j
 965                 *
 966                 *                      else if quadrant[i] > quadrant[j]
 967                 *                              then the string starting at j lexicographically
 968                 *                              precedes the string starting at i
 969                 *
 970                 *                      else
 971                 *                              the relative ordering of the strings starting
 972                 *                              at i and j has not yet been determined.
 973                 *              }
 974                 */
 975                bigDone[ss] = True;
 976
 977                if (i < 255) {
 978                        int32_t bbStart = ftab[ss << 8] & CLEARMASK;
 979                        int32_t bbSize  = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
 980                        int32_t shifts  = 0;
 981
 982                        while ((bbSize >> shifts) > 65534) shifts++;
 983
 984                        for (j = bbSize-1; j >= 0; j--) {
 985                                int32_t a2update   = ptr[bbStart + j];
 986                                uint16_t qVal      = (uint16_t)(j >> shifts);
 987                                quadrant[a2update] = qVal;
 988                                if (a2update < BZ_N_OVERSHOOT)
 989                                        quadrant[a2update + nblock] = qVal;
 990                        }
 991                        AssertH(((bbSize-1) >> shifts) <= 65535, 1002);
 992                }
 993        }
 994#undef runningOrder
 995#undef copyStart
 996#undef copyEnd
 997}
 998
 999#undef BIGFREQ
1000#undef SETMASK
1001#undef CLEARMASK
1002
1003
1004/*---------------------------------------------*/
1005/* Pre:
1006 *      nblock > 0
1007 *      arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1008 *        ((uint8_t*)arr2)[0 .. nblock-1] holds block
1009 *      arr1 exists for [0 .. nblock-1]
1010 *
1011 * Post:
1012 *      ((uint8_t*)arr2) [0 .. nblock-1] holds block
1013 *      All other areas of block destroyed
1014 *      ftab[0 .. 65536] destroyed
1015 *      arr1[0 .. nblock-1] holds sorted order
1016 */
1017static NOINLINE
1018void BZ2_blockSort(EState* s)
1019{
1020        /* In original bzip2 1.0.4, it's a parameter, but 30
1021         * (which was the default) should work ok. */
1022        enum { wfact = 30 };
1023
1024        uint32_t* ptr    = s->ptr;
1025        uint8_t*  block  = s->block;
1026        uint32_t* ftab   = s->ftab;
1027        int32_t   nblock = s->nblock;
1028        uint16_t* quadrant;
1029        int32_t   budget;
1030        int32_t   i;
1031
1032        if (nblock < 10000) {
1033                fallbackSort(s->arr1, s->arr2, ftab, nblock);
1034        } else {
1035                /* Calculate the location for quadrant, remembering to get
1036                 * the alignment right.  Assumes that &(block[0]) is at least
1037                 * 2-byte aligned -- this should be ok since block is really
1038                 * the first section of arr2.
1039                 */
1040                i = nblock + BZ_N_OVERSHOOT;
1041                if (i & 1) i++;
1042                quadrant = (uint16_t*)(&(block[i]));
1043
1044                /* (wfact-1) / 3 puts the default-factor-30
1045                 * transition point at very roughly the same place as
1046                 * with v0.1 and v0.9.0.
1047                 * Not that it particularly matters any more, since the
1048                 * resulting compressed stream is now the same regardless
1049                 * of whether or not we use the main sort or fallback sort.
1050                 */
1051                budget = nblock * ((wfact-1) / 3);
1052
1053                mainSort(s, ptr, block, quadrant, ftab, nblock, &budget);
1054                if (budget < 0) {
1055                        fallbackSort(s->arr1, s->arr2, ftab, nblock);
1056                }
1057        }
1058
1059        s->origPtr = -1;
1060        for (i = 0; i < s->nblock; i++)
1061                if (ptr[i] == 0) {
1062                        s->origPtr = i;
1063                        break;
1064                };
1065
1066        AssertH(s->origPtr != -1, 1003);
1067}
1068
1069
1070/*-------------------------------------------------------------*/
1071/*--- end                                       blocksort.c ---*/
1072/*-------------------------------------------------------------*/
1073