busybox/archival/libarchive/bz/blocksort.c
<<
>>
Prefs
   1/*
   2 * bzip2 is written by Julian Seward <jseward@bzip.org>.
   3 * Adapted for busybox by Denys Vlasenko <vda.linux@googlemail.com>.
   4 * See README and LICENSE files in this directory for more information.
   5 */
   6
   7/*-------------------------------------------------------------*/
   8/*--- Block sorting machinery                               ---*/
   9/*---                                           blocksort.c ---*/
  10/*-------------------------------------------------------------*/
  11
  12/* ------------------------------------------------------------------
  13This file is part of bzip2/libbzip2, a program and library for
  14lossless, block-sorting data compression.
  15
  16bzip2/libbzip2 version 1.0.4 of 20 December 2006
  17Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org>
  18
  19Please read the WARNING, DISCLAIMER and PATENTS sections in the
  20README file.
  21
  22This program is released under the terms of the license contained
  23in the file LICENSE.
  24------------------------------------------------------------------ */
  25
  26/* #include "bzlib_private.h" */
  27
  28#define mswap(zz1, zz2) \
  29{ \
  30        int32_t zztmp = zz1; \
  31        zz1 = zz2; \
  32        zz2 = zztmp; \
  33}
  34
  35static
  36/* No measurable speed gain with inlining */
  37/* ALWAYS_INLINE */
  38void mvswap(uint32_t* ptr, int32_t zzp1, int32_t zzp2, int32_t zzn)
  39{
  40        while (zzn > 0) {
  41                mswap(ptr[zzp1], ptr[zzp2]);
  42                zzp1++;
  43                zzp2++;
  44                zzn--;
  45        }
  46}
  47
  48static
  49ALWAYS_INLINE
  50int32_t mmin(int32_t a, int32_t b)
  51{
  52        return (a < b) ? a : b;
  53}
  54
  55
  56/*---------------------------------------------*/
  57/*--- Fallback O(N log(N)^2) sorting        ---*/
  58/*--- algorithm, for repetitive blocks      ---*/
  59/*---------------------------------------------*/
  60
  61/*---------------------------------------------*/
  62static
  63inline
  64void fallbackSimpleSort(uint32_t* fmap,
  65                uint32_t* eclass,
  66                int32_t   lo,
  67                int32_t   hi)
  68{
  69        int32_t i, j, tmp;
  70        uint32_t ec_tmp;
  71
  72        if (lo == hi) return;
  73
  74        if (hi - lo > 3) {
  75                for (i = hi-4; i >= lo; i--) {
  76                        tmp = fmap[i];
  77                        ec_tmp = eclass[tmp];
  78                        for (j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4)
  79                                fmap[j-4] = fmap[j];
  80                        fmap[j-4] = tmp;
  81                }
  82        }
  83
  84        for (i = hi-1; i >= lo; i--) {
  85                tmp = fmap[i];
  86                ec_tmp = eclass[tmp];
  87                for (j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++)
  88                        fmap[j-1] = fmap[j];
  89                fmap[j-1] = tmp;
  90        }
  91}
  92
  93
  94/*---------------------------------------------*/
  95#define fpush(lz,hz) { \
  96        stackLo[sp] = lz; \
  97        stackHi[sp] = hz; \
  98        sp++; \
  99}
 100
 101#define fpop(lz,hz) { \
 102        sp--; \
 103        lz = stackLo[sp]; \
 104        hz = stackHi[sp]; \
 105}
 106
 107#define FALLBACK_QSORT_SMALL_THRESH 10
 108#define FALLBACK_QSORT_STACK_SIZE   100
 109
 110static NOINLINE
 111void fallbackQSort3(uint32_t* fmap,
 112                uint32_t* eclass,
 113                int32_t   loSt,
 114                int32_t   hiSt)
 115{
 116        int32_t sp;
 117        uint32_t r;
 118        int32_t stackLo[FALLBACK_QSORT_STACK_SIZE];
 119        int32_t stackHi[FALLBACK_QSORT_STACK_SIZE];
 120
 121        r = 0;
 122
 123        sp = 0;
 124        fpush(loSt, hiSt);
 125
 126        while (sp > 0) {
 127                int32_t unLo, unHi, ltLo, gtHi, n, m;
 128                int32_t lo, hi;
 129                uint32_t med;
 130                uint32_t r3;
 131
 132                AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004);
 133
 134                fpop(lo, hi);
 135                if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
 136                        fallbackSimpleSort(fmap, eclass, lo, hi);
 137                        continue;
 138                }
 139
 140                /* Random partitioning.  Median of 3 sometimes fails to
 141                 * avoid bad cases.  Median of 9 seems to help but
 142                 * looks rather expensive.  This too seems to work but
 143                 * is cheaper.  Guidance for the magic constants
 144                 * 7621 and 32768 is taken from Sedgewick's algorithms
 145                 * book, chapter 35.
 146                 */
 147                r = ((r * 7621) + 1) % 32768;
 148                r3 = r % 3;
 149                if (r3 == 0)
 150                        med = eclass[fmap[lo]];
 151                else if (r3 == 1)
 152                        med = eclass[fmap[(lo+hi)>>1]];
 153                else
 154                        med = eclass[fmap[hi]];
 155
 156                unLo = ltLo = lo;
 157                unHi = gtHi = hi;
 158
 159                while (1) {
 160                        while (1) {
 161                                if (unLo > unHi) break;
 162                                n = (int32_t)eclass[fmap[unLo]] - (int32_t)med;
 163                                if (n == 0) {
 164                                        mswap(fmap[unLo], fmap[ltLo]);
 165                                        ltLo++;
 166                                        unLo++;
 167                                        continue;
 168                                }
 169                                if (n > 0) break;
 170                                unLo++;
 171                        }
 172                        while (1) {
 173                                if (unLo > unHi) break;
 174                                n = (int32_t)eclass[fmap[unHi]] - (int32_t)med;
 175                                if (n == 0) {
 176                                        mswap(fmap[unHi], fmap[gtHi]);
 177                                        gtHi--; unHi--;
 178                                        continue;
 179                                }
 180                                if (n < 0) break;
 181                                unHi--;
 182                        }
 183                        if (unLo > unHi) break;
 184                        mswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
 185                }
 186
 187                AssertD(unHi == unLo-1, "fallbackQSort3(2)");
 188
 189                if (gtHi < ltLo) continue;
 190
 191                n = mmin(ltLo-lo, unLo-ltLo); mvswap(fmap, lo, unLo-n, n);
 192                m = mmin(hi-gtHi, gtHi-unHi); mvswap(fmap, unLo, hi-m+1, m);
 193
 194                n = lo + unLo - ltLo - 1;
 195                m = hi - (gtHi - unHi) + 1;
 196
 197                if (n - lo > hi - m) {
 198                        fpush(lo, n);
 199                        fpush(m, hi);
 200                } else {
 201                        fpush(m, hi);
 202                        fpush(lo, n);
 203                }
 204        }
 205}
 206
 207#undef fpush
 208#undef fpop
 209#undef FALLBACK_QSORT_SMALL_THRESH
 210#undef FALLBACK_QSORT_STACK_SIZE
 211
 212
 213/*---------------------------------------------*/
 214/* Pre:
 215 *      nblock > 0
 216 *      eclass exists for [0 .. nblock-1]
 217 *      ((uint8_t*)eclass) [0 .. nblock-1] holds block
 218 *      ptr exists for [0 .. nblock-1]
 219 *
 220 * Post:
 221 *      ((uint8_t*)eclass) [0 .. nblock-1] holds block
 222 *      All other areas of eclass destroyed
 223 *      fmap [0 .. nblock-1] holds sorted order
 224 *      bhtab[0 .. 2+(nblock/32)] destroyed
 225*/
 226
 227#define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
 228#define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
 229#define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
 230#define      WORD_BH(zz)  bhtab[(zz) >> 5]
 231#define UNALIGNED_BH(zz)  ((zz) & 0x01f)
 232
 233static
 234void fallbackSort(EState* state)
 235{
 236        int32_t ftab[257];
 237        int32_t ftabCopy[256];
 238        int32_t H, i, j, k, l, r, cc, cc1;
 239        int32_t nNotDone;
 240        int32_t nBhtab;
 241        /* params */
 242        uint32_t *const fmap    = state->arr1;
 243        uint32_t *const eclass  = state->arr2;
 244#define eclass8 ((uint8_t*)eclass)
 245        uint32_t *const bhtab   = state->ftab;
 246        const int32_t   nblock  = state->nblock;
 247
 248        /*
 249         * Initial 1-char radix sort to generate
 250         * initial fmap and initial BH bits.
 251         */
 252        for (i = 0; i < 257;    i++) ftab[i] = 0;
 253        for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
 254        for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
 255
 256        j = ftab[0];  /* bbox: optimized */
 257        for (i = 1; i < 257;    i++) {
 258                j += ftab[i];
 259                ftab[i] = j;
 260        }
 261
 262        for (i = 0; i < nblock; i++) {
 263                j = eclass8[i];
 264                k = ftab[j] - 1;
 265                ftab[j] = k;
 266                fmap[k] = i;
 267        }
 268
 269        nBhtab = 2 + ((uint32_t)nblock / 32); /* bbox: unsigned div is easier */
 270        for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
 271        for (i = 0; i < 256; i++) SET_BH(ftab[i]);
 272
 273        /*
 274         * Inductively refine the buckets.  Kind-of an
 275         * "exponential radix sort" (!), inspired by the
 276         * Manber-Myers suffix array construction algorithm.
 277         */
 278
 279        /*-- set sentinel bits for block-end detection --*/
 280        for (i = 0; i < 32; i++) {
 281                SET_BH(nblock + 2*i);
 282                CLEAR_BH(nblock + 2*i + 1);
 283        }
 284
 285        /*-- the log(N) loop --*/
 286        H = 1;
 287        while (1) {
 288                j = 0;
 289                for (i = 0; i < nblock; i++) {
 290                        if (ISSET_BH(i))
 291                                j = i;
 292                        k = fmap[i] - H;
 293                        if (k < 0)
 294                                k += nblock;
 295                        eclass[k] = j;
 296                }
 297
 298                nNotDone = 0;
 299                r = -1;
 300                while (1) {
 301
 302                /*-- find the next non-singleton bucket --*/
 303                        k = r + 1;
 304                        while (ISSET_BH(k) && UNALIGNED_BH(k))
 305                                k++;
 306                        if (ISSET_BH(k)) {
 307                                while (WORD_BH(k) == 0xffffffff) k += 32;
 308                                while (ISSET_BH(k)) k++;
 309                        }
 310                        l = k - 1;
 311                        if (l >= nblock)
 312                                break;
 313                        while (!ISSET_BH(k) && UNALIGNED_BH(k))
 314                                k++;
 315                        if (!ISSET_BH(k)) {
 316                                while (WORD_BH(k) == 0x00000000) k += 32;
 317                                while (!ISSET_BH(k)) k++;
 318                        }
 319                        r = k - 1;
 320                        if (r >= nblock)
 321                                break;
 322
 323                        /*-- now [l, r] bracket current bucket --*/
 324                        if (r > l) {
 325                                nNotDone += (r - l + 1);
 326                                fallbackQSort3(fmap, eclass, l, r);
 327
 328                                /*-- scan bucket and generate header bits-- */
 329                                cc = -1;
 330                                for (i = l; i <= r; i++) {
 331                                        cc1 = eclass[fmap[i]];
 332                                        if (cc != cc1) {
 333                                                SET_BH(i);
 334                                                cc = cc1;
 335                                        }
 336                                }
 337                        }
 338                }
 339
 340                H *= 2;
 341                if (H > nblock || nNotDone == 0)
 342                        break;
 343        }
 344
 345        /*
 346         * Reconstruct the original block in
 347         * eclass8 [0 .. nblock-1], since the
 348         * previous phase destroyed it.
 349         */
 350        j = 0;
 351        for (i = 0; i < nblock; i++) {
 352                while (ftabCopy[j] == 0)
 353                        j++;
 354                ftabCopy[j]--;
 355                eclass8[fmap[i]] = (uint8_t)j;
 356        }
 357        AssertH(j < 256, 1005);
 358#undef eclass8
 359}
 360
 361#undef       SET_BH
 362#undef     CLEAR_BH
 363#undef     ISSET_BH
 364#undef      WORD_BH
 365#undef UNALIGNED_BH
 366
 367
 368/*---------------------------------------------*/
 369/*--- The main, O(N^2 log(N)) sorting       ---*/
 370/*--- algorithm.  Faster for "normal"       ---*/
 371/*--- non-repetitive blocks.                ---*/
 372/*---------------------------------------------*/
 373
 374/*---------------------------------------------*/
 375static
 376NOINLINE
 377int mainGtU(EState* state,
 378                uint32_t  i1,
 379                uint32_t  i2)
 380{
 381        int32_t  k;
 382        uint8_t  c1, c2;
 383        uint16_t s1, s2;
 384
 385        uint8_t  *const block    = state->block;
 386        uint16_t *const quadrant = state->quadrant;
 387        const int32_t   nblock   = state->nblock;
 388
 389/* Loop unrolling here is actually very useful
 390 * (generated code is much simpler),
 391 * code size increase is only 270 bytes (i386)
 392 * but speeds up compression 10% overall
 393 */
 394
 395#if BZIP2_SPEED >= 1
 396
 397#define TIMES_8(code) \
 398        code; code; code; code; \
 399        code; code; code; code;
 400#define TIMES_12(code) \
 401        code; code; code; code; \
 402        code; code; code; code; \
 403        code; code; code; code;
 404
 405#else
 406
 407#define TIMES_8(code) \
 408{ \
 409        int nn = 8; \
 410        do { \
 411                code; \
 412        } while (--nn); \
 413}
 414#define TIMES_12(code) \
 415{ \
 416        int nn = 12; \
 417        do { \
 418                code; \
 419        } while (--nn); \
 420}
 421
 422#endif
 423
 424        AssertD(i1 != i2, "mainGtU");
 425        TIMES_12(
 426                c1 = block[i1]; c2 = block[i2];
 427                if (c1 != c2) return (c1 > c2);
 428                i1++; i2++;
 429        )
 430
 431        k = nblock + 8;
 432
 433        do {
 434                TIMES_8(
 435                        c1 = block[i1]; c2 = block[i2];
 436                        if (c1 != c2) return (c1 > c2);
 437                        s1 = quadrant[i1]; s2 = quadrant[i2];
 438                        if (s1 != s2) return (s1 > s2);
 439                        i1++; i2++;
 440                )
 441
 442                if (i1 >= nblock) i1 -= nblock;
 443                if (i2 >= nblock) i2 -= nblock;
 444
 445                state->budget--;
 446                k -= 8;
 447        } while (k >= 0);
 448
 449        return False;
 450}
 451#undef TIMES_8
 452#undef TIMES_12
 453
 454/*---------------------------------------------*/
 455/*
 456 * Knuth's increments seem to work better
 457 * than Incerpi-Sedgewick here.  Possibly
 458 * because the number of elems to sort is
 459 * usually small, typically <= 20.
 460 */
 461static
 462const uint32_t incs[14] ALIGN4 = {
 463        1, 4, 13, 40, 121, 364, 1093, 3280,
 464        9841, 29524, 88573, 265720,
 465        797161, 2391484
 466};
 467
 468static
 469void mainSimpleSort(EState* state,
 470                int32_t   lo,
 471                int32_t   hi,
 472                int32_t   d)
 473{
 474        uint32_t *const ptr = state->ptr;
 475
 476        /* At which increment to start? */
 477        int hp = 0;
 478        {
 479                int bigN = hi - lo;
 480                if (bigN <= 0)
 481                        return;
 482                while (incs[hp] <= bigN)
 483                        hp++;
 484                hp--;
 485        }
 486
 487        for (; hp >= 0; hp--) {
 488                int32_t i;
 489                unsigned h;
 490
 491                h = incs[hp];
 492                i = lo + h;
 493                while (1) {
 494                        unsigned j;
 495                        unsigned v;
 496
 497                        if (i > hi) break;
 498                        v = ptr[i];
 499                        j = i;
 500                        while (mainGtU(state, ptr[j-h]+d, v+d)) {
 501                                ptr[j] = ptr[j-h];
 502                                j = j - h;
 503                                if (j <= (lo + h - 1)) break;
 504                        }
 505                        ptr[j] = v;
 506                        i++;
 507
 508/* 1.5% overall speedup, +290 bytes */
 509#if BZIP2_SPEED >= 3
 510                        /*-- copy 2 --*/
 511                        if (i > hi) break;
 512                        v = ptr[i];
 513                        j = i;
 514                        while (mainGtU(state, ptr[j-h]+d, v+d)) {
 515                                ptr[j] = ptr[j-h];
 516                                j = j - h;
 517                                if (j <= (lo + h - 1)) break;
 518                        }
 519                        ptr[j] = v;
 520                        i++;
 521                        /*-- copy 3 --*/
 522                        if (i > hi) break;
 523                        v = ptr[i];
 524                        j = i;
 525                        while (mainGtU(state, ptr[j-h]+d, v+d)) {
 526                                ptr[j] = ptr[j-h];
 527                                j = j - h;
 528                                if (j <= (lo + h - 1)) break;
 529                        }
 530                        ptr[j] = v;
 531                        i++;
 532#endif
 533                        if (state->budget < 0) return;
 534                }
 535        }
 536}
 537
 538
 539/*---------------------------------------------*/
 540/*
 541 * The following is an implementation of
 542 * an elegant 3-way quicksort for strings,
 543 * described in a paper "Fast Algorithms for
 544 * Sorting and Searching Strings", by Robert
 545 * Sedgewick and Jon L. Bentley.
 546 */
 547
 548static
 549ALWAYS_INLINE
 550uint8_t mmed3(uint8_t a, uint8_t b, uint8_t c)
 551{
 552        uint8_t t;
 553        if (a > b) {
 554                t = a;
 555                a = b;
 556                b = t;
 557        }
 558        /* here b >= a */
 559        if (b > c) {
 560                b = c;
 561                if (a > b)
 562                        b = a;
 563        }
 564        return b;
 565}
 566
 567#define mpush(lz,hz,dz) \
 568{ \
 569        stackLo[sp] = lz; \
 570        stackHi[sp] = hz; \
 571        stackD [sp] = dz; \
 572        sp++; \
 573}
 574
 575#define mpop(lz,hz,dz) \
 576{ \
 577        sp--; \
 578        lz = stackLo[sp]; \
 579        hz = stackHi[sp]; \
 580        dz = stackD [sp]; \
 581}
 582
 583#define mnextsize(az) (nextHi[az] - nextLo[az])
 584
 585#define mnextswap(az,bz) \
 586{ \
 587        int32_t tz; \
 588        tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
 589        tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
 590        tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; \
 591}
 592
 593#define MAIN_QSORT_SMALL_THRESH 20
 594#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
 595#define MAIN_QSORT_STACK_SIZE   100
 596
 597static NOINLINE
 598void mainQSort3(EState* state,
 599                int32_t   loSt,
 600                int32_t   hiSt
 601                /*int32_t dSt*/)
 602{
 603        enum { dSt = BZ_N_RADIX };
 604        int32_t unLo, unHi, ltLo, gtHi, n, m, med;
 605        int32_t sp, lo, hi, d;
 606
 607        int32_t stackLo[MAIN_QSORT_STACK_SIZE];
 608        int32_t stackHi[MAIN_QSORT_STACK_SIZE];
 609        int32_t stackD [MAIN_QSORT_STACK_SIZE];
 610
 611        int32_t nextLo[3];
 612        int32_t nextHi[3];
 613        int32_t nextD [3];
 614
 615        uint32_t *const ptr   = state->ptr;
 616        uint8_t  *const block = state->block;
 617
 618        sp = 0;
 619        mpush(loSt, hiSt, dSt);
 620
 621        while (sp > 0) {
 622                AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001);
 623
 624                mpop(lo, hi, d);
 625                if (hi - lo < MAIN_QSORT_SMALL_THRESH
 626                 || d > MAIN_QSORT_DEPTH_THRESH
 627                ) {
 628                        mainSimpleSort(state, lo, hi, d);
 629                        if (state->budget < 0)
 630                                return;
 631                        continue;
 632                }
 633                med = (int32_t) mmed3(block[ptr[lo          ] + d],
 634                                      block[ptr[hi          ] + d],
 635                                      block[ptr[(lo+hi) >> 1] + d]);
 636
 637                unLo = ltLo = lo;
 638                unHi = gtHi = hi;
 639
 640                while (1) {
 641                        while (1) {
 642                                if (unLo > unHi)
 643                                        break;
 644                                n = ((int32_t)block[ptr[unLo]+d]) - med;
 645                                if (n == 0) {
 646                                        mswap(ptr[unLo], ptr[ltLo]);
 647                                        ltLo++;
 648                                        unLo++;
 649                                        continue;
 650                                }
 651                                if (n > 0) break;
 652                                unLo++;
 653                        }
 654                        while (1) {
 655                                if (unLo > unHi)
 656                                        break;
 657                                n = ((int32_t)block[ptr[unHi]+d]) - med;
 658                                if (n == 0) {
 659                                        mswap(ptr[unHi], ptr[gtHi]);
 660                                        gtHi--;
 661                                        unHi--;
 662                                        continue;
 663                                }
 664                                if (n < 0) break;
 665                                unHi--;
 666                        }
 667                        if (unLo > unHi)
 668                                break;
 669                        mswap(ptr[unLo], ptr[unHi]);
 670                        unLo++;
 671                        unHi--;
 672                }
 673
 674                AssertD(unHi == unLo-1, "mainQSort3(2)");
 675
 676                if (gtHi < ltLo) {
 677                        mpush(lo, hi, d + 1);
 678                        continue;
 679                }
 680
 681                n = mmin(ltLo-lo, unLo-ltLo); mvswap(ptr, lo, unLo-n, n);
 682                m = mmin(hi-gtHi, gtHi-unHi); mvswap(ptr, unLo, hi-m+1, m);
 683
 684                n = lo + unLo - ltLo - 1;
 685                m = hi - (gtHi - unHi) + 1;
 686
 687                nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
 688                nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
 689                nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
 690
 691                if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
 692                if (mnextsize(1) < mnextsize(2)) mnextswap(1, 2);
 693                if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
 694
 695                AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)");
 696                AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)");
 697
 698                mpush(nextLo[0], nextHi[0], nextD[0]);
 699                mpush(nextLo[1], nextHi[1], nextD[1]);
 700                mpush(nextLo[2], nextHi[2], nextD[2]);
 701        }
 702}
 703
 704#undef mpush
 705#undef mpop
 706#undef mnextsize
 707#undef mnextswap
 708#undef MAIN_QSORT_SMALL_THRESH
 709#undef MAIN_QSORT_DEPTH_THRESH
 710#undef MAIN_QSORT_STACK_SIZE
 711
 712
 713/*---------------------------------------------*/
 714/* Pre:
 715 *      nblock > N_OVERSHOOT
 716 *      block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
 717 *      ((uint8_t*)block32) [0 .. nblock-1] holds block
 718 *      ptr exists for [0 .. nblock-1]
 719 *
 720 * Post:
 721 *      ((uint8_t*)block32) [0 .. nblock-1] holds block
 722 *      All other areas of block32 destroyed
 723 *      ftab[0 .. 65536] destroyed
 724 *      ptr [0 .. nblock-1] holds sorted order
 725 *      if (*budget < 0), sorting was abandoned
 726 */
 727
 728#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
 729#define SETMASK (1 << 21)
 730#define CLEARMASK (~(SETMASK))
 731
 732static NOINLINE
 733void mainSort(EState* state)
 734{
 735        int32_t  i, j;
 736        Bool     bigDone[256];
 737        uint8_t  runningOrder[256];
 738        /* bbox: moved to EState to save stack
 739        int32_t  copyStart[256];
 740        int32_t  copyEnd  [256];
 741        */
 742#define copyStart    (state->mainSort__copyStart)
 743#define copyEnd      (state->mainSort__copyEnd)
 744
 745        uint32_t *const ptr      = state->ptr;
 746        uint8_t  *const block    = state->block;
 747        uint32_t *const ftab     = state->ftab;
 748        const int32_t   nblock   = state->nblock;
 749        uint16_t *const quadrant = state->quadrant;
 750
 751        /*-- set up the 2-byte frequency table --*/
 752        /* was: for (i = 65536; i >= 0; i--) ftab[i] = 0; */
 753        memset(ftab, 0, 65537 * sizeof(ftab[0]));
 754
 755        j = block[0] << 8;
 756        i = nblock - 1;
 757/* 3%, +300 bytes */
 758#if BZIP2_SPEED >= 2
 759        for (; i >= 3; i -= 4) {
 760                quadrant[i] = 0;
 761                j = (j >> 8) | (((unsigned)block[i]) << 8);
 762                ftab[j]++;
 763                quadrant[i-1] = 0;
 764                j = (j >> 8) | (((unsigned)block[i-1]) << 8);
 765                ftab[j]++;
 766                quadrant[i-2] = 0;
 767                j = (j >> 8) | (((unsigned)block[i-2]) << 8);
 768                ftab[j]++;
 769                quadrant[i-3] = 0;
 770                j = (j >> 8) | (((unsigned)block[i-3]) << 8);
 771                ftab[j]++;
 772        }
 773#endif
 774        for (; i >= 0; i--) {
 775                quadrant[i] = 0;
 776                j = (j >> 8) | (((unsigned)block[i]) << 8);
 777                ftab[j]++;
 778        }
 779
 780        /*-- (emphasises close relationship of block & quadrant) --*/
 781        for (i = 0; i < BZ_N_OVERSHOOT; i++) {
 782                block   [nblock+i] = block[i];
 783                quadrant[nblock+i] = 0;
 784        }
 785
 786        /*-- Complete the initial radix sort --*/
 787        j = ftab[0]; /* bbox: optimized */
 788        for (i = 1; i <= 65536; i++) {
 789                j += ftab[i];
 790                ftab[i] = j;
 791        }
 792
 793        {
 794                unsigned s;
 795                s = block[0] << 8;
 796                i = nblock - 1;
 797#if BZIP2_SPEED >= 2
 798                for (; i >= 3; i -= 4) {
 799                        s = (s >> 8) | (block[i] << 8);
 800                        j = ftab[s] - 1;
 801                        ftab[s] = j;
 802                        ptr[j] = i;
 803                        s = (s >> 8) | (block[i-1] << 8);
 804                        j = ftab[s] - 1;
 805                        ftab[s] = j;
 806                        ptr[j] = i-1;
 807                        s = (s >> 8) | (block[i-2] << 8);
 808                        j = ftab[s] - 1;
 809                        ftab[s] = j;
 810                        ptr[j] = i-2;
 811                        s = (s >> 8) | (block[i-3] << 8);
 812                        j = ftab[s] - 1;
 813                        ftab[s] = j;
 814                        ptr[j] = i-3;
 815                }
 816#endif
 817                for (; i >= 0; i--) {
 818                        s = (s >> 8) | (block[i] << 8);
 819                        j = ftab[s] - 1;
 820                        ftab[s] = j;
 821                        ptr[j] = i;
 822                }
 823        }
 824
 825        /*
 826         * Now ftab contains the first loc of every small bucket.
 827         * Calculate the running order, from smallest to largest
 828         * big bucket.
 829         */
 830        for (i = 0; i <= 255; i++) {
 831                bigDone     [i] = False;
 832                runningOrder[i] = i;
 833        }
 834
 835        {
 836                /* bbox: was: int32_t h = 1; */
 837                /* do h = 3 * h + 1; while (h <= 256); */
 838                unsigned h = 364;
 839
 840                do {
 841                        /*h = h / 3;*/
 842                        h = (h * 171) >> 9; /* bbox: fast h/3 */
 843                        for (i = h; i <= 255; i++) {
 844                                unsigned vv, jh;
 845                                vv = runningOrder[i]; /* uint8[] */
 846                                j = i;
 847                                while (jh = j - h, BIGFREQ(runningOrder[jh]) > BIGFREQ(vv)) {
 848                                        runningOrder[j] = runningOrder[jh];
 849                                        j = jh;
 850                                        if (j < h)
 851                                                break;
 852                                }
 853                                runningOrder[j] = vv;
 854                        }
 855                } while (h != 1);
 856        }
 857
 858        /*
 859         * The main sorting loop.
 860         */
 861
 862        for (i = 0; /*i <= 255*/; i++) {
 863                unsigned ss;
 864
 865                /*
 866                 * Process big buckets, starting with the least full.
 867                 * Basically this is a 3-step process in which we call
 868                 * mainQSort3 to sort the small buckets [ss, j], but
 869                 * also make a big effort to avoid the calls if we can.
 870                 */
 871                ss = runningOrder[i];
 872
 873                /*
 874                 * Step 1:
 875                 * Complete the big bucket [ss] by quicksorting
 876                 * any unsorted small buckets [ss, j], for j != ss.
 877                 * Hopefully previous pointer-scanning phases have already
 878                 * completed many of the small buckets [ss, j], so
 879                 * we don't have to sort them at all.
 880                 */
 881                for (j = 0; j <= 255; j++) {
 882                        if (j != ss) {
 883                                unsigned sb;
 884                                sb = (ss << 8) + j;
 885                                if (!(ftab[sb] & SETMASK)) {
 886                                        int32_t lo =  ftab[sb] /*& CLEARMASK (redundant)*/;
 887                                        int32_t hi = (ftab[sb+1] & CLEARMASK) - 1;
 888                                        if (hi > lo) {
 889                                                mainQSort3(state, lo, hi /*,BZ_N_RADIX*/);
 890                                                if (state->budget < 0) return;
 891                                        }
 892                                }
 893                                ftab[sb] |= SETMASK;
 894                        }
 895                }
 896
 897                AssertH(!bigDone[ss], 1006);
 898
 899                /*
 900                 * Step 2:
 901                 * Now scan this big bucket [ss] so as to synthesise the
 902                 * sorted order for small buckets [t, ss] for all t,
 903                 * including, magically, the bucket [ss,ss] too.
 904                 * This will avoid doing Real Work in subsequent Step 1's.
 905                 */
 906                {
 907                        for (j = 0; j <= 255; j++) {
 908                                copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
 909                                copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
 910                        }
 911                        for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
 912                                unsigned c1;
 913                                int32_t k;
 914                                k = ptr[j] - 1;
 915                                if (k < 0)
 916                                        k += nblock;
 917                                c1 = block[k];
 918                                if (!bigDone[c1])
 919                                        ptr[copyStart[c1]++] = k;
 920                        }
 921                        for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
 922                                unsigned c1;
 923                                int32_t k;
 924                                k = ptr[j]-1;
 925                                if (k < 0)
 926                                        k += nblock;
 927                                c1 = block[k];
 928                                if (!bigDone[c1])
 929                                        ptr[copyEnd[c1]--] = k;
 930                        }
 931                }
 932
 933                /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
 934                 * Necessity for this case is demonstrated by compressing
 935                 * a sequence of approximately 48.5 million of character
 936                 * 251; 1.0.0/1.0.1 will then die here. */
 937                AssertH((copyStart[ss]-1 == copyEnd[ss]) \
 938                     || (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 1007);
 939
 940                for (j = 0; j <= 255; j++)
 941                        ftab[(j << 8) + ss] |= SETMASK;
 942
 943                if (i == 255)
 944                        break;
 945
 946                /*
 947                 * Step 3:
 948                 * The [ss] big bucket is now done.  Record this fact,
 949                 * and update the quadrant descriptors.  Remember to
 950                 * update quadrants in the overshoot area too, if
 951                 * necessary.  The "if (i < 255)" test merely skips
 952                 * this updating for the last bucket processed, since
 953                 * updating for the last bucket is pointless.
 954                 *
 955                 * The quadrant array provides a way to incrementally
 956                 * cache sort orderings, as they appear, so as to
 957                 * make subsequent comparisons in fullGtU() complete
 958                 * faster.  For repetitive blocks this makes a big
 959                 * difference (but not big enough to be able to avoid
 960                 * the fallback sorting mechanism, exponential radix sort).
 961                 *
 962                 * The precise meaning is: at all times:
 963                 *
 964                 *      for 0 <= i < nblock and 0 <= j <= nblock
 965                 *
 966                 *      if block[i] != block[j],
 967                 *
 968                 *              then the relative values of quadrant[i] and
 969                 *                        quadrant[j] are meaningless.
 970                 *
 971                 *              else {
 972                 *                      if quadrant[i] < quadrant[j]
 973                 *                              then the string starting at i lexicographically
 974                 *                              precedes the string starting at j
 975                 *
 976                 *                      else if quadrant[i] > quadrant[j]
 977                 *                              then the string starting at j lexicographically
 978                 *                              precedes the string starting at i
 979                 *
 980                 *                      else
 981                 *                              the relative ordering of the strings starting
 982                 *                              at i and j has not yet been determined.
 983                 *              }
 984                 */
 985                bigDone[ss] = True;
 986
 987                {
 988                        unsigned bbStart = ftab[ss << 8] & CLEARMASK;
 989                        unsigned bbSize  = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
 990                        unsigned shifts  = 0;
 991
 992                        while ((bbSize >> shifts) > 65534) shifts++;
 993
 994                        for (j = bbSize-1; j >= 0; j--) {
 995                                unsigned a2update  = ptr[bbStart + j]; /* uint32[] */
 996                                uint16_t qVal      = (uint16_t)(j >> shifts);
 997                                quadrant[a2update] = qVal;
 998                                if (a2update < BZ_N_OVERSHOOT)
 999                                        quadrant[a2update + nblock] = qVal;
1000                        }
1001                        AssertH(((bbSize-1) >> shifts) <= 65535, 1002);
1002                }
1003        }
1004#undef runningOrder
1005#undef copyStart
1006#undef copyEnd
1007}
1008
1009#undef BIGFREQ
1010#undef SETMASK
1011#undef CLEARMASK
1012
1013
1014/*---------------------------------------------*/
1015/* Pre:
1016 *      nblock > 0
1017 *      arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1018 *        ((uint8_t*)arr2)[0 .. nblock-1] holds block
1019 *      arr1 exists for [0 .. nblock-1]
1020 *
1021 * Post:
1022 *      ((uint8_t*)arr2) [0 .. nblock-1] holds block
1023 *      All other areas of block destroyed
1024 *      ftab[0 .. 65536] destroyed
1025 *      arr1[0 .. nblock-1] holds sorted order
1026 */
1027static NOINLINE
1028int32_t BZ2_blockSort(EState* state)
1029{
1030        /* In original bzip2 1.0.4, it's a parameter, but 30
1031         * (which was the default) should work ok. */
1032        enum { wfact = 30 };
1033        unsigned i;
1034        int32_t origPtr = origPtr;
1035
1036        if (state->nblock >= 10000) {
1037                /* Calculate the location for quadrant, remembering to get
1038                 * the alignment right.  Assumes that &(block[0]) is at least
1039                 * 2-byte aligned -- this should be ok since block is really
1040                 * the first section of arr2.
1041                 */
1042                i = state->nblock + BZ_N_OVERSHOOT;
1043                if (i & 1)
1044                        i++;
1045                state->quadrant = (uint16_t*) &(state->block[i]);
1046
1047                /* (wfact-1) / 3 puts the default-factor-30
1048                 * transition point at very roughly the same place as
1049                 * with v0.1 and v0.9.0.
1050                 * Not that it particularly matters any more, since the
1051                 * resulting compressed stream is now the same regardless
1052                 * of whether or not we use the main sort or fallback sort.
1053                 */
1054                state->budget = state->nblock * ((wfact-1) / 3);
1055                mainSort(state);
1056                if (state->budget >= 0)
1057                        goto good;
1058        }
1059        fallbackSort(state);
1060 good:
1061
1062#if BZ_LIGHT_DEBUG
1063        origPtr = -1;
1064#endif
1065        for (i = 0; i < state->nblock; i++) {
1066                if (state->ptr[i] == 0) {
1067                        origPtr = i;
1068                        break;
1069                }
1070        }
1071
1072        AssertH(origPtr != -1, 1003);
1073        return origPtr;
1074}
1075
1076
1077/*-------------------------------------------------------------*/
1078/*--- end                                       blocksort.c ---*/
1079/*-------------------------------------------------------------*/
1080