uboot/lib/bzip2/bzlib_blocksort.c
<<
>>
Prefs
   1
   2/*-------------------------------------------------------------*/
   3/*--- Block sorting machinery                               ---*/
   4/*---                                           blocksort.c ---*/
   5/*-------------------------------------------------------------*/
   6
   7/*--
   8  This file is a part of bzip2 and/or libbzip2, a program and
   9  library for lossless, block-sorting data compression.
  10
  11  Copyright (C) 1996-2002 Julian R Seward.  All rights reserved.
  12
  13  Redistribution and use in source and binary forms, with or without
  14  modification, are permitted provided that the following conditions
  15  are met:
  16
  17  1. Redistributions of source code must retain the above copyright
  18     notice, this list of conditions and the following disclaimer.
  19
  20  2. The origin of this software must not be misrepresented; you must
  21     not claim that you wrote the original software.  If you use this
  22     software in a product, an acknowledgment in the product
  23     documentation would be appreciated but is not required.
  24
  25  3. Altered source versions must be plainly marked as such, and must
  26     not be misrepresented as being the original software.
  27
  28  4. The name of the author may not be used to endorse or promote
  29     products derived from this software without specific prior written
  30     permission.
  31
  32  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
  33  OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  34  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  35  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  36  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  37  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
  38  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  39  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  40  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  41  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  42  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  43
  44  Julian Seward, Cambridge, UK.
  45  jseward@acm.org
  46  bzip2/libbzip2 version 1.0.6 of 6 September 2010
  47  Copyright (C) 1996-2010 Julian Seward <jseward@bzip.org>
  48
  49  This program is based on (at least) the work of:
  50     Mike Burrows
  51     David Wheeler
  52     Peter Fenwick
  53     Alistair Moffat
  54     Radford Neal
  55     Ian H. Witten
  56     Robert Sedgewick
  57     Jon L. Bentley
  58
  59  For more information on these sources, see the manual.
  60--*/
  61
  62#include "bzlib_private.h"
  63
  64/*---------------------------------------------*/
  65/*--- Fallback O(N log(N)^2) sorting        ---*/
  66/*--- algorithm, for repetitive blocks      ---*/
  67/*---------------------------------------------*/
  68
  69/*---------------------------------------------*/
  70static 
  71__inline__
  72void fallbackSimpleSort ( UInt32* fmap, 
  73                          UInt32* eclass, 
  74                          Int32   lo, 
  75                          Int32   hi )
  76{
  77   Int32 i, j, tmp;
  78   UInt32 ec_tmp;
  79
  80   if (lo == hi) return;
  81
  82   if (hi - lo > 3) {
  83      for ( i = hi-4; i >= lo; i-- ) {
  84         tmp = fmap[i];
  85         ec_tmp = eclass[tmp];
  86         for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
  87            fmap[j-4] = fmap[j];
  88         fmap[j-4] = tmp;
  89      }
  90   }
  91
  92   for ( i = hi-1; i >= lo; i-- ) {
  93      tmp = fmap[i];
  94      ec_tmp = eclass[tmp];
  95      for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
  96         fmap[j-1] = fmap[j];
  97      fmap[j-1] = tmp;
  98   }
  99}
 100
 101
 102/*---------------------------------------------*/
 103#define fswap(zz1, zz2) \
 104   { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
 105
 106#define fvswap(zzp1, zzp2, zzn)       \
 107{                                     \
 108   Int32 yyp1 = (zzp1);               \
 109   Int32 yyp2 = (zzp2);               \
 110   Int32 yyn  = (zzn);                \
 111   while (yyn > 0) {                  \
 112      fswap(fmap[yyp1], fmap[yyp2]);  \
 113      yyp1++; yyp2++; yyn--;          \
 114   }                                  \
 115}
 116
 117
 118#define fmin(a,b) ((a) < (b)) ? (a) : (b)
 119
 120#define fpush(lz,hz) { stackLo[sp] = lz; \
 121                       stackHi[sp] = hz; \
 122                       sp++; }
 123
 124#define fpop(lz,hz) { sp--;              \
 125                      lz = stackLo[sp];  \
 126                      hz = stackHi[sp]; }
 127
 128#define FALLBACK_QSORT_SMALL_THRESH 10
 129#define FALLBACK_QSORT_STACK_SIZE   100
 130
 131
 132static
 133void fallbackQSort3 ( UInt32* fmap, 
 134                      UInt32* eclass,
 135                      Int32   loSt, 
 136                      Int32   hiSt )
 137{
 138   Int32 unLo, unHi, ltLo, gtHi, n, m;
 139   Int32 sp, lo, hi;
 140   UInt32 med, r, r3;
 141   Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
 142   Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
 143
 144   r = 0;
 145
 146   sp = 0;
 147   fpush ( loSt, hiSt );
 148
 149   while (sp > 0) {
 150
 151      AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 );
 152
 153      fpop ( lo, hi );
 154      if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
 155         fallbackSimpleSort ( fmap, eclass, lo, hi );
 156         continue;
 157      }
 158
 159      /* Random partitioning.  Median of 3 sometimes fails to
 160         avoid bad cases.  Median of 9 seems to help but 
 161         looks rather expensive.  This too seems to work but
 162         is cheaper.  Guidance for the magic constants 
 163         7621 and 32768 is taken from Sedgewick's algorithms
 164         book, chapter 35.
 165      */
 166      r = ((r * 7621) + 1) % 32768;
 167      r3 = r % 3;
 168      if (r3 == 0) med = eclass[fmap[lo]]; else
 169      if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
 170                   med = eclass[fmap[hi]];
 171
 172      unLo = ltLo = lo;
 173      unHi = gtHi = hi;
 174
 175      while (1) {
 176         while (1) {
 177            if (unLo > unHi) break;
 178            n = (Int32)eclass[fmap[unLo]] - (Int32)med;
 179            if (n == 0) { 
 180               fswap(fmap[unLo], fmap[ltLo]); 
 181               ltLo++; unLo++; 
 182               continue; 
 183            };
 184            if (n > 0) break;
 185            unLo++;
 186         }
 187         while (1) {
 188            if (unLo > unHi) break;
 189            n = (Int32)eclass[fmap[unHi]] - (Int32)med;
 190            if (n == 0) { 
 191               fswap(fmap[unHi], fmap[gtHi]); 
 192               gtHi--; unHi--; 
 193               continue; 
 194            };
 195            if (n < 0) break;
 196            unHi--;
 197         }
 198         if (unLo > unHi) break;
 199         fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
 200      }
 201
 202      AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
 203
 204      if (gtHi < ltLo) continue;
 205
 206      n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
 207      m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
 208
 209      n = lo + unLo - ltLo - 1;
 210      m = hi - (gtHi - unHi) + 1;
 211
 212      if (n - lo > hi - m) {
 213         fpush ( lo, n );
 214         fpush ( m, hi );
 215      } else {
 216         fpush ( m, hi );
 217         fpush ( lo, n );
 218      }
 219   }
 220}
 221
 222#undef fmin
 223#undef fpush
 224#undef fpop
 225#undef fswap
 226#undef fvswap
 227#undef FALLBACK_QSORT_SMALL_THRESH
 228#undef FALLBACK_QSORT_STACK_SIZE
 229
 230
 231/*---------------------------------------------*/
 232/* Pre:
 233      nblock > 0
 234      eclass exists for [0 .. nblock-1]
 235      ((UChar*)eclass) [0 .. nblock-1] holds block
 236      ptr exists for [0 .. nblock-1]
 237
 238   Post:
 239      ((UChar*)eclass) [0 .. nblock-1] holds block
 240      All other areas of eclass destroyed
 241      fmap [0 .. nblock-1] holds sorted order
 242      bhtab [ 0 .. 2+(nblock/32) ] destroyed
 243*/
 244
 245#define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
 246#define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
 247#define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
 248#define      WORD_BH(zz)  bhtab[(zz) >> 5]
 249#define UNALIGNED_BH(zz)  ((zz) & 0x01f)
 250
 251static
 252void fallbackSort ( UInt32* fmap, 
 253                    UInt32* eclass, 
 254                    UInt32* bhtab,
 255                    Int32   nblock,
 256                    Int32   verb )
 257{
 258   Int32 ftab[257];
 259   Int32 ftabCopy[256];
 260   Int32 H, i, j, k, l, r, cc, cc1;
 261   Int32 nNotDone;
 262   Int32 nBhtab;
 263   UChar* eclass8 = (UChar*)eclass;
 264
 265   /*--
 266      Initial 1-char radix sort to generate
 267      initial fmap and initial BH bits.
 268   --*/
 269   if (verb >= 4)
 270      VPrintf0 ( "        bucket sorting ...\n" );
 271   for (i = 0; i < 257;    i++) ftab[i] = 0;
 272   for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
 273   for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
 274   for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
 275
 276   for (i = 0; i < nblock; i++) {
 277      j = eclass8[i];
 278      k = ftab[j] - 1;
 279      ftab[j] = k;
 280      fmap[k] = i;
 281   }
 282
 283   nBhtab = 2 + (nblock / 32);
 284   for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
 285   for (i = 0; i < 256; i++) SET_BH(ftab[i]);
 286
 287   /*--
 288      Inductively refine the buckets.  Kind-of an
 289      "exponential radix sort" (!), inspired by the
 290      Manber-Myers suffix array construction algorithm.
 291   --*/
 292
 293   /*-- set sentinel bits for block-end detection --*/
 294   for (i = 0; i < 32; i++) { 
 295      SET_BH(nblock + 2*i);
 296      CLEAR_BH(nblock + 2*i + 1);
 297   }
 298
 299   /*-- the log(N) loop --*/
 300   H = 1;
 301   while (1) {
 302
 303      if (verb >= 4) 
 304         VPrintf1 ( "        depth %6d has ", H );
 305
 306      j = 0;
 307      for (i = 0; i < nblock; i++) {
 308         if (ISSET_BH(i)) j = i;
 309         k = fmap[i] - H; if (k < 0) k += nblock;
 310         eclass[k] = j;
 311      }
 312
 313      nNotDone = 0;
 314      r = -1;
 315      while (1) {
 316
 317         /*-- find the next non-singleton bucket --*/
 318         k = r + 1;
 319         while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
 320         if (ISSET_BH(k)) {
 321            while (WORD_BH(k) == 0xffffffff) k += 32;
 322            while (ISSET_BH(k)) k++;
 323         }
 324         l = k - 1;
 325         if (l >= nblock) break;
 326         while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
 327         if (!ISSET_BH(k)) {
 328            while (WORD_BH(k) == 0x00000000) k += 32;
 329            while (!ISSET_BH(k)) k++;
 330         }
 331         r = k - 1;
 332         if (r >= nblock) break;
 333
 334         /*-- now [l, r] bracket current bucket --*/
 335         if (r > l) {
 336            nNotDone += (r - l + 1);
 337            fallbackQSort3 ( fmap, eclass, l, r );
 338
 339            /*-- scan bucket and generate header bits-- */
 340            cc = -1;
 341            for (i = l; i <= r; i++) {
 342               cc1 = eclass[fmap[i]];
 343               if (cc != cc1) { SET_BH(i); cc = cc1; };
 344            }
 345         }
 346      }
 347
 348      if (verb >= 4) 
 349         VPrintf1 ( "%6d unresolved strings\n", nNotDone );
 350
 351      H *= 2;
 352      if (H > nblock || nNotDone == 0) break;
 353   }
 354
 355   /*-- 
 356      Reconstruct the original block in
 357      eclass8 [0 .. nblock-1], since the
 358      previous phase destroyed it.
 359   --*/
 360   if (verb >= 4)
 361      VPrintf0 ( "        reconstructing block ...\n" );
 362   j = 0;
 363   for (i = 0; i < nblock; i++) {
 364      while (ftabCopy[j] == 0) j++;
 365      ftabCopy[j]--;
 366      eclass8[fmap[i]] = (UChar)j;
 367   }
 368   AssertH ( j < 256, 1005 );
 369}
 370
 371#undef       SET_BH
 372#undef     CLEAR_BH
 373#undef     ISSET_BH
 374#undef      WORD_BH
 375#undef UNALIGNED_BH
 376
 377
 378/*---------------------------------------------*/
 379/*--- The main, O(N^2 log(N)) sorting       ---*/
 380/*--- algorithm.  Faster for "normal"       ---*/
 381/*--- non-repetitive blocks.                ---*/
 382/*---------------------------------------------*/
 383
 384/*---------------------------------------------*/
 385static
 386__inline__
 387Bool mainGtU ( UInt32  i1, 
 388               UInt32  i2,
 389               UChar*  block, 
 390               UInt16* quadrant,
 391               UInt32  nblock,
 392               Int32*  budget )
 393{
 394   Int32  k;
 395   UChar  c1, c2;
 396   UInt16 s1, s2;
 397
 398   AssertD ( i1 != i2, "mainGtU" );
 399   /* 1 */
 400   c1 = block[i1]; c2 = block[i2];
 401   if (c1 != c2) return (c1 > c2);
 402   i1++; i2++;
 403   /* 2 */
 404   c1 = block[i1]; c2 = block[i2];
 405   if (c1 != c2) return (c1 > c2);
 406   i1++; i2++;
 407   /* 3 */
 408   c1 = block[i1]; c2 = block[i2];
 409   if (c1 != c2) return (c1 > c2);
 410   i1++; i2++;
 411   /* 4 */
 412   c1 = block[i1]; c2 = block[i2];
 413   if (c1 != c2) return (c1 > c2);
 414   i1++; i2++;
 415   /* 5 */
 416   c1 = block[i1]; c2 = block[i2];
 417   if (c1 != c2) return (c1 > c2);
 418   i1++; i2++;
 419   /* 6 */
 420   c1 = block[i1]; c2 = block[i2];
 421   if (c1 != c2) return (c1 > c2);
 422   i1++; i2++;
 423   /* 7 */
 424   c1 = block[i1]; c2 = block[i2];
 425   if (c1 != c2) return (c1 > c2);
 426   i1++; i2++;
 427   /* 8 */
 428   c1 = block[i1]; c2 = block[i2];
 429   if (c1 != c2) return (c1 > c2);
 430   i1++; i2++;
 431   /* 9 */
 432   c1 = block[i1]; c2 = block[i2];
 433   if (c1 != c2) return (c1 > c2);
 434   i1++; i2++;
 435   /* 10 */
 436   c1 = block[i1]; c2 = block[i2];
 437   if (c1 != c2) return (c1 > c2);
 438   i1++; i2++;
 439   /* 11 */
 440   c1 = block[i1]; c2 = block[i2];
 441   if (c1 != c2) return (c1 > c2);
 442   i1++; i2++;
 443   /* 12 */
 444   c1 = block[i1]; c2 = block[i2];
 445   if (c1 != c2) return (c1 > c2);
 446   i1++; i2++;
 447
 448   k = nblock + 8;
 449
 450   do {
 451      /* 1 */
 452      c1 = block[i1]; c2 = block[i2];
 453      if (c1 != c2) return (c1 > c2);
 454      s1 = quadrant[i1]; s2 = quadrant[i2];
 455      if (s1 != s2) return (s1 > s2);
 456      i1++; i2++;
 457      /* 2 */
 458      c1 = block[i1]; c2 = block[i2];
 459      if (c1 != c2) return (c1 > c2);
 460      s1 = quadrant[i1]; s2 = quadrant[i2];
 461      if (s1 != s2) return (s1 > s2);
 462      i1++; i2++;
 463      /* 3 */
 464      c1 = block[i1]; c2 = block[i2];
 465      if (c1 != c2) return (c1 > c2);
 466      s1 = quadrant[i1]; s2 = quadrant[i2];
 467      if (s1 != s2) return (s1 > s2);
 468      i1++; i2++;
 469      /* 4 */
 470      c1 = block[i1]; c2 = block[i2];
 471      if (c1 != c2) return (c1 > c2);
 472      s1 = quadrant[i1]; s2 = quadrant[i2];
 473      if (s1 != s2) return (s1 > s2);
 474      i1++; i2++;
 475      /* 5 */
 476      c1 = block[i1]; c2 = block[i2];
 477      if (c1 != c2) return (c1 > c2);
 478      s1 = quadrant[i1]; s2 = quadrant[i2];
 479      if (s1 != s2) return (s1 > s2);
 480      i1++; i2++;
 481      /* 6 */
 482      c1 = block[i1]; c2 = block[i2];
 483      if (c1 != c2) return (c1 > c2);
 484      s1 = quadrant[i1]; s2 = quadrant[i2];
 485      if (s1 != s2) return (s1 > s2);
 486      i1++; i2++;
 487      /* 7 */
 488      c1 = block[i1]; c2 = block[i2];
 489      if (c1 != c2) return (c1 > c2);
 490      s1 = quadrant[i1]; s2 = quadrant[i2];
 491      if (s1 != s2) return (s1 > s2);
 492      i1++; i2++;
 493      /* 8 */
 494      c1 = block[i1]; c2 = block[i2];
 495      if (c1 != c2) return (c1 > c2);
 496      s1 = quadrant[i1]; s2 = quadrant[i2];
 497      if (s1 != s2) return (s1 > s2);
 498      i1++; i2++;
 499
 500      if (i1 >= nblock) i1 -= nblock;
 501      if (i2 >= nblock) i2 -= nblock;
 502
 503      k -= 8;
 504      (*budget)--;
 505   }
 506      while (k >= 0);
 507
 508   return False;
 509}
 510
 511
 512/*---------------------------------------------*/
 513/*--
 514   Knuth's increments seem to work better
 515   than Incerpi-Sedgewick here.  Possibly
 516   because the number of elems to sort is
 517   usually small, typically <= 20.
 518--*/
 519static
 520Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
 521                   9841, 29524, 88573, 265720,
 522                   797161, 2391484 };
 523
 524static
 525void mainSimpleSort ( UInt32* ptr,
 526                      UChar*  block,
 527                      UInt16* quadrant,
 528                      Int32   nblock,
 529                      Int32   lo, 
 530                      Int32   hi, 
 531                      Int32   d,
 532                      Int32*  budget )
 533{
 534   Int32 i, j, h, bigN, hp;
 535   UInt32 v;
 536
 537   bigN = hi - lo + 1;
 538   if (bigN < 2) return;
 539
 540   hp = 0;
 541   while (incs[hp] < bigN) hp++;
 542   hp--;
 543
 544   for (; hp >= 0; hp--) {
 545      h = incs[hp];
 546
 547      i = lo + h;
 548      while (True) {
 549
 550         /*-- copy 1 --*/
 551         if (i > hi) break;
 552         v = ptr[i];
 553         j = i;
 554         while ( mainGtU ( 
 555                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
 556                 ) ) {
 557            ptr[j] = ptr[j-h];
 558            j = j - h;
 559            if (j <= (lo + h - 1)) break;
 560         }
 561         ptr[j] = v;
 562         i++;
 563
 564         /*-- copy 2 --*/
 565         if (i > hi) break;
 566         v = ptr[i];
 567         j = i;
 568         while ( mainGtU ( 
 569                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
 570                 ) ) {
 571            ptr[j] = ptr[j-h];
 572            j = j - h;
 573            if (j <= (lo + h - 1)) break;
 574         }
 575         ptr[j] = v;
 576         i++;
 577
 578         /*-- copy 3 --*/
 579         if (i > hi) break;
 580         v = ptr[i];
 581         j = i;
 582         while ( mainGtU ( 
 583                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
 584                 ) ) {
 585            ptr[j] = ptr[j-h];
 586            j = j - h;
 587            if (j <= (lo + h - 1)) break;
 588         }
 589         ptr[j] = v;
 590         i++;
 591
 592         if (*budget < 0) return;
 593      }
 594   }
 595}
 596
 597
 598/*---------------------------------------------*/
 599/*--
 600   The following is an implementation of
 601   an elegant 3-way quicksort for strings,
 602   described in a paper "Fast Algorithms for
 603   Sorting and Searching Strings", by Robert
 604   Sedgewick and Jon L. Bentley.
 605--*/
 606
 607#define mswap(zz1, zz2) \
 608   { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
 609
 610#define mvswap(zzp1, zzp2, zzn)       \
 611{                                     \
 612   Int32 yyp1 = (zzp1);               \
 613   Int32 yyp2 = (zzp2);               \
 614   Int32 yyn  = (zzn);                \
 615   while (yyn > 0) {                  \
 616      mswap(ptr[yyp1], ptr[yyp2]);    \
 617      yyp1++; yyp2++; yyn--;          \
 618   }                                  \
 619}
 620
 621static 
 622__inline__
 623UChar mmed3 ( UChar a, UChar b, UChar c )
 624{
 625   UChar t;
 626   if (a > b) { t = a; a = b; b = t; };
 627   if (b > c) { 
 628      b = c;
 629      if (a > b) b = a;
 630   }
 631   return b;
 632}
 633
 634#define mmin(a,b) ((a) < (b)) ? (a) : (b)
 635
 636#define mpush(lz,hz,dz) { stackLo[sp] = lz; \
 637                          stackHi[sp] = hz; \
 638                          stackD [sp] = dz; \
 639                          sp++; }
 640
 641#define mpop(lz,hz,dz) { sp--;             \
 642                         lz = stackLo[sp]; \
 643                         hz = stackHi[sp]; \
 644                         dz = stackD [sp]; }
 645
 646
 647#define mnextsize(az) (nextHi[az]-nextLo[az])
 648
 649#define mnextswap(az,bz)                                        \
 650   { Int32 tz;                                                  \
 651     tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
 652     tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
 653     tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
 654
 655
 656#define MAIN_QSORT_SMALL_THRESH 20
 657#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
 658#define MAIN_QSORT_STACK_SIZE 100
 659
 660static
 661void mainQSort3 ( UInt32* ptr,
 662                  UChar*  block,
 663                  UInt16* quadrant,
 664                  Int32   nblock,
 665                  Int32   loSt, 
 666                  Int32   hiSt, 
 667                  Int32   dSt,
 668                  Int32*  budget )
 669{
 670   Int32 unLo, unHi, ltLo, gtHi, n, m, med;
 671   Int32 sp, lo, hi, d;
 672
 673   Int32 stackLo[MAIN_QSORT_STACK_SIZE];
 674   Int32 stackHi[MAIN_QSORT_STACK_SIZE];
 675   Int32 stackD [MAIN_QSORT_STACK_SIZE];
 676
 677   Int32 nextLo[3];
 678   Int32 nextHi[3];
 679   Int32 nextD [3];
 680
 681   sp = 0;
 682   mpush ( loSt, hiSt, dSt );
 683
 684   while (sp > 0) {
 685
 686      AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 );
 687
 688      mpop ( lo, hi, d );
 689      if (hi - lo < MAIN_QSORT_SMALL_THRESH || 
 690          d > MAIN_QSORT_DEPTH_THRESH) {
 691         mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
 692         if (*budget < 0) return;
 693         continue;
 694      }
 695
 696      med = (Int32) 
 697            mmed3 ( block[ptr[ lo         ]+d],
 698                    block[ptr[ hi         ]+d],
 699                    block[ptr[ (lo+hi)>>1 ]+d] );
 700
 701      unLo = ltLo = lo;
 702      unHi = gtHi = hi;
 703
 704      while (True) {
 705         while (True) {
 706            if (unLo > unHi) break;
 707            n = ((Int32)block[ptr[unLo]+d]) - med;
 708            if (n == 0) { 
 709               mswap(ptr[unLo], ptr[ltLo]); 
 710               ltLo++; unLo++; continue; 
 711            };
 712            if (n >  0) break;
 713            unLo++;
 714         }
 715         while (True) {
 716            if (unLo > unHi) break;
 717            n = ((Int32)block[ptr[unHi]+d]) - med;
 718            if (n == 0) { 
 719               mswap(ptr[unHi], ptr[gtHi]); 
 720               gtHi--; unHi--; continue; 
 721            };
 722            if (n <  0) break;
 723            unHi--;
 724         }
 725         if (unLo > unHi) break;
 726         mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
 727      }
 728
 729      AssertD ( unHi == unLo-1, "mainQSort3(2)" );
 730
 731      if (gtHi < ltLo) {
 732         mpush(lo, hi, d+1 );
 733         continue;
 734      }
 735
 736      n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
 737      m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
 738
 739      n = lo + unLo - ltLo - 1;
 740      m = hi - (gtHi - unHi) + 1;
 741
 742      nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
 743      nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
 744      nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
 745
 746      if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
 747      if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
 748      if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
 749
 750      AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
 751      AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
 752
 753      mpush (nextLo[0], nextHi[0], nextD[0]);
 754      mpush (nextLo[1], nextHi[1], nextD[1]);
 755      mpush (nextLo[2], nextHi[2], nextD[2]);
 756   }
 757}
 758
 759#undef mswap
 760#undef mvswap
 761#undef mpush
 762#undef mpop
 763#undef mmin
 764#undef mnextsize
 765#undef mnextswap
 766#undef MAIN_QSORT_SMALL_THRESH
 767#undef MAIN_QSORT_DEPTH_THRESH
 768#undef MAIN_QSORT_STACK_SIZE
 769
 770
 771/*---------------------------------------------*/
 772/* Pre:
 773      nblock > N_OVERSHOOT
 774      block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
 775      ((UChar*)block32) [0 .. nblock-1] holds block
 776      ptr exists for [0 .. nblock-1]
 777
 778   Post:
 779      ((UChar*)block32) [0 .. nblock-1] holds block
 780      All other areas of block32 destroyed
 781      ftab [0 .. 65536 ] destroyed
 782      ptr [0 .. nblock-1] holds sorted order
 783      if (*budget < 0), sorting was abandoned
 784*/
 785
 786#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
 787#define SETMASK (1 << 21)
 788#define CLEARMASK (~(SETMASK))
 789
 790static
 791void mainSort ( UInt32* ptr, 
 792                UChar*  block,
 793                UInt16* quadrant, 
 794                UInt32* ftab,
 795                Int32   nblock,
 796                Int32   verb,
 797                Int32*  budget )
 798{
 799   Int32  i, j, k, ss, sb;
 800   Int32  runningOrder[256];
 801   Bool   bigDone[256];
 802   Int32  copyStart[256];
 803   Int32  copyEnd  [256];
 804   UChar  c1;
 805   Int32  numQSorted;
 806   UInt16 s;
 807   if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
 808
 809   /*-- set up the 2-byte frequency table --*/
 810   for (i = 65536; i >= 0; i--) ftab[i] = 0;
 811
 812   j = block[0] << 8;
 813   i = nblock-1;
 814   for (; i >= 3; i -= 4) {
 815      quadrant[i] = 0;
 816      j = (j >> 8) | ( ((UInt16)block[i]) << 8);
 817      ftab[j]++;
 818      quadrant[i-1] = 0;
 819      j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
 820      ftab[j]++;
 821      quadrant[i-2] = 0;
 822      j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
 823      ftab[j]++;
 824      quadrant[i-3] = 0;
 825      j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
 826      ftab[j]++;
 827   }
 828   for (; i >= 0; i--) {
 829      quadrant[i] = 0;
 830      j = (j >> 8) | ( ((UInt16)block[i]) << 8);
 831      ftab[j]++;
 832   }
 833
 834   /*-- (emphasises close relationship of block & quadrant) --*/
 835   for (i = 0; i < BZ_N_OVERSHOOT; i++) {
 836      block   [nblock+i] = block[i];
 837      quadrant[nblock+i] = 0;
 838   }
 839
 840   if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
 841
 842   /*-- Complete the initial radix sort --*/
 843   for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
 844
 845   s = block[0] << 8;
 846   i = nblock-1;
 847   for (; i >= 3; i -= 4) {
 848      s = (s >> 8) | (block[i] << 8);
 849      j = ftab[s] -1;
 850      ftab[s] = j;
 851      ptr[j] = i;
 852      s = (s >> 8) | (block[i-1] << 8);
 853      j = ftab[s] -1;
 854      ftab[s] = j;
 855      ptr[j] = i-1;
 856      s = (s >> 8) | (block[i-2] << 8);
 857      j = ftab[s] -1;
 858      ftab[s] = j;
 859      ptr[j] = i-2;
 860      s = (s >> 8) | (block[i-3] << 8);
 861      j = ftab[s] -1;
 862      ftab[s] = j;
 863      ptr[j] = i-3;
 864   }
 865   for (; i >= 0; i--) {
 866      s = (s >> 8) | (block[i] << 8);
 867      j = ftab[s] -1;
 868      ftab[s] = j;
 869      ptr[j] = i;
 870   }
 871
 872   /*--
 873      Now ftab contains the first loc of every small bucket.
 874      Calculate the running order, from smallest to largest
 875      big bucket.
 876   --*/
 877   for (i = 0; i <= 255; i++) {
 878      bigDone     [i] = False;
 879      runningOrder[i] = i;
 880   }
 881
 882   {
 883      Int32 vv;
 884      Int32 h = 1;
 885      do h = 3 * h + 1; while (h <= 256);
 886      do {
 887         h = h / 3;
 888         for (i = h; i <= 255; i++) {
 889            vv = runningOrder[i];
 890            j = i;
 891            while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
 892               runningOrder[j] = runningOrder[j-h];
 893               j = j - h;
 894               if (j <= (h - 1)) goto zero;
 895            }
 896            zero:
 897            runningOrder[j] = vv;
 898         }
 899      } while (h != 1);
 900   }
 901
 902   /*--
 903      The main sorting loop.
 904   --*/
 905
 906   numQSorted = 0;
 907
 908   for (i = 0; i <= 255; i++) {
 909
 910      /*--
 911         Process big buckets, starting with the least full.
 912         Basically this is a 3-step process in which we call
 913         mainQSort3 to sort the small buckets [ss, j], but
 914         also make a big effort to avoid the calls if we can.
 915      --*/
 916      ss = runningOrder[i];
 917
 918      /*--
 919         Step 1:
 920         Complete the big bucket [ss] by quicksorting
 921         any unsorted small buckets [ss, j], for j != ss.  
 922         Hopefully previous pointer-scanning phases have already
 923         completed many of the small buckets [ss, j], so
 924         we don't have to sort them at all.
 925      --*/
 926      for (j = 0; j <= 255; j++) {
 927         if (j != ss) {
 928            sb = (ss << 8) + j;
 929            if ( ! (ftab[sb] & SETMASK) ) {
 930               Int32 lo = ftab[sb]   & CLEARMASK;
 931               Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
 932               if (hi > lo) {
 933                  if (verb >= 4)
 934                     VPrintf4 ( "        qsort [0x%x, 0x%x]   "
 935                                "done %d   this %d\n",
 936                                ss, j, numQSorted, hi - lo + 1 );
 937                  mainQSort3 ( 
 938                     ptr, block, quadrant, nblock, 
 939                     lo, hi, BZ_N_RADIX, budget 
 940                  );   
 941                  numQSorted += (hi - lo + 1);
 942                  if (*budget < 0) return;
 943               }
 944            }
 945            ftab[sb] |= SETMASK;
 946         }
 947      }
 948
 949      AssertH ( !bigDone[ss], 1006 );
 950
 951      /*--
 952         Step 2:
 953         Now scan this big bucket [ss] so as to synthesise the
 954         sorted order for small buckets [t, ss] for all t,
 955         including, magically, the bucket [ss,ss] too.
 956         This will avoid doing Real Work in subsequent Step 1's.
 957      --*/
 958      {
 959         for (j = 0; j <= 255; j++) {
 960            copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
 961            copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
 962         }
 963         for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
 964            k = ptr[j]-1; if (k < 0) k += nblock;
 965            c1 = block[k];
 966            if (!bigDone[c1])
 967               ptr[ copyStart[c1]++ ] = k;
 968         }
 969         for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
 970            k = ptr[j]-1; if (k < 0) k += nblock;
 971            c1 = block[k];
 972            if (!bigDone[c1]) 
 973               ptr[ copyEnd[c1]-- ] = k;
 974         }
 975      }
 976
 977      AssertH ( (copyStart[ss]-1 == copyEnd[ss])
 978                || 
 979                /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
 980                   Necessity for this case is demonstrated by compressing 
 981                   a sequence of approximately 48.5 million of character 
 982                   251; 1.0.0/1.0.1 will then die here. */
 983                (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
 984                1007 )
 985
 986      for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
 987
 988      /*--
 989         Step 3:
 990         The [ss] big bucket is now done.  Record this fact,
 991         and update the quadrant descriptors.  Remember to
 992         update quadrants in the overshoot area too, if
 993         necessary.  The "if (i < 255)" test merely skips
 994         this updating for the last bucket processed, since
 995         updating for the last bucket is pointless.
 996
 997         The quadrant array provides a way to incrementally
 998         cache sort orderings, as they appear, so as to 
 999         make subsequent comparisons in fullGtU() complete
1000         faster.  For repetitive blocks this makes a big
1001         difference (but not big enough to be able to avoid
1002         the fallback sorting mechanism, exponential radix sort).
1003
1004         The precise meaning is: at all times:
1005
1006            for 0 <= i < nblock and 0 <= j <= nblock
1007
1008            if block[i] != block[j], 
1009
1010               then the relative values of quadrant[i] and 
1011                    quadrant[j] are meaningless.
1012
1013               else {
1014                  if quadrant[i] < quadrant[j]
1015                     then the string starting at i lexicographically
1016                     precedes the string starting at j
1017
1018                  else if quadrant[i] > quadrant[j]
1019                     then the string starting at j lexicographically
1020                     precedes the string starting at i
1021
1022                  else
1023                     the relative ordering of the strings starting
1024                     at i and j has not yet been determined.
1025               }
1026      --*/
1027      bigDone[ss] = True;
1028
1029      if (i < 255) {
1030         Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
1031         Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
1032         Int32 shifts   = 0;
1033
1034         while ((bbSize >> shifts) > 65534) shifts++;
1035
1036         for (j = bbSize-1; j >= 0; j--) {
1037            Int32 a2update     = ptr[bbStart + j];
1038            UInt16 qVal        = (UInt16)(j >> shifts);
1039            quadrant[a2update] = qVal;
1040            if (a2update < BZ_N_OVERSHOOT)
1041               quadrant[a2update + nblock] = qVal;
1042         }
1043         AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
1044      }
1045
1046   }
1047
1048   if (verb >= 4)
1049      VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
1050                 nblock, numQSorted, nblock - numQSorted );
1051}
1052
1053#undef BIGFREQ
1054#undef SETMASK
1055#undef CLEARMASK
1056
1057
1058/*---------------------------------------------*/
1059/* Pre:
1060      nblock > 0
1061      arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1062      ((UChar*)arr2)  [0 .. nblock-1] holds block
1063      arr1 exists for [0 .. nblock-1]
1064
1065   Post:
1066      ((UChar*)arr2) [0 .. nblock-1] holds block
1067      All other areas of block destroyed
1068      ftab [ 0 .. 65536 ] destroyed
1069      arr1 [0 .. nblock-1] holds sorted order
1070*/
1071void BZ2_blockSort ( EState* s )
1072{
1073   UInt32* ptr    = s->ptr; 
1074   UChar*  block  = s->block;
1075   UInt32* ftab   = s->ftab;
1076   Int32   nblock = s->nblock;
1077   Int32   verb   = s->verbosity;
1078   Int32   wfact  = s->workFactor;
1079   UInt16* quadrant;
1080   Int32   budget;
1081   Int32   budgetInit;
1082   Int32   i;
1083
1084   if (nblock < 10000) {
1085      fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1086   } else {
1087      /* Calculate the location for quadrant, remembering to get
1088         the alignment right.  Assumes that &(block[0]) is at least
1089         2-byte aligned -- this should be ok since block is really
1090         the first section of arr2.
1091      */
1092      i = nblock+BZ_N_OVERSHOOT;
1093      if (i & 1) i++;
1094      quadrant = (UInt16*)(&(block[i]));
1095
1096      /* (wfact-1) / 3 puts the default-factor-30
1097         transition point at very roughly the same place as 
1098         with v0.1 and v0.9.0.  
1099         Not that it particularly matters any more, since the
1100         resulting compressed stream is now the same regardless
1101         of whether or not we use the main sort or fallback sort.
1102      */
1103      if (wfact < 1  ) wfact = 1;
1104      if (wfact > 100) wfact = 100;
1105      budgetInit = nblock * ((wfact-1) / 3);
1106      budget = budgetInit;
1107
1108      mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1109      if (verb >= 3) 
1110         VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
1111                    budgetInit - budget,
1112                    nblock, 
1113                    (float)(budgetInit - budget) /
1114                    (float)(nblock==0 ? 1 : nblock) ); 
1115      if (budget < 0) {
1116         if (verb >= 2) 
1117            VPrintf0 ( "    too repetitive; using fallback"
1118                       " sorting algorithm\n" );
1119         fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1120      }
1121   }
1122
1123   s->origPtr = -1;
1124   for (i = 0; i < s->nblock; i++)
1125      if (ptr[i] == 0)
1126         { s->origPtr = i; break; };
1127
1128   AssertH( s->origPtr != -1, 1003 );
1129}
1130
1131
1132/*-------------------------------------------------------------*/
1133/*--- end                                       blocksort.c ---*/
1134/*-------------------------------------------------------------*/
1135