toybox/lib/deflate.c
<<
>>
Prefs
   1/* deflate.c - deflate/inflate code for gzip and friends
   2 *
   3 * Copyright 2014 Rob Landley <rob@landley.net>
   4 *
   5 * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip)
   6 * LSB 4.1 has gzip, gunzip, and zcat
   7 *
   8 * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout
   9 */
  10
  11#include "toys.h"
  12
  13struct deflate {
  14  // Huffman codes: base offset and extra bits tables (length and distance)
  15  char lenbits[29], distbits[30];
  16  unsigned short lenbase[29], distbase[30];
  17  void *fixdisthuff, *fixlithuff;
  18
  19  // CRC
  20  void (*crcfunc)(struct deflate *dd, char *data, int len);
  21  unsigned crctable[256], crc;
  22
  23
  24  // Tables only used for deflation
  25  unsigned short *hashhead, *hashchain;
  26
  27  // Compressed data buffer (extra space malloced at end)
  28  unsigned pos, len;
  29  int infd, outfd;
  30  char data[];
  31};
  32
  33// little endian bit buffer
  34struct bitbuf {
  35  int fd, bitpos, len, max;
  36  char buf[];
  37};
  38
  39// malloc a struct bitbuf
  40static struct bitbuf *bitbuf_init(int fd, int size)
  41{
  42  struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size);
  43
  44  bb->max = size;
  45  bb->fd = fd;
  46
  47  return bb;
  48}
  49
  50// Advance bitpos without the overhead of recording bits
  51// Loads more data when input buffer empty
  52static void bitbuf_skip(struct bitbuf *bb, int bits)
  53{
  54  int pos = bb->bitpos + bits, len = bb->len << 3;
  55
  56  while (pos >= len) {
  57    pos -= len;
  58    len = (bb->len = read(bb->fd, bb->buf, bb->max)) << 3;
  59    if (bb->len < 1) perror_exit("inflate EOF");
  60  }
  61  bb->bitpos = pos;
  62}
  63
  64// Optimized single bit inlined version
  65static inline int bitbuf_bit(struct bitbuf *bb)
  66{
  67  int bufpos = bb->bitpos>>3;
  68
  69  if (bufpos == bb->len) {
  70    bitbuf_skip(bb, 0);
  71    bufpos = 0;
  72  }
  73
  74  return (bb->buf[bufpos]>>(bb->bitpos++&7))&1;
  75}
  76
  77// Fetch the next X bits from the bitbuf, little endian
  78static unsigned bitbuf_get(struct bitbuf *bb, int bits)
  79{
  80  int result = 0, offset = 0;
  81
  82  while (bits) {
  83    int click = bb->bitpos >> 3, blow, blen;
  84
  85    // Load more data if buffer empty
  86    if (click == bb->len) bitbuf_skip(bb, click = 0);
  87
  88    // grab bits from next byte
  89    blow = bb->bitpos & 7;
  90    blen = 8-blow;
  91    if (blen > bits) blen = bits;
  92    result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset;
  93    offset += blen;
  94    bits -= blen;
  95    bb->bitpos += blen;
  96  }
  97
  98  return result;
  99}
 100
 101static void bitbuf_flush(struct bitbuf *bb)
 102{
 103  if (!bb->bitpos) return;
 104
 105  xwrite(bb->fd, bb->buf, (bb->bitpos+7)>>3);
 106  memset(bb->buf, 0, bb->max);
 107  bb->bitpos = 0;
 108}
 109
 110static void bitbuf_put(struct bitbuf *bb, int data, int len)
 111{
 112  while (len) {
 113    int click = bb->bitpos >> 3, blow, blen;
 114
 115    // Flush buffer if necessary
 116    if (click == bb->max) {
 117      bitbuf_flush(bb);
 118      click = 0;
 119    }
 120    blow = bb->bitpos & 7;
 121    blen = 8-blow;
 122    if (blen > len) blen = len;
 123    bb->buf[click] |= data << blow;
 124    bb->bitpos += blen;
 125    data >>= blen;
 126    len -= blen;
 127  }
 128}
 129
 130static void output_byte(struct deflate *dd, char sym)
 131{
 132  int pos = dd->pos++ & 32767;
 133
 134  dd->data[pos] = sym;
 135
 136  if (pos == 32767) {
 137    xwrite(dd->outfd, dd->data, 32768);
 138    if (dd->crcfunc) dd->crcfunc(dd, dd->data, 32768);
 139  }
 140}
 141
 142// Huffman coding uses bits to traverse a binary tree to a leaf node,
 143// By placing frequently occurring symbols at shorter paths, frequently
 144// used symbols may be represented in fewer bits than uncommon symbols.
 145// (length[0] isn't used but code's clearer if it's there.)
 146
 147struct huff {
 148  unsigned short length[16];  // How many symbols have this bit length?
 149  unsigned short symbol[288]; // sorted by bit length, then ascending order
 150};
 151
 152// Create simple huffman tree from array of bit lengths.
 153
 154// The symbols in the huffman trees are sorted (first by bit length
 155// of the code to reach them, then by symbol number). This means that given
 156// the bit length of each symbol, we can construct a unique tree.
 157static void len2huff(struct huff *huff, char bitlen[], int len)
 158{
 159  int offset[16];
 160  int i;
 161
 162  // Count number of codes at each bit length
 163  memset(huff, 0, sizeof(struct huff));
 164  for (i = 0; i<len; i++) huff->length[bitlen[i]]++;
 165
 166  // Sort symbols by bit length, then symbol. Get list of starting positions
 167  // for each group, then write each symbol to next position within its group.
 168  *huff->length = *offset = 0;
 169  for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1];
 170  for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i;
 171}
 172
 173// Fetch and decode next huffman coded symbol from bitbuf.
 174// This takes advantage of the sorting to navigate the tree as an array:
 175// each time we fetch a bit we have all the codes at that bit level in
 176// order with no gaps.
 177static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff)
 178{
 179  unsigned short *length = huff->length;
 180  int start = 0, offset = 0;
 181
 182  // Traverse through the bit lengths until our code is in this range
 183  for (;;) {
 184    offset = (offset << 1) | bitbuf_bit(bb);
 185    start += *++length;
 186    if ((offset -= *length) < 0) break;
 187    if ((length - huff->length) & 16) error_exit("bad symbol");
 188  }
 189
 190  return huff->symbol[start + offset];
 191}
 192
 193// Decompress deflated data from bitbuf to dd->outfd.
 194static void inflate(struct deflate *dd, struct bitbuf *bb)
 195{
 196  dd->crc = ~0;
 197  // repeat until spanked
 198  for (;;) {
 199    int final, type;
 200
 201    final = bitbuf_get(bb, 1);
 202    type = bitbuf_get(bb, 2);
 203
 204    if (type == 3) error_exit("bad type");
 205
 206    // Uncompressed block?
 207    if (!type) {
 208      int len, nlen;
 209
 210      // Align to byte, read length
 211      bitbuf_skip(bb, (8-bb->bitpos)&7);
 212      len = bitbuf_get(bb, 16);
 213      nlen = bitbuf_get(bb, 16);
 214      if (len != (0xffff & ~nlen)) error_exit("bad len");
 215
 216      // Dump literal output data
 217      while (len) {
 218        int pos = bb->bitpos >> 3, bblen = bb->len - pos;
 219        char *p = bb->buf+pos;
 220
 221        // dump bytes until done or end of current bitbuf contents
 222        if (bblen > len) bblen = len;
 223        pos = bblen;
 224        while (pos--) output_byte(dd, *(p++));
 225        bitbuf_skip(bb, bblen << 3);
 226        len -= bblen;
 227      }
 228
 229    // Compressed block
 230    } else {
 231      struct huff *disthuff, *lithuff;
 232
 233      // Dynamic huffman codes?
 234      if (type == 2) {
 235        struct huff *h2 = ((struct huff *)libbuf)+1;
 236        int i, litlen, distlen, hufflen;
 237        char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b"
 238                              "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits;
 239
 240        // The huffman trees are stored as a series of bit lengths
 241        litlen = bitbuf_get(bb, 5)+257;  // max 288
 242        distlen = bitbuf_get(bb, 5)+1;   // max 32
 243        hufflen = bitbuf_get(bb, 4)+4;   // max 19
 244
 245        // The literal and distance codes are themselves compressed, in
 246        // a complicated way: an array of bit lengths (hufflen many
 247        // entries, each 3 bits) is used to fill out an array of 19 entries
 248        // in a magic order, leaving the rest 0. Then make a tree out of it:
 249        memset(bits = libbuf+1, 0, 19);
 250        for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3);
 251        len2huff(h2, bits, 19);
 252
 253        // Use that tree to read in the literal and distance bit lengths
 254        for (i = 0; i < litlen + distlen;) {
 255          int sym = huff_and_puff(bb, h2);
 256
 257          // 0-15 are literals, 16 = repeat previous code 3-6 times,
 258          // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit)
 259          if (sym < 16) bits[i++] = sym;
 260          else {
 261            int len = sym & 2;
 262
 263            len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2);
 264            memset(bits+i, bits[i-1] * !(sym&3), len);
 265            i += len;
 266          }
 267        }
 268        if (i > litlen+distlen) error_exit("bad tree");
 269
 270        len2huff(lithuff = h2, bits, litlen);
 271        len2huff(disthuff = ((struct huff *)libbuf)+2, bits+litlen, distlen);
 272
 273      // Static huffman codes
 274      } else {
 275        lithuff = dd->fixlithuff;
 276        disthuff = dd->fixdisthuff;
 277      }
 278
 279      // Use huffman tables to decode block of compressed symbols
 280      for (;;) {
 281        int sym = huff_and_puff(bb, lithuff);
 282
 283        // Literal?
 284        if (sym < 256) output_byte(dd, sym);
 285
 286        // Copy range?
 287        else if (sym > 256) {
 288          int len, dist;
 289
 290          sym -= 257;
 291          len = dd->lenbase[sym] + bitbuf_get(bb, dd->lenbits[sym]);
 292          sym = huff_and_puff(bb, disthuff);
 293          dist = dd->distbase[sym] + bitbuf_get(bb, dd->distbits[sym]);
 294          sym = dd->pos & 32767;
 295
 296          while (len--) output_byte(dd, dd->data[(dd->pos-dist) & 32767]);
 297
 298        // End of block
 299        } else break;
 300      }
 301    }
 302
 303    // Was that the last block?
 304    if (final) break;
 305  }
 306
 307  if (dd->pos & 32767) {
 308    xwrite(dd->outfd, dd->data, dd->pos&32767);
 309    if (dd->crcfunc) dd->crcfunc(dd, dd->data, dd->pos&32767);
 310  }
 311}
 312
 313// Deflate from dd->infd to bitbuf
 314// For deflate, dd->len = input read, dd->pos = input consumed
 315static void deflate(struct deflate *dd, struct bitbuf *bb)
 316{
 317  char *data = dd->data;
 318  int len, final = 0;
 319
 320  dd->crc = ~0;
 321
 322  while (!final) {
 323    // Read next half-window of data if we haven't hit EOF yet.
 324    len = readall(dd->infd, data+(dd->len&32768), 32768);
 325    if (len < 0) perror_exit("read"); // todo: add filename
 326    if (len != 32768) final++;
 327    if (dd->crcfunc) dd->crcfunc(dd, data+(dd->len&32768), len);
 328    // dd->len += len;  crcfunc advances len TODO
 329
 330    // store block as literal
 331    bitbuf_put(bb, final, 1);
 332    bitbuf_put(bb, 0, 1);
 333
 334    bitbuf_put(bb, 0, (8-bb->bitpos)&7);
 335    bitbuf_put(bb, len, 16);
 336    bitbuf_put(bb, 0xffff & ~len, 16);
 337
 338    // repeat until spanked
 339    while (dd->pos != dd->len) {
 340      unsigned pos = dd->pos&65535;
 341
 342      bitbuf_put(bb, data[pos], 8);
 343
 344      // need to refill buffer?
 345      if (!(32767 & ++dd->pos) && !final) break;
 346    }
 347  }
 348  bitbuf_flush(bb);
 349}
 350
 351// Allocate memory for deflate/inflate.
 352static struct deflate *init_deflate(int compress)
 353{
 354  int i, n = 1;
 355  struct deflate *dd = xmalloc(sizeof(struct deflate)+32768*(compress ? 4 : 1));
 356
 357  memset(dd, 0, sizeof(struct deflate));
 358  // decompress needs 32k history, compress adds 64k hashhead and 32k hashchain
 359  if (compress) {
 360    dd->hashhead = (unsigned short *)(dd->data+65536);
 361    dd->hashchain = (unsigned short *)(dd->data+65536+32768);
 362  }
 363
 364  // Calculate lenbits, lenbase, distbits, distbase
 365  *dd->lenbase = 3;
 366  for (i = 0; i<sizeof(dd->lenbits)-1; i++) {
 367    if (i>4) {
 368      if (!(i&3)) {
 369        dd->lenbits[i]++;
 370        n <<= 1;
 371      }
 372      if (i == 27) n--;
 373      else dd->lenbits[i+1] = dd->lenbits[i];
 374    }
 375    dd->lenbase[i+1] = n + dd->lenbase[i];
 376  }
 377  n = 0;
 378  for (i = 0; i<sizeof(dd->distbits); i++) {
 379    dd->distbase[i] = 1<<n;
 380    if (i) dd->distbase[i] += dd->distbase[i-1];
 381    if (i>3 && !(i&1)) n++;
 382    dd->distbits[i] = n;
 383  }
 384
 385// TODO layout and lifetime of this?
 386  // Init fixed huffman tables
 387  for (i=0; i<288; i++) libbuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279);
 388  len2huff(dd->fixlithuff = ((struct huff *)libbuf)+3, libbuf, 288);
 389  memset(libbuf, 5, 30);
 390  len2huff(dd->fixdisthuff = ((struct huff *)libbuf)+4, libbuf, 30);
 391
 392  return dd;
 393}
 394
 395// Return true/false whether we consumed a gzip header.
 396static int is_gzip(struct bitbuf *bb)
 397{
 398  int flags;
 399
 400  // Confirm signature
 401  if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31)
 402    return 0;
 403  bitbuf_skip(bb, 6*8);
 404
 405  // Skip extra, name, comment, header CRC fields
 406  if (flags & 4) bitbuf_skip(bb, bitbuf_get(bb, 16) * 8);
 407  if (flags & 8) while (bitbuf_get(bb, 8));
 408  if (flags & 16) while (bitbuf_get(bb, 8));
 409  if (flags & 2) bitbuf_skip(bb, 16);
 410
 411  return 1;
 412}
 413
 414static void gzip_crc(struct deflate *dd, char *data, int len)
 415{
 416  int i;
 417  unsigned crc, *crc_table = dd->crctable;
 418
 419  crc = dd->crc;
 420  for (i=0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8);
 421  dd->crc = crc;
 422  dd->len += len;
 423}
 424
 425long long gzip_fd(int infd, int outfd)
 426{
 427  struct bitbuf *bb = bitbuf_init(outfd, 4096);
 428  struct deflate *dd = init_deflate(1);
 429  long long rc;
 430
 431  // Header from RFC 1952 section 2.2:
 432  // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none),
 433  // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression),
 434  // Operating System (FF=unknown)
 435 
 436  dd->infd = infd;
 437  xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10);
 438
 439  // Little endian crc table
 440  crc_init(dd->crctable, 1);
 441  dd->crcfunc = gzip_crc;
 442
 443  deflate(dd, bb);
 444
 445  // tail: crc32, len32
 446
 447  bitbuf_put(bb, 0, (8-bb->bitpos)&7);
 448  bitbuf_put(bb, ~dd->crc, 32);
 449  bitbuf_put(bb, dd->len, 32);
 450  rc = dd->len;
 451
 452  bitbuf_flush(bb);
 453  free(bb);
 454  free(dd);
 455
 456  return rc;
 457}
 458
 459long long gunzip_fd(int infd, int outfd)
 460{
 461  struct bitbuf *bb = bitbuf_init(infd, 4096);
 462  struct deflate *dd = init_deflate(0);
 463  long long rc;
 464
 465  if (!is_gzip(bb)) error_exit("not gzip");
 466  dd->outfd = outfd;
 467
 468  // Little endian crc table
 469  crc_init(dd->crctable, 1);
 470  dd->crcfunc = gzip_crc;
 471
 472  inflate(dd, bb);
 473
 474  // tail: crc32, len32
 475
 476  bitbuf_skip(bb, (8-bb->bitpos)&7);
 477  if (~dd->crc != bitbuf_get(bb, 32) || dd->len != bitbuf_get(bb, 32))
 478    error_exit("bad crc");
 479
 480  rc = dd->len;
 481  free(bb);
 482  free(dd);
 483
 484  return rc;
 485}
 486