toybox/toys/pending/compress.c
<<
>>
Prefs
   1/* compress.c - deflate/inflate code for zip, gzip, zlib, and raw
   2 *
   3 * Copyright 2014 Rob Landley <rob@landley.net>
   4 *
   5 * The inflate/deflate code lives here, so the various things that use it
   6 * either live here or call these commands to pipe data through them.
   7 *
   8 * Divergence from posix: replace obsolete/patented "compress" with mutiplexer.
   9 * (gzip already replaces "uncompress".)
  10 *
  11 * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip)
  12 * LSB 4.1 has gzip, gunzip, and zcat
  13 * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout
  14
  15// Accept many different kinds of command line argument.
  16// Leave Lrg at end so flag values line up.
  17
  18USE_COMPRESS(NEWTOY(compress, "zcd9lrg[-cd][!zgLr]", TOYFLAG_USR|TOYFLAG_BIN))
  19
  20//zip unzip gzip gunzip zcat
  21
  22config COMPRESS
  23  bool "compress"
  24  default n
  25  help
  26    usage: compress [-zgLR19] [FILE]
  27
  28    Compress or decompress file (or stdin) using "deflate" algorithm.
  29
  30    -1  min compression
  31    -9  max compression (default)
  32    -g  gzip (default)
  33    -L  zlib
  34    -R  raw
  35    -z  zip
  36
  37config DECOMPRESS
  38  bool "decompress"
  39  default n
  40  help
  41    usage: compress [-zglrcd9] [FILE]
  42
  43    Compress or decompress file (or stdin) using "deflate" algorithm.
  44
  45    -c  compress with -g gzip (default)  -l zlib  -r raw  -z zip
  46    -d  decompress (autodetects type)
  47*/
  48
  49#define FOR_compress
  50#include "toys.h"
  51
  52GLOBALS(
  53  // Huffman codes: base offset and extra bits tables (length and distance)
  54  char lenbits[29], distbits[30];
  55  unsigned short lenbase[29], distbase[30];
  56  void *fixdisthuff, *fixlithuff;
  57
  58  // CRC
  59  void (*crcfunc)(char *data, int len);
  60  unsigned crc;
  61
  62  // Compressed data buffer
  63  char *data;
  64  unsigned pos, len;
  65  int infd, outfd;
  66
  67  // Tables only used for deflation
  68  unsigned short *hashhead, *hashchain;
  69)
  70
  71// little endian bit buffer
  72struct bitbuf {
  73  int fd, bitpos, len, max;
  74  char buf[];
  75};
  76
  77// malloc a struct bitbuf
  78struct bitbuf *bitbuf_init(int fd, int size)
  79{
  80  struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size);
  81
  82  bb->max = size;
  83  bb->fd = fd;
  84
  85  return bb;
  86}
  87
  88// Advance bitpos without the overhead of recording bits
  89void bitbuf_skip(struct bitbuf *bb, int bits)
  90{
  91  int pos = bb->bitpos + bits, len = bb->len << 3;
  92
  93  while (pos >= len) {
  94    pos -= len;
  95    len = (bb->len = read(bb->fd, bb->buf, bb->max)) << 3;
  96    if (bb->len < 1) perror_exit("inflate EOF");
  97  }
  98  bb->bitpos = pos;
  99}
 100
 101// Optimized single bit inlined version
 102static inline int bitbuf_bit(struct bitbuf *bb)
 103{
 104  int bufpos = bb->bitpos>>3;
 105
 106  if (bufpos == bb->len) {
 107    bitbuf_skip(bb, 0);
 108    bufpos = 0;
 109  }
 110
 111  return (bb->buf[bufpos]>>(bb->bitpos++&7))&1;
 112}
 113
 114// Fetch the next X bits from the bitbuf, little endian
 115unsigned bitbuf_get(struct bitbuf *bb, int bits)
 116{
 117  int result = 0, offset = 0;
 118
 119  while (bits) {
 120    int click = bb->bitpos >> 3, blow, blen;
 121
 122    // Load more data if buffer empty
 123    if (click == bb->len) bitbuf_skip(bb, click = 0);
 124
 125    // grab bits from next byte
 126    blow = bb->bitpos & 7;
 127    blen = 8-blow;
 128    if (blen > bits) blen = bits;
 129    result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset;
 130    offset += blen;
 131    bits -= blen;
 132    bb->bitpos += blen;
 133  }
 134
 135  return result;
 136}
 137
 138void bitbuf_flush(struct bitbuf *bb)
 139{
 140  if (!bb->bitpos) return;
 141
 142  xwrite(bb->fd, bb->buf, (bb->bitpos+7)/8);
 143  memset(bb->buf, 0, bb->max);
 144  bb->bitpos = 0;
 145}
 146
 147void bitbuf_put(struct bitbuf *bb, int data, int len)
 148{
 149  while (len) {
 150    int click = bb->bitpos >> 3, blow, blen;
 151
 152    // Flush buffer if necessary
 153    if (click == bb->max) {
 154      bitbuf_flush(bb);
 155      click = 0;
 156    }
 157    blow = bb->bitpos & 7;
 158    blen = 8-blow;
 159    if (blen > len) blen = len;
 160    bb->buf[click] |= data << blow;
 161    bb->bitpos += blen;
 162    data >>= blen;
 163    len -= blen;
 164  }
 165}
 166
 167static void output_byte(char sym)
 168{
 169  int pos = TT.pos++ & 32767;
 170
 171  TT.data[pos] = sym;
 172
 173  if (pos == 32767) {
 174    xwrite(TT.outfd, TT.data, 32768);
 175    if (TT.crcfunc) TT.crcfunc(TT.data, 32768);
 176  }
 177}
 178
 179// Huffman coding uses bits to traverse a binary tree to a leaf node,
 180// By placing frequently occurring symbols at shorter paths, frequently
 181// used symbols may be represented in fewer bits than uncommon symbols.
 182
 183struct huff {
 184  unsigned short length[16];
 185  unsigned short symbol[288];
 186};
 187
 188// Create simple huffman tree from array of bit lengths.
 189
 190// The symbols in the huffman trees are sorted (first by bit length
 191// of the code to reach them, then by symbol number). This means that given
 192// the bit length of each symbol, we can construct a unique tree.
 193static void len2huff(struct huff *huff, char bitlen[], int len)
 194{
 195  int offset[16];
 196  int i;
 197
 198  // Count number of codes at each bit length
 199  memset(huff, 0, sizeof(struct huff));
 200  for (i = 0; i<len; i++) huff->length[bitlen[i]]++;
 201
 202  // Sort symbols by bit length. (They'll remain sorted by symbol within that.)
 203  *huff->length = *offset = 0;
 204  for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1];
 205
 206  for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i;
 207}
 208
 209// Fetch and decode next huffman coded symbol from bitbuf.
 210// This takes advantage of the sorting to navigate the tree as an array:
 211// each time we fetch a bit we have all the codes at that bit level in
 212// order with no gaps.
 213static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff)
 214{
 215  unsigned short *length = huff->length;
 216  int start = 0, offset = 0;
 217
 218  // Traverse through the bit lengths until our code is in this range
 219  for (;;) {
 220    offset = (offset << 1) | bitbuf_bit(bb);
 221    start += *++length;
 222    if ((offset -= *length) < 0) break;
 223    if ((length - huff->length) & 16) error_exit("bad symbol");
 224  }
 225
 226  return huff->symbol[start + offset];
 227}
 228
 229// Decompress deflated data from bitbuf to TT.outfd.
 230static void inflate(struct bitbuf *bb)
 231{
 232  TT.crc = ~0;
 233  // repeat until spanked
 234  for (;;) {
 235    int final, type;
 236
 237    final = bitbuf_get(bb, 1);
 238    type = bitbuf_get(bb, 2);
 239
 240    if (type == 3) error_exit("bad type");
 241
 242    // Uncompressed block?
 243    if (!type) {
 244      int len, nlen;
 245
 246      // Align to byte, read length
 247      bitbuf_skip(bb, (8-bb->bitpos)&7);
 248      len = bitbuf_get(bb, 16);
 249      nlen = bitbuf_get(bb, 16);
 250      if (len != (0xffff & ~nlen)) error_exit("bad len");
 251
 252      // Dump literal output data
 253      while (len) {
 254        int pos = bb->bitpos >> 3, bblen = bb->len - pos;
 255        char *p = bb->buf+pos;
 256
 257        // dump bytes until done or end of current bitbuf contents
 258        if (bblen > len) bblen = len;
 259        pos = bblen;
 260        while (pos--) output_byte(*(p++));
 261        bitbuf_skip(bb, bblen << 3);
 262        len -= bblen;
 263      }
 264
 265    // Compressed block
 266    } else {
 267      struct huff *disthuff, *lithuff;
 268
 269      // Dynamic huffman codes?
 270      if (type == 2) {
 271        struct huff *h2 = ((struct huff *)toybuf)+1;
 272        int i, litlen, distlen, hufflen;
 273        char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b"
 274                              "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits;
 275
 276        // The huffman trees are stored as a series of bit lengths
 277        litlen = bitbuf_get(bb, 5)+257;  // max 288
 278        distlen = bitbuf_get(bb, 5)+1;   // max 32
 279        hufflen = bitbuf_get(bb, 4)+4;   // max 19
 280
 281        // The literal and distance codes are themselves compressed, in
 282        // a complicated way: an array of bit lengths (hufflen many
 283        // entries, each 3 bits) is used to fill out an array of 19 entries
 284        // in a magic order, leaving the rest 0. Then make a tree out of it:
 285        memset(bits = toybuf+1, 0, 19);
 286        for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3);
 287        len2huff(h2, bits, 19);
 288
 289        // Use that tree to read in the literal and distance bit lengths
 290        for (i = 0; i < litlen + distlen;) {
 291          int sym = huff_and_puff(bb, h2);
 292
 293          // 0-15 are literals, 16 = repeat previous code 3-6 times,
 294          // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit)
 295          if (sym < 16) bits[i++] = sym;
 296          else {
 297            int len = sym & 2;
 298
 299            len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2);
 300            memset(bits+i, bits[i-1] * !(sym&3), len);
 301            i += len;
 302          }
 303        }
 304        if (i > litlen+distlen) error_exit("bad tree");
 305
 306        len2huff(lithuff = h2, bits, litlen);
 307        len2huff(disthuff = ((struct huff *)toybuf)+2, bits+litlen, distlen);
 308
 309      // Static huffman codes
 310      } else {
 311        lithuff = TT.fixlithuff;
 312        disthuff = TT.fixdisthuff;
 313      }
 314
 315      // Use huffman tables to decode block of compressed symbols
 316      for (;;) {
 317        int sym = huff_and_puff(bb, lithuff);
 318
 319        // Literal?
 320        if (sym < 256) output_byte(sym);
 321
 322        // Copy range?
 323        else if (sym > 256) {
 324          int len, dist;
 325
 326          sym -= 257;
 327          len = TT.lenbase[sym] + bitbuf_get(bb, TT.lenbits[sym]);
 328          sym = huff_and_puff(bb, disthuff);
 329          dist = TT.distbase[sym] + bitbuf_get(bb, TT.distbits[sym]);
 330          sym = TT.pos & 32767;
 331
 332          while (len--) output_byte(TT.data[(TT.pos-dist) & 32767]);
 333
 334        // End of block
 335        } else break;
 336      }
 337    }
 338
 339    // Was that the last block?
 340    if (final) break;
 341  }
 342
 343  if (TT.pos & 32767) {
 344    xwrite(TT.outfd, TT.data, TT.pos & 32767);
 345    if (TT.crcfunc) TT.crcfunc(TT.data, TT.pos & 32767);
 346  }
 347}
 348
 349// Deflate from TT.infd to bitbuf
 350// For deflate, TT.len = input read, TT.pos = input consumed
 351static void deflate(struct bitbuf *bb)
 352{
 353  char *data = TT.data;
 354  int len, final = 0;
 355
 356  TT.crc = ~0;
 357
 358  while (!final) {
 359    // Read next half-window of data if we haven't hit EOF yet.
 360    len = readall(TT.infd, data+(TT.len&32768), 32768);
 361    if (len < 0) perror_exit("read"); // todo: add filename
 362    if (len != 32768) final++;
 363    if (TT.crcfunc) TT.crcfunc(data+(TT.len&32768), len);
 364    // TT.len += len;  crcfunc advances len
 365
 366    // store block as literal
 367    bitbuf_put(bb, final, 1);
 368    bitbuf_put(bb, 0, 1);
 369
 370    bitbuf_put(bb, 0, (8-bb->bitpos)&7);
 371    bitbuf_put(bb, len, 16);
 372    bitbuf_put(bb, 0xffff & ~len, 16);
 373
 374    // repeat until spanked
 375    while (TT.pos != TT.len) {
 376      unsigned pos = TT.pos & 65535;
 377
 378      bitbuf_put(bb, data[pos], 8);
 379
 380      // need to refill buffer?
 381      if (!(32767 & ++TT.pos) && !final) break;
 382    }
 383  }
 384  bitbuf_flush(bb);
 385}
 386
 387// Allocate memory for deflate/inflate.
 388static void init_deflate(int compress)
 389{
 390  int i, n = 1;
 391
 392  // compress needs 64k data and 32k each for hashhead and hashchain.
 393  // decompress just needs 32k data.
 394  TT.data = xmalloc(32768*(compress ? 4 : 1));
 395  if (compress) {
 396    TT.hashhead = (unsigned short *)(TT.data + 65536);
 397    TT.hashchain = (unsigned short *)(TT.data + 65536 + 32768);
 398  }
 399
 400  // Calculate lenbits, lenbase, distbits, distbase
 401  *TT.lenbase = 3;
 402  for (i = 0; i<sizeof(TT.lenbits)-1; i++) {
 403    if (i>4) {
 404      if (!(i&3)) {
 405        TT.lenbits[i]++;
 406        n <<= 1;
 407      }
 408      if (i == 27) n--;
 409      else TT.lenbits[i+1] = TT.lenbits[i];
 410    }
 411    TT.lenbase[i+1] = n + TT.lenbase[i];
 412  }
 413  n = 0;
 414  for (i = 0; i<sizeof(TT.distbits); i++) {
 415    TT.distbase[i] = 1<<n;
 416    if (i) TT.distbase[i] += TT.distbase[i-1];
 417    if (i>3 && !(i&1)) n++;
 418    TT.distbits[i] = n;
 419  }
 420
 421  // Init fixed huffman tables
 422  for (i=0; i<288; i++) toybuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279);
 423  len2huff(TT.fixlithuff = ((struct huff *)toybuf)+3, toybuf, 288);
 424  memset(toybuf, 5, 30);
 425  len2huff(TT.fixdisthuff = ((struct huff *)toybuf)+4, toybuf, 30);
 426}
 427
 428// Return true/false whether we consumed a gzip header.
 429static int is_gzip(struct bitbuf *bb)
 430{
 431  int flags;
 432
 433  // Confirm signature
 434  if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31)
 435    return 0;
 436  bitbuf_skip(bb, 6*8);
 437
 438  // Skip extra, name, comment, header CRC fields
 439  if (flags & 4) bitbuf_skip(bb, 16);
 440  if (flags & 8) while (bitbuf_get(bb, 8));
 441  if (flags & 16) while (bitbuf_get(bb, 8));
 442  if (flags & 2) bitbuf_skip(bb, 16);
 443
 444  return 1;
 445}
 446
 447void gzip_crc(char *data, int len)
 448{
 449  int i;
 450  unsigned crc, *crc_table = (unsigned *)(toybuf+sizeof(toybuf)-1024);
 451
 452  crc = TT.crc;
 453  for (i=0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8);
 454  TT.crc = crc;
 455  TT.len += len;
 456}
 457
 458static void do_gzip(int fd, char *name)
 459{
 460  struct bitbuf *bb = bitbuf_init(1, sizeof(toybuf));
 461
 462  // Header from RFC 1952 section 2.2:
 463  // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none),
 464  // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression),
 465  // Operating System (FF=unknown)
 466 
 467  TT.infd = fd;
 468  xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10);
 469
 470  // Use last 1k of toybuf for little endian crc table
 471  crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
 472  TT.crcfunc = gzip_crc;
 473
 474  deflate(bb);
 475
 476  // tail: crc32, len32
 477
 478  bitbuf_put(bb, 0, (8-bb->bitpos)&7);
 479  bitbuf_put(bb, ~TT.crc, 32);
 480  bitbuf_put(bb, TT.len, 32);
 481
 482  bitbuf_flush(bb);
 483  free(bb);
 484}
 485
 486static void do_zcat(int fd, char *name)
 487{
 488  struct bitbuf *bb = bitbuf_init(fd, sizeof(toybuf));
 489
 490  if (!is_gzip(bb)) error_exit("not gzip");
 491  TT.outfd = 1;
 492
 493  // Use last 1k of toybuf for little endian crc table
 494  crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
 495  TT.crcfunc = gzip_crc;
 496
 497  inflate(bb);
 498
 499  // tail: crc32, len32
 500
 501  bitbuf_skip(bb, (8-bb->bitpos)&7);
 502  if (~TT.crc != bitbuf_get(bb, 32) || TT.len != bitbuf_get(bb, 32))
 503    error_exit("bad crc");
 504  free(bb);
 505}
 506
 507// Parse many different kinds of command line argument:
 508
 509void compress_main(void)
 510{
 511  // todo: this
 512  printf("hello world");
 513}
 514