linux/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2
   3/* P9 gunzip sample code for demonstrating the P9 NX hardware
   4 * interface.  Not intended for productive uses or for performance or
   5 * compression ratio measurements.  Note also that /dev/crypto/gzip,
   6 * VAS and skiboot support are required
   7 *
   8 * Copyright 2020 IBM Corp.
   9 *
  10 * Author: Bulent Abali <abali@us.ibm.com>
  11 *
  12 * https://github.com/libnxz/power-gzip for zlib api and other utils
  13 * Definitions of acronyms used here.  See
  14 * P9 NX Gzip Accelerator User's Manual for details:
  15 * https://github.com/libnxz/power-gzip/blob/develop/doc/power_nx_gzip_um.pdf
  16 *
  17 * adler/crc: 32 bit checksums appended to stream tail
  18 * ce:       completion extension
  19 * cpb:      coprocessor parameter block (metadata)
  20 * crb:      coprocessor request block (command)
  21 * csb:      coprocessor status block (status)
  22 * dht:      dynamic huffman table
  23 * dde:      data descriptor element (address, length)
  24 * ddl:      list of ddes
  25 * dh/fh:    dynamic and fixed huffman types
  26 * fc:       coprocessor function code
  27 * histlen:  history/dictionary length
  28 * history:  sliding window of up to 32KB of data
  29 * lzcount:  Deflate LZ symbol counts
  30 * rembytecnt: remaining byte count
  31 * sfbt:     source final block type; last block's type during decomp
  32 * spbc:     source processed byte count
  33 * subc:     source unprocessed bit count
  34 * tebc:     target ending bit count; valid bits in the last byte
  35 * tpbc:     target processed byte count
  36 * vas:      virtual accelerator switch; the user mode interface
  37 */
  38
  39#define _ISOC11_SOURCE  // For aligned_alloc()
  40#define _DEFAULT_SOURCE // For endian.h
  41
  42#include <stdio.h>
  43#include <stdlib.h>
  44#include <string.h>
  45#include <unistd.h>
  46#include <stdint.h>
  47#include <sys/types.h>
  48#include <sys/stat.h>
  49#include <sys/time.h>
  50#include <sys/fcntl.h>
  51#include <sys/mman.h>
  52#include <endian.h>
  53#include <bits/endian.h>
  54#include <sys/ioctl.h>
  55#include <assert.h>
  56#include <errno.h>
  57#include <signal.h>
  58#include "nxu.h"
  59#include "nx.h"
  60#include "crb.h"
  61
  62int nx_dbg;
  63FILE *nx_gzip_log;
  64
  65#define NX_MIN(X, Y) (((X) < (Y))?(X):(Y))
  66#define NX_MAX(X, Y) (((X) > (Y))?(X):(Y))
  67
  68#define GETINPC(X) fgetc(X)
  69#define FNAME_MAX 1024
  70
  71/* fifo queue management */
  72#define fifo_used_bytes(used) (used)
  73#define fifo_free_bytes(used, len) ((len)-(used))
  74/* amount of free bytes in the first and last parts */
  75#define fifo_free_first_bytes(cur, used, len)  ((((cur)+(used)) <= (len)) \
  76                                                  ? (len)-((cur)+(used)) : 0)
  77#define fifo_free_last_bytes(cur, used, len)   ((((cur)+(used)) <= (len)) \
  78                                                  ? (cur) : (len)-(used))
  79/* amount of used bytes in the first and last parts */
  80#define fifo_used_first_bytes(cur, used, len)  ((((cur)+(used)) <= (len)) \
  81                                                  ? (used) : (len)-(cur))
  82#define fifo_used_last_bytes(cur, used, len)   ((((cur)+(used)) <= (len)) \
  83                                                  ? 0 : ((used)+(cur))-(len))
  84/* first and last free parts start here */
  85#define fifo_free_first_offset(cur, used)      ((cur)+(used))
  86#define fifo_free_last_offset(cur, used, len)  \
  87                                           fifo_used_last_bytes(cur, used, len)
  88/* first and last used parts start here */
  89#define fifo_used_first_offset(cur)            (cur)
  90#define fifo_used_last_offset(cur)             (0)
  91
  92const int fifo_in_len = 1<<24;
  93const int fifo_out_len = 1<<24;
  94const int page_sz = 1<<16;
  95const int line_sz = 1<<7;
  96const int window_max = 1<<15;
  97
  98/*
  99 * Adds an (address, len) pair to the list of ddes (ddl) and updates
 100 * the base dde.  ddl[0] is the only dde in a direct dde which
 101 * contains a single (addr,len) pair.  For more pairs, ddl[0] becomes
 102 * the indirect (base) dde that points to a list of direct ddes.
 103 * See Section 6.4 of the NX-gzip user manual for DDE description.
 104 * Addr=NULL, len=0 clears the ddl[0].  Returns the total number of
 105 * bytes in ddl.  Caller is responsible for allocting the array of
 106 * nx_dde_t *ddl.  If N addresses are required in the scatter-gather
 107 * list, the ddl array must have N+1 entries minimum.
 108 */
 109static inline uint32_t nx_append_dde(struct nx_dde_t *ddl, void *addr,
 110                                        uint32_t len)
 111{
 112        uint32_t ddecnt;
 113        uint32_t bytes;
 114
 115        if (addr == NULL && len == 0) {
 116                clearp_dde(ddl);
 117                return 0;
 118        }
 119
 120        NXPRT(fprintf(stderr, "%d: %s addr %p len %x\n", __LINE__, addr,
 121                        __func__, len));
 122
 123        /* Number of ddes in the dde list ; == 0 when it is a direct dde */
 124        ddecnt = getpnn(ddl, dde_count);
 125        bytes = getp32(ddl, ddebc);
 126
 127        if (ddecnt == 0 && bytes == 0) {
 128                /* First dde is unused; make it a direct dde */
 129                bytes = len;
 130                putp32(ddl, ddebc, bytes);
 131                putp64(ddl, ddead, (uint64_t) addr);
 132        } else if (ddecnt == 0) {
 133                /* Converting direct to indirect dde
 134                 * ddl[0] becomes head dde of ddl
 135                 * copy direct to indirect first.
 136                 */
 137                ddl[1] = ddl[0];
 138
 139                /* Add the new dde next */
 140                clear_dde(ddl[2]);
 141                put32(ddl[2], ddebc, len);
 142                put64(ddl[2], ddead, (uint64_t) addr);
 143
 144                /* Ddl head points to 2 direct ddes */
 145                ddecnt = 2;
 146                putpnn(ddl, dde_count, ddecnt);
 147                bytes = bytes + len;
 148                putp32(ddl, ddebc, bytes);
 149                /* Pointer to the first direct dde */
 150                putp64(ddl, ddead, (uint64_t) &ddl[1]);
 151        } else {
 152                /* Append a dde to an existing indirect ddl */
 153                ++ddecnt;
 154                clear_dde(ddl[ddecnt]);
 155                put64(ddl[ddecnt], ddead, (uint64_t) addr);
 156                put32(ddl[ddecnt], ddebc, len);
 157
 158                putpnn(ddl, dde_count, ddecnt);
 159                bytes = bytes + len;
 160                putp32(ddl, ddebc, bytes); /* byte sum of all dde */
 161        }
 162        return bytes;
 163}
 164
 165/*
 166 * Touch specified number of pages represented in number bytes
 167 * beginning from the first buffer in a dde list.
 168 * Do not touch the pages past buf_sz-th byte's page.
 169 *
 170 * Set buf_sz = 0 to touch all pages described by the ddep.
 171 */
 172static int nx_touch_pages_dde(struct nx_dde_t *ddep, long buf_sz, long page_sz,
 173                                int wr)
 174{
 175        uint32_t indirect_count;
 176        uint32_t buf_len;
 177        long total;
 178        uint64_t buf_addr;
 179        struct nx_dde_t *dde_list;
 180        int i;
 181
 182        assert(!!ddep);
 183
 184        indirect_count = getpnn(ddep, dde_count);
 185
 186        NXPRT(fprintf(stderr, "%s dde_count %d request len ", __func__,
 187                        indirect_count));
 188        NXPRT(fprintf(stderr, "0x%lx\n", buf_sz));
 189
 190        if (indirect_count == 0) {
 191                /* Direct dde */
 192                buf_len = getp32(ddep, ddebc);
 193                buf_addr = getp64(ddep, ddead);
 194
 195                NXPRT(fprintf(stderr, "touch direct ddebc 0x%x ddead %p\n",
 196                                buf_len, (void *)buf_addr));
 197
 198                if (buf_sz == 0)
 199                        nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
 200                else
 201                        nxu_touch_pages((void *)buf_addr, NX_MIN(buf_len,
 202                                        buf_sz), page_sz, wr);
 203
 204                return ERR_NX_OK;
 205        }
 206
 207        /* Indirect dde */
 208        if (indirect_count > MAX_DDE_COUNT)
 209                return ERR_NX_EXCESSIVE_DDE;
 210
 211        /* First address of the list */
 212        dde_list = (struct nx_dde_t *) getp64(ddep, ddead);
 213
 214        if (buf_sz == 0)
 215                buf_sz = getp32(ddep, ddebc);
 216
 217        total = 0;
 218        for (i = 0; i < indirect_count; i++) {
 219                buf_len = get32(dde_list[i], ddebc);
 220                buf_addr = get64(dde_list[i], ddead);
 221                total += buf_len;
 222
 223                NXPRT(fprintf(stderr, "touch loop len 0x%x ddead %p total ",
 224                                buf_len, (void *)buf_addr));
 225                NXPRT(fprintf(stderr, "0x%lx\n", total));
 226
 227                /* Touching fewer pages than encoded in the ddebc */
 228                if (total > buf_sz) {
 229                        buf_len = NX_MIN(buf_len, total - buf_sz);
 230                        nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
 231                        NXPRT(fprintf(stderr, "touch loop break len 0x%x ",
 232                                      buf_len));
 233                        NXPRT(fprintf(stderr, "ddead %p\n", (void *)buf_addr));
 234                        break;
 235                }
 236                nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
 237        }
 238        return ERR_NX_OK;
 239}
 240
 241/*
 242 * Src and dst buffers are supplied in scatter gather lists.
 243 * NX function code and other parameters supplied in cmdp.
 244 */
 245static int nx_submit_job(struct nx_dde_t *src, struct nx_dde_t *dst,
 246                         struct nx_gzip_crb_cpb_t *cmdp, void *handle)
 247{
 248        uint64_t csbaddr;
 249
 250        memset((void *)&cmdp->crb.csb, 0, sizeof(cmdp->crb.csb));
 251
 252        cmdp->crb.source_dde = *src;
 253        cmdp->crb.target_dde = *dst;
 254
 255        /* Status, output byte count in tpbc */
 256        csbaddr = ((uint64_t) &cmdp->crb.csb) & csb_address_mask;
 257        put64(cmdp->crb, csb_address, csbaddr);
 258
 259        /* NX reports input bytes in spbc; cleared */
 260        cmdp->cpb.out_spbc_comp_wrap = 0;
 261        cmdp->cpb.out_spbc_comp_with_count = 0;
 262        cmdp->cpb.out_spbc_decomp = 0;
 263
 264        /* Clear output */
 265        put32(cmdp->cpb, out_crc, INIT_CRC);
 266        put32(cmdp->cpb, out_adler, INIT_ADLER);
 267
 268        /* Submit the crb, the job descriptor, to the accelerator. */
 269        return nxu_submit_job(cmdp, handle);
 270}
 271
 272int decompress_file(int argc, char **argv, void *devhandle)
 273{
 274        FILE *inpf = NULL;
 275        FILE *outf = NULL;
 276
 277        int c, expect, i, cc, rc = 0;
 278        char gzfname[FNAME_MAX];
 279
 280        /* Queuing, file ops, byte counting */
 281        char *fifo_in, *fifo_out;
 282        int used_in, cur_in, used_out, cur_out, read_sz, n;
 283        int first_free, last_free, first_used, last_used;
 284        int first_offset, last_offset;
 285        int write_sz, free_space, source_sz;
 286        int source_sz_estimate, target_sz_estimate;
 287        uint64_t last_comp_ratio = 0; /* 1000 max */
 288        uint64_t total_out = 0;
 289        int is_final, is_eof;
 290
 291        /* nx hardware */
 292        int sfbt, subc, spbc, tpbc, nx_ce, fc, resuming = 0;
 293        int history_len = 0;
 294        struct nx_gzip_crb_cpb_t cmd, *cmdp;
 295        struct nx_dde_t *ddl_in;
 296        struct nx_dde_t dde_in[6] __aligned(128);
 297        struct nx_dde_t *ddl_out;
 298        struct nx_dde_t dde_out[6] __aligned(128);
 299        int pgfault_retries;
 300
 301        /* when using mmap'ed files */
 302        off_t input_file_offset;
 303
 304        if (argc > 2) {
 305                fprintf(stderr, "usage: %s <fname> or stdin\n", argv[0]);
 306                fprintf(stderr, "    writes to stdout or <fname>.nx.gunzip\n");
 307                return -1;
 308        }
 309
 310        if (argc == 1) {
 311                inpf = stdin;
 312                outf = stdout;
 313        } else if (argc == 2) {
 314                char w[1024];
 315                char *wp;
 316
 317                inpf = fopen(argv[1], "r");
 318                if (inpf == NULL) {
 319                        perror(argv[1]);
 320                        return -1;
 321                }
 322
 323                /* Make a new file name to write to.  Ignoring '.gz' */
 324                wp = (NULL != (wp = strrchr(argv[1], '/'))) ? (wp+1) : argv[1];
 325                strcpy(w, wp);
 326                strcat(w, ".nx.gunzip");
 327
 328                outf = fopen(w, "w");
 329                if (outf == NULL) {
 330                        perror(w);
 331                        return -1;
 332                }
 333        }
 334
 335        /* Decode the gzip header */
 336        c = GETINPC(inpf); expect = 0x1f; /* ID1 */
 337        if (c != expect)
 338                goto err1;
 339
 340        c = GETINPC(inpf); expect = 0x8b; /* ID2 */
 341        if (c != expect)
 342                goto err1;
 343
 344        c = GETINPC(inpf); expect = 0x08; /* CM */
 345        if (c != expect)
 346                goto err1;
 347
 348        int flg = GETINPC(inpf); /* FLG */
 349
 350        if (flg & 0xE0 || flg & 0x4 || flg == EOF)
 351                goto err2;
 352
 353        fprintf(stderr, "gzHeader FLG %x\n", flg);
 354
 355        /* Read 6 bytes; ignoring the MTIME, XFL, OS fields in this
 356         * sample code.
 357         */
 358        for (i = 0; i < 6; i++) {
 359                char tmp[10];
 360
 361                tmp[i] = GETINPC(inpf);
 362                if (tmp[i] == EOF)
 363                        goto err3;
 364                fprintf(stderr, "%02x ", tmp[i]);
 365                if (i == 5)
 366                        fprintf(stderr, "\n");
 367        }
 368        fprintf(stderr, "gzHeader MTIME, XFL, OS ignored\n");
 369
 370        /* FNAME */
 371        if (flg & 0x8) {
 372                int k = 0;
 373
 374                do {
 375                        c = GETINPC(inpf);
 376                        if (c == EOF || k >= FNAME_MAX)
 377                                goto err3;
 378                        gzfname[k++] = c;
 379                } while (c);
 380                fprintf(stderr, "gzHeader FNAME: %s\n", gzfname);
 381        }
 382
 383        /* FHCRC */
 384        if (flg & 0x2) {
 385                c = GETINPC(inpf);
 386                if (c == EOF)
 387                        goto err3;
 388                c = GETINPC(inpf);
 389                if (c == EOF)
 390                        goto err3;
 391                fprintf(stderr, "gzHeader FHCRC: ignored\n");
 392        }
 393
 394        used_in = cur_in = used_out = cur_out = 0;
 395        is_final = is_eof = 0;
 396
 397        /* Allocate one page larger to prevent page faults due to NX
 398         * overfetching.
 399         * Either do this (char*)(uintptr_t)aligned_alloc or use
 400         * -std=c11 flag to make the int-to-pointer warning go away.
 401         */
 402        assert((fifo_in  = (char *)(uintptr_t)aligned_alloc(line_sz,
 403                                   fifo_in_len + page_sz)) != NULL);
 404        assert((fifo_out = (char *)(uintptr_t)aligned_alloc(line_sz,
 405                                   fifo_out_len + page_sz + line_sz)) != NULL);
 406        /* Leave unused space due to history rounding rules */
 407        fifo_out = fifo_out + line_sz;
 408        nxu_touch_pages(fifo_out, fifo_out_len, page_sz, 1);
 409
 410        ddl_in  = &dde_in[0];
 411        ddl_out = &dde_out[0];
 412        cmdp = &cmd;
 413        memset(&cmdp->crb, 0, sizeof(cmdp->crb));
 414
 415read_state:
 416
 417        /* Read from .gz file */
 418
 419        NXPRT(fprintf(stderr, "read_state:\n"));
 420
 421        if (is_eof != 0)
 422                goto write_state;
 423
 424        /* We read in to fifo_in in two steps: first: read in to from
 425         * cur_in to the end of the buffer.  last: if free space wrapped
 426         * around, read from fifo_in offset 0 to offset cur_in.
 427         */
 428
 429        /* Reset fifo head to reduce unnecessary wrap arounds */
 430        cur_in = (used_in == 0) ? 0 : cur_in;
 431
 432        /* Free space total is reduced by a gap */
 433        free_space = NX_MAX(0, fifo_free_bytes(used_in, fifo_in_len)
 434                            - line_sz);
 435
 436        /* Free space may wrap around as first and last */
 437        first_free = fifo_free_first_bytes(cur_in, used_in, fifo_in_len);
 438        last_free  = fifo_free_last_bytes(cur_in, used_in, fifo_in_len);
 439
 440        /* Start offsets of the free memory */
 441        first_offset = fifo_free_first_offset(cur_in, used_in);
 442        last_offset  = fifo_free_last_offset(cur_in, used_in, fifo_in_len);
 443
 444        /* Reduce read_sz because of the line_sz gap */
 445        read_sz = NX_MIN(free_space, first_free);
 446        n = 0;
 447        if (read_sz > 0) {
 448                /* Read in to offset cur_in + used_in */
 449                n = fread(fifo_in + first_offset, 1, read_sz, inpf);
 450                used_in = used_in + n;
 451                free_space = free_space - n;
 452                assert(n <= read_sz);
 453                if (n != read_sz) {
 454                        /* Either EOF or error; exit the read loop */
 455                        is_eof = 1;
 456                        goto write_state;
 457                }
 458        }
 459
 460        /* If free space wrapped around */
 461        if (last_free > 0) {
 462                /* Reduce read_sz because of the line_sz gap */
 463                read_sz = NX_MIN(free_space, last_free);
 464                n = 0;
 465                if (read_sz > 0) {
 466                        n = fread(fifo_in + last_offset, 1, read_sz, inpf);
 467                        used_in = used_in + n;       /* Increase used space */
 468                        free_space = free_space - n; /* Decrease free space */
 469                        assert(n <= read_sz);
 470                        if (n != read_sz) {
 471                                /* Either EOF or error; exit the read loop */
 472                                is_eof = 1;
 473                                goto write_state;
 474                        }
 475                }
 476        }
 477
 478        /* At this point we have used_in bytes in fifo_in with the
 479         * data head starting at cur_in and possibly wrapping around.
 480         */
 481
 482write_state:
 483
 484        /* Write decompressed data to output file */
 485
 486        NXPRT(fprintf(stderr, "write_state:\n"));
 487
 488        if (used_out == 0)
 489                goto decomp_state;
 490
 491        /* If fifo_out has data waiting, write it out to the file to
 492         * make free target space for the accelerator used bytes in
 493         * the first and last parts of fifo_out.
 494         */
 495
 496        first_used = fifo_used_first_bytes(cur_out, used_out, fifo_out_len);
 497        last_used  = fifo_used_last_bytes(cur_out, used_out, fifo_out_len);
 498
 499        write_sz = first_used;
 500
 501        n = 0;
 502        if (write_sz > 0) {
 503                n = fwrite(fifo_out + cur_out, 1, write_sz, outf);
 504                used_out = used_out - n;
 505                /* Move head of the fifo */
 506                cur_out = (cur_out + n) % fifo_out_len;
 507                assert(n <= write_sz);
 508                if (n != write_sz) {
 509                        fprintf(stderr, "error: write\n");
 510                        rc = -1;
 511                        goto err5;
 512                }
 513        }
 514
 515        if (last_used > 0) { /* If more data available in the last part */
 516                write_sz = last_used; /* Keep it here for later */
 517                n = 0;
 518                if (write_sz > 0) {
 519                        n = fwrite(fifo_out, 1, write_sz, outf);
 520                        used_out = used_out - n;
 521                        cur_out = (cur_out + n) % fifo_out_len;
 522                        assert(n <= write_sz);
 523                        if (n != write_sz) {
 524                                fprintf(stderr, "error: write\n");
 525                                rc = -1;
 526                                goto err5;
 527                        }
 528                }
 529        }
 530
 531decomp_state:
 532
 533        /* NX decompresses input data */
 534
 535        NXPRT(fprintf(stderr, "decomp_state:\n"));
 536
 537        if (is_final)
 538                goto finish_state;
 539
 540        /* Address/len lists */
 541        clearp_dde(ddl_in);
 542        clearp_dde(ddl_out);
 543
 544        /* FC, CRC, HistLen, Table 6-6 */
 545        if (resuming) {
 546                /* Resuming a partially decompressed input.
 547                 * The key to resume is supplying the 32KB
 548                 * dictionary (history) to NX, which is basically
 549                 * the last 32KB of output produced.
 550                 */
 551                fc = GZIP_FC_DECOMPRESS_RESUME;
 552
 553                cmdp->cpb.in_crc   = cmdp->cpb.out_crc;
 554                cmdp->cpb.in_adler = cmdp->cpb.out_adler;
 555
 556                /* Round up the history size to quadword.  Section 2.10 */
 557                history_len = (history_len + 15) / 16;
 558                putnn(cmdp->cpb, in_histlen, history_len);
 559                history_len = history_len * 16; /* bytes */
 560
 561                if (history_len > 0) {
 562                        /* Chain in the history buffer to the DDE list */
 563                        if (cur_out >= history_len) {
 564                                nx_append_dde(ddl_in, fifo_out
 565                                              + (cur_out - history_len),
 566                                              history_len);
 567                        } else {
 568                                nx_append_dde(ddl_in, fifo_out
 569                                              + ((fifo_out_len + cur_out)
 570                                              - history_len),
 571                                              history_len - cur_out);
 572                                /* Up to 32KB history wraps around fifo_out */
 573                                nx_append_dde(ddl_in, fifo_out, cur_out);
 574                        }
 575
 576                }
 577        } else {
 578                /* First decompress job */
 579                fc = GZIP_FC_DECOMPRESS;
 580
 581                history_len = 0;
 582                /* Writing 0 clears out subc as well */
 583                cmdp->cpb.in_histlen = 0;
 584                total_out = 0;
 585
 586                put32(cmdp->cpb, in_crc, INIT_CRC);
 587                put32(cmdp->cpb, in_adler, INIT_ADLER);
 588                put32(cmdp->cpb, out_crc, INIT_CRC);
 589                put32(cmdp->cpb, out_adler, INIT_ADLER);
 590
 591                /* Assuming 10% compression ratio initially; use the
 592                 * most recently measured compression ratio as a
 593                 * heuristic to estimate the input and output
 594                 * sizes.  If we give too much input, the target buffer
 595                 * overflows and NX cycles are wasted, and then we
 596                 * must retry with smaller input size.  1000 is 100%.
 597                 */
 598                last_comp_ratio = 100UL;
 599        }
 600        cmdp->crb.gzip_fc = 0;
 601        putnn(cmdp->crb, gzip_fc, fc);
 602
 603        /*
 604         * NX source buffers
 605         */
 606        first_used = fifo_used_first_bytes(cur_in, used_in, fifo_in_len);
 607        last_used = fifo_used_last_bytes(cur_in, used_in, fifo_in_len);
 608
 609        if (first_used > 0)
 610                nx_append_dde(ddl_in, fifo_in + cur_in, first_used);
 611
 612        if (last_used > 0)
 613                nx_append_dde(ddl_in, fifo_in, last_used);
 614
 615        /*
 616         * NX target buffers
 617         */
 618        first_free = fifo_free_first_bytes(cur_out, used_out, fifo_out_len);
 619        last_free = fifo_free_last_bytes(cur_out, used_out, fifo_out_len);
 620
 621        /* Reduce output free space amount not to overwrite the history */
 622        int target_max = NX_MAX(0, fifo_free_bytes(used_out, fifo_out_len)
 623                                - (1<<16));
 624
 625        NXPRT(fprintf(stderr, "target_max %d (0x%x)\n", target_max,
 626                      target_max));
 627
 628        first_free = NX_MIN(target_max, first_free);
 629        if (first_free > 0) {
 630                first_offset = fifo_free_first_offset(cur_out, used_out);
 631                nx_append_dde(ddl_out, fifo_out + first_offset, first_free);
 632        }
 633
 634        if (last_free > 0) {
 635                last_free = NX_MIN(target_max - first_free, last_free);
 636                if (last_free > 0) {
 637                        last_offset = fifo_free_last_offset(cur_out, used_out,
 638                                                            fifo_out_len);
 639                        nx_append_dde(ddl_out, fifo_out + last_offset,
 640                                      last_free);
 641                }
 642        }
 643
 644        /* Target buffer size is used to limit the source data size
 645         * based on previous measurements of compression ratio.
 646         */
 647
 648        /* source_sz includes history */
 649        source_sz = getp32(ddl_in, ddebc);
 650        assert(source_sz > history_len);
 651        source_sz = source_sz - history_len;
 652
 653        /* Estimating how much source is needed to 3/4 fill a
 654         * target_max size target buffer.  If we overshoot, then NX
 655         * must repeat the job with smaller input and we waste
 656         * bandwidth.  If we undershoot then we use more NX calls than
 657         * necessary.
 658         */
 659
 660        source_sz_estimate = ((uint64_t)target_max * last_comp_ratio * 3UL)
 661                                / 4000;
 662
 663        if (source_sz_estimate < source_sz) {
 664                /* Target might be small, therefore limiting the
 665                 * source data.
 666                 */
 667                source_sz = source_sz_estimate;
 668                target_sz_estimate = target_max;
 669        } else {
 670                /* Source file might be small, therefore limiting target
 671                 * touch pages to a smaller value to save processor cycles.
 672                 */
 673                target_sz_estimate = ((uint64_t)source_sz * 1000UL)
 674                                        / (last_comp_ratio + 1);
 675                target_sz_estimate = NX_MIN(2 * target_sz_estimate,
 676                                            target_max);
 677        }
 678
 679        source_sz = source_sz + history_len;
 680
 681        /* Some NX condition codes require submitting the NX job again.
 682         * Kernel doesn't handle NX page faults. Expects user code to
 683         * touch pages.
 684         */
 685        pgfault_retries = NX_MAX_FAULTS;
 686
 687restart_nx:
 688
 689        putp32(ddl_in, ddebc, source_sz);
 690
 691        /* Fault in pages */
 692        nxu_touch_pages(cmdp, sizeof(struct nx_gzip_crb_cpb_t), page_sz, 1);
 693        nx_touch_pages_dde(ddl_in, 0, page_sz, 0);
 694        nx_touch_pages_dde(ddl_out, target_sz_estimate, page_sz, 1);
 695
 696        /* Send job to NX */
 697        cc = nx_submit_job(ddl_in, ddl_out, cmdp, devhandle);
 698
 699        switch (cc) {
 700
 701        case ERR_NX_AT_FAULT:
 702
 703                /* We touched the pages ahead of time.  In the most common case
 704                 * we shouldn't be here.  But may be some pages were paged out.
 705                 * Kernel should have placed the faulting address to fsaddr.
 706                 */
 707                NXPRT(fprintf(stderr, "ERR_NX_AT_FAULT %p\n",
 708                              (void *)cmdp->crb.csb.fsaddr));
 709
 710                if (pgfault_retries == NX_MAX_FAULTS) {
 711                        /* Try once with exact number of pages */
 712                        --pgfault_retries;
 713                        goto restart_nx;
 714                } else if (pgfault_retries > 0) {
 715                        /* If still faulting try fewer input pages
 716                         * assuming memory outage
 717                         */
 718                        if (source_sz > page_sz)
 719                                source_sz = NX_MAX(source_sz / 2, page_sz);
 720                        --pgfault_retries;
 721                        goto restart_nx;
 722                } else {
 723                        fprintf(stderr, "cannot make progress; too many ");
 724                        fprintf(stderr, "page fault retries cc= %d\n", cc);
 725                        rc = -1;
 726                        goto err5;
 727                }
 728
 729        case ERR_NX_DATA_LENGTH:
 730
 731                NXPRT(fprintf(stderr, "ERR_NX_DATA_LENGTH; "));
 732                NXPRT(fprintf(stderr, "stream may have trailing data\n"));
 733
 734                /* Not an error in the most common case; it just says
 735                 * there is trailing data that we must examine.
 736                 *
 737                 * CC=3 CE(1)=0 CE(0)=1 indicates partial completion
 738                 * Fig.6-7 and Table 6-8.
 739                 */
 740                nx_ce = get_csb_ce_ms3b(cmdp->crb.csb);
 741
 742                if (!csb_ce_termination(nx_ce) &&
 743                    csb_ce_partial_completion(nx_ce)) {
 744                        /* Check CPB for more information
 745                         * spbc and tpbc are valid
 746                         */
 747                        sfbt = getnn(cmdp->cpb, out_sfbt); /* Table 6-4 */
 748                        subc = getnn(cmdp->cpb, out_subc); /* Table 6-4 */
 749                        spbc = get32(cmdp->cpb, out_spbc_decomp);
 750                        tpbc = get32(cmdp->crb.csb, tpbc);
 751                        assert(target_max >= tpbc);
 752
 753                        goto ok_cc3; /* not an error */
 754                } else {
 755                        /* History length error when CE(1)=1 CE(0)=0. */
 756                        rc = -1;
 757                        fprintf(stderr, "history length error cc= %d\n", cc);
 758                        goto err5;
 759                }
 760
 761        case ERR_NX_TARGET_SPACE:
 762
 763                /* Target buffer not large enough; retry smaller input
 764                 * data; give at least 1 byte.  SPBC/TPBC are not valid.
 765                 */
 766                assert(source_sz > history_len);
 767                source_sz = ((source_sz - history_len + 2) / 2) + history_len;
 768                NXPRT(fprintf(stderr, "ERR_NX_TARGET_SPACE; retry with "));
 769                NXPRT(fprintf(stderr, "smaller input data src %d hist %d\n",
 770                              source_sz, history_len));
 771                goto restart_nx;
 772
 773        case ERR_NX_OK:
 774
 775                /* This should not happen for gzip formatted data;
 776                 * we need trailing crc and isize
 777                 */
 778                fprintf(stderr, "ERR_NX_OK\n");
 779                spbc = get32(cmdp->cpb, out_spbc_decomp);
 780                tpbc = get32(cmdp->crb.csb, tpbc);
 781                assert(target_max >= tpbc);
 782                assert(spbc >= history_len);
 783                source_sz = spbc - history_len;
 784                goto offsets_state;
 785
 786        default:
 787                fprintf(stderr, "error: cc= %d\n", cc);
 788                rc = -1;
 789                goto err5;
 790        }
 791
 792ok_cc3:
 793
 794        NXPRT(fprintf(stderr, "cc3: sfbt: %x\n", sfbt));
 795
 796        assert(spbc > history_len);
 797        source_sz = spbc - history_len;
 798
 799        /* Table 6-4: Source Final Block Type (SFBT) describes the
 800         * last processed deflate block and clues the software how to
 801         * resume the next job.  SUBC indicates how many input bits NX
 802         * consumed but did not process.  SPBC indicates how many
 803         * bytes of source were given to the accelerator including
 804         * history bytes.
 805         */
 806
 807        switch (sfbt) {
 808                int dhtlen;
 809
 810        case 0x0: /* Deflate final EOB received */
 811
 812                /* Calculating the checksum start position. */
 813
 814                source_sz = source_sz - subc / 8;
 815                is_final = 1;
 816                break;
 817
 818                /* Resume decompression cases are below. Basically
 819                 * indicates where NX has suspended and how to resume
 820                 * the input stream.
 821                 */
 822
 823        case 0x8: /* Within a literal block; use rembytecount */
 824        case 0x9: /* Within a literal block; use rembytecount; bfinal=1 */
 825
 826                /* Supply the partially processed source byte again */
 827                source_sz = source_sz - ((subc + 7) / 8);
 828
 829                /* SUBC LS 3bits: number of bits in the first source byte need
 830                 * to be processed.
 831                 * 000 means all 8 bits;  Table 6-3
 832                 * Clear subc, histlen, sfbt, rembytecnt, dhtlen
 833                 */
 834                cmdp->cpb.in_subc = 0;
 835                cmdp->cpb.in_sfbt = 0;
 836                putnn(cmdp->cpb, in_subc, subc % 8);
 837                putnn(cmdp->cpb, in_sfbt, sfbt);
 838                putnn(cmdp->cpb, in_rembytecnt, getnn(cmdp->cpb,
 839                                                      out_rembytecnt));
 840                break;
 841
 842        case 0xA: /* Within a FH block; */
 843        case 0xB: /* Within a FH block; bfinal=1 */
 844
 845                source_sz = source_sz - ((subc + 7) / 8);
 846
 847                /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
 848                cmdp->cpb.in_subc = 0;
 849                cmdp->cpb.in_sfbt = 0;
 850                putnn(cmdp->cpb, in_subc, subc % 8);
 851                putnn(cmdp->cpb, in_sfbt, sfbt);
 852                break;
 853
 854        case 0xC: /* Within a DH block; */
 855        case 0xD: /* Within a DH block; bfinal=1 */
 856
 857                source_sz = source_sz - ((subc + 7) / 8);
 858
 859                /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
 860                cmdp->cpb.in_subc = 0;
 861                cmdp->cpb.in_sfbt = 0;
 862                putnn(cmdp->cpb, in_subc, subc % 8);
 863                putnn(cmdp->cpb, in_sfbt, sfbt);
 864
 865                dhtlen = getnn(cmdp->cpb, out_dhtlen);
 866                putnn(cmdp->cpb, in_dhtlen, dhtlen);
 867                assert(dhtlen >= 42);
 868
 869                /* Round up to a qword */
 870                dhtlen = (dhtlen + 127) / 128;
 871
 872                while (dhtlen > 0) { /* Copy dht from cpb.out to cpb.in */
 873                        --dhtlen;
 874                        cmdp->cpb.in_dht[dhtlen] = cmdp->cpb.out_dht[dhtlen];
 875                }
 876                break;
 877
 878        case 0xE: /* Within a block header; bfinal=0; */
 879                     /* Also given if source data exactly ends (SUBC=0) with
 880                      * EOB code with BFINAL=0.  Means the next byte will
 881                      * contain a block header.
 882                      */
 883        case 0xF: /* within a block header with BFINAL=1. */
 884
 885                source_sz = source_sz - ((subc + 7) / 8);
 886
 887                /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
 888                cmdp->cpb.in_subc = 0;
 889                cmdp->cpb.in_sfbt = 0;
 890                putnn(cmdp->cpb, in_subc, subc % 8);
 891                putnn(cmdp->cpb, in_sfbt, sfbt);
 892
 893                /* Engine did not process any data */
 894                if (is_eof && (source_sz == 0))
 895                        is_final = 1;
 896        }
 897
 898offsets_state:
 899
 900        /* Adjust the source and target buffer offsets and lengths  */
 901
 902        NXPRT(fprintf(stderr, "offsets_state:\n"));
 903
 904        /* Delete input data from fifo_in */
 905        used_in = used_in - source_sz;
 906        cur_in = (cur_in + source_sz) % fifo_in_len;
 907        input_file_offset = input_file_offset + source_sz;
 908
 909        /* Add output data to fifo_out */
 910        used_out = used_out + tpbc;
 911
 912        assert(used_out <= fifo_out_len);
 913
 914        total_out = total_out + tpbc;
 915
 916        /* Deflate history is 32KB max.  No need to supply more
 917         * than 32KB on a resume.
 918         */
 919        history_len = (total_out > window_max) ? window_max : total_out;
 920
 921        /* To estimate expected expansion in the next NX job; 500 means 50%.
 922         * Deflate best case is around 1 to 1000.
 923         */
 924        last_comp_ratio = (1000UL * ((uint64_t)source_sz + 1))
 925                          / ((uint64_t)tpbc + 1);
 926        last_comp_ratio = NX_MAX(NX_MIN(1000UL, last_comp_ratio), 1);
 927        NXPRT(fprintf(stderr, "comp_ratio %ld source_sz %d spbc %d tpbc %d\n",
 928                      last_comp_ratio, source_sz, spbc, tpbc));
 929
 930        resuming = 1;
 931
 932finish_state:
 933
 934        NXPRT(fprintf(stderr, "finish_state:\n"));
 935
 936        if (is_final) {
 937                if (used_out)
 938                        goto write_state; /* More data to write out */
 939                else if (used_in < 8) {
 940                        /* Need at least 8 more bytes containing gzip crc
 941                         * and isize.
 942                         */
 943                        rc = -1;
 944                        goto err4;
 945                } else {
 946                        /* Compare checksums and exit */
 947                        int i;
 948                        unsigned char tail[8];
 949                        uint32_t cksum, isize;
 950
 951                        for (i = 0; i < 8; i++)
 952                                tail[i] = fifo_in[(cur_in + i) % fifo_in_len];
 953                        fprintf(stderr, "computed checksum %08x isize %08x\n",
 954                                cmdp->cpb.out_crc, (uint32_t) (total_out
 955                                % (1ULL<<32)));
 956                        cksum = ((uint32_t) tail[0] | (uint32_t) tail[1]<<8
 957                                 | (uint32_t) tail[2]<<16
 958                                 | (uint32_t) tail[3]<<24);
 959                        isize = ((uint32_t) tail[4] | (uint32_t) tail[5]<<8
 960                                 | (uint32_t) tail[6]<<16
 961                                 | (uint32_t) tail[7]<<24);
 962                        fprintf(stderr, "stored   checksum %08x isize %08x\n",
 963                                cksum, isize);
 964
 965                        if (cksum == cmdp->cpb.out_crc && isize == (uint32_t)
 966                            (total_out % (1ULL<<32))) {
 967                                rc = 0; goto ok1;
 968                        } else {
 969                                rc = -1; goto err4;
 970                        }
 971                }
 972        } else
 973                goto read_state;
 974
 975        return -1;
 976
 977err1:
 978        fprintf(stderr, "error: not a gzip file, expect %x, read %x\n",
 979                expect, c);
 980        return -1;
 981
 982err2:
 983        fprintf(stderr, "error: the FLG byte is wrong or not being handled\n");
 984        return -1;
 985
 986err3:
 987        fprintf(stderr, "error: gzip header\n");
 988        return -1;
 989
 990err4:
 991        fprintf(stderr, "error: checksum missing or mismatch\n");
 992
 993err5:
 994ok1:
 995        fprintf(stderr, "decomp is complete: fclose\n");
 996        fclose(outf);
 997
 998        return rc;
 999}
1000
1001
1002int main(int argc, char **argv)
1003{
1004        int rc;
1005        struct sigaction act;
1006        void *handle;
1007
1008        nx_dbg = 0;
1009        nx_gzip_log = NULL;
1010        act.sa_handler = 0;
1011        act.sa_sigaction = nxu_sigsegv_handler;
1012        act.sa_flags = SA_SIGINFO;
1013        act.sa_restorer = 0;
1014        sigemptyset(&act.sa_mask);
1015        sigaction(SIGSEGV, &act, NULL);
1016
1017        handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0);
1018        if (!handle) {
1019                fprintf(stderr, "Unable to init NX, errno %d\n", errno);
1020                exit(-1);
1021        }
1022
1023        rc = decompress_file(argc, argv, handle);
1024
1025        nx_function_end(handle);
1026
1027        return rc;
1028}
1029