linux/drivers/media/test-drivers/vicodec/codec-fwht.c
<<
>>
Prefs
   1// SPDX-License-Identifier: LGPL-2.1+
   2/*
   3 * Copyright 2016 Tom aan de Wiel
   4 * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
   5 *
   6 * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
   7 *
   8 * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
   9 * R.D. Brown, 1977
  10 */
  11
  12#include <linux/string.h>
  13#include <linux/kernel.h>
  14#include <linux/videodev2.h>
  15#include "codec-fwht.h"
  16
  17#define OVERFLOW_BIT BIT(14)
  18
  19/*
  20 * Note: bit 0 of the header must always be 0. Otherwise it cannot
  21 * be guaranteed that the magic 8 byte sequence (see below) can
  22 * never occur in the rlc output.
  23 */
  24#define PFRAME_BIT BIT(15)
  25#define DUPS_MASK 0x1ffe
  26
  27#define PBLOCK 0
  28#define IBLOCK 1
  29
  30#define ALL_ZEROS 15
  31
  32static const uint8_t zigzag[64] = {
  33        0,
  34        1,  8,
  35        2,  9, 16,
  36        3, 10, 17, 24,
  37        4, 11, 18, 25, 32,
  38        5, 12, 19, 26, 33, 40,
  39        6, 13, 20, 27, 34, 41, 48,
  40        7, 14, 21, 28, 35, 42, 49, 56,
  41        15, 22, 29, 36, 43, 50, 57,
  42        23, 30, 37, 44, 51, 58,
  43        31, 38, 45, 52, 59,
  44        39, 46, 53, 60,
  45        47, 54, 61,
  46        55, 62,
  47        63,
  48};
  49
  50/*
  51 * noinline_for_stack to work around
  52 * https://bugs.llvm.org/show_bug.cgi?id=38809
  53 */
  54static int noinline_for_stack
  55rlc(const s16 *in, __be16 *output, int blocktype)
  56{
  57        s16 block[8 * 8];
  58        s16 *wp = block;
  59        int i = 0;
  60        int x, y;
  61        int ret = 0;
  62
  63        /* read in block from framebuffer */
  64        int lastzero_run = 0;
  65        int to_encode;
  66
  67        for (y = 0; y < 8; y++) {
  68                for (x = 0; x < 8; x++) {
  69                        *wp = in[x + y * 8];
  70                        wp++;
  71                }
  72        }
  73
  74        /* keep track of amount of trailing zeros */
  75        for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
  76                lastzero_run++;
  77
  78        *output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
  79        ret++;
  80
  81        to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
  82
  83        i = 0;
  84        while (i < to_encode) {
  85                int cnt = 0;
  86                int tmp;
  87
  88                /* count leading zeros */
  89                while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
  90                        cnt++;
  91                        i++;
  92                        if (i == to_encode) {
  93                                cnt--;
  94                                break;
  95                        }
  96                }
  97                /* 4 bits for run, 12 for coefficient (quantization by 4) */
  98                *output++ = htons((cnt | tmp << 4));
  99                i++;
 100                ret++;
 101        }
 102        if (lastzero_run > 14) {
 103                *output = htons(ALL_ZEROS | 0);
 104                ret++;
 105        }
 106
 107        return ret;
 108}
 109
 110/*
 111 * This function will worst-case increase rlc_in by 65*2 bytes:
 112 * one s16 value for the header and 8 * 8 coefficients of type s16.
 113 */
 114static noinline_for_stack u16
 115derlc(const __be16 **rlc_in, s16 *dwht_out, const __be16 *end_of_input)
 116{
 117        /* header */
 118        const __be16 *input = *rlc_in;
 119        u16 stat;
 120        int dec_count = 0;
 121        s16 block[8 * 8 + 16];
 122        s16 *wp = block;
 123        int i;
 124
 125        if (input > end_of_input)
 126                return OVERFLOW_BIT;
 127        stat = ntohs(*input++);
 128
 129        /*
 130         * Now de-compress, it expands one byte to up to 15 bytes
 131         * (or fills the remainder of the 64 bytes with zeroes if it
 132         * is the last byte to expand).
 133         *
 134         * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
 135         * allow for overflow if the incoming data was malformed.
 136         */
 137        while (dec_count < 8 * 8) {
 138                s16 in;
 139                int length;
 140                int coeff;
 141
 142                if (input > end_of_input)
 143                        return OVERFLOW_BIT;
 144                in = ntohs(*input++);
 145                length = in & 0xf;
 146                coeff = in >> 4;
 147
 148                /* fill remainder with zeros */
 149                if (length == 15) {
 150                        for (i = 0; i < 64 - dec_count; i++)
 151                                *wp++ = 0;
 152                        break;
 153                }
 154
 155                for (i = 0; i < length; i++)
 156                        *wp++ = 0;
 157                *wp++ = coeff;
 158                dec_count += length + 1;
 159        }
 160
 161        wp = block;
 162
 163        for (i = 0; i < 64; i++) {
 164                int pos = zigzag[i];
 165                int y = pos / 8;
 166                int x = pos % 8;
 167
 168                dwht_out[x + y * 8] = *wp++;
 169        }
 170        *rlc_in = input;
 171        return stat;
 172}
 173
 174static const int quant_table[] = {
 175        2, 2, 2, 2, 2, 2,  2,  2,
 176        2, 2, 2, 2, 2, 2,  2,  2,
 177        2, 2, 2, 2, 2, 2,  2,  3,
 178        2, 2, 2, 2, 2, 2,  3,  6,
 179        2, 2, 2, 2, 2, 3,  6,  6,
 180        2, 2, 2, 2, 3, 6,  6,  6,
 181        2, 2, 2, 3, 6, 6,  6,  6,
 182        2, 2, 3, 6, 6, 6,  6,  8,
 183};
 184
 185static const int quant_table_p[] = {
 186        3, 3, 3, 3, 3, 3,  3,  3,
 187        3, 3, 3, 3, 3, 3,  3,  3,
 188        3, 3, 3, 3, 3, 3,  3,  3,
 189        3, 3, 3, 3, 3, 3,  3,  6,
 190        3, 3, 3, 3, 3, 3,  6,  6,
 191        3, 3, 3, 3, 3, 6,  6,  9,
 192        3, 3, 3, 3, 6, 6,  9,  9,
 193        3, 3, 3, 6, 6, 9,  9,  10,
 194};
 195
 196static void quantize_intra(s16 *coeff, s16 *de_coeff, u16 qp)
 197{
 198        const int *quant = quant_table;
 199        int i, j;
 200
 201        for (j = 0; j < 8; j++) {
 202                for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
 203                        *coeff >>= *quant;
 204                        if (*coeff >= -qp && *coeff <= qp)
 205                                *coeff = *de_coeff = 0;
 206                        else
 207                                *de_coeff = *coeff << *quant;
 208                }
 209        }
 210}
 211
 212static void dequantize_intra(s16 *coeff)
 213{
 214        const int *quant = quant_table;
 215        int i, j;
 216
 217        for (j = 0; j < 8; j++)
 218                for (i = 0; i < 8; i++, quant++, coeff++)
 219                        *coeff <<= *quant;
 220}
 221
 222static void quantize_inter(s16 *coeff, s16 *de_coeff, u16 qp)
 223{
 224        const int *quant = quant_table_p;
 225        int i, j;
 226
 227        for (j = 0; j < 8; j++) {
 228                for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
 229                        *coeff >>= *quant;
 230                        if (*coeff >= -qp && *coeff <= qp)
 231                                *coeff = *de_coeff = 0;
 232                        else
 233                                *de_coeff = *coeff << *quant;
 234                }
 235        }
 236}
 237
 238static void dequantize_inter(s16 *coeff)
 239{
 240        const int *quant = quant_table_p;
 241        int i, j;
 242
 243        for (j = 0; j < 8; j++)
 244                for (i = 0; i < 8; i++, quant++, coeff++)
 245                        *coeff <<= *quant;
 246}
 247
 248static void noinline_for_stack fwht(const u8 *block, s16 *output_block,
 249                                    unsigned int stride,
 250                                    unsigned int input_step, bool intra)
 251{
 252        /* we'll need more than 8 bits for the transformed coefficients */
 253        s32 workspace1[8], workspace2[8];
 254        const u8 *tmp = block;
 255        s16 *out = output_block;
 256        int add = intra ? 256 : 0;
 257        unsigned int i;
 258
 259        /* stage 1 */
 260        for (i = 0; i < 8; i++, tmp += stride, out += 8) {
 261                switch (input_step) {
 262                case 1:
 263                        workspace1[0]  = tmp[0] + tmp[1] - add;
 264                        workspace1[1]  = tmp[0] - tmp[1];
 265
 266                        workspace1[2]  = tmp[2] + tmp[3] - add;
 267                        workspace1[3]  = tmp[2] - tmp[3];
 268
 269                        workspace1[4]  = tmp[4] + tmp[5] - add;
 270                        workspace1[5]  = tmp[4] - tmp[5];
 271
 272                        workspace1[6]  = tmp[6] + tmp[7] - add;
 273                        workspace1[7]  = tmp[6] - tmp[7];
 274                        break;
 275                case 2:
 276                        workspace1[0]  = tmp[0] + tmp[2] - add;
 277                        workspace1[1]  = tmp[0] - tmp[2];
 278
 279                        workspace1[2]  = tmp[4] + tmp[6] - add;
 280                        workspace1[3]  = tmp[4] - tmp[6];
 281
 282                        workspace1[4]  = tmp[8] + tmp[10] - add;
 283                        workspace1[5]  = tmp[8] - tmp[10];
 284
 285                        workspace1[6]  = tmp[12] + tmp[14] - add;
 286                        workspace1[7]  = tmp[12] - tmp[14];
 287                        break;
 288                case 3:
 289                        workspace1[0]  = tmp[0] + tmp[3] - add;
 290                        workspace1[1]  = tmp[0] - tmp[3];
 291
 292                        workspace1[2]  = tmp[6] + tmp[9] - add;
 293                        workspace1[3]  = tmp[6] - tmp[9];
 294
 295                        workspace1[4]  = tmp[12] + tmp[15] - add;
 296                        workspace1[5]  = tmp[12] - tmp[15];
 297
 298                        workspace1[6]  = tmp[18] + tmp[21] - add;
 299                        workspace1[7]  = tmp[18] - tmp[21];
 300                        break;
 301                default:
 302                        workspace1[0]  = tmp[0] + tmp[4] - add;
 303                        workspace1[1]  = tmp[0] - tmp[4];
 304
 305                        workspace1[2]  = tmp[8] + tmp[12] - add;
 306                        workspace1[3]  = tmp[8] - tmp[12];
 307
 308                        workspace1[4]  = tmp[16] + tmp[20] - add;
 309                        workspace1[5]  = tmp[16] - tmp[20];
 310
 311                        workspace1[6]  = tmp[24] + tmp[28] - add;
 312                        workspace1[7]  = tmp[24] - tmp[28];
 313                        break;
 314                }
 315
 316                /* stage 2 */
 317                workspace2[0] = workspace1[0] + workspace1[2];
 318                workspace2[1] = workspace1[0] - workspace1[2];
 319                workspace2[2] = workspace1[1] - workspace1[3];
 320                workspace2[3] = workspace1[1] + workspace1[3];
 321
 322                workspace2[4] = workspace1[4] + workspace1[6];
 323                workspace2[5] = workspace1[4] - workspace1[6];
 324                workspace2[6] = workspace1[5] - workspace1[7];
 325                workspace2[7] = workspace1[5] + workspace1[7];
 326
 327                /* stage 3 */
 328                out[0] = workspace2[0] + workspace2[4];
 329                out[1] = workspace2[0] - workspace2[4];
 330                out[2] = workspace2[1] - workspace2[5];
 331                out[3] = workspace2[1] + workspace2[5];
 332                out[4] = workspace2[2] + workspace2[6];
 333                out[5] = workspace2[2] - workspace2[6];
 334                out[6] = workspace2[3] - workspace2[7];
 335                out[7] = workspace2[3] + workspace2[7];
 336        }
 337
 338        out = output_block;
 339
 340        for (i = 0; i < 8; i++, out++) {
 341                /* stage 1 */
 342                workspace1[0]  = out[0] + out[1 * 8];
 343                workspace1[1]  = out[0] - out[1 * 8];
 344
 345                workspace1[2]  = out[2 * 8] + out[3 * 8];
 346                workspace1[3]  = out[2 * 8] - out[3 * 8];
 347
 348                workspace1[4]  = out[4 * 8] + out[5 * 8];
 349                workspace1[5]  = out[4 * 8] - out[5 * 8];
 350
 351                workspace1[6]  = out[6 * 8] + out[7 * 8];
 352                workspace1[7]  = out[6 * 8] - out[7 * 8];
 353
 354                /* stage 2 */
 355                workspace2[0] = workspace1[0] + workspace1[2];
 356                workspace2[1] = workspace1[0] - workspace1[2];
 357                workspace2[2] = workspace1[1] - workspace1[3];
 358                workspace2[3] = workspace1[1] + workspace1[3];
 359
 360                workspace2[4] = workspace1[4] + workspace1[6];
 361                workspace2[5] = workspace1[4] - workspace1[6];
 362                workspace2[6] = workspace1[5] - workspace1[7];
 363                workspace2[7] = workspace1[5] + workspace1[7];
 364                /* stage 3 */
 365                out[0 * 8] = workspace2[0] + workspace2[4];
 366                out[1 * 8] = workspace2[0] - workspace2[4];
 367                out[2 * 8] = workspace2[1] - workspace2[5];
 368                out[3 * 8] = workspace2[1] + workspace2[5];
 369                out[4 * 8] = workspace2[2] + workspace2[6];
 370                out[5 * 8] = workspace2[2] - workspace2[6];
 371                out[6 * 8] = workspace2[3] - workspace2[7];
 372                out[7 * 8] = workspace2[3] + workspace2[7];
 373        }
 374}
 375
 376/*
 377 * Not the nicest way of doing it, but P-blocks get twice the range of
 378 * that of the I-blocks. Therefore we need a type bigger than 8 bits.
 379 * Furthermore values can be negative... This is just a version that
 380 * works with 16 signed data
 381 */
 382static void noinline_for_stack
 383fwht16(const s16 *block, s16 *output_block, int stride, int intra)
 384{
 385        /* we'll need more than 8 bits for the transformed coefficients */
 386        s32 workspace1[8], workspace2[8];
 387        const s16 *tmp = block;
 388        s16 *out = output_block;
 389        int i;
 390
 391        for (i = 0; i < 8; i++, tmp += stride, out += 8) {
 392                /* stage 1 */
 393                workspace1[0]  = tmp[0] + tmp[1];
 394                workspace1[1]  = tmp[0] - tmp[1];
 395
 396                workspace1[2]  = tmp[2] + tmp[3];
 397                workspace1[3]  = tmp[2] - tmp[3];
 398
 399                workspace1[4]  = tmp[4] + tmp[5];
 400                workspace1[5]  = tmp[4] - tmp[5];
 401
 402                workspace1[6]  = tmp[6] + tmp[7];
 403                workspace1[7]  = tmp[6] - tmp[7];
 404
 405                /* stage 2 */
 406                workspace2[0] = workspace1[0] + workspace1[2];
 407                workspace2[1] = workspace1[0] - workspace1[2];
 408                workspace2[2] = workspace1[1] - workspace1[3];
 409                workspace2[3] = workspace1[1] + workspace1[3];
 410
 411                workspace2[4] = workspace1[4] + workspace1[6];
 412                workspace2[5] = workspace1[4] - workspace1[6];
 413                workspace2[6] = workspace1[5] - workspace1[7];
 414                workspace2[7] = workspace1[5] + workspace1[7];
 415
 416                /* stage 3 */
 417                out[0] = workspace2[0] + workspace2[4];
 418                out[1] = workspace2[0] - workspace2[4];
 419                out[2] = workspace2[1] - workspace2[5];
 420                out[3] = workspace2[1] + workspace2[5];
 421                out[4] = workspace2[2] + workspace2[6];
 422                out[5] = workspace2[2] - workspace2[6];
 423                out[6] = workspace2[3] - workspace2[7];
 424                out[7] = workspace2[3] + workspace2[7];
 425        }
 426
 427        out = output_block;
 428
 429        for (i = 0; i < 8; i++, out++) {
 430                /* stage 1 */
 431                workspace1[0]  = out[0] + out[1*8];
 432                workspace1[1]  = out[0] - out[1*8];
 433
 434                workspace1[2]  = out[2*8] + out[3*8];
 435                workspace1[3]  = out[2*8] - out[3*8];
 436
 437                workspace1[4]  = out[4*8] + out[5*8];
 438                workspace1[5]  = out[4*8] - out[5*8];
 439
 440                workspace1[6]  = out[6*8] + out[7*8];
 441                workspace1[7]  = out[6*8] - out[7*8];
 442
 443                /* stage 2 */
 444                workspace2[0] = workspace1[0] + workspace1[2];
 445                workspace2[1] = workspace1[0] - workspace1[2];
 446                workspace2[2] = workspace1[1] - workspace1[3];
 447                workspace2[3] = workspace1[1] + workspace1[3];
 448
 449                workspace2[4] = workspace1[4] + workspace1[6];
 450                workspace2[5] = workspace1[4] - workspace1[6];
 451                workspace2[6] = workspace1[5] - workspace1[7];
 452                workspace2[7] = workspace1[5] + workspace1[7];
 453
 454                /* stage 3 */
 455                out[0*8] = workspace2[0] + workspace2[4];
 456                out[1*8] = workspace2[0] - workspace2[4];
 457                out[2*8] = workspace2[1] - workspace2[5];
 458                out[3*8] = workspace2[1] + workspace2[5];
 459                out[4*8] = workspace2[2] + workspace2[6];
 460                out[5*8] = workspace2[2] - workspace2[6];
 461                out[6*8] = workspace2[3] - workspace2[7];
 462                out[7*8] = workspace2[3] + workspace2[7];
 463        }
 464}
 465
 466static noinline_for_stack void
 467ifwht(const s16 *block, s16 *output_block, int intra)
 468{
 469        /*
 470         * we'll need more than 8 bits for the transformed coefficients
 471         * use native unit of cpu
 472         */
 473        int workspace1[8], workspace2[8];
 474        int inter = intra ? 0 : 1;
 475        const s16 *tmp = block;
 476        s16 *out = output_block;
 477        int i;
 478
 479        for (i = 0; i < 8; i++, tmp += 8, out += 8) {
 480                /* stage 1 */
 481                workspace1[0]  = tmp[0] + tmp[1];
 482                workspace1[1]  = tmp[0] - tmp[1];
 483
 484                workspace1[2]  = tmp[2] + tmp[3];
 485                workspace1[3]  = tmp[2] - tmp[3];
 486
 487                workspace1[4]  = tmp[4] + tmp[5];
 488                workspace1[5]  = tmp[4] - tmp[5];
 489
 490                workspace1[6]  = tmp[6] + tmp[7];
 491                workspace1[7]  = tmp[6] - tmp[7];
 492
 493                /* stage 2 */
 494                workspace2[0] = workspace1[0] + workspace1[2];
 495                workspace2[1] = workspace1[0] - workspace1[2];
 496                workspace2[2] = workspace1[1] - workspace1[3];
 497                workspace2[3] = workspace1[1] + workspace1[3];
 498
 499                workspace2[4] = workspace1[4] + workspace1[6];
 500                workspace2[5] = workspace1[4] - workspace1[6];
 501                workspace2[6] = workspace1[5] - workspace1[7];
 502                workspace2[7] = workspace1[5] + workspace1[7];
 503
 504                /* stage 3 */
 505                out[0] = workspace2[0] + workspace2[4];
 506                out[1] = workspace2[0] - workspace2[4];
 507                out[2] = workspace2[1] - workspace2[5];
 508                out[3] = workspace2[1] + workspace2[5];
 509                out[4] = workspace2[2] + workspace2[6];
 510                out[5] = workspace2[2] - workspace2[6];
 511                out[6] = workspace2[3] - workspace2[7];
 512                out[7] = workspace2[3] + workspace2[7];
 513        }
 514
 515        out = output_block;
 516
 517        for (i = 0; i < 8; i++, out++) {
 518                /* stage 1 */
 519                workspace1[0]  = out[0] + out[1 * 8];
 520                workspace1[1]  = out[0] - out[1 * 8];
 521
 522                workspace1[2]  = out[2 * 8] + out[3 * 8];
 523                workspace1[3]  = out[2 * 8] - out[3 * 8];
 524
 525                workspace1[4]  = out[4 * 8] + out[5 * 8];
 526                workspace1[5]  = out[4 * 8] - out[5 * 8];
 527
 528                workspace1[6]  = out[6 * 8] + out[7 * 8];
 529                workspace1[7]  = out[6 * 8] - out[7 * 8];
 530
 531                /* stage 2 */
 532                workspace2[0] = workspace1[0] + workspace1[2];
 533                workspace2[1] = workspace1[0] - workspace1[2];
 534                workspace2[2] = workspace1[1] - workspace1[3];
 535                workspace2[3] = workspace1[1] + workspace1[3];
 536
 537                workspace2[4] = workspace1[4] + workspace1[6];
 538                workspace2[5] = workspace1[4] - workspace1[6];
 539                workspace2[6] = workspace1[5] - workspace1[7];
 540                workspace2[7] = workspace1[5] + workspace1[7];
 541
 542                /* stage 3 */
 543                if (inter) {
 544                        int d;
 545
 546                        out[0 * 8] = workspace2[0] + workspace2[4];
 547                        out[1 * 8] = workspace2[0] - workspace2[4];
 548                        out[2 * 8] = workspace2[1] - workspace2[5];
 549                        out[3 * 8] = workspace2[1] + workspace2[5];
 550                        out[4 * 8] = workspace2[2] + workspace2[6];
 551                        out[5 * 8] = workspace2[2] - workspace2[6];
 552                        out[6 * 8] = workspace2[3] - workspace2[7];
 553                        out[7 * 8] = workspace2[3] + workspace2[7];
 554
 555                        for (d = 0; d < 8; d++)
 556                                out[8 * d] >>= 6;
 557                } else {
 558                        int d;
 559
 560                        out[0 * 8] = workspace2[0] + workspace2[4];
 561                        out[1 * 8] = workspace2[0] - workspace2[4];
 562                        out[2 * 8] = workspace2[1] - workspace2[5];
 563                        out[3 * 8] = workspace2[1] + workspace2[5];
 564                        out[4 * 8] = workspace2[2] + workspace2[6];
 565                        out[5 * 8] = workspace2[2] - workspace2[6];
 566                        out[6 * 8] = workspace2[3] - workspace2[7];
 567                        out[7 * 8] = workspace2[3] + workspace2[7];
 568
 569                        for (d = 0; d < 8; d++) {
 570                                out[8 * d] >>= 6;
 571                                out[8 * d] += 128;
 572                        }
 573                }
 574        }
 575}
 576
 577static void fill_encoder_block(const u8 *input, s16 *dst,
 578                               unsigned int stride, unsigned int input_step)
 579{
 580        int i, j;
 581
 582        for (i = 0; i < 8; i++) {
 583                for (j = 0; j < 8; j++, input += input_step)
 584                        *dst++ = *input;
 585                input += stride - 8 * input_step;
 586        }
 587}
 588
 589static int var_intra(const s16 *input)
 590{
 591        int32_t mean = 0;
 592        int32_t ret = 0;
 593        const s16 *tmp = input;
 594        int i;
 595
 596        for (i = 0; i < 8 * 8; i++, tmp++)
 597                mean += *tmp;
 598        mean /= 64;
 599        tmp = input;
 600        for (i = 0; i < 8 * 8; i++, tmp++)
 601                ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
 602        return ret;
 603}
 604
 605static int var_inter(const s16 *old, const s16 *new)
 606{
 607        int32_t ret = 0;
 608        int i;
 609
 610        for (i = 0; i < 8 * 8; i++, old++, new++)
 611                ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
 612        return ret;
 613}
 614
 615static noinline_for_stack int
 616decide_blocktype(const u8 *cur, const u8 *reference, s16 *deltablock,
 617                 unsigned int stride, unsigned int input_step)
 618{
 619        s16 tmp[64];
 620        s16 old[64];
 621        s16 *work = tmp;
 622        unsigned int k, l;
 623        int vari;
 624        int vard;
 625
 626        fill_encoder_block(cur, tmp, stride, input_step);
 627        fill_encoder_block(reference, old, 8, 1);
 628        vari = var_intra(tmp);
 629
 630        for (k = 0; k < 8; k++) {
 631                for (l = 0; l < 8; l++) {
 632                        *deltablock = *work - *reference;
 633                        deltablock++;
 634                        work++;
 635                        reference++;
 636                }
 637        }
 638        deltablock -= 64;
 639        vard = var_inter(old, tmp);
 640        return vari <= vard ? IBLOCK : PBLOCK;
 641}
 642
 643static void fill_decoder_block(u8 *dst, const s16 *input, int stride,
 644                               unsigned int dst_step)
 645{
 646        int i, j;
 647
 648        for (i = 0; i < 8; i++) {
 649                for (j = 0; j < 8; j++, input++, dst += dst_step) {
 650                        if (*input < 0)
 651                                *dst = 0;
 652                        else if (*input > 255)
 653                                *dst = 255;
 654                        else
 655                                *dst = *input;
 656                }
 657                dst += stride - (8 * dst_step);
 658        }
 659}
 660
 661static void add_deltas(s16 *deltas, const u8 *ref, int stride,
 662                       unsigned int ref_step)
 663{
 664        int k, l;
 665
 666        for (k = 0; k < 8; k++) {
 667                for (l = 0; l < 8; l++) {
 668                        *deltas += *ref;
 669                        ref += ref_step;
 670                        /*
 671                         * Due to quantizing, it might possible that the
 672                         * decoded coefficients are slightly out of range
 673                         */
 674                        if (*deltas < 0)
 675                                *deltas = 0;
 676                        else if (*deltas > 255)
 677                                *deltas = 255;
 678                        deltas++;
 679                }
 680                ref += stride - (8 * ref_step);
 681        }
 682}
 683
 684static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
 685                        struct fwht_cframe *cf, u32 height, u32 width,
 686                        u32 stride, unsigned int input_step,
 687                        bool is_intra, bool next_is_intra)
 688{
 689        u8 *input_start = input;
 690        __be16 *rlco_start = *rlco;
 691        s16 deltablock[64];
 692        __be16 pframe_bit = htons(PFRAME_BIT);
 693        u32 encoding = 0;
 694        unsigned int last_size = 0;
 695        unsigned int i, j;
 696
 697        width = round_up(width, 8);
 698        height = round_up(height, 8);
 699
 700        for (j = 0; j < height / 8; j++) {
 701                input = input_start + j * 8 * stride;
 702                for (i = 0; i < width / 8; i++) {
 703                        /* intra code, first frame is always intra coded. */
 704                        int blocktype = IBLOCK;
 705                        unsigned int size;
 706
 707                        if (!is_intra)
 708                                blocktype = decide_blocktype(input, refp,
 709                                        deltablock, stride, input_step);
 710                        if (blocktype == IBLOCK) {
 711                                fwht(input, cf->coeffs, stride, input_step, 1);
 712                                quantize_intra(cf->coeffs, cf->de_coeffs,
 713                                               cf->i_frame_qp);
 714                        } else {
 715                                /* inter code */
 716                                encoding |= FWHT_FRAME_PCODED;
 717                                fwht16(deltablock, cf->coeffs, 8, 0);
 718                                quantize_inter(cf->coeffs, cf->de_coeffs,
 719                                               cf->p_frame_qp);
 720                        }
 721                        if (!next_is_intra) {
 722                                ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
 723
 724                                if (blocktype == PBLOCK)
 725                                        add_deltas(cf->de_fwht, refp, 8, 1);
 726                                fill_decoder_block(refp, cf->de_fwht, 8, 1);
 727                        }
 728
 729                        input += 8 * input_step;
 730                        refp += 8 * 8;
 731
 732                        size = rlc(cf->coeffs, *rlco, blocktype);
 733                        if (last_size == size &&
 734                            !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
 735                                __be16 *last_rlco = *rlco - size;
 736                                s16 hdr = ntohs(*last_rlco);
 737
 738                                if (!((*last_rlco ^ **rlco) & pframe_bit) &&
 739                                    (hdr & DUPS_MASK) < DUPS_MASK)
 740                                        *last_rlco = htons(hdr + 2);
 741                                else
 742                                        *rlco += size;
 743                        } else {
 744                                *rlco += size;
 745                        }
 746                        if (*rlco >= rlco_max) {
 747                                encoding |= FWHT_FRAME_UNENCODED;
 748                                goto exit_loop;
 749                        }
 750                        last_size = size;
 751                }
 752        }
 753
 754exit_loop:
 755        if (encoding & FWHT_FRAME_UNENCODED) {
 756                u8 *out = (u8 *)rlco_start;
 757                u8 *p;
 758
 759                input = input_start;
 760                /*
 761                 * The compressed stream should never contain the magic
 762                 * header, so when we copy the YUV data we replace 0xff
 763                 * by 0xfe. Since YUV is limited range such values
 764                 * shouldn't appear anyway.
 765                 */
 766                for (j = 0; j < height; j++) {
 767                        for (i = 0, p = input; i < width; i++, p += input_step)
 768                                *out++ = (*p == 0xff) ? 0xfe : *p;
 769                        input += stride;
 770                }
 771                *rlco = (__be16 *)out;
 772                encoding &= ~FWHT_FRAME_PCODED;
 773        }
 774        return encoding;
 775}
 776
 777u32 fwht_encode_frame(struct fwht_raw_frame *frm,
 778                      struct fwht_raw_frame *ref_frm,
 779                      struct fwht_cframe *cf,
 780                      bool is_intra, bool next_is_intra,
 781                      unsigned int width, unsigned int height,
 782                      unsigned int stride, unsigned int chroma_stride)
 783{
 784        unsigned int size = height * width;
 785        __be16 *rlco = cf->rlc_data;
 786        __be16 *rlco_max;
 787        u32 encoding;
 788
 789        rlco_max = rlco + size / 2 - 256;
 790        encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
 791                                height, width, stride,
 792                                frm->luma_alpha_step, is_intra, next_is_intra);
 793        if (encoding & FWHT_FRAME_UNENCODED)
 794                encoding |= FWHT_LUMA_UNENCODED;
 795        encoding &= ~FWHT_FRAME_UNENCODED;
 796
 797        if (frm->components_num >= 3) {
 798                u32 chroma_h = height / frm->height_div;
 799                u32 chroma_w = width / frm->width_div;
 800                unsigned int chroma_size = chroma_h * chroma_w;
 801
 802                rlco_max = rlco + chroma_size / 2 - 256;
 803                encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max,
 804                                         cf, chroma_h, chroma_w,
 805                                         chroma_stride, frm->chroma_step,
 806                                         is_intra, next_is_intra);
 807                if (encoding & FWHT_FRAME_UNENCODED)
 808                        encoding |= FWHT_CB_UNENCODED;
 809                encoding &= ~FWHT_FRAME_UNENCODED;
 810                rlco_max = rlco + chroma_size / 2 - 256;
 811                encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max,
 812                                         cf, chroma_h, chroma_w,
 813                                         chroma_stride, frm->chroma_step,
 814                                         is_intra, next_is_intra);
 815                if (encoding & FWHT_FRAME_UNENCODED)
 816                        encoding |= FWHT_CR_UNENCODED;
 817                encoding &= ~FWHT_FRAME_UNENCODED;
 818        }
 819
 820        if (frm->components_num == 4) {
 821                rlco_max = rlco + size / 2 - 256;
 822                encoding |= encode_plane(frm->alpha, ref_frm->alpha, &rlco,
 823                                         rlco_max, cf, height, width,
 824                                         stride, frm->luma_alpha_step,
 825                                         is_intra, next_is_intra);
 826                if (encoding & FWHT_FRAME_UNENCODED)
 827                        encoding |= FWHT_ALPHA_UNENCODED;
 828                encoding &= ~FWHT_FRAME_UNENCODED;
 829        }
 830
 831        cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
 832        return encoding;
 833}
 834
 835static bool decode_plane(struct fwht_cframe *cf, const __be16 **rlco,
 836                         u32 height, u32 width, const u8 *ref, u32 ref_stride,
 837                         unsigned int ref_step, u8 *dst,
 838                         unsigned int dst_stride, unsigned int dst_step,
 839                         bool uncompressed, const __be16 *end_of_rlco_buf)
 840{
 841        unsigned int copies = 0;
 842        s16 copy[8 * 8];
 843        u16 stat;
 844        unsigned int i, j;
 845        bool is_intra = !ref;
 846
 847        width = round_up(width, 8);
 848        height = round_up(height, 8);
 849
 850        if (uncompressed) {
 851                int i;
 852
 853                if (end_of_rlco_buf + 1 < *rlco + width * height / 2)
 854                        return false;
 855                for (i = 0; i < height; i++) {
 856                        memcpy(dst, *rlco, width);
 857                        dst += dst_stride;
 858                        *rlco += width / 2;
 859                }
 860                return true;
 861        }
 862
 863        /*
 864         * When decoding each macroblock the rlco pointer will be increased
 865         * by 65 * 2 bytes worst-case.
 866         * To avoid overflow the buffer has to be 65/64th of the actual raw
 867         * image size, just in case someone feeds it malicious data.
 868         */
 869        for (j = 0; j < height / 8; j++) {
 870                for (i = 0; i < width / 8; i++) {
 871                        const u8 *refp = ref + j * 8 * ref_stride +
 872                                i * 8 * ref_step;
 873                        u8 *dstp = dst + j * 8 * dst_stride + i * 8 * dst_step;
 874
 875                        if (copies) {
 876                                memcpy(cf->de_fwht, copy, sizeof(copy));
 877                                if ((stat & PFRAME_BIT) && !is_intra)
 878                                        add_deltas(cf->de_fwht, refp,
 879                                                   ref_stride, ref_step);
 880                                fill_decoder_block(dstp, cf->de_fwht,
 881                                                   dst_stride, dst_step);
 882                                copies--;
 883                                continue;
 884                        }
 885
 886                        stat = derlc(rlco, cf->coeffs, end_of_rlco_buf);
 887                        if (stat & OVERFLOW_BIT)
 888                                return false;
 889                        if ((stat & PFRAME_BIT) && !is_intra)
 890                                dequantize_inter(cf->coeffs);
 891                        else
 892                                dequantize_intra(cf->coeffs);
 893
 894                        ifwht(cf->coeffs, cf->de_fwht,
 895                              ((stat & PFRAME_BIT) && !is_intra) ? 0 : 1);
 896
 897                        copies = (stat & DUPS_MASK) >> 1;
 898                        if (copies)
 899                                memcpy(copy, cf->de_fwht, sizeof(copy));
 900                        if ((stat & PFRAME_BIT) && !is_intra)
 901                                add_deltas(cf->de_fwht, refp,
 902                                           ref_stride, ref_step);
 903                        fill_decoder_block(dstp, cf->de_fwht, dst_stride,
 904                                           dst_step);
 905                }
 906        }
 907        return true;
 908}
 909
 910bool fwht_decode_frame(struct fwht_cframe *cf, u32 hdr_flags,
 911                       unsigned int components_num, unsigned int width,
 912                       unsigned int height, const struct fwht_raw_frame *ref,
 913                       unsigned int ref_stride, unsigned int ref_chroma_stride,
 914                       struct fwht_raw_frame *dst, unsigned int dst_stride,
 915                       unsigned int dst_chroma_stride)
 916{
 917        const __be16 *rlco = cf->rlc_data;
 918        const __be16 *end_of_rlco_buf = cf->rlc_data +
 919                        (cf->size / sizeof(*rlco)) - 1;
 920
 921        if (!decode_plane(cf, &rlco, height, width, ref->luma, ref_stride,
 922                          ref->luma_alpha_step, dst->luma, dst_stride,
 923                          dst->luma_alpha_step,
 924                          hdr_flags & V4L2_FWHT_FL_LUMA_IS_UNCOMPRESSED,
 925                          end_of_rlco_buf))
 926                return false;
 927
 928        if (components_num >= 3) {
 929                u32 h = height;
 930                u32 w = width;
 931
 932                if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_HEIGHT))
 933                        h /= 2;
 934                if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_WIDTH))
 935                        w /= 2;
 936
 937                if (!decode_plane(cf, &rlco, h, w, ref->cb, ref_chroma_stride,
 938                                  ref->chroma_step, dst->cb, dst_chroma_stride,
 939                                  dst->chroma_step,
 940                                  hdr_flags & V4L2_FWHT_FL_CB_IS_UNCOMPRESSED,
 941                                  end_of_rlco_buf))
 942                        return false;
 943                if (!decode_plane(cf, &rlco, h, w, ref->cr, ref_chroma_stride,
 944                                  ref->chroma_step, dst->cr, dst_chroma_stride,
 945                                  dst->chroma_step,
 946                                  hdr_flags & V4L2_FWHT_FL_CR_IS_UNCOMPRESSED,
 947                                  end_of_rlco_buf))
 948                        return false;
 949        }
 950
 951        if (components_num == 4)
 952                if (!decode_plane(cf, &rlco, height, width, ref->alpha, ref_stride,
 953                                  ref->luma_alpha_step, dst->alpha, dst_stride,
 954                                  dst->luma_alpha_step,
 955                                  hdr_flags & V4L2_FWHT_FL_ALPHA_IS_UNCOMPRESSED,
 956                                  end_of_rlco_buf))
 957                        return false;
 958        return true;
 959}
 960