linux/drivers/media/platform/vicodec/codec-fwht.c
<<
>>
Prefs
   1// SPDX-License-Identifier: LGPL-2.1+
   2/*
   3 * Copyright 2016 Tom aan de Wiel
   4 * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
   5 *
   6 * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
   7 *
   8 * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
   9 * R.D. Brown, 1977
  10 */
  11
  12#include <linux/string.h>
  13#include "codec-fwht.h"
  14
  15/*
  16 * Note: bit 0 of the header must always be 0. Otherwise it cannot
  17 * be guaranteed that the magic 8 byte sequence (see below) can
  18 * never occur in the rlc output.
  19 */
  20#define PFRAME_BIT BIT(15)
  21#define DUPS_MASK 0x1ffe
  22
  23#define PBLOCK 0
  24#define IBLOCK 1
  25
  26#define ALL_ZEROS 15
  27
  28static const uint8_t zigzag[64] = {
  29        0,
  30        1,  8,
  31        2,  9, 16,
  32        3, 10, 17, 24,
  33        4, 11, 18, 25, 32,
  34        5, 12, 19, 26, 33, 40,
  35        6, 13, 20, 27, 34, 41, 48,
  36        7, 14, 21, 28, 35, 42, 49, 56,
  37        15, 22, 29, 36, 43, 50, 57,
  38        23, 30, 37, 44, 51, 58,
  39        31, 38, 45, 52, 59,
  40        39, 46, 53, 60,
  41        47, 54, 61,
  42        55, 62,
  43        63,
  44};
  45
  46
  47static int rlc(const s16 *in, __be16 *output, int blocktype)
  48{
  49        s16 block[8 * 8];
  50        s16 *wp = block;
  51        int i = 0;
  52        int x, y;
  53        int ret = 0;
  54
  55        /* read in block from framebuffer */
  56        int lastzero_run = 0;
  57        int to_encode;
  58
  59        for (y = 0; y < 8; y++) {
  60                for (x = 0; x < 8; x++) {
  61                        *wp = in[x + y * 8];
  62                        wp++;
  63                }
  64        }
  65
  66        /* keep track of amount of trailing zeros */
  67        for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
  68                lastzero_run++;
  69
  70        *output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
  71        ret++;
  72
  73        to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
  74
  75        i = 0;
  76        while (i < to_encode) {
  77                int cnt = 0;
  78                int tmp;
  79
  80                /* count leading zeros */
  81                while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
  82                        cnt++;
  83                        i++;
  84                        if (i == to_encode) {
  85                                cnt--;
  86                                break;
  87                        }
  88                }
  89                /* 4 bits for run, 12 for coefficient (quantization by 4) */
  90                *output++ = htons((cnt | tmp << 4));
  91                i++;
  92                ret++;
  93        }
  94        if (lastzero_run > 14) {
  95                *output = htons(ALL_ZEROS | 0);
  96                ret++;
  97        }
  98
  99        return ret;
 100}
 101
 102/*
 103 * This function will worst-case increase rlc_in by 65*2 bytes:
 104 * one s16 value for the header and 8 * 8 coefficients of type s16.
 105 */
 106static s16 derlc(const __be16 **rlc_in, s16 *dwht_out)
 107{
 108        /* header */
 109        const __be16 *input = *rlc_in;
 110        s16 ret = ntohs(*input++);
 111        int dec_count = 0;
 112        s16 block[8 * 8 + 16];
 113        s16 *wp = block;
 114        int i;
 115
 116        /*
 117         * Now de-compress, it expands one byte to up to 15 bytes
 118         * (or fills the remainder of the 64 bytes with zeroes if it
 119         * is the last byte to expand).
 120         *
 121         * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
 122         * allow for overflow if the incoming data was malformed.
 123         */
 124        while (dec_count < 8 * 8) {
 125                s16 in = ntohs(*input++);
 126                int length = in & 0xf;
 127                int coeff = in >> 4;
 128
 129                /* fill remainder with zeros */
 130                if (length == 15) {
 131                        for (i = 0; i < 64 - dec_count; i++)
 132                                *wp++ = 0;
 133                        break;
 134                }
 135
 136                for (i = 0; i < length; i++)
 137                        *wp++ = 0;
 138                *wp++ = coeff;
 139                dec_count += length + 1;
 140        }
 141
 142        wp = block;
 143
 144        for (i = 0; i < 64; i++) {
 145                int pos = zigzag[i];
 146                int y = pos / 8;
 147                int x = pos % 8;
 148
 149                dwht_out[x + y * 8] = *wp++;
 150        }
 151        *rlc_in = input;
 152        return ret;
 153}
 154
 155static const int quant_table[] = {
 156        2, 2, 2, 2, 2, 2,  2,  2,
 157        2, 2, 2, 2, 2, 2,  2,  2,
 158        2, 2, 2, 2, 2, 2,  2,  3,
 159        2, 2, 2, 2, 2, 2,  3,  6,
 160        2, 2, 2, 2, 2, 3,  6,  6,
 161        2, 2, 2, 2, 3, 6,  6,  6,
 162        2, 2, 2, 3, 6, 6,  6,  6,
 163        2, 2, 3, 6, 6, 6,  6,  8,
 164};
 165
 166static const int quant_table_p[] = {
 167        3, 3, 3, 3, 3, 3,  3,  3,
 168        3, 3, 3, 3, 3, 3,  3,  3,
 169        3, 3, 3, 3, 3, 3,  3,  3,
 170        3, 3, 3, 3, 3, 3,  3,  6,
 171        3, 3, 3, 3, 3, 3,  6,  6,
 172        3, 3, 3, 3, 3, 6,  6,  9,
 173        3, 3, 3, 3, 6, 6,  9,  9,
 174        3, 3, 3, 6, 6, 9,  9,  10,
 175};
 176
 177static void quantize_intra(s16 *coeff, s16 *de_coeff, u16 qp)
 178{
 179        const int *quant = quant_table;
 180        int i, j;
 181
 182        for (j = 0; j < 8; j++) {
 183                for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
 184                        *coeff >>= *quant;
 185                        if (*coeff >= -qp && *coeff <= qp)
 186                                *coeff = *de_coeff = 0;
 187                        else
 188                                *de_coeff = *coeff << *quant;
 189                }
 190        }
 191}
 192
 193static void dequantize_intra(s16 *coeff)
 194{
 195        const int *quant = quant_table;
 196        int i, j;
 197
 198        for (j = 0; j < 8; j++)
 199                for (i = 0; i < 8; i++, quant++, coeff++)
 200                        *coeff <<= *quant;
 201}
 202
 203static void quantize_inter(s16 *coeff, s16 *de_coeff, u16 qp)
 204{
 205        const int *quant = quant_table_p;
 206        int i, j;
 207
 208        for (j = 0; j < 8; j++) {
 209                for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
 210                        *coeff >>= *quant;
 211                        if (*coeff >= -qp && *coeff <= qp)
 212                                *coeff = *de_coeff = 0;
 213                        else
 214                                *de_coeff = *coeff << *quant;
 215                }
 216        }
 217}
 218
 219static void dequantize_inter(s16 *coeff)
 220{
 221        const int *quant = quant_table_p;
 222        int i, j;
 223
 224        for (j = 0; j < 8; j++)
 225                for (i = 0; i < 8; i++, quant++, coeff++)
 226                        *coeff <<= *quant;
 227}
 228
 229static void fwht(const u8 *block, s16 *output_block, unsigned int stride,
 230                 unsigned int input_step, bool intra)
 231{
 232        /* we'll need more than 8 bits for the transformed coefficients */
 233        s32 workspace1[8], workspace2[8];
 234        const u8 *tmp = block;
 235        s16 *out = output_block;
 236        int add = intra ? 256 : 0;
 237        unsigned int i;
 238
 239        /* stage 1 */
 240        stride *= input_step;
 241
 242        for (i = 0; i < 8; i++, tmp += stride, out += 8) {
 243                switch (input_step) {
 244                case 1:
 245                        workspace1[0]  = tmp[0] + tmp[1] - add;
 246                        workspace1[1]  = tmp[0] - tmp[1];
 247
 248                        workspace1[2]  = tmp[2] + tmp[3] - add;
 249                        workspace1[3]  = tmp[2] - tmp[3];
 250
 251                        workspace1[4]  = tmp[4] + tmp[5] - add;
 252                        workspace1[5]  = tmp[4] - tmp[5];
 253
 254                        workspace1[6]  = tmp[6] + tmp[7] - add;
 255                        workspace1[7]  = tmp[6] - tmp[7];
 256                        break;
 257                case 2:
 258                        workspace1[0]  = tmp[0] + tmp[2] - add;
 259                        workspace1[1]  = tmp[0] - tmp[2];
 260
 261                        workspace1[2]  = tmp[4] + tmp[6] - add;
 262                        workspace1[3]  = tmp[4] - tmp[6];
 263
 264                        workspace1[4]  = tmp[8] + tmp[10] - add;
 265                        workspace1[5]  = tmp[8] - tmp[10];
 266
 267                        workspace1[6]  = tmp[12] + tmp[14] - add;
 268                        workspace1[7]  = tmp[12] - tmp[14];
 269                        break;
 270                case 3:
 271                        workspace1[0]  = tmp[0] + tmp[3] - add;
 272                        workspace1[1]  = tmp[0] - tmp[3];
 273
 274                        workspace1[2]  = tmp[6] + tmp[9] - add;
 275                        workspace1[3]  = tmp[6] - tmp[9];
 276
 277                        workspace1[4]  = tmp[12] + tmp[15] - add;
 278                        workspace1[5]  = tmp[12] - tmp[15];
 279
 280                        workspace1[6]  = tmp[18] + tmp[21] - add;
 281                        workspace1[7]  = tmp[18] - tmp[21];
 282                        break;
 283                default:
 284                        workspace1[0]  = tmp[0] + tmp[4] - add;
 285                        workspace1[1]  = tmp[0] - tmp[4];
 286
 287                        workspace1[2]  = tmp[8] + tmp[12] - add;
 288                        workspace1[3]  = tmp[8] - tmp[12];
 289
 290                        workspace1[4]  = tmp[16] + tmp[20] - add;
 291                        workspace1[5]  = tmp[16] - tmp[20];
 292
 293                        workspace1[6]  = tmp[24] + tmp[28] - add;
 294                        workspace1[7]  = tmp[24] - tmp[28];
 295                        break;
 296                }
 297
 298                /* stage 2 */
 299                workspace2[0] = workspace1[0] + workspace1[2];
 300                workspace2[1] = workspace1[0] - workspace1[2];
 301                workspace2[2] = workspace1[1] - workspace1[3];
 302                workspace2[3] = workspace1[1] + workspace1[3];
 303
 304                workspace2[4] = workspace1[4] + workspace1[6];
 305                workspace2[5] = workspace1[4] - workspace1[6];
 306                workspace2[6] = workspace1[5] - workspace1[7];
 307                workspace2[7] = workspace1[5] + workspace1[7];
 308
 309                /* stage 3 */
 310                out[0] = workspace2[0] + workspace2[4];
 311                out[1] = workspace2[0] - workspace2[4];
 312                out[2] = workspace2[1] - workspace2[5];
 313                out[3] = workspace2[1] + workspace2[5];
 314                out[4] = workspace2[2] + workspace2[6];
 315                out[5] = workspace2[2] - workspace2[6];
 316                out[6] = workspace2[3] - workspace2[7];
 317                out[7] = workspace2[3] + workspace2[7];
 318        }
 319
 320        out = output_block;
 321
 322        for (i = 0; i < 8; i++, out++) {
 323                /* stage 1 */
 324                workspace1[0]  = out[0] + out[1 * 8];
 325                workspace1[1]  = out[0] - out[1 * 8];
 326
 327                workspace1[2]  = out[2 * 8] + out[3 * 8];
 328                workspace1[3]  = out[2 * 8] - out[3 * 8];
 329
 330                workspace1[4]  = out[4 * 8] + out[5 * 8];
 331                workspace1[5]  = out[4 * 8] - out[5 * 8];
 332
 333                workspace1[6]  = out[6 * 8] + out[7 * 8];
 334                workspace1[7]  = out[6 * 8] - out[7 * 8];
 335
 336                /* stage 2 */
 337                workspace2[0] = workspace1[0] + workspace1[2];
 338                workspace2[1] = workspace1[0] - workspace1[2];
 339                workspace2[2] = workspace1[1] - workspace1[3];
 340                workspace2[3] = workspace1[1] + workspace1[3];
 341
 342                workspace2[4] = workspace1[4] + workspace1[6];
 343                workspace2[5] = workspace1[4] - workspace1[6];
 344                workspace2[6] = workspace1[5] - workspace1[7];
 345                workspace2[7] = workspace1[5] + workspace1[7];
 346                /* stage 3 */
 347                out[0 * 8] = workspace2[0] + workspace2[4];
 348                out[1 * 8] = workspace2[0] - workspace2[4];
 349                out[2 * 8] = workspace2[1] - workspace2[5];
 350                out[3 * 8] = workspace2[1] + workspace2[5];
 351                out[4 * 8] = workspace2[2] + workspace2[6];
 352                out[5 * 8] = workspace2[2] - workspace2[6];
 353                out[6 * 8] = workspace2[3] - workspace2[7];
 354                out[7 * 8] = workspace2[3] + workspace2[7];
 355        }
 356}
 357
 358/*
 359 * Not the nicest way of doing it, but P-blocks get twice the range of
 360 * that of the I-blocks. Therefore we need a type bigger than 8 bits.
 361 * Furthermore values can be negative... This is just a version that
 362 * works with 16 signed data
 363 */
 364static void fwht16(const s16 *block, s16 *output_block, int stride, int intra)
 365{
 366        /* we'll need more than 8 bits for the transformed coefficients */
 367        s32 workspace1[8], workspace2[8];
 368        const s16 *tmp = block;
 369        s16 *out = output_block;
 370        int i;
 371
 372        for (i = 0; i < 8; i++, tmp += stride, out += 8) {
 373                /* stage 1 */
 374                workspace1[0]  = tmp[0] + tmp[1];
 375                workspace1[1]  = tmp[0] - tmp[1];
 376
 377                workspace1[2]  = tmp[2] + tmp[3];
 378                workspace1[3]  = tmp[2] - tmp[3];
 379
 380                workspace1[4]  = tmp[4] + tmp[5];
 381                workspace1[5]  = tmp[4] - tmp[5];
 382
 383                workspace1[6]  = tmp[6] + tmp[7];
 384                workspace1[7]  = tmp[6] - tmp[7];
 385
 386                /* stage 2 */
 387                workspace2[0] = workspace1[0] + workspace1[2];
 388                workspace2[1] = workspace1[0] - workspace1[2];
 389                workspace2[2] = workspace1[1] - workspace1[3];
 390                workspace2[3] = workspace1[1] + workspace1[3];
 391
 392                workspace2[4] = workspace1[4] + workspace1[6];
 393                workspace2[5] = workspace1[4] - workspace1[6];
 394                workspace2[6] = workspace1[5] - workspace1[7];
 395                workspace2[7] = workspace1[5] + workspace1[7];
 396
 397                /* stage 3 */
 398                out[0] = workspace2[0] + workspace2[4];
 399                out[1] = workspace2[0] - workspace2[4];
 400                out[2] = workspace2[1] - workspace2[5];
 401                out[3] = workspace2[1] + workspace2[5];
 402                out[4] = workspace2[2] + workspace2[6];
 403                out[5] = workspace2[2] - workspace2[6];
 404                out[6] = workspace2[3] - workspace2[7];
 405                out[7] = workspace2[3] + workspace2[7];
 406        }
 407
 408        out = output_block;
 409
 410        for (i = 0; i < 8; i++, out++) {
 411                /* stage 1 */
 412                workspace1[0]  = out[0] + out[1*8];
 413                workspace1[1]  = out[0] - out[1*8];
 414
 415                workspace1[2]  = out[2*8] + out[3*8];
 416                workspace1[3]  = out[2*8] - out[3*8];
 417
 418                workspace1[4]  = out[4*8] + out[5*8];
 419                workspace1[5]  = out[4*8] - out[5*8];
 420
 421                workspace1[6]  = out[6*8] + out[7*8];
 422                workspace1[7]  = out[6*8] - out[7*8];
 423
 424                /* stage 2 */
 425                workspace2[0] = workspace1[0] + workspace1[2];
 426                workspace2[1] = workspace1[0] - workspace1[2];
 427                workspace2[2] = workspace1[1] - workspace1[3];
 428                workspace2[3] = workspace1[1] + workspace1[3];
 429
 430                workspace2[4] = workspace1[4] + workspace1[6];
 431                workspace2[5] = workspace1[4] - workspace1[6];
 432                workspace2[6] = workspace1[5] - workspace1[7];
 433                workspace2[7] = workspace1[5] + workspace1[7];
 434
 435                /* stage 3 */
 436                out[0*8] = workspace2[0] + workspace2[4];
 437                out[1*8] = workspace2[0] - workspace2[4];
 438                out[2*8] = workspace2[1] - workspace2[5];
 439                out[3*8] = workspace2[1] + workspace2[5];
 440                out[4*8] = workspace2[2] + workspace2[6];
 441                out[5*8] = workspace2[2] - workspace2[6];
 442                out[6*8] = workspace2[3] - workspace2[7];
 443                out[7*8] = workspace2[3] + workspace2[7];
 444        }
 445}
 446
 447static void ifwht(const s16 *block, s16 *output_block, int intra)
 448{
 449        /*
 450         * we'll need more than 8 bits for the transformed coefficients
 451         * use native unit of cpu
 452         */
 453        int workspace1[8], workspace2[8];
 454        int inter = intra ? 0 : 1;
 455        const s16 *tmp = block;
 456        s16 *out = output_block;
 457        int i;
 458
 459        for (i = 0; i < 8; i++, tmp += 8, out += 8) {
 460                /* stage 1 */
 461                workspace1[0]  = tmp[0] + tmp[1];
 462                workspace1[1]  = tmp[0] - tmp[1];
 463
 464                workspace1[2]  = tmp[2] + tmp[3];
 465                workspace1[3]  = tmp[2] - tmp[3];
 466
 467                workspace1[4]  = tmp[4] + tmp[5];
 468                workspace1[5]  = tmp[4] - tmp[5];
 469
 470                workspace1[6]  = tmp[6] + tmp[7];
 471                workspace1[7]  = tmp[6] - tmp[7];
 472
 473                /* stage 2 */
 474                workspace2[0] = workspace1[0] + workspace1[2];
 475                workspace2[1] = workspace1[0] - workspace1[2];
 476                workspace2[2] = workspace1[1] - workspace1[3];
 477                workspace2[3] = workspace1[1] + workspace1[3];
 478
 479                workspace2[4] = workspace1[4] + workspace1[6];
 480                workspace2[5] = workspace1[4] - workspace1[6];
 481                workspace2[6] = workspace1[5] - workspace1[7];
 482                workspace2[7] = workspace1[5] + workspace1[7];
 483
 484                /* stage 3 */
 485                out[0] = workspace2[0] + workspace2[4];
 486                out[1] = workspace2[0] - workspace2[4];
 487                out[2] = workspace2[1] - workspace2[5];
 488                out[3] = workspace2[1] + workspace2[5];
 489                out[4] = workspace2[2] + workspace2[6];
 490                out[5] = workspace2[2] - workspace2[6];
 491                out[6] = workspace2[3] - workspace2[7];
 492                out[7] = workspace2[3] + workspace2[7];
 493        }
 494
 495        out = output_block;
 496
 497        for (i = 0; i < 8; i++, out++) {
 498                /* stage 1 */
 499                workspace1[0]  = out[0] + out[1 * 8];
 500                workspace1[1]  = out[0] - out[1 * 8];
 501
 502                workspace1[2]  = out[2 * 8] + out[3 * 8];
 503                workspace1[3]  = out[2 * 8] - out[3 * 8];
 504
 505                workspace1[4]  = out[4 * 8] + out[5 * 8];
 506                workspace1[5]  = out[4 * 8] - out[5 * 8];
 507
 508                workspace1[6]  = out[6 * 8] + out[7 * 8];
 509                workspace1[7]  = out[6 * 8] - out[7 * 8];
 510
 511                /* stage 2 */
 512                workspace2[0] = workspace1[0] + workspace1[2];
 513                workspace2[1] = workspace1[0] - workspace1[2];
 514                workspace2[2] = workspace1[1] - workspace1[3];
 515                workspace2[3] = workspace1[1] + workspace1[3];
 516
 517                workspace2[4] = workspace1[4] + workspace1[6];
 518                workspace2[5] = workspace1[4] - workspace1[6];
 519                workspace2[6] = workspace1[5] - workspace1[7];
 520                workspace2[7] = workspace1[5] + workspace1[7];
 521
 522                /* stage 3 */
 523                if (inter) {
 524                        int d;
 525
 526                        out[0 * 8] = workspace2[0] + workspace2[4];
 527                        out[1 * 8] = workspace2[0] - workspace2[4];
 528                        out[2 * 8] = workspace2[1] - workspace2[5];
 529                        out[3 * 8] = workspace2[1] + workspace2[5];
 530                        out[4 * 8] = workspace2[2] + workspace2[6];
 531                        out[5 * 8] = workspace2[2] - workspace2[6];
 532                        out[6 * 8] = workspace2[3] - workspace2[7];
 533                        out[7 * 8] = workspace2[3] + workspace2[7];
 534
 535                        for (d = 0; d < 8; d++)
 536                                out[8 * d] >>= 6;
 537                } else {
 538                        int d;
 539
 540                        out[0 * 8] = workspace2[0] + workspace2[4];
 541                        out[1 * 8] = workspace2[0] - workspace2[4];
 542                        out[2 * 8] = workspace2[1] - workspace2[5];
 543                        out[3 * 8] = workspace2[1] + workspace2[5];
 544                        out[4 * 8] = workspace2[2] + workspace2[6];
 545                        out[5 * 8] = workspace2[2] - workspace2[6];
 546                        out[6 * 8] = workspace2[3] - workspace2[7];
 547                        out[7 * 8] = workspace2[3] + workspace2[7];
 548
 549                        for (d = 0; d < 8; d++) {
 550                                out[8 * d] >>= 6;
 551                                out[8 * d] += 128;
 552                        }
 553                }
 554        }
 555}
 556
 557static void fill_encoder_block(const u8 *input, s16 *dst,
 558                               unsigned int stride, unsigned int input_step)
 559{
 560        int i, j;
 561
 562        for (i = 0; i < 8; i++) {
 563                for (j = 0; j < 8; j++, input += input_step)
 564                        *dst++ = *input;
 565                input += (stride - 8) * input_step;
 566        }
 567}
 568
 569static int var_intra(const s16 *input)
 570{
 571        int32_t mean = 0;
 572        int32_t ret = 0;
 573        const s16 *tmp = input;
 574        int i;
 575
 576        for (i = 0; i < 8 * 8; i++, tmp++)
 577                mean += *tmp;
 578        mean /= 64;
 579        tmp = input;
 580        for (i = 0; i < 8 * 8; i++, tmp++)
 581                ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
 582        return ret;
 583}
 584
 585static int var_inter(const s16 *old, const s16 *new)
 586{
 587        int32_t ret = 0;
 588        int i;
 589
 590        for (i = 0; i < 8 * 8; i++, old++, new++)
 591                ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
 592        return ret;
 593}
 594
 595static int decide_blocktype(const u8 *cur, const u8 *reference,
 596                            s16 *deltablock, unsigned int stride,
 597                            unsigned int input_step)
 598{
 599        s16 tmp[64];
 600        s16 old[64];
 601        s16 *work = tmp;
 602        unsigned int k, l;
 603        int vari;
 604        int vard;
 605
 606        fill_encoder_block(cur, tmp, stride, input_step);
 607        fill_encoder_block(reference, old, 8, 1);
 608        vari = var_intra(tmp);
 609
 610        for (k = 0; k < 8; k++) {
 611                for (l = 0; l < 8; l++) {
 612                        *deltablock = *work - *reference;
 613                        deltablock++;
 614                        work++;
 615                        reference++;
 616                }
 617        }
 618        deltablock -= 64;
 619        vard = var_inter(old, tmp);
 620        return vari <= vard ? IBLOCK : PBLOCK;
 621}
 622
 623static void fill_decoder_block(u8 *dst, const s16 *input, int stride)
 624{
 625        int i, j;
 626
 627        for (i = 0; i < 8; i++) {
 628                for (j = 0; j < 8; j++, input++, dst++) {
 629                        if (*input < 0)
 630                                *dst = 0;
 631                        else if (*input > 255)
 632                                *dst = 255;
 633                        else
 634                                *dst = *input;
 635                }
 636                dst += stride - 8;
 637        }
 638}
 639
 640static void add_deltas(s16 *deltas, const u8 *ref, int stride)
 641{
 642        int k, l;
 643
 644        for (k = 0; k < 8; k++) {
 645                for (l = 0; l < 8; l++) {
 646                        *deltas += *ref++;
 647                        /*
 648                         * Due to quantizing, it might possible that the
 649                         * decoded coefficients are slightly out of range
 650                         */
 651                        if (*deltas < 0)
 652                                *deltas = 0;
 653                        else if (*deltas > 255)
 654                                *deltas = 255;
 655                        deltas++;
 656                }
 657                ref += stride - 8;
 658        }
 659}
 660
 661static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
 662                        struct fwht_cframe *cf, u32 height, u32 width,
 663                        unsigned int input_step,
 664                        bool is_intra, bool next_is_intra)
 665{
 666        u8 *input_start = input;
 667        __be16 *rlco_start = *rlco;
 668        s16 deltablock[64];
 669        __be16 pframe_bit = htons(PFRAME_BIT);
 670        u32 encoding = 0;
 671        unsigned int last_size = 0;
 672        unsigned int i, j;
 673
 674        for (j = 0; j < height / 8; j++) {
 675                for (i = 0; i < width / 8; i++) {
 676                        /* intra code, first frame is always intra coded. */
 677                        int blocktype = IBLOCK;
 678                        unsigned int size;
 679
 680                        if (!is_intra)
 681                                blocktype = decide_blocktype(input, refp,
 682                                        deltablock, width, input_step);
 683                        if (blocktype == IBLOCK) {
 684                                fwht(input, cf->coeffs, width, input_step, 1);
 685                                quantize_intra(cf->coeffs, cf->de_coeffs,
 686                                               cf->i_frame_qp);
 687                        } else {
 688                                /* inter code */
 689                                encoding |= FWHT_FRAME_PCODED;
 690                                fwht16(deltablock, cf->coeffs, 8, 0);
 691                                quantize_inter(cf->coeffs, cf->de_coeffs,
 692                                               cf->p_frame_qp);
 693                        }
 694                        if (!next_is_intra) {
 695                                ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
 696
 697                                if (blocktype == PBLOCK)
 698                                        add_deltas(cf->de_fwht, refp, 8);
 699                                fill_decoder_block(refp, cf->de_fwht, 8);
 700                        }
 701
 702                        input += 8 * input_step;
 703                        refp += 8 * 8;
 704
 705                        size = rlc(cf->coeffs, *rlco, blocktype);
 706                        if (last_size == size &&
 707                            !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
 708                                __be16 *last_rlco = *rlco - size;
 709                                s16 hdr = ntohs(*last_rlco);
 710
 711                                if (!((*last_rlco ^ **rlco) & pframe_bit) &&
 712                                    (hdr & DUPS_MASK) < DUPS_MASK)
 713                                        *last_rlco = htons(hdr + 2);
 714                                else
 715                                        *rlco += size;
 716                        } else {
 717                                *rlco += size;
 718                        }
 719                        if (*rlco >= rlco_max) {
 720                                encoding |= FWHT_FRAME_UNENCODED;
 721                                goto exit_loop;
 722                        }
 723                        last_size = size;
 724                }
 725                input += width * 7 * input_step;
 726        }
 727
 728exit_loop:
 729        if (encoding & FWHT_FRAME_UNENCODED) {
 730                u8 *out = (u8 *)rlco_start;
 731
 732                input = input_start;
 733                /*
 734                 * The compressed stream should never contain the magic
 735                 * header, so when we copy the YUV data we replace 0xff
 736                 * by 0xfe. Since YUV is limited range such values
 737                 * shouldn't appear anyway.
 738                 */
 739                for (i = 0; i < height * width; i++, input += input_step)
 740                        *out++ = (*input == 0xff) ? 0xfe : *input;
 741                *rlco = (__be16 *)out;
 742                encoding &= ~FWHT_FRAME_PCODED;
 743        }
 744        return encoding;
 745}
 746
 747u32 fwht_encode_frame(struct fwht_raw_frame *frm,
 748                      struct fwht_raw_frame *ref_frm,
 749                      struct fwht_cframe *cf,
 750                      bool is_intra, bool next_is_intra)
 751{
 752        unsigned int size = frm->height * frm->width;
 753        __be16 *rlco = cf->rlc_data;
 754        __be16 *rlco_max;
 755        u32 encoding;
 756        u32 chroma_h = frm->height / frm->height_div;
 757        u32 chroma_w = frm->width / frm->width_div;
 758        unsigned int chroma_size = chroma_h * chroma_w;
 759
 760        rlco_max = rlco + size / 2 - 256;
 761        encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
 762                                frm->height, frm->width,
 763                                frm->luma_step, is_intra, next_is_intra);
 764        if (encoding & FWHT_FRAME_UNENCODED)
 765                encoding |= FWHT_LUMA_UNENCODED;
 766        encoding &= ~FWHT_FRAME_UNENCODED;
 767        rlco_max = rlco + chroma_size / 2 - 256;
 768        encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max, cf,
 769                                 chroma_h, chroma_w,
 770                                 frm->chroma_step, is_intra, next_is_intra);
 771        if (encoding & FWHT_FRAME_UNENCODED)
 772                encoding |= FWHT_CB_UNENCODED;
 773        encoding &= ~FWHT_FRAME_UNENCODED;
 774        rlco_max = rlco + chroma_size / 2 - 256;
 775        encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max, cf,
 776                                 chroma_h, chroma_w,
 777                                 frm->chroma_step, is_intra, next_is_intra);
 778        if (encoding & FWHT_FRAME_UNENCODED)
 779                encoding |= FWHT_CR_UNENCODED;
 780        encoding &= ~FWHT_FRAME_UNENCODED;
 781        cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
 782        return encoding;
 783}
 784
 785static void decode_plane(struct fwht_cframe *cf, const __be16 **rlco, u8 *ref,
 786                         u32 height, u32 width, bool uncompressed)
 787{
 788        unsigned int copies = 0;
 789        s16 copy[8 * 8];
 790        s16 stat;
 791        unsigned int i, j;
 792
 793        if (uncompressed) {
 794                memcpy(ref, *rlco, width * height);
 795                *rlco += width * height / 2;
 796                return;
 797        }
 798
 799        /*
 800         * When decoding each macroblock the rlco pointer will be increased
 801         * by 65 * 2 bytes worst-case.
 802         * To avoid overflow the buffer has to be 65/64th of the actual raw
 803         * image size, just in case someone feeds it malicious data.
 804         */
 805        for (j = 0; j < height / 8; j++) {
 806                for (i = 0; i < width / 8; i++) {
 807                        u8 *refp = ref + j * 8 * width + i * 8;
 808
 809                        if (copies) {
 810                                memcpy(cf->de_fwht, copy, sizeof(copy));
 811                                if (stat & PFRAME_BIT)
 812                                        add_deltas(cf->de_fwht, refp, width);
 813                                fill_decoder_block(refp, cf->de_fwht, width);
 814                                copies--;
 815                                continue;
 816                        }
 817
 818                        stat = derlc(rlco, cf->coeffs);
 819
 820                        if (stat & PFRAME_BIT)
 821                                dequantize_inter(cf->coeffs);
 822                        else
 823                                dequantize_intra(cf->coeffs);
 824
 825                        ifwht(cf->coeffs, cf->de_fwht,
 826                              (stat & PFRAME_BIT) ? 0 : 1);
 827
 828                        copies = (stat & DUPS_MASK) >> 1;
 829                        if (copies)
 830                                memcpy(copy, cf->de_fwht, sizeof(copy));
 831                        if (stat & PFRAME_BIT)
 832                                add_deltas(cf->de_fwht, refp, width);
 833                        fill_decoder_block(refp, cf->de_fwht, width);
 834                }
 835        }
 836}
 837
 838void fwht_decode_frame(struct fwht_cframe *cf, struct fwht_raw_frame *ref,
 839                       u32 hdr_flags)
 840{
 841        const __be16 *rlco = cf->rlc_data;
 842        u32 h = cf->height / 2;
 843        u32 w = cf->width / 2;
 844
 845        if (hdr_flags & FWHT_FL_CHROMA_FULL_HEIGHT)
 846                h *= 2;
 847        if (hdr_flags & FWHT_FL_CHROMA_FULL_WIDTH)
 848                w *= 2;
 849        decode_plane(cf, &rlco, ref->luma, cf->height, cf->width,
 850                     hdr_flags & FWHT_FL_LUMA_IS_UNCOMPRESSED);
 851        decode_plane(cf, &rlco, ref->cb, h, w,
 852                     hdr_flags & FWHT_FL_CB_IS_UNCOMPRESSED);
 853        decode_plane(cf, &rlco, ref->cr, h, w,
 854                     hdr_flags & FWHT_FL_CR_IS_UNCOMPRESSED);
 855}
 856