linux/arch/mips/crypto/chacha-core.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0 OR MIT */
   2/*
   3 * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
   4 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
   5 */
   6
   7#define MASK_U32                0x3c
   8#define CHACHA20_BLOCK_SIZE     64
   9#define STACK_SIZE              32
  10
  11#define X0      $t0
  12#define X1      $t1
  13#define X2      $t2
  14#define X3      $t3
  15#define X4      $t4
  16#define X5      $t5
  17#define X6      $t6
  18#define X7      $t7
  19#define X8      $t8
  20#define X9      $t9
  21#define X10     $v1
  22#define X11     $s6
  23#define X12     $s5
  24#define X13     $s4
  25#define X14     $s3
  26#define X15     $s2
  27/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
  28#define T0      $s1
  29#define T1      $s0
  30#define T(n)    T ## n
  31#define X(n)    X ## n
  32
  33/* Input arguments */
  34#define STATE           $a0
  35#define OUT             $a1
  36#define IN              $a2
  37#define BYTES           $a3
  38
  39/* Output argument */
  40/* NONCE[0] is kept in a register and not in memory.
  41 * We don't want to touch original value in memory.
  42 * Must be incremented every loop iteration.
  43 */
  44#define NONCE_0         $v0
  45
  46/* SAVED_X and SAVED_CA are set in the jump table.
  47 * Use regs which are overwritten on exit else we don't leak clear data.
  48 * They are used to handling the last bytes which are not multiple of 4.
  49 */
  50#define SAVED_X         X15
  51#define SAVED_CA        $s7
  52
  53#define IS_UNALIGNED    $s7
  54
  55#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  56#define MSB 0
  57#define LSB 3
  58#define ROTx rotl
  59#define ROTR(n) rotr n, 24
  60#define CPU_TO_LE32(n) \
  61        wsbh    n; \
  62        rotr    n, 16;
  63#else
  64#define MSB 3
  65#define LSB 0
  66#define ROTx rotr
  67#define CPU_TO_LE32(n)
  68#define ROTR(n)
  69#endif
  70
  71#define FOR_EACH_WORD(x) \
  72        x( 0); \
  73        x( 1); \
  74        x( 2); \
  75        x( 3); \
  76        x( 4); \
  77        x( 5); \
  78        x( 6); \
  79        x( 7); \
  80        x( 8); \
  81        x( 9); \
  82        x(10); \
  83        x(11); \
  84        x(12); \
  85        x(13); \
  86        x(14); \
  87        x(15);
  88
  89#define FOR_EACH_WORD_REV(x) \
  90        x(15); \
  91        x(14); \
  92        x(13); \
  93        x(12); \
  94        x(11); \
  95        x(10); \
  96        x( 9); \
  97        x( 8); \
  98        x( 7); \
  99        x( 6); \
 100        x( 5); \
 101        x( 4); \
 102        x( 3); \
 103        x( 2); \
 104        x( 1); \
 105        x( 0);
 106
 107#define PLUS_ONE_0       1
 108#define PLUS_ONE_1       2
 109#define PLUS_ONE_2       3
 110#define PLUS_ONE_3       4
 111#define PLUS_ONE_4       5
 112#define PLUS_ONE_5       6
 113#define PLUS_ONE_6       7
 114#define PLUS_ONE_7       8
 115#define PLUS_ONE_8       9
 116#define PLUS_ONE_9      10
 117#define PLUS_ONE_10     11
 118#define PLUS_ONE_11     12
 119#define PLUS_ONE_12     13
 120#define PLUS_ONE_13     14
 121#define PLUS_ONE_14     15
 122#define PLUS_ONE_15     16
 123#define PLUS_ONE(x)     PLUS_ONE_ ## x
 124#define _CONCAT3(a,b,c) a ## b ## c
 125#define CONCAT3(a,b,c)  _CONCAT3(a,b,c)
 126
 127#define STORE_UNALIGNED(x) \
 128CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
 129        .if (x != 12); \
 130                lw      T0, (x*4)(STATE); \
 131        .endif; \
 132        lwl     T1, (x*4)+MSB ## (IN); \
 133        lwr     T1, (x*4)+LSB ## (IN); \
 134        .if (x == 12); \
 135                addu    X ## x, NONCE_0; \
 136        .else; \
 137                addu    X ## x, T0; \
 138        .endif; \
 139        CPU_TO_LE32(X ## x); \
 140        xor     X ## x, T1; \
 141        swl     X ## x, (x*4)+MSB ## (OUT); \
 142        swr     X ## x, (x*4)+LSB ## (OUT);
 143
 144#define STORE_ALIGNED(x) \
 145CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
 146        .if (x != 12); \
 147                lw      T0, (x*4)(STATE); \
 148        .endif; \
 149        lw      T1, (x*4) ## (IN); \
 150        .if (x == 12); \
 151                addu    X ## x, NONCE_0; \
 152        .else; \
 153                addu    X ## x, T0; \
 154        .endif; \
 155        CPU_TO_LE32(X ## x); \
 156        xor     X ## x, T1; \
 157        sw      X ## x, (x*4) ## (OUT);
 158
 159/* Jump table macro.
 160 * Used for setup and handling the last bytes, which are not multiple of 4.
 161 * X15 is free to store Xn
 162 * Every jumptable entry must be equal in size.
 163 */
 164#define JMPTBL_ALIGNED(x) \
 165.Lchacha_mips_jmptbl_aligned_ ## x: ; \
 166        .set    noreorder; \
 167        b       .Lchacha_mips_xor_aligned_ ## x ## _b; \
 168        .if (x == 12); \
 169                addu    SAVED_X, X ## x, NONCE_0; \
 170        .else; \
 171                addu    SAVED_X, X ## x, SAVED_CA; \
 172        .endif; \
 173        .set    reorder
 174
 175#define JMPTBL_UNALIGNED(x) \
 176.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
 177        .set    noreorder; \
 178        b       .Lchacha_mips_xor_unaligned_ ## x ## _b; \
 179        .if (x == 12); \
 180                addu    SAVED_X, X ## x, NONCE_0; \
 181        .else; \
 182                addu    SAVED_X, X ## x, SAVED_CA; \
 183        .endif; \
 184        .set    reorder
 185
 186#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
 187        addu    X(A), X(K); \
 188        addu    X(B), X(L); \
 189        addu    X(C), X(M); \
 190        addu    X(D), X(N); \
 191        xor     X(V), X(A); \
 192        xor     X(W), X(B); \
 193        xor     X(Y), X(C); \
 194        xor     X(Z), X(D); \
 195        rotl    X(V), S;    \
 196        rotl    X(W), S;    \
 197        rotl    X(Y), S;    \
 198        rotl    X(Z), S;
 199
 200.text
 201.set    reorder
 202.set    noat
 203.globl  chacha_crypt_arch
 204.ent    chacha_crypt_arch
 205chacha_crypt_arch:
 206        .frame  $sp, STACK_SIZE, $ra
 207
 208        /* Load number of rounds */
 209        lw      $at, 16($sp)
 210
 211        addiu   $sp, -STACK_SIZE
 212
 213        /* Return bytes = 0. */
 214        beqz    BYTES, .Lchacha_mips_end
 215
 216        lw      NONCE_0, 48(STATE)
 217
 218        /* Save s0-s7 */
 219        sw      $s0,  0($sp)
 220        sw      $s1,  4($sp)
 221        sw      $s2,  8($sp)
 222        sw      $s3, 12($sp)
 223        sw      $s4, 16($sp)
 224        sw      $s5, 20($sp)
 225        sw      $s6, 24($sp)
 226        sw      $s7, 28($sp)
 227
 228        /* Test IN or OUT is unaligned.
 229         * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
 230         */
 231        or      IS_UNALIGNED, IN, OUT
 232        andi    IS_UNALIGNED, 0x3
 233
 234        b       .Lchacha_rounds_start
 235
 236.align 4
 237.Loop_chacha_rounds:
 238        addiu   IN,  CHACHA20_BLOCK_SIZE
 239        addiu   OUT, CHACHA20_BLOCK_SIZE
 240        addiu   NONCE_0, 1
 241
 242.Lchacha_rounds_start:
 243        lw      X0,  0(STATE)
 244        lw      X1,  4(STATE)
 245        lw      X2,  8(STATE)
 246        lw      X3,  12(STATE)
 247
 248        lw      X4,  16(STATE)
 249        lw      X5,  20(STATE)
 250        lw      X6,  24(STATE)
 251        lw      X7,  28(STATE)
 252        lw      X8,  32(STATE)
 253        lw      X9,  36(STATE)
 254        lw      X10, 40(STATE)
 255        lw      X11, 44(STATE)
 256
 257        move    X12, NONCE_0
 258        lw      X13, 52(STATE)
 259        lw      X14, 56(STATE)
 260        lw      X15, 60(STATE)
 261
 262.Loop_chacha_xor_rounds:
 263        addiu   $at, -2
 264        AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
 265        AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
 266        AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
 267        AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
 268        AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
 269        AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
 270        AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
 271        AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
 272        bnez    $at, .Loop_chacha_xor_rounds
 273
 274        addiu   BYTES, -(CHACHA20_BLOCK_SIZE)
 275
 276        /* Is data src/dst unaligned? Jump */
 277        bnez    IS_UNALIGNED, .Loop_chacha_unaligned
 278
 279        /* Set number rounds here to fill delayslot. */
 280        lw      $at, (STACK_SIZE+16)($sp)
 281
 282        /* BYTES < 0, it has no full block. */
 283        bltz    BYTES, .Lchacha_mips_no_full_block_aligned
 284
 285        FOR_EACH_WORD_REV(STORE_ALIGNED)
 286
 287        /* BYTES > 0? Loop again. */
 288        bgtz    BYTES, .Loop_chacha_rounds
 289
 290        /* Place this here to fill delay slot */
 291        addiu   NONCE_0, 1
 292
 293        /* BYTES < 0? Handle last bytes */
 294        bltz    BYTES, .Lchacha_mips_xor_bytes
 295
 296.Lchacha_mips_xor_done:
 297        /* Restore used registers */
 298        lw      $s0,  0($sp)
 299        lw      $s1,  4($sp)
 300        lw      $s2,  8($sp)
 301        lw      $s3, 12($sp)
 302        lw      $s4, 16($sp)
 303        lw      $s5, 20($sp)
 304        lw      $s6, 24($sp)
 305        lw      $s7, 28($sp)
 306
 307        /* Write NONCE_0 back to right location in state */
 308        sw      NONCE_0, 48(STATE)
 309
 310.Lchacha_mips_end:
 311        addiu   $sp, STACK_SIZE
 312        jr      $ra
 313
 314.Lchacha_mips_no_full_block_aligned:
 315        /* Restore the offset on BYTES */
 316        addiu   BYTES, CHACHA20_BLOCK_SIZE
 317
 318        /* Get number of full WORDS */
 319        andi    $at, BYTES, MASK_U32
 320
 321        /* Load upper half of jump table addr */
 322        lui     T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
 323
 324        /* Calculate lower half jump table offset */
 325        ins     T0, $at, 1, 6
 326
 327        /* Add offset to STATE */
 328        addu    T1, STATE, $at
 329
 330        /* Add lower half jump table addr */
 331        addiu   T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
 332
 333        /* Read value from STATE */
 334        lw      SAVED_CA, 0(T1)
 335
 336        /* Store remaining bytecounter as negative value */
 337        subu    BYTES, $at, BYTES
 338
 339        jr      T0
 340
 341        /* Jump table */
 342        FOR_EACH_WORD(JMPTBL_ALIGNED)
 343
 344
 345.Loop_chacha_unaligned:
 346        /* Set number rounds here to fill delayslot. */
 347        lw      $at, (STACK_SIZE+16)($sp)
 348
 349        /* BYTES > 0, it has no full block. */
 350        bltz    BYTES, .Lchacha_mips_no_full_block_unaligned
 351
 352        FOR_EACH_WORD_REV(STORE_UNALIGNED)
 353
 354        /* BYTES > 0? Loop again. */
 355        bgtz    BYTES, .Loop_chacha_rounds
 356
 357        /* Write NONCE_0 back to right location in state */
 358        sw      NONCE_0, 48(STATE)
 359
 360        .set noreorder
 361        /* Fall through to byte handling */
 362        bgez    BYTES, .Lchacha_mips_xor_done
 363.Lchacha_mips_xor_unaligned_0_b:
 364.Lchacha_mips_xor_aligned_0_b:
 365        /* Place this here to fill delay slot */
 366        addiu   NONCE_0, 1
 367        .set reorder
 368
 369.Lchacha_mips_xor_bytes:
 370        addu    IN, $at
 371        addu    OUT, $at
 372        /* First byte */
 373        lbu     T1, 0(IN)
 374        addiu   $at, BYTES, 1
 375        CPU_TO_LE32(SAVED_X)
 376        ROTR(SAVED_X)
 377        xor     T1, SAVED_X
 378        sb      T1, 0(OUT)
 379        beqz    $at, .Lchacha_mips_xor_done
 380        /* Second byte */
 381        lbu     T1, 1(IN)
 382        addiu   $at, BYTES, 2
 383        ROTx    SAVED_X, 8
 384        xor     T1, SAVED_X
 385        sb      T1, 1(OUT)
 386        beqz    $at, .Lchacha_mips_xor_done
 387        /* Third byte */
 388        lbu     T1, 2(IN)
 389        ROTx    SAVED_X, 8
 390        xor     T1, SAVED_X
 391        sb      T1, 2(OUT)
 392        b       .Lchacha_mips_xor_done
 393
 394.Lchacha_mips_no_full_block_unaligned:
 395        /* Restore the offset on BYTES */
 396        addiu   BYTES, CHACHA20_BLOCK_SIZE
 397
 398        /* Get number of full WORDS */
 399        andi    $at, BYTES, MASK_U32
 400
 401        /* Load upper half of jump table addr */
 402        lui     T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
 403
 404        /* Calculate lower half jump table offset */
 405        ins     T0, $at, 1, 6
 406
 407        /* Add offset to STATE */
 408        addu    T1, STATE, $at
 409
 410        /* Add lower half jump table addr */
 411        addiu   T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
 412
 413        /* Read value from STATE */
 414        lw      SAVED_CA, 0(T1)
 415
 416        /* Store remaining bytecounter as negative value */
 417        subu    BYTES, $at, BYTES
 418
 419        jr      T0
 420
 421        /* Jump table */
 422        FOR_EACH_WORD(JMPTBL_UNALIGNED)
 423.end chacha_crypt_arch
 424.set at
 425
 426/* Input arguments
 427 * STATE        $a0
 428 * OUT          $a1
 429 * NROUND       $a2
 430 */
 431
 432#undef X12
 433#undef X13
 434#undef X14
 435#undef X15
 436
 437#define X12     $a3
 438#define X13     $at
 439#define X14     $v0
 440#define X15     STATE
 441
 442.set noat
 443.globl  hchacha_block_arch
 444.ent    hchacha_block_arch
 445hchacha_block_arch:
 446        .frame  $sp, STACK_SIZE, $ra
 447
 448        addiu   $sp, -STACK_SIZE
 449
 450        /* Save X11(s6) */
 451        sw      X11, 0($sp)
 452
 453        lw      X0,  0(STATE)
 454        lw      X1,  4(STATE)
 455        lw      X2,  8(STATE)
 456        lw      X3,  12(STATE)
 457        lw      X4,  16(STATE)
 458        lw      X5,  20(STATE)
 459        lw      X6,  24(STATE)
 460        lw      X7,  28(STATE)
 461        lw      X8,  32(STATE)
 462        lw      X9,  36(STATE)
 463        lw      X10, 40(STATE)
 464        lw      X11, 44(STATE)
 465        lw      X12, 48(STATE)
 466        lw      X13, 52(STATE)
 467        lw      X14, 56(STATE)
 468        lw      X15, 60(STATE)
 469
 470.Loop_hchacha_xor_rounds:
 471        addiu   $a2, -2
 472        AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
 473        AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
 474        AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
 475        AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
 476        AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
 477        AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
 478        AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
 479        AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
 480        bnez    $a2, .Loop_hchacha_xor_rounds
 481
 482        /* Restore used register */
 483        lw      X11, 0($sp)
 484
 485        sw      X0,  0(OUT)
 486        sw      X1,  4(OUT)
 487        sw      X2,  8(OUT)
 488        sw      X3,  12(OUT)
 489        sw      X12, 16(OUT)
 490        sw      X13, 20(OUT)
 491        sw      X14, 24(OUT)
 492        sw      X15, 28(OUT)
 493
 494        addiu   $sp, STACK_SIZE
 495        jr      $ra
 496.end hchacha_block_arch
 497.set at
 498