linux/arch/x86/crypto/sha1_ssse3_asm.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
   4 * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
   5 * processors. CPUs supporting Intel(R) AVX extensions will get an additional
   6 * boost.
   7 *
   8 * This work was inspired by the vectorized implementation of Dean Gaudet.
   9 * Additional information on it can be found at:
  10 *    http://www.arctic.org/~dean/crypto/sha1.html
  11 *
  12 * It was improved upon with more efficient vectorization of the message
  13 * scheduling. This implementation has also been optimized for all current and
  14 * several future generations of Intel CPUs.
  15 *
  16 * See this article for more information about the implementation details:
  17 *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
  18 *
  19 * Copyright (C) 2010, Intel Corp.
  20 *   Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
  21 *            Ronen Zohar <ronen.zohar@intel.com>
  22 *
  23 * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
  24 *   Author: Mathias Krause <minipli@googlemail.com>
  25 */
  26
  27#include <linux/linkage.h>
  28
  29#define CTX     %rdi    // arg1
  30#define BUF     %rsi    // arg2
  31#define CNT     %rdx    // arg3
  32
  33#define REG_A   %ecx
  34#define REG_B   %esi
  35#define REG_C   %edi
  36#define REG_D   %r12d
  37#define REG_E   %edx
  38
  39#define REG_T1  %eax
  40#define REG_T2  %ebx
  41
  42#define K_BASE          %r8
  43#define HASH_PTR        %r9
  44#define BUFFER_PTR      %r10
  45#define BUFFER_END      %r11
  46
  47#define W_TMP1  %xmm0
  48#define W_TMP2  %xmm9
  49
  50#define W0      %xmm1
  51#define W4      %xmm2
  52#define W8      %xmm3
  53#define W12     %xmm4
  54#define W16     %xmm5
  55#define W20     %xmm6
  56#define W24     %xmm7
  57#define W28     %xmm8
  58
  59#define XMM_SHUFB_BSWAP %xmm10
  60
  61/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
  62#define WK(t)   (((t) & 15) * 4)(%rsp)
  63#define W_PRECALC_AHEAD 16
  64
  65/*
  66 * This macro implements the SHA-1 function's body for single 64-byte block
  67 * param: function's name
  68 */
  69.macro SHA1_VECTOR_ASM  name
  70        SYM_FUNC_START(\name)
  71
  72        push    %rbx
  73        push    %r12
  74        push    %rbp
  75        mov     %rsp, %rbp
  76
  77        sub     $64, %rsp               # allocate workspace
  78        and     $~15, %rsp              # align stack
  79
  80        mov     CTX, HASH_PTR
  81        mov     BUF, BUFFER_PTR
  82
  83        shl     $6, CNT                 # multiply by 64
  84        add     BUF, CNT
  85        mov     CNT, BUFFER_END
  86
  87        lea     K_XMM_AR(%rip), K_BASE
  88        xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
  89
  90        SHA1_PIPELINED_MAIN_BODY
  91
  92        # cleanup workspace
  93        mov     $8, %ecx
  94        mov     %rsp, %rdi
  95        xor     %eax, %eax
  96        rep stosq
  97
  98        mov     %rbp, %rsp              # deallocate workspace
  99        pop     %rbp
 100        pop     %r12
 101        pop     %rbx
 102        ret
 103
 104        SYM_FUNC_END(\name)
 105.endm
 106
 107/*
 108 * This macro implements 80 rounds of SHA-1 for one 64-byte block
 109 */
 110.macro SHA1_PIPELINED_MAIN_BODY
 111        INIT_REGALLOC
 112
 113        mov       (HASH_PTR), A
 114        mov      4(HASH_PTR), B
 115        mov      8(HASH_PTR), C
 116        mov     12(HASH_PTR), D
 117        mov     16(HASH_PTR), E
 118
 119  .set i, 0
 120  .rept W_PRECALC_AHEAD
 121        W_PRECALC i
 122    .set i, (i+1)
 123  .endr
 124
 125.align 4
 1261:
 127        RR F1,A,B,C,D,E,0
 128        RR F1,D,E,A,B,C,2
 129        RR F1,B,C,D,E,A,4
 130        RR F1,E,A,B,C,D,6
 131        RR F1,C,D,E,A,B,8
 132
 133        RR F1,A,B,C,D,E,10
 134        RR F1,D,E,A,B,C,12
 135        RR F1,B,C,D,E,A,14
 136        RR F1,E,A,B,C,D,16
 137        RR F1,C,D,E,A,B,18
 138
 139        RR F2,A,B,C,D,E,20
 140        RR F2,D,E,A,B,C,22
 141        RR F2,B,C,D,E,A,24
 142        RR F2,E,A,B,C,D,26
 143        RR F2,C,D,E,A,B,28
 144
 145        RR F2,A,B,C,D,E,30
 146        RR F2,D,E,A,B,C,32
 147        RR F2,B,C,D,E,A,34
 148        RR F2,E,A,B,C,D,36
 149        RR F2,C,D,E,A,B,38
 150
 151        RR F3,A,B,C,D,E,40
 152        RR F3,D,E,A,B,C,42
 153        RR F3,B,C,D,E,A,44
 154        RR F3,E,A,B,C,D,46
 155        RR F3,C,D,E,A,B,48
 156
 157        RR F3,A,B,C,D,E,50
 158        RR F3,D,E,A,B,C,52
 159        RR F3,B,C,D,E,A,54
 160        RR F3,E,A,B,C,D,56
 161        RR F3,C,D,E,A,B,58
 162
 163        add     $64, BUFFER_PTR         # move to the next 64-byte block
 164        cmp     BUFFER_END, BUFFER_PTR  # if the current is the last one use
 165        cmovae  K_BASE, BUFFER_PTR      # dummy source to avoid buffer overrun
 166
 167        RR F4,A,B,C,D,E,60
 168        RR F4,D,E,A,B,C,62
 169        RR F4,B,C,D,E,A,64
 170        RR F4,E,A,B,C,D,66
 171        RR F4,C,D,E,A,B,68
 172
 173        RR F4,A,B,C,D,E,70
 174        RR F4,D,E,A,B,C,72
 175        RR F4,B,C,D,E,A,74
 176        RR F4,E,A,B,C,D,76
 177        RR F4,C,D,E,A,B,78
 178
 179        UPDATE_HASH   (HASH_PTR), A
 180        UPDATE_HASH  4(HASH_PTR), B
 181        UPDATE_HASH  8(HASH_PTR), C
 182        UPDATE_HASH 12(HASH_PTR), D
 183        UPDATE_HASH 16(HASH_PTR), E
 184
 185        RESTORE_RENAMED_REGS
 186        cmp     K_BASE, BUFFER_PTR      # K_BASE means, we reached the end
 187        jne     1b
 188.endm
 189
 190.macro INIT_REGALLOC
 191  .set A, REG_A
 192  .set B, REG_B
 193  .set C, REG_C
 194  .set D, REG_D
 195  .set E, REG_E
 196  .set T1, REG_T1
 197  .set T2, REG_T2
 198.endm
 199
 200.macro RESTORE_RENAMED_REGS
 201        # order is important (REG_C is where it should be)
 202        mov     B, REG_B
 203        mov     D, REG_D
 204        mov     A, REG_A
 205        mov     E, REG_E
 206.endm
 207
 208.macro SWAP_REG_NAMES  a, b
 209  .set _T, \a
 210  .set \a, \b
 211  .set \b, _T
 212.endm
 213
 214.macro F1  b, c, d
 215        mov     \c, T1
 216        SWAP_REG_NAMES \c, T1
 217        xor     \d, T1
 218        and     \b, T1
 219        xor     \d, T1
 220.endm
 221
 222.macro F2  b, c, d
 223        mov     \d, T1
 224        SWAP_REG_NAMES \d, T1
 225        xor     \c, T1
 226        xor     \b, T1
 227.endm
 228
 229.macro F3  b, c ,d
 230        mov     \c, T1
 231        SWAP_REG_NAMES \c, T1
 232        mov     \b, T2
 233        or      \b, T1
 234        and     \c, T2
 235        and     \d, T1
 236        or      T2, T1
 237.endm
 238
 239.macro F4  b, c, d
 240        F2 \b, \c, \d
 241.endm
 242
 243.macro UPDATE_HASH  hash, val
 244        add     \hash, \val
 245        mov     \val, \hash
 246.endm
 247
 248/*
 249 * RR does two rounds of SHA-1 back to back with W[] pre-calc
 250 *   t1 = F(b, c, d);   e += w(i)
 251 *   e += t1;           b <<= 30;   d  += w(i+1);
 252 *   t1 = F(a, b, c);
 253 *   d += t1;           a <<= 5;
 254 *   e += a;
 255 *   t1 = e;            a >>= 7;
 256 *   t1 <<= 5;
 257 *   d += t1;
 258 */
 259.macro RR  F, a, b, c, d, e, round
 260        add     WK(\round), \e
 261        \F   \b, \c, \d         # t1 = F(b, c, d);
 262        W_PRECALC (\round + W_PRECALC_AHEAD)
 263        rol     $30, \b
 264        add     T1, \e
 265        add     WK(\round + 1), \d
 266
 267        \F   \a, \b, \c
 268        W_PRECALC (\round + W_PRECALC_AHEAD + 1)
 269        rol     $5, \a
 270        add     \a, \e
 271        add     T1, \d
 272        ror     $7, \a          # (a <<r 5) >>r 7) => a <<r 30)
 273
 274        mov     \e, T1
 275        SWAP_REG_NAMES \e, T1
 276
 277        rol     $5, T1
 278        add     T1, \d
 279
 280        # write:  \a, \b
 281        # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
 282.endm
 283
 284.macro W_PRECALC  r
 285  .set i, \r
 286
 287  .if (i < 20)
 288    .set K_XMM, 0
 289  .elseif (i < 40)
 290    .set K_XMM, 16
 291  .elseif (i < 60)
 292    .set K_XMM, 32
 293  .elseif (i < 80)
 294    .set K_XMM, 48
 295  .endif
 296
 297  .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
 298    .set i, ((\r) % 80)     # pre-compute for the next iteration
 299    .if (i == 0)
 300        W_PRECALC_RESET
 301    .endif
 302        W_PRECALC_00_15
 303  .elseif (i<32)
 304        W_PRECALC_16_31
 305  .elseif (i < 80)   // rounds 32-79
 306        W_PRECALC_32_79
 307  .endif
 308.endm
 309
 310.macro W_PRECALC_RESET
 311  .set W,          W0
 312  .set W_minus_04, W4
 313  .set W_minus_08, W8
 314  .set W_minus_12, W12
 315  .set W_minus_16, W16
 316  .set W_minus_20, W20
 317  .set W_minus_24, W24
 318  .set W_minus_28, W28
 319  .set W_minus_32, W
 320.endm
 321
 322.macro W_PRECALC_ROTATE
 323  .set W_minus_32, W_minus_28
 324  .set W_minus_28, W_minus_24
 325  .set W_minus_24, W_minus_20
 326  .set W_minus_20, W_minus_16
 327  .set W_minus_16, W_minus_12
 328  .set W_minus_12, W_minus_08
 329  .set W_minus_08, W_minus_04
 330  .set W_minus_04, W
 331  .set W,          W_minus_32
 332.endm
 333
 334.macro W_PRECALC_SSSE3
 335
 336.macro W_PRECALC_00_15
 337        W_PRECALC_00_15_SSSE3
 338.endm
 339.macro W_PRECALC_16_31
 340        W_PRECALC_16_31_SSSE3
 341.endm
 342.macro W_PRECALC_32_79
 343        W_PRECALC_32_79_SSSE3
 344.endm
 345
 346/* message scheduling pre-compute for rounds 0-15 */
 347.macro W_PRECALC_00_15_SSSE3
 348  .if ((i & 3) == 0)
 349        movdqu  (i*4)(BUFFER_PTR), W_TMP1
 350  .elseif ((i & 3) == 1)
 351        pshufb  XMM_SHUFB_BSWAP, W_TMP1
 352        movdqa  W_TMP1, W
 353  .elseif ((i & 3) == 2)
 354        paddd   (K_BASE), W_TMP1
 355  .elseif ((i & 3) == 3)
 356        movdqa  W_TMP1, WK(i&~3)
 357        W_PRECALC_ROTATE
 358  .endif
 359.endm
 360
 361/* message scheduling pre-compute for rounds 16-31
 362 *
 363 * - calculating last 32 w[i] values in 8 XMM registers
 364 * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
 365 *   instruction
 366 *
 367 * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
 368 * dependency, but improves for 32-79
 369 */
 370.macro W_PRECALC_16_31_SSSE3
 371  # blended scheduling of vector and scalar instruction streams, one 4-wide
 372  # vector iteration / 4 scalar rounds
 373  .if ((i & 3) == 0)
 374        movdqa  W_minus_12, W
 375        palignr $8, W_minus_16, W       # w[i-14]
 376        movdqa  W_minus_04, W_TMP1
 377        psrldq  $4, W_TMP1              # w[i-3]
 378        pxor    W_minus_08, W
 379  .elseif ((i & 3) == 1)
 380        pxor    W_minus_16, W_TMP1
 381        pxor    W_TMP1, W
 382        movdqa  W, W_TMP2
 383        movdqa  W, W_TMP1
 384        pslldq  $12, W_TMP2
 385  .elseif ((i & 3) == 2)
 386        psrld   $31, W
 387        pslld   $1, W_TMP1
 388        por     W, W_TMP1
 389        movdqa  W_TMP2, W
 390        psrld   $30, W_TMP2
 391        pslld   $2, W
 392  .elseif ((i & 3) == 3)
 393        pxor    W, W_TMP1
 394        pxor    W_TMP2, W_TMP1
 395        movdqa  W_TMP1, W
 396        paddd   K_XMM(K_BASE), W_TMP1
 397        movdqa  W_TMP1, WK(i&~3)
 398        W_PRECALC_ROTATE
 399  .endif
 400.endm
 401
 402/* message scheduling pre-compute for rounds 32-79
 403 *
 404 * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
 405 * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
 406 * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
 407 */
 408.macro W_PRECALC_32_79_SSSE3
 409  .if ((i & 3) == 0)
 410        movdqa  W_minus_04, W_TMP1
 411        pxor    W_minus_28, W           # W is W_minus_32 before xor
 412        palignr $8, W_minus_08, W_TMP1
 413  .elseif ((i & 3) == 1)
 414        pxor    W_minus_16, W
 415        pxor    W_TMP1, W
 416        movdqa  W, W_TMP1
 417  .elseif ((i & 3) == 2)
 418        psrld   $30, W
 419        pslld   $2, W_TMP1
 420        por     W, W_TMP1
 421  .elseif ((i & 3) == 3)
 422        movdqa  W_TMP1, W
 423        paddd   K_XMM(K_BASE), W_TMP1
 424        movdqa  W_TMP1, WK(i&~3)
 425        W_PRECALC_ROTATE
 426  .endif
 427.endm
 428
 429.endm           // W_PRECALC_SSSE3
 430
 431
 432#define K1      0x5a827999
 433#define K2      0x6ed9eba1
 434#define K3      0x8f1bbcdc
 435#define K4      0xca62c1d6
 436
 437.section .rodata
 438.align 16
 439
 440K_XMM_AR:
 441        .long K1, K1, K1, K1
 442        .long K2, K2, K2, K2
 443        .long K3, K3, K3, K3
 444        .long K4, K4, K4, K4
 445
 446BSWAP_SHUFB_CTL:
 447        .long 0x00010203
 448        .long 0x04050607
 449        .long 0x08090a0b
 450        .long 0x0c0d0e0f
 451
 452
 453.section .text
 454
 455W_PRECALC_SSSE3
 456.macro xmm_mov a, b
 457        movdqu  \a,\b
 458.endm
 459
 460/*
 461 * SSSE3 optimized implementation:
 462 *
 463 * extern "C" void sha1_transform_ssse3(struct sha1_state *state,
 464 *                                      const u8 *data, int blocks);
 465 *
 466 * Note that struct sha1_state is assumed to begin with u32 state[5].
 467 */
 468SHA1_VECTOR_ASM     sha1_transform_ssse3
 469
 470.macro W_PRECALC_AVX
 471
 472.purgem W_PRECALC_00_15
 473.macro  W_PRECALC_00_15
 474    W_PRECALC_00_15_AVX
 475.endm
 476.purgem W_PRECALC_16_31
 477.macro  W_PRECALC_16_31
 478    W_PRECALC_16_31_AVX
 479.endm
 480.purgem W_PRECALC_32_79
 481.macro  W_PRECALC_32_79
 482    W_PRECALC_32_79_AVX
 483.endm
 484
 485.macro W_PRECALC_00_15_AVX
 486  .if ((i & 3) == 0)
 487        vmovdqu (i*4)(BUFFER_PTR), W_TMP1
 488  .elseif ((i & 3) == 1)
 489        vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
 490  .elseif ((i & 3) == 2)
 491        vpaddd  (K_BASE), W, W_TMP1
 492  .elseif ((i & 3) == 3)
 493        vmovdqa W_TMP1, WK(i&~3)
 494        W_PRECALC_ROTATE
 495  .endif
 496.endm
 497
 498.macro W_PRECALC_16_31_AVX
 499  .if ((i & 3) == 0)
 500        vpalignr $8, W_minus_16, W_minus_12, W  # w[i-14]
 501        vpsrldq $4, W_minus_04, W_TMP1          # w[i-3]
 502        vpxor   W_minus_08, W, W
 503        vpxor   W_minus_16, W_TMP1, W_TMP1
 504  .elseif ((i & 3) == 1)
 505        vpxor   W_TMP1, W, W
 506        vpslldq $12, W, W_TMP2
 507        vpslld  $1, W, W_TMP1
 508  .elseif ((i & 3) == 2)
 509        vpsrld  $31, W, W
 510        vpor    W, W_TMP1, W_TMP1
 511        vpslld  $2, W_TMP2, W
 512        vpsrld  $30, W_TMP2, W_TMP2
 513  .elseif ((i & 3) == 3)
 514        vpxor   W, W_TMP1, W_TMP1
 515        vpxor   W_TMP2, W_TMP1, W
 516        vpaddd  K_XMM(K_BASE), W, W_TMP1
 517        vmovdqu W_TMP1, WK(i&~3)
 518        W_PRECALC_ROTATE
 519  .endif
 520.endm
 521
 522.macro W_PRECALC_32_79_AVX
 523  .if ((i & 3) == 0)
 524        vpalignr $8, W_minus_08, W_minus_04, W_TMP1
 525        vpxor   W_minus_28, W, W                # W is W_minus_32 before xor
 526  .elseif ((i & 3) == 1)
 527        vpxor   W_minus_16, W_TMP1, W_TMP1
 528        vpxor   W_TMP1, W, W
 529  .elseif ((i & 3) == 2)
 530        vpslld  $2, W, W_TMP1
 531        vpsrld  $30, W, W
 532        vpor    W, W_TMP1, W
 533  .elseif ((i & 3) == 3)
 534        vpaddd  K_XMM(K_BASE), W, W_TMP1
 535        vmovdqu W_TMP1, WK(i&~3)
 536        W_PRECALC_ROTATE
 537  .endif
 538.endm
 539
 540.endm    // W_PRECALC_AVX
 541
 542W_PRECALC_AVX
 543.purgem xmm_mov
 544.macro xmm_mov a, b
 545        vmovdqu \a,\b
 546.endm
 547
 548
 549/* AVX optimized implementation:
 550 *  extern "C" void sha1_transform_avx(struct sha1_state *state,
 551 *                                     const u8 *data, int blocks);
 552 */
 553SHA1_VECTOR_ASM     sha1_transform_avx
 554