linux/arch/x86/crypto/sha1_ssse3_asm.S
<<
>>
Prefs
   1/*
   2 * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
   3 * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
   4 * processors. CPUs supporting Intel(R) AVX extensions will get an additional
   5 * boost.
   6 *
   7 * This work was inspired by the vectorized implementation of Dean Gaudet.
   8 * Additional information on it can be found at:
   9 *    http://www.arctic.org/~dean/crypto/sha1.html
  10 *
  11 * It was improved upon with more efficient vectorization of the message
  12 * scheduling. This implementation has also been optimized for all current and
  13 * several future generations of Intel CPUs.
  14 *
  15 * See this article for more information about the implementation details:
  16 *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
  17 *
  18 * Copyright (C) 2010, Intel Corp.
  19 *   Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
  20 *            Ronen Zohar <ronen.zohar@intel.com>
  21 *
  22 * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
  23 *   Author: Mathias Krause <minipli@googlemail.com>
  24 *
  25 * This program is free software; you can redistribute it and/or modify
  26 * it under the terms of the GNU General Public License as published by
  27 * the Free Software Foundation; either version 2 of the License, or
  28 * (at your option) any later version.
  29 */
  30
  31#include <linux/linkage.h>
  32
  33#define CTX     %rdi    // arg1
  34#define BUF     %rsi    // arg2
  35#define CNT     %rdx    // arg3
  36
  37#define REG_A   %ecx
  38#define REG_B   %esi
  39#define REG_C   %edi
  40#define REG_D   %ebp
  41#define REG_E   %edx
  42
  43#define REG_T1  %eax
  44#define REG_T2  %ebx
  45
  46#define K_BASE          %r8
  47#define HASH_PTR        %r9
  48#define BUFFER_PTR      %r10
  49#define BUFFER_END      %r11
  50
  51#define W_TMP1  %xmm0
  52#define W_TMP2  %xmm9
  53
  54#define W0      %xmm1
  55#define W4      %xmm2
  56#define W8      %xmm3
  57#define W12     %xmm4
  58#define W16     %xmm5
  59#define W20     %xmm6
  60#define W24     %xmm7
  61#define W28     %xmm8
  62
  63#define XMM_SHUFB_BSWAP %xmm10
  64
  65/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
  66#define WK(t)   (((t) & 15) * 4)(%rsp)
  67#define W_PRECALC_AHEAD 16
  68
  69/*
  70 * This macro implements the SHA-1 function's body for single 64-byte block
  71 * param: function's name
  72 */
  73.macro SHA1_VECTOR_ASM  name
  74        ENTRY(\name)
  75
  76        push    %rbx
  77        push    %rbp
  78        push    %r12
  79
  80        mov     %rsp, %r12
  81        sub     $64, %rsp               # allocate workspace
  82        and     $~15, %rsp              # align stack
  83
  84        mov     CTX, HASH_PTR
  85        mov     BUF, BUFFER_PTR
  86
  87        shl     $6, CNT                 # multiply by 64
  88        add     BUF, CNT
  89        mov     CNT, BUFFER_END
  90
  91        lea     K_XMM_AR(%rip), K_BASE
  92        xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
  93
  94        SHA1_PIPELINED_MAIN_BODY
  95
  96        # cleanup workspace
  97        mov     $8, %ecx
  98        mov     %rsp, %rdi
  99        xor     %rax, %rax
 100        rep stosq
 101
 102        mov     %r12, %rsp              # deallocate workspace
 103
 104        pop     %r12
 105        pop     %rbp
 106        pop     %rbx
 107        ret
 108
 109        ENDPROC(\name)
 110.endm
 111
 112/*
 113 * This macro implements 80 rounds of SHA-1 for one 64-byte block
 114 */
 115.macro SHA1_PIPELINED_MAIN_BODY
 116        INIT_REGALLOC
 117
 118        mov       (HASH_PTR), A
 119        mov      4(HASH_PTR), B
 120        mov      8(HASH_PTR), C
 121        mov     12(HASH_PTR), D
 122        mov     16(HASH_PTR), E
 123
 124  .set i, 0
 125  .rept W_PRECALC_AHEAD
 126        W_PRECALC i
 127    .set i, (i+1)
 128  .endr
 129
 130.align 4
 1311:
 132        RR F1,A,B,C,D,E,0
 133        RR F1,D,E,A,B,C,2
 134        RR F1,B,C,D,E,A,4
 135        RR F1,E,A,B,C,D,6
 136        RR F1,C,D,E,A,B,8
 137
 138        RR F1,A,B,C,D,E,10
 139        RR F1,D,E,A,B,C,12
 140        RR F1,B,C,D,E,A,14
 141        RR F1,E,A,B,C,D,16
 142        RR F1,C,D,E,A,B,18
 143
 144        RR F2,A,B,C,D,E,20
 145        RR F2,D,E,A,B,C,22
 146        RR F2,B,C,D,E,A,24
 147        RR F2,E,A,B,C,D,26
 148        RR F2,C,D,E,A,B,28
 149
 150        RR F2,A,B,C,D,E,30
 151        RR F2,D,E,A,B,C,32
 152        RR F2,B,C,D,E,A,34
 153        RR F2,E,A,B,C,D,36
 154        RR F2,C,D,E,A,B,38
 155
 156        RR F3,A,B,C,D,E,40
 157        RR F3,D,E,A,B,C,42
 158        RR F3,B,C,D,E,A,44
 159        RR F3,E,A,B,C,D,46
 160        RR F3,C,D,E,A,B,48
 161
 162        RR F3,A,B,C,D,E,50
 163        RR F3,D,E,A,B,C,52
 164        RR F3,B,C,D,E,A,54
 165        RR F3,E,A,B,C,D,56
 166        RR F3,C,D,E,A,B,58
 167
 168        add     $64, BUFFER_PTR         # move to the next 64-byte block
 169        cmp     BUFFER_END, BUFFER_PTR  # if the current is the last one use
 170        cmovae  K_BASE, BUFFER_PTR      # dummy source to avoid buffer overrun
 171
 172        RR F4,A,B,C,D,E,60
 173        RR F4,D,E,A,B,C,62
 174        RR F4,B,C,D,E,A,64
 175        RR F4,E,A,B,C,D,66
 176        RR F4,C,D,E,A,B,68
 177
 178        RR F4,A,B,C,D,E,70
 179        RR F4,D,E,A,B,C,72
 180        RR F4,B,C,D,E,A,74
 181        RR F4,E,A,B,C,D,76
 182        RR F4,C,D,E,A,B,78
 183
 184        UPDATE_HASH   (HASH_PTR), A
 185        UPDATE_HASH  4(HASH_PTR), B
 186        UPDATE_HASH  8(HASH_PTR), C
 187        UPDATE_HASH 12(HASH_PTR), D
 188        UPDATE_HASH 16(HASH_PTR), E
 189
 190        RESTORE_RENAMED_REGS
 191        cmp     K_BASE, BUFFER_PTR      # K_BASE means, we reached the end
 192        jne     1b
 193.endm
 194
 195.macro INIT_REGALLOC
 196  .set A, REG_A
 197  .set B, REG_B
 198  .set C, REG_C
 199  .set D, REG_D
 200  .set E, REG_E
 201  .set T1, REG_T1
 202  .set T2, REG_T2
 203.endm
 204
 205.macro RESTORE_RENAMED_REGS
 206        # order is important (REG_C is where it should be)
 207        mov     B, REG_B
 208        mov     D, REG_D
 209        mov     A, REG_A
 210        mov     E, REG_E
 211.endm
 212
 213.macro SWAP_REG_NAMES  a, b
 214  .set _T, \a
 215  .set \a, \b
 216  .set \b, _T
 217.endm
 218
 219.macro F1  b, c, d
 220        mov     \c, T1
 221        SWAP_REG_NAMES \c, T1
 222        xor     \d, T1
 223        and     \b, T1
 224        xor     \d, T1
 225.endm
 226
 227.macro F2  b, c, d
 228        mov     \d, T1
 229        SWAP_REG_NAMES \d, T1
 230        xor     \c, T1
 231        xor     \b, T1
 232.endm
 233
 234.macro F3  b, c ,d
 235        mov     \c, T1
 236        SWAP_REG_NAMES \c, T1
 237        mov     \b, T2
 238        or      \b, T1
 239        and     \c, T2
 240        and     \d, T1
 241        or      T2, T1
 242.endm
 243
 244.macro F4  b, c, d
 245        F2 \b, \c, \d
 246.endm
 247
 248.macro UPDATE_HASH  hash, val
 249        add     \hash, \val
 250        mov     \val, \hash
 251.endm
 252
 253/*
 254 * RR does two rounds of SHA-1 back to back with W[] pre-calc
 255 *   t1 = F(b, c, d);   e += w(i)
 256 *   e += t1;           b <<= 30;   d  += w(i+1);
 257 *   t1 = F(a, b, c);
 258 *   d += t1;           a <<= 5;
 259 *   e += a;
 260 *   t1 = e;            a >>= 7;
 261 *   t1 <<= 5;
 262 *   d += t1;
 263 */
 264.macro RR  F, a, b, c, d, e, round
 265        add     WK(\round), \e
 266        \F   \b, \c, \d         # t1 = F(b, c, d);
 267        W_PRECALC (\round + W_PRECALC_AHEAD)
 268        rol     $30, \b
 269        add     T1, \e
 270        add     WK(\round + 1), \d
 271
 272        \F   \a, \b, \c
 273        W_PRECALC (\round + W_PRECALC_AHEAD + 1)
 274        rol     $5, \a
 275        add     \a, \e
 276        add     T1, \d
 277        ror     $7, \a          # (a <<r 5) >>r 7) => a <<r 30)
 278
 279        mov     \e, T1
 280        SWAP_REG_NAMES \e, T1
 281
 282        rol     $5, T1
 283        add     T1, \d
 284
 285        # write:  \a, \b
 286        # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
 287.endm
 288
 289.macro W_PRECALC  r
 290  .set i, \r
 291
 292  .if (i < 20)
 293    .set K_XMM, 0
 294  .elseif (i < 40)
 295    .set K_XMM, 16
 296  .elseif (i < 60)
 297    .set K_XMM, 32
 298  .elseif (i < 80)
 299    .set K_XMM, 48
 300  .endif
 301
 302  .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
 303    .set i, ((\r) % 80)     # pre-compute for the next iteration
 304    .if (i == 0)
 305        W_PRECALC_RESET
 306    .endif
 307        W_PRECALC_00_15
 308  .elseif (i<32)
 309        W_PRECALC_16_31
 310  .elseif (i < 80)   // rounds 32-79
 311        W_PRECALC_32_79
 312  .endif
 313.endm
 314
 315.macro W_PRECALC_RESET
 316  .set W,          W0
 317  .set W_minus_04, W4
 318  .set W_minus_08, W8
 319  .set W_minus_12, W12
 320  .set W_minus_16, W16
 321  .set W_minus_20, W20
 322  .set W_minus_24, W24
 323  .set W_minus_28, W28
 324  .set W_minus_32, W
 325.endm
 326
 327.macro W_PRECALC_ROTATE
 328  .set W_minus_32, W_minus_28
 329  .set W_minus_28, W_minus_24
 330  .set W_minus_24, W_minus_20
 331  .set W_minus_20, W_minus_16
 332  .set W_minus_16, W_minus_12
 333  .set W_minus_12, W_minus_08
 334  .set W_minus_08, W_minus_04
 335  .set W_minus_04, W
 336  .set W,          W_minus_32
 337.endm
 338
 339.macro W_PRECALC_SSSE3
 340
 341.macro W_PRECALC_00_15
 342        W_PRECALC_00_15_SSSE3
 343.endm
 344.macro W_PRECALC_16_31
 345        W_PRECALC_16_31_SSSE3
 346.endm
 347.macro W_PRECALC_32_79
 348        W_PRECALC_32_79_SSSE3
 349.endm
 350
 351/* message scheduling pre-compute for rounds 0-15 */
 352.macro W_PRECALC_00_15_SSSE3
 353  .if ((i & 3) == 0)
 354        movdqu  (i*4)(BUFFER_PTR), W_TMP1
 355  .elseif ((i & 3) == 1)
 356        pshufb  XMM_SHUFB_BSWAP, W_TMP1
 357        movdqa  W_TMP1, W
 358  .elseif ((i & 3) == 2)
 359        paddd   (K_BASE), W_TMP1
 360  .elseif ((i & 3) == 3)
 361        movdqa  W_TMP1, WK(i&~3)
 362        W_PRECALC_ROTATE
 363  .endif
 364.endm
 365
 366/* message scheduling pre-compute for rounds 16-31
 367 *
 368 * - calculating last 32 w[i] values in 8 XMM registers
 369 * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
 370 *   instruction
 371 *
 372 * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
 373 * dependency, but improves for 32-79
 374 */
 375.macro W_PRECALC_16_31_SSSE3
 376  # blended scheduling of vector and scalar instruction streams, one 4-wide
 377  # vector iteration / 4 scalar rounds
 378  .if ((i & 3) == 0)
 379        movdqa  W_minus_12, W
 380        palignr $8, W_minus_16, W       # w[i-14]
 381        movdqa  W_minus_04, W_TMP1
 382        psrldq  $4, W_TMP1              # w[i-3]
 383        pxor    W_minus_08, W
 384  .elseif ((i & 3) == 1)
 385        pxor    W_minus_16, W_TMP1
 386        pxor    W_TMP1, W
 387        movdqa  W, W_TMP2
 388        movdqa  W, W_TMP1
 389        pslldq  $12, W_TMP2
 390  .elseif ((i & 3) == 2)
 391        psrld   $31, W
 392        pslld   $1, W_TMP1
 393        por     W, W_TMP1
 394        movdqa  W_TMP2, W
 395        psrld   $30, W_TMP2
 396        pslld   $2, W
 397  .elseif ((i & 3) == 3)
 398        pxor    W, W_TMP1
 399        pxor    W_TMP2, W_TMP1
 400        movdqa  W_TMP1, W
 401        paddd   K_XMM(K_BASE), W_TMP1
 402        movdqa  W_TMP1, WK(i&~3)
 403        W_PRECALC_ROTATE
 404  .endif
 405.endm
 406
 407/* message scheduling pre-compute for rounds 32-79
 408 *
 409 * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
 410 * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
 411 * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
 412 */
 413.macro W_PRECALC_32_79_SSSE3
 414  .if ((i & 3) == 0)
 415        movdqa  W_minus_04, W_TMP1
 416        pxor    W_minus_28, W           # W is W_minus_32 before xor
 417        palignr $8, W_minus_08, W_TMP1
 418  .elseif ((i & 3) == 1)
 419        pxor    W_minus_16, W
 420        pxor    W_TMP1, W
 421        movdqa  W, W_TMP1
 422  .elseif ((i & 3) == 2)
 423        psrld   $30, W
 424        pslld   $2, W_TMP1
 425        por     W, W_TMP1
 426  .elseif ((i & 3) == 3)
 427        movdqa  W_TMP1, W
 428        paddd   K_XMM(K_BASE), W_TMP1
 429        movdqa  W_TMP1, WK(i&~3)
 430        W_PRECALC_ROTATE
 431  .endif
 432.endm
 433
 434.endm           // W_PRECALC_SSSE3
 435
 436
 437#define K1      0x5a827999
 438#define K2      0x6ed9eba1
 439#define K3      0x8f1bbcdc
 440#define K4      0xca62c1d6
 441
 442.section .rodata
 443.align 16
 444
 445K_XMM_AR:
 446        .long K1, K1, K1, K1
 447        .long K2, K2, K2, K2
 448        .long K3, K3, K3, K3
 449        .long K4, K4, K4, K4
 450
 451BSWAP_SHUFB_CTL:
 452        .long 0x00010203
 453        .long 0x04050607
 454        .long 0x08090a0b
 455        .long 0x0c0d0e0f
 456
 457
 458.section .text
 459
 460W_PRECALC_SSSE3
 461.macro xmm_mov a, b
 462        movdqu  \a,\b
 463.endm
 464
 465/* SSSE3 optimized implementation:
 466 *  extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws,
 467 *                                       unsigned int rounds);
 468 */
 469SHA1_VECTOR_ASM     sha1_transform_ssse3
 470
 471#ifdef CONFIG_AS_AVX
 472
 473.macro W_PRECALC_AVX
 474
 475.purgem W_PRECALC_00_15
 476.macro  W_PRECALC_00_15
 477    W_PRECALC_00_15_AVX
 478.endm
 479.purgem W_PRECALC_16_31
 480.macro  W_PRECALC_16_31
 481    W_PRECALC_16_31_AVX
 482.endm
 483.purgem W_PRECALC_32_79
 484.macro  W_PRECALC_32_79
 485    W_PRECALC_32_79_AVX
 486.endm
 487
 488.macro W_PRECALC_00_15_AVX
 489  .if ((i & 3) == 0)
 490        vmovdqu (i*4)(BUFFER_PTR), W_TMP1
 491  .elseif ((i & 3) == 1)
 492        vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
 493  .elseif ((i & 3) == 2)
 494        vpaddd  (K_BASE), W, W_TMP1
 495  .elseif ((i & 3) == 3)
 496        vmovdqa W_TMP1, WK(i&~3)
 497        W_PRECALC_ROTATE
 498  .endif
 499.endm
 500
 501.macro W_PRECALC_16_31_AVX
 502  .if ((i & 3) == 0)
 503        vpalignr $8, W_minus_16, W_minus_12, W  # w[i-14]
 504        vpsrldq $4, W_minus_04, W_TMP1          # w[i-3]
 505        vpxor   W_minus_08, W, W
 506        vpxor   W_minus_16, W_TMP1, W_TMP1
 507  .elseif ((i & 3) == 1)
 508        vpxor   W_TMP1, W, W
 509        vpslldq $12, W, W_TMP2
 510        vpslld  $1, W, W_TMP1
 511  .elseif ((i & 3) == 2)
 512        vpsrld  $31, W, W
 513        vpor    W, W_TMP1, W_TMP1
 514        vpslld  $2, W_TMP2, W
 515        vpsrld  $30, W_TMP2, W_TMP2
 516  .elseif ((i & 3) == 3)
 517        vpxor   W, W_TMP1, W_TMP1
 518        vpxor   W_TMP2, W_TMP1, W
 519        vpaddd  K_XMM(K_BASE), W, W_TMP1
 520        vmovdqu W_TMP1, WK(i&~3)
 521        W_PRECALC_ROTATE
 522  .endif
 523.endm
 524
 525.macro W_PRECALC_32_79_AVX
 526  .if ((i & 3) == 0)
 527        vpalignr $8, W_minus_08, W_minus_04, W_TMP1
 528        vpxor   W_minus_28, W, W                # W is W_minus_32 before xor
 529  .elseif ((i & 3) == 1)
 530        vpxor   W_minus_16, W_TMP1, W_TMP1
 531        vpxor   W_TMP1, W, W
 532  .elseif ((i & 3) == 2)
 533        vpslld  $2, W, W_TMP1
 534        vpsrld  $30, W, W
 535        vpor    W, W_TMP1, W
 536  .elseif ((i & 3) == 3)
 537        vpaddd  K_XMM(K_BASE), W, W_TMP1
 538        vmovdqu W_TMP1, WK(i&~3)
 539        W_PRECALC_ROTATE
 540  .endif
 541.endm
 542
 543.endm    // W_PRECALC_AVX
 544
 545W_PRECALC_AVX
 546.purgem xmm_mov
 547.macro xmm_mov a, b
 548        vmovdqu \a,\b
 549.endm
 550
 551
 552/* AVX optimized implementation:
 553 *  extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws,
 554 *                                     unsigned int rounds);
 555 */
 556SHA1_VECTOR_ASM     sha1_transform_avx
 557
 558#endif
 559