linux/arch/x86/crypto/sha-mb/sha1_x8_avx2.S
<<
>>
Prefs
   1/*
   2 * Multi-buffer SHA1 algorithm hash compute routine
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 *  Copyright(c) 2014 Intel Corporation.
  10 *
  11 *  This program is free software; you can redistribute it and/or modify
  12 *  it under the terms of version 2 of the GNU General Public License as
  13 *  published by the Free Software Foundation.
  14 *
  15 *  This program is distributed in the hope that it will be useful, but
  16 *  WITHOUT ANY WARRANTY; without even the implied warranty of
  17 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 *  General Public License for more details.
  19 *
  20 *  Contact Information:
  21 *      James Guilford <james.guilford@intel.com>
  22 *      Tim Chen <tim.c.chen@linux.intel.com>
  23 *
  24 *  BSD LICENSE
  25 *
  26 *  Copyright(c) 2014 Intel Corporation.
  27 *
  28 *  Redistribution and use in source and binary forms, with or without
  29 *  modification, are permitted provided that the following conditions
  30 *  are met:
  31 *
  32 *    * Redistributions of source code must retain the above copyright
  33 *      notice, this list of conditions and the following disclaimer.
  34 *    * Redistributions in binary form must reproduce the above copyright
  35 *      notice, this list of conditions and the following disclaimer in
  36 *      the documentation and/or other materials provided with the
  37 *      distribution.
  38 *    * Neither the name of Intel Corporation nor the names of its
  39 *      contributors may be used to endorse or promote products derived
  40 *      from this software without specific prior written permission.
  41 *
  42 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  43 *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  44 *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  45 *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  46 *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  47 *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  48 *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  49 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  50 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  51 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  53 */
  54
  55#include <linux/linkage.h>
  56#include "sha1_mb_mgr_datastruct.S"
  57
  58## code to compute oct SHA1 using SSE-256
  59## outer calling routine takes care of save and restore of XMM registers
  60
  61## Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15# ymm0-15
  62##
  63## Linux clobbers:    rax rbx rcx rdx rsi            r9 r10 r11 r12 r13 r14 r15
  64## Linux preserves:                       rdi rbp r8
  65##
  66## clobbers ymm0-15
  67
  68
  69# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
  70# "transpose" data in {r0...r7} using temps {t0...t1}
  71# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
  72# r0 = {a7 a6 a5 a4   a3 a2 a1 a0}
  73# r1 = {b7 b6 b5 b4   b3 b2 b1 b0}
  74# r2 = {c7 c6 c5 c4   c3 c2 c1 c0}
  75# r3 = {d7 d6 d5 d4   d3 d2 d1 d0}
  76# r4 = {e7 e6 e5 e4   e3 e2 e1 e0}
  77# r5 = {f7 f6 f5 f4   f3 f2 f1 f0}
  78# r6 = {g7 g6 g5 g4   g3 g2 g1 g0}
  79# r7 = {h7 h6 h5 h4   h3 h2 h1 h0}
  80#
  81# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
  82# r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
  83# r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
  84# r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
  85# r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
  86# r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
  87# r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
  88# r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
  89# r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
  90#
  91
  92.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
  93        # process top half (r0..r3) {a...d}
  94        vshufps  $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4   b1 b0 a1 a0}
  95        vshufps  $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6   b3 b2 a3 a2}
  96        vshufps  $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4   d1 d0 c1 c0}
  97        vshufps  $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6   d3 d2 c3 c2}
  98        vshufps  $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5   d1 c1 b1 a1}
  99        vshufps  $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6   d2 c2 b2 a2}
 100        vshufps  $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7   d3 c3 b3 a3}
 101        vshufps  $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4   d0 c0 b0 a0}
 102
 103        # use r2 in place of t0
 104        # process bottom half (r4..r7) {e...h}
 105        vshufps  $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4   f1 f0 e1 e0}
 106        vshufps  $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6   f3 f2 e3 e2}
 107        vshufps  $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4   h1 h0 g1 g0}
 108        vshufps  $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6   h3 h2 g3 g2}
 109        vshufps  $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5   h1 g1 f1 e1}
 110        vshufps  $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6   h2 g2 f2 e2}
 111        vshufps  $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7   h3 g3 f3 e3}
 112        vshufps  $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4   h0 g0 f0 e0}
 113
 114        vperm2f128      $0x13, \r1, \r5, \r6  # h6...a6
 115        vperm2f128      $0x02, \r1, \r5, \r2  # h2...a2
 116        vperm2f128      $0x13, \r3, \r7, \r5  # h5...a5
 117        vperm2f128      $0x02, \r3, \r7, \r1  # h1...a1
 118        vperm2f128      $0x13, \r0, \r4, \r7  # h7...a7
 119        vperm2f128      $0x02, \r0, \r4, \r3  # h3...a3
 120        vperm2f128      $0x13, \t0, \t1, \r4  # h4...a4
 121        vperm2f128      $0x02, \t0, \t1, \r0  # h0...a0
 122
 123.endm
 124##
 125## Magic functions defined in FIPS 180-1
 126##
 127# macro MAGIC_F0 F,B,C,D,T   ## F = (D ^ (B & (C ^ D)))
 128.macro MAGIC_F0 regF regB regC regD regT
 129    vpxor \regD, \regC, \regF
 130    vpand \regB, \regF, \regF
 131    vpxor \regD, \regF, \regF
 132.endm
 133
 134# macro MAGIC_F1 F,B,C,D,T   ## F = (B ^ C ^ D)
 135.macro MAGIC_F1 regF regB regC regD regT
 136    vpxor  \regC, \regD, \regF
 137    vpxor  \regB, \regF, \regF
 138.endm
 139
 140# macro MAGIC_F2 F,B,C,D,T   ## F = ((B & C) | (B & D) | (C & D))
 141.macro MAGIC_F2 regF regB regC regD regT
 142    vpor  \regC, \regB, \regF
 143    vpand \regC, \regB, \regT
 144    vpand \regD, \regF, \regF
 145    vpor  \regT, \regF, \regF
 146.endm
 147
 148# macro MAGIC_F3 F,B,C,D,T   ## F = (B ^ C ^ D)
 149.macro MAGIC_F3 regF regB regC regD regT
 150    MAGIC_F1 \regF,\regB,\regC,\regD,\regT
 151.endm
 152
 153# PROLD reg, imm, tmp
 154.macro PROLD reg imm tmp
 155        vpsrld  $(32-\imm), \reg, \tmp
 156        vpslld  $\imm, \reg, \reg
 157        vpor    \tmp, \reg, \reg
 158.endm
 159
 160.macro PROLD_nd reg imm tmp src
 161        vpsrld  $(32-\imm), \src, \tmp
 162        vpslld  $\imm, \src, \reg
 163        vpor    \tmp, \reg, \reg
 164.endm
 165
 166.macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC
 167        vpaddd  \immCNT, \regE, \regE
 168        vpaddd  \memW*32(%rsp), \regE, \regE
 169        PROLD_nd \regT, 5, \regF, \regA
 170        vpaddd  \regT, \regE, \regE
 171        \MAGIC  \regF, \regB, \regC, \regD, \regT
 172        PROLD   \regB, 30, \regT
 173        vpaddd  \regF, \regE, \regE
 174.endm
 175
 176.macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC
 177        vpaddd  \immCNT, \regE, \regE
 178        offset = ((\memW - 14) & 15) * 32
 179        vmovdqu offset(%rsp), W14
 180        vpxor   W14, W16, W16
 181        offset = ((\memW -  8) & 15) * 32
 182        vpxor   offset(%rsp), W16, W16
 183        offset = ((\memW -  3) & 15) * 32
 184        vpxor   offset(%rsp), W16, W16
 185        vpsrld  $(32-1), W16, \regF
 186        vpslld  $1, W16, W16
 187        vpor    W16, \regF, \regF
 188
 189        ROTATE_W
 190
 191        offset = ((\memW - 0) & 15) * 32
 192        vmovdqu \regF, offset(%rsp)
 193        vpaddd  \regF, \regE, \regE
 194        PROLD_nd \regT, 5, \regF, \regA
 195        vpaddd  \regT, \regE, \regE
 196        \MAGIC \regF,\regB,\regC,\regD,\regT      ## FUN  = MAGIC_Fi(B,C,D)
 197        PROLD   \regB,30, \regT
 198        vpaddd  \regF, \regE, \regE
 199.endm
 200
 201########################################################################
 202########################################################################
 203########################################################################
 204
 205## FRAMESZ plus pushes must be an odd multiple of 8
 206YMM_SAVE = (15-15)*32
 207FRAMESZ = 32*16 + YMM_SAVE
 208_YMM  =   FRAMESZ - YMM_SAVE
 209
 210#define VMOVPS   vmovups
 211
 212IDX  = %rax
 213inp0 = %r9
 214inp1 = %r10
 215inp2 = %r11
 216inp3 = %r12
 217inp4 = %r13
 218inp5 = %r14
 219inp6 = %r15
 220inp7 = %rcx
 221arg1 = %rdi
 222arg2 = %rsi
 223RSP_SAVE = %rdx
 224
 225# ymm0 A
 226# ymm1 B
 227# ymm2 C
 228# ymm3 D
 229# ymm4 E
 230# ymm5         F       AA
 231# ymm6         T0      BB
 232# ymm7         T1      CC
 233# ymm8         T2      DD
 234# ymm9         T3      EE
 235# ymm10                T4      TMP
 236# ymm11                T5      FUN
 237# ymm12                T6      K
 238# ymm13                T7      W14
 239# ymm14                T8      W15
 240# ymm15                T9      W16
 241
 242
 243A  =     %ymm0
 244B  =     %ymm1
 245C  =     %ymm2
 246D  =     %ymm3
 247E  =     %ymm4
 248F  =     %ymm5
 249T0 =     %ymm6
 250T1 =     %ymm7
 251T2 =     %ymm8
 252T3 =     %ymm9
 253T4 =     %ymm10
 254T5 =     %ymm11
 255T6 =     %ymm12
 256T7 =     %ymm13
 257T8  =     %ymm14
 258T9  =     %ymm15
 259
 260AA  =     %ymm5
 261BB  =     %ymm6
 262CC  =     %ymm7
 263DD  =     %ymm8
 264EE  =     %ymm9
 265TMP =     %ymm10
 266FUN =     %ymm11
 267K   =     %ymm12
 268W14 =     %ymm13
 269W15 =     %ymm14
 270W16 =     %ymm15
 271
 272.macro ROTATE_ARGS
 273 TMP_ = E
 274 E = D
 275 D = C
 276 C = B
 277 B = A
 278 A = TMP_
 279.endm
 280
 281.macro ROTATE_W
 282TMP_  = W16
 283W16  = W15
 284W15  = W14
 285W14  = TMP_
 286.endm
 287
 288# 8 streams x 5 32bit words per digest x 4 bytes per word
 289#define DIGEST_SIZE (8*5*4)
 290
 291.align 32
 292
 293# void sha1_x8_avx2(void **input_data, UINT128 *digest, UINT32 size)
 294# arg 1 : pointer to array[4] of pointer to input data
 295# arg 2 : size (in blocks) ;; assumed to be >= 1
 296#
 297ENTRY(sha1_x8_avx2)
 298
 299        push    RSP_SAVE
 300
 301        #save rsp
 302        mov     %rsp, RSP_SAVE
 303        sub     $FRAMESZ, %rsp
 304
 305        #align rsp to 32 Bytes
 306        and     $~0x1F, %rsp
 307
 308        ## Initialize digests
 309        vmovdqu  0*32(arg1), A
 310        vmovdqu  1*32(arg1), B
 311        vmovdqu  2*32(arg1), C
 312        vmovdqu  3*32(arg1), D
 313        vmovdqu  4*32(arg1), E
 314
 315        ## transpose input onto stack
 316        mov     _data_ptr+0*8(arg1),inp0
 317        mov     _data_ptr+1*8(arg1),inp1
 318        mov     _data_ptr+2*8(arg1),inp2
 319        mov     _data_ptr+3*8(arg1),inp3
 320        mov     _data_ptr+4*8(arg1),inp4
 321        mov     _data_ptr+5*8(arg1),inp5
 322        mov     _data_ptr+6*8(arg1),inp6
 323        mov     _data_ptr+7*8(arg1),inp7
 324
 325        xor     IDX, IDX
 326lloop:
 327        vmovdqu  PSHUFFLE_BYTE_FLIP_MASK(%rip), F
 328        I=0
 329.rep 2
 330        VMOVPS   (inp0, IDX), T0
 331        VMOVPS   (inp1, IDX), T1
 332        VMOVPS   (inp2, IDX), T2
 333        VMOVPS   (inp3, IDX), T3
 334        VMOVPS   (inp4, IDX), T4
 335        VMOVPS   (inp5, IDX), T5
 336        VMOVPS   (inp6, IDX), T6
 337        VMOVPS   (inp7, IDX), T7
 338
 339        TRANSPOSE8       T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
 340        vpshufb  F, T0, T0
 341        vmovdqu  T0, (I*8)*32(%rsp)
 342        vpshufb  F, T1, T1
 343        vmovdqu  T1, (I*8+1)*32(%rsp)
 344        vpshufb  F, T2, T2
 345        vmovdqu  T2, (I*8+2)*32(%rsp)
 346        vpshufb  F, T3, T3
 347        vmovdqu  T3, (I*8+3)*32(%rsp)
 348        vpshufb  F, T4, T4
 349        vmovdqu  T4, (I*8+4)*32(%rsp)
 350        vpshufb  F, T5, T5
 351        vmovdqu  T5, (I*8+5)*32(%rsp)
 352        vpshufb  F, T6, T6
 353        vmovdqu  T6, (I*8+6)*32(%rsp)
 354        vpshufb  F, T7, T7
 355        vmovdqu  T7, (I*8+7)*32(%rsp)
 356        add     $32, IDX
 357        I = (I+1)
 358.endr
 359        # save old digests
 360        vmovdqu  A,AA
 361        vmovdqu  B,BB
 362        vmovdqu  C,CC
 363        vmovdqu  D,DD
 364        vmovdqu  E,EE
 365
 366##
 367## perform 0-79 steps
 368##
 369        vmovdqu  K00_19(%rip), K
 370## do rounds 0...15
 371        I = 0
 372.rep 16
 373        SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
 374        ROTATE_ARGS
 375        I = (I+1)
 376.endr
 377
 378## do rounds 16...19
 379        vmovdqu  ((16 - 16) & 15) * 32 (%rsp), W16
 380        vmovdqu  ((16 - 15) & 15) * 32 (%rsp), W15
 381.rep 4
 382        SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
 383        ROTATE_ARGS
 384        I = (I+1)
 385.endr
 386
 387## do rounds 20...39
 388        vmovdqu  K20_39(%rip), K
 389.rep 20
 390        SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
 391        ROTATE_ARGS
 392        I = (I+1)
 393.endr
 394
 395## do rounds 40...59
 396        vmovdqu  K40_59(%rip), K
 397.rep 20
 398        SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
 399        ROTATE_ARGS
 400        I = (I+1)
 401.endr
 402
 403## do rounds 60...79
 404        vmovdqu  K60_79(%rip), K
 405.rep 20
 406        SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
 407        ROTATE_ARGS
 408        I = (I+1)
 409.endr
 410
 411        vpaddd   AA,A,A
 412        vpaddd   BB,B,B
 413        vpaddd   CC,C,C
 414        vpaddd   DD,D,D
 415        vpaddd   EE,E,E
 416
 417        sub     $1, arg2
 418        jne     lloop
 419
 420        # write out digests
 421        vmovdqu  A, 0*32(arg1)
 422        vmovdqu  B, 1*32(arg1)
 423        vmovdqu  C, 2*32(arg1)
 424        vmovdqu  D, 3*32(arg1)
 425        vmovdqu  E, 4*32(arg1)
 426
 427        # update input pointers
 428        add     IDX, inp0
 429        add     IDX, inp1
 430        add     IDX, inp2
 431        add     IDX, inp3
 432        add     IDX, inp4
 433        add     IDX, inp5
 434        add     IDX, inp6
 435        add     IDX, inp7
 436        mov     inp0, _data_ptr (arg1)
 437        mov     inp1, _data_ptr + 1*8(arg1)
 438        mov     inp2, _data_ptr + 2*8(arg1)
 439        mov     inp3, _data_ptr + 3*8(arg1)
 440        mov     inp4, _data_ptr + 4*8(arg1)
 441        mov     inp5, _data_ptr + 5*8(arg1)
 442        mov     inp6, _data_ptr + 6*8(arg1)
 443        mov     inp7, _data_ptr + 7*8(arg1)
 444
 445        ################
 446        ## Postamble
 447
 448        mov     RSP_SAVE, %rsp
 449        pop     RSP_SAVE
 450
 451        ret
 452ENDPROC(sha1_x8_avx2)
 453
 454
 455.data
 456
 457.align 32
 458K00_19:
 459.octa 0x5A8279995A8279995A8279995A827999
 460.octa 0x5A8279995A8279995A8279995A827999
 461K20_39:
 462.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
 463.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
 464K40_59:
 465.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
 466.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
 467K60_79:
 468.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
 469.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
 470PSHUFFLE_BYTE_FLIP_MASK:
 471.octa 0x0c0d0e0f08090a0b0405060700010203
 472.octa 0x0c0d0e0f08090a0b0405060700010203
 473