linux/arch/x86/crypto/aesni-intel_avx-x86_64.S
<<
>>
Prefs
   1########################################################################
   2# Copyright (c) 2013, Intel Corporation
   3#
   4# This software is available to you under a choice of one of two
   5# licenses.  You may choose to be licensed under the terms of the GNU
   6# General Public License (GPL) Version 2, available from the file
   7# COPYING in the main directory of this source tree, or the
   8# OpenIB.org BSD license below:
   9#
  10# Redistribution and use in source and binary forms, with or without
  11# modification, are permitted provided that the following conditions are
  12# met:
  13#
  14# * Redistributions of source code must retain the above copyright
  15#   notice, this list of conditions and the following disclaimer.
  16#
  17# * Redistributions in binary form must reproduce the above copyright
  18#   notice, this list of conditions and the following disclaimer in the
  19#   documentation and/or other materials provided with the
  20#   distribution.
  21#
  22# * Neither the name of the Intel Corporation nor the names of its
  23#   contributors may be used to endorse or promote products derived from
  24#   this software without specific prior written permission.
  25#
  26#
  27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
  28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
  31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
  34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38########################################################################
  39##
  40## Authors:
  41##      Erdinc Ozturk <erdinc.ozturk@intel.com>
  42##      Vinodh Gopal <vinodh.gopal@intel.com>
  43##      James Guilford <james.guilford@intel.com>
  44##      Tim Chen <tim.c.chen@linux.intel.com>
  45##
  46## References:
  47##       This code was derived and highly optimized from the code described in paper:
  48##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
  49##                      on Intel Architecture Processors. August, 2010
  50##       The details of the implementation is explained in:
  51##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
  52##                      on Intel Architecture Processors. October, 2012.
  53##
  54## Assumptions:
  55##
  56##
  57##
  58## iv:
  59##       0                   1                   2                   3
  60##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  61##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  62##       |                             Salt  (From the SA)               |
  63##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  64##       |                     Initialization Vector                     |
  65##       |         (This is the sequence number from IPSec header)       |
  66##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  67##       |                              0x1                              |
  68##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  69##
  70##
  71##
  72## AAD:
  73##       AAD padded to 128 bits with 0
  74##       for example, assume AAD is a u32 vector
  75##
  76##       if AAD is 8 bytes:
  77##       AAD[3] = {A0, A1}#
  78##       padded AAD in xmm register = {A1 A0 0 0}
  79##
  80##       0                   1                   2                   3
  81##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  82##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  83##       |                               SPI (A1)                        |
  84##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  85##       |                     32-bit Sequence Number (A0)               |
  86##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  87##       |                              0x0                              |
  88##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  89##
  90##                                       AAD Format with 32-bit Sequence Number
  91##
  92##       if AAD is 12 bytes:
  93##       AAD[3] = {A0, A1, A2}#
  94##       padded AAD in xmm register = {A2 A1 A0 0}
  95##
  96##       0                   1                   2                   3
  97##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  98##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  99##       |                               SPI (A2)                        |
 100##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 101##       |                 64-bit Extended Sequence Number {A1,A0}       |
 102##       |                                                               |
 103##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 104##       |                              0x0                              |
 105##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 106##
 107##        AAD Format with 64-bit Extended Sequence Number
 108##
 109##
 110## aadLen:
 111##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
 112##       The code additionally supports aadLen of length 16 bytes.
 113##
 114## TLen:
 115##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
 116##
 117## poly = x^128 + x^127 + x^126 + x^121 + 1
 118## throughout the code, one tab and two tab indentations are used. one tab is
 119## for GHASH part, two tabs is for AES part.
 120##
 121
 122#include <linux/linkage.h>
 123#include <asm/inst.h>
 124
 125.data
 126.align 16
 127
 128POLY:            .octa     0xC2000000000000000000000000000001
 129POLY2:           .octa     0xC20000000000000000000001C2000000
 130TWOONE:          .octa     0x00000001000000000000000000000001
 131
 132# order of these constants should not change.
 133# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
 134
 135SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
 136SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
 137ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
 138ZERO:            .octa     0x00000000000000000000000000000000
 139ONE:             .octa     0x00000000000000000000000000000001
 140ONEf:            .octa     0x01000000000000000000000000000000
 141
 142.text
 143
 144
 145##define the fields of the gcm aes context
 146#{
 147#        u8 expanded_keys[16*11] store expanded keys
 148#        u8 shifted_hkey_1[16]   store HashKey <<1 mod poly here
 149#        u8 shifted_hkey_2[16]   store HashKey^2 <<1 mod poly here
 150#        u8 shifted_hkey_3[16]   store HashKey^3 <<1 mod poly here
 151#        u8 shifted_hkey_4[16]   store HashKey^4 <<1 mod poly here
 152#        u8 shifted_hkey_5[16]   store HashKey^5 <<1 mod poly here
 153#        u8 shifted_hkey_6[16]   store HashKey^6 <<1 mod poly here
 154#        u8 shifted_hkey_7[16]   store HashKey^7 <<1 mod poly here
 155#        u8 shifted_hkey_8[16]   store HashKey^8 <<1 mod poly here
 156#        u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
 157#        u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
 158#        u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
 159#        u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
 160#        u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
 161#        u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
 162#        u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
 163#        u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
 164#} gcm_ctx#
 165
 166HashKey        = 16*11   # store HashKey <<1 mod poly here
 167HashKey_2      = 16*12   # store HashKey^2 <<1 mod poly here
 168HashKey_3      = 16*13   # store HashKey^3 <<1 mod poly here
 169HashKey_4      = 16*14   # store HashKey^4 <<1 mod poly here
 170HashKey_5      = 16*15   # store HashKey^5 <<1 mod poly here
 171HashKey_6      = 16*16   # store HashKey^6 <<1 mod poly here
 172HashKey_7      = 16*17   # store HashKey^7 <<1 mod poly here
 173HashKey_8      = 16*18   # store HashKey^8 <<1 mod poly here
 174HashKey_k      = 16*19   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
 175HashKey_2_k    = 16*20   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
 176HashKey_3_k    = 16*21   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
 177HashKey_4_k    = 16*22   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
 178HashKey_5_k    = 16*23   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
 179HashKey_6_k    = 16*24   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
 180HashKey_7_k    = 16*25   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
 181HashKey_8_k    = 16*26   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
 182
 183#define arg1 %rdi
 184#define arg2 %rsi
 185#define arg3 %rdx
 186#define arg4 %rcx
 187#define arg5 %r8
 188#define arg6 %r9
 189#define arg7 STACK_OFFSET+8*1(%r14)
 190#define arg8 STACK_OFFSET+8*2(%r14)
 191#define arg9 STACK_OFFSET+8*3(%r14)
 192
 193i = 0
 194j = 0
 195
 196out_order = 0
 197in_order = 1
 198DEC = 0
 199ENC = 1
 200
 201.macro define_reg r n
 202reg_\r = %xmm\n
 203.endm
 204
 205.macro setreg
 206.altmacro
 207define_reg i %i
 208define_reg j %j
 209.noaltmacro
 210.endm
 211
 212# need to push 4 registers into stack to maintain
 213STACK_OFFSET = 8*4
 214
 215TMP1 =   16*0    # Temporary storage for AAD
 216TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
 217TMP3 =   16*2    # Temporary storage for AES State 3
 218TMP4 =   16*3    # Temporary storage for AES State 4
 219TMP5 =   16*4    # Temporary storage for AES State 5
 220TMP6 =   16*5    # Temporary storage for AES State 6
 221TMP7 =   16*6    # Temporary storage for AES State 7
 222TMP8 =   16*7    # Temporary storage for AES State 8
 223
 224VARIABLE_OFFSET = 16*8
 225
 226################################
 227# Utility Macros
 228################################
 229
 230# Encryption of a single block
 231.macro ENCRYPT_SINGLE_BLOCK XMM0
 232                vpxor    (arg1), \XMM0, \XMM0
 233                i = 1
 234                setreg
 235.rep 9
 236                vaesenc  16*i(arg1), \XMM0, \XMM0
 237                i = (i+1)
 238                setreg
 239.endr
 240                vaesenclast 16*10(arg1), \XMM0, \XMM0
 241.endm
 242
 243#ifdef CONFIG_AS_AVX
 244###############################################################################
 245# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 246# Input: A and B (128-bits each, bit-reflected)
 247# Output: C = A*B*x mod poly, (i.e. >>1 )
 248# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 249# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 250###############################################################################
 251.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
 252
 253        vpshufd         $0b01001110, \GH, \T2
 254        vpshufd         $0b01001110, \HK, \T3
 255        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
 256        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
 257
 258        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
 259        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
 260        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
 261        vpxor           \GH, \T2,\T2
 262        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
 263
 264        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
 265        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
 266        vpxor           \T3, \GH, \GH
 267        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
 268
 269        #first phase of the reduction
 270        vpslld  $31, \GH, \T2                   # packed right shifting << 31
 271        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
 272        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
 273
 274        vpxor   \T3, \T2, \T2                   # xor the shifted versions
 275        vpxor   \T4, \T2, \T2
 276
 277        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
 278
 279        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
 280        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
 281
 282        #second phase of the reduction
 283
 284        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
 285        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
 286        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
 287        vpxor   \T3, \T2, \T2                   # xor the shifted versions
 288        vpxor   \T4, \T2, \T2
 289
 290        vpxor   \T5, \T2, \T2
 291        vpxor   \T2, \GH, \GH
 292        vpxor   \T1, \GH, \GH                   # the result is in GH
 293
 294
 295.endm
 296
 297.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
 298
 299        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 300        vmovdqa  \HK, \T5
 301
 302        vpshufd  $0b01001110, \T5, \T1
 303        vpxor    \T5, \T1, \T1
 304        vmovdqa  \T1, HashKey_k(arg1)
 305
 306        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
 307        vmovdqa  \T5, HashKey_2(arg1)                    #  [HashKey_2] = HashKey^2<<1 mod poly
 308        vpshufd  $0b01001110, \T5, \T1
 309        vpxor    \T5, \T1, \T1
 310        vmovdqa  \T1, HashKey_2_k(arg1)
 311
 312        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
 313        vmovdqa  \T5, HashKey_3(arg1)
 314        vpshufd  $0b01001110, \T5, \T1
 315        vpxor    \T5, \T1, \T1
 316        vmovdqa  \T1, HashKey_3_k(arg1)
 317
 318        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
 319        vmovdqa  \T5, HashKey_4(arg1)
 320        vpshufd  $0b01001110, \T5, \T1
 321        vpxor    \T5, \T1, \T1
 322        vmovdqa  \T1, HashKey_4_k(arg1)
 323
 324        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
 325        vmovdqa  \T5, HashKey_5(arg1)
 326        vpshufd  $0b01001110, \T5, \T1
 327        vpxor    \T5, \T1, \T1
 328        vmovdqa  \T1, HashKey_5_k(arg1)
 329
 330        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
 331        vmovdqa  \T5, HashKey_6(arg1)
 332        vpshufd  $0b01001110, \T5, \T1
 333        vpxor    \T5, \T1, \T1
 334        vmovdqa  \T1, HashKey_6_k(arg1)
 335
 336        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
 337        vmovdqa  \T5, HashKey_7(arg1)
 338        vpshufd  $0b01001110, \T5, \T1
 339        vpxor    \T5, \T1, \T1
 340        vmovdqa  \T1, HashKey_7_k(arg1)
 341
 342        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
 343        vmovdqa  \T5, HashKey_8(arg1)
 344        vpshufd  $0b01001110, \T5, \T1
 345        vpxor    \T5, \T1, \T1
 346        vmovdqa  \T1, HashKey_8_k(arg1)
 347
 348.endm
 349
 350## if a = number of total plaintext bytes
 351## b = floor(a/16)
 352## num_initial_blocks = b mod 4#
 353## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
 354## r10, r11, r12, rax are clobbered
 355## arg1, arg2, arg3, r14 are used as a pointer only, not modified
 356
 357.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
 358        i = (8-\num_initial_blocks)
 359        setreg
 360
 361        mov     arg6, %r10                      # r10 = AAD
 362        mov     arg7, %r12                      # r12 = aadLen
 363
 364
 365        mov     %r12, %r11
 366
 367        vpxor   reg_i, reg_i, reg_i
 368_get_AAD_loop\@:
 369        vmovd   (%r10), \T1
 370        vpslldq $12, \T1, \T1
 371        vpsrldq $4, reg_i, reg_i
 372        vpxor   \T1, reg_i, reg_i
 373
 374        add     $4, %r10
 375        sub     $4, %r12
 376        jg      _get_AAD_loop\@
 377
 378
 379        cmp     $16, %r11
 380        je      _get_AAD_loop2_done\@
 381        mov     $16, %r12
 382
 383_get_AAD_loop2\@:
 384        vpsrldq $4, reg_i, reg_i
 385        sub     $4, %r12
 386        cmp     %r11, %r12
 387        jg      _get_AAD_loop2\@
 388
 389_get_AAD_loop2_done\@:
 390
 391        #byte-reflect the AAD data
 392        vpshufb SHUF_MASK(%rip), reg_i, reg_i
 393
 394        # initialize the data pointer offset as zero
 395        xor     %r11, %r11
 396
 397        # start AES for num_initial_blocks blocks
 398        mov     arg5, %rax                     # rax = *Y0
 399        vmovdqu (%rax), \CTR                   # CTR = Y0
 400        vpshufb SHUF_MASK(%rip), \CTR, \CTR
 401
 402
 403        i = (9-\num_initial_blocks)
 404        setreg
 405.rep \num_initial_blocks
 406                vpaddd  ONE(%rip), \CTR, \CTR           # INCR Y0
 407                vmovdqa \CTR, reg_i
 408                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
 409        i = (i+1)
 410        setreg
 411.endr
 412
 413        vmovdqa  (arg1), \T_key
 414        i = (9-\num_initial_blocks)
 415        setreg
 416.rep \num_initial_blocks
 417                vpxor   \T_key, reg_i, reg_i
 418        i = (i+1)
 419        setreg
 420.endr
 421
 422        j = 1
 423        setreg
 424.rep 9
 425        vmovdqa  16*j(arg1), \T_key
 426        i = (9-\num_initial_blocks)
 427        setreg
 428.rep \num_initial_blocks
 429        vaesenc \T_key, reg_i, reg_i
 430        i = (i+1)
 431        setreg
 432.endr
 433
 434        j = (j+1)
 435        setreg
 436.endr
 437
 438
 439        vmovdqa  16*10(arg1), \T_key
 440        i = (9-\num_initial_blocks)
 441        setreg
 442.rep \num_initial_blocks
 443        vaesenclast      \T_key, reg_i, reg_i
 444        i = (i+1)
 445        setreg
 446.endr
 447
 448        i = (9-\num_initial_blocks)
 449        setreg
 450.rep \num_initial_blocks
 451                vmovdqu (arg3, %r11), \T1
 452                vpxor   \T1, reg_i, reg_i
 453                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for num_initial_blocks blocks
 454                add     $16, %r11
 455.if  \ENC_DEC == DEC
 456                vmovdqa \T1, reg_i
 457.endif
 458                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
 459        i = (i+1)
 460        setreg
 461.endr
 462
 463
 464        i = (8-\num_initial_blocks)
 465        j = (9-\num_initial_blocks)
 466        setreg
 467        GHASH_MUL_AVX       reg_i, \T2, \T1, \T3, \T4, \T5, \T6
 468
 469.rep \num_initial_blocks
 470        vpxor    reg_i, reg_j, reg_j
 471        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
 472        i = (i+1)
 473        j = (j+1)
 474        setreg
 475.endr
 476        # XMM8 has the combined result here
 477
 478        vmovdqa  \XMM8, TMP1(%rsp)
 479        vmovdqa  \XMM8, \T3
 480
 481        cmp     $128, %r13
 482        jl      _initial_blocks_done\@                  # no need for precomputed constants
 483
 484###############################################################################
 485# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 486                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 487                vmovdqa  \CTR, \XMM1
 488                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
 489
 490                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 491                vmovdqa  \CTR, \XMM2
 492                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
 493
 494                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 495                vmovdqa  \CTR, \XMM3
 496                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
 497
 498                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 499                vmovdqa  \CTR, \XMM4
 500                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
 501
 502                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 503                vmovdqa  \CTR, \XMM5
 504                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
 505
 506                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 507                vmovdqa  \CTR, \XMM6
 508                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
 509
 510                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 511                vmovdqa  \CTR, \XMM7
 512                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
 513
 514                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 515                vmovdqa  \CTR, \XMM8
 516                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
 517
 518                vmovdqa  (arg1), \T_key
 519                vpxor    \T_key, \XMM1, \XMM1
 520                vpxor    \T_key, \XMM2, \XMM2
 521                vpxor    \T_key, \XMM3, \XMM3
 522                vpxor    \T_key, \XMM4, \XMM4
 523                vpxor    \T_key, \XMM5, \XMM5
 524                vpxor    \T_key, \XMM6, \XMM6
 525                vpxor    \T_key, \XMM7, \XMM7
 526                vpxor    \T_key, \XMM8, \XMM8
 527
 528                i = 1
 529                setreg
 530.rep    9       # do 9 rounds
 531                vmovdqa  16*i(arg1), \T_key
 532                vaesenc  \T_key, \XMM1, \XMM1
 533                vaesenc  \T_key, \XMM2, \XMM2
 534                vaesenc  \T_key, \XMM3, \XMM3
 535                vaesenc  \T_key, \XMM4, \XMM4
 536                vaesenc  \T_key, \XMM5, \XMM5
 537                vaesenc  \T_key, \XMM6, \XMM6
 538                vaesenc  \T_key, \XMM7, \XMM7
 539                vaesenc  \T_key, \XMM8, \XMM8
 540                i = (i+1)
 541                setreg
 542.endr
 543
 544
 545                vmovdqa  16*i(arg1), \T_key
 546                vaesenclast  \T_key, \XMM1, \XMM1
 547                vaesenclast  \T_key, \XMM2, \XMM2
 548                vaesenclast  \T_key, \XMM3, \XMM3
 549                vaesenclast  \T_key, \XMM4, \XMM4
 550                vaesenclast  \T_key, \XMM5, \XMM5
 551                vaesenclast  \T_key, \XMM6, \XMM6
 552                vaesenclast  \T_key, \XMM7, \XMM7
 553                vaesenclast  \T_key, \XMM8, \XMM8
 554
 555                vmovdqu  (arg3, %r11), \T1
 556                vpxor    \T1, \XMM1, \XMM1
 557                vmovdqu  \XMM1, (arg2 , %r11)
 558                .if   \ENC_DEC == DEC
 559                vmovdqa  \T1, \XMM1
 560                .endif
 561
 562                vmovdqu  16*1(arg3, %r11), \T1
 563                vpxor    \T1, \XMM2, \XMM2
 564                vmovdqu  \XMM2, 16*1(arg2 , %r11)
 565                .if   \ENC_DEC == DEC
 566                vmovdqa  \T1, \XMM2
 567                .endif
 568
 569                vmovdqu  16*2(arg3, %r11), \T1
 570                vpxor    \T1, \XMM3, \XMM3
 571                vmovdqu  \XMM3, 16*2(arg2 , %r11)
 572                .if   \ENC_DEC == DEC
 573                vmovdqa  \T1, \XMM3
 574                .endif
 575
 576                vmovdqu  16*3(arg3, %r11), \T1
 577                vpxor    \T1, \XMM4, \XMM4
 578                vmovdqu  \XMM4, 16*3(arg2 , %r11)
 579                .if   \ENC_DEC == DEC
 580                vmovdqa  \T1, \XMM4
 581                .endif
 582
 583                vmovdqu  16*4(arg3, %r11), \T1
 584                vpxor    \T1, \XMM5, \XMM5
 585                vmovdqu  \XMM5, 16*4(arg2 , %r11)
 586                .if   \ENC_DEC == DEC
 587                vmovdqa  \T1, \XMM5
 588                .endif
 589
 590                vmovdqu  16*5(arg3, %r11), \T1
 591                vpxor    \T1, \XMM6, \XMM6
 592                vmovdqu  \XMM6, 16*5(arg2 , %r11)
 593                .if   \ENC_DEC == DEC
 594                vmovdqa  \T1, \XMM6
 595                .endif
 596
 597                vmovdqu  16*6(arg3, %r11), \T1
 598                vpxor    \T1, \XMM7, \XMM7
 599                vmovdqu  \XMM7, 16*6(arg2 , %r11)
 600                .if   \ENC_DEC == DEC
 601                vmovdqa  \T1, \XMM7
 602                .endif
 603
 604                vmovdqu  16*7(arg3, %r11), \T1
 605                vpxor    \T1, \XMM8, \XMM8
 606                vmovdqu  \XMM8, 16*7(arg2 , %r11)
 607                .if   \ENC_DEC == DEC
 608                vmovdqa  \T1, \XMM8
 609                .endif
 610
 611                add     $128, %r11
 612
 613                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
 614                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
 615                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
 616                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
 617                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
 618                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
 619                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
 620                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
 621                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
 622
 623###############################################################################
 624
 625_initial_blocks_done\@:
 626
 627.endm
 628
 629# encrypt 8 blocks at a time
 630# ghash the 8 previously encrypted ciphertext blocks
 631# arg1, arg2, arg3 are used as pointers only, not modified
 632# r11 is the data offset value
 633.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
 634
 635        vmovdqa \XMM1, \T2
 636        vmovdqa \XMM2, TMP2(%rsp)
 637        vmovdqa \XMM3, TMP3(%rsp)
 638        vmovdqa \XMM4, TMP4(%rsp)
 639        vmovdqa \XMM5, TMP5(%rsp)
 640        vmovdqa \XMM6, TMP6(%rsp)
 641        vmovdqa \XMM7, TMP7(%rsp)
 642        vmovdqa \XMM8, TMP8(%rsp)
 643
 644.if \loop_idx == in_order
 645                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
 646                vpaddd  ONE(%rip), \XMM1, \XMM2
 647                vpaddd  ONE(%rip), \XMM2, \XMM3
 648                vpaddd  ONE(%rip), \XMM3, \XMM4
 649                vpaddd  ONE(%rip), \XMM4, \XMM5
 650                vpaddd  ONE(%rip), \XMM5, \XMM6
 651                vpaddd  ONE(%rip), \XMM6, \XMM7
 652                vpaddd  ONE(%rip), \XMM7, \XMM8
 653                vmovdqa \XMM8, \CTR
 654
 655                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
 656                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
 657                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
 658                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
 659                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
 660                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
 661                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
 662                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
 663.else
 664                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
 665                vpaddd  ONEf(%rip), \XMM1, \XMM2
 666                vpaddd  ONEf(%rip), \XMM2, \XMM3
 667                vpaddd  ONEf(%rip), \XMM3, \XMM4
 668                vpaddd  ONEf(%rip), \XMM4, \XMM5
 669                vpaddd  ONEf(%rip), \XMM5, \XMM6
 670                vpaddd  ONEf(%rip), \XMM6, \XMM7
 671                vpaddd  ONEf(%rip), \XMM7, \XMM8
 672                vmovdqa \XMM8, \CTR
 673.endif
 674
 675
 676        #######################################################################
 677
 678                vmovdqu (arg1), \T1
 679                vpxor   \T1, \XMM1, \XMM1
 680                vpxor   \T1, \XMM2, \XMM2
 681                vpxor   \T1, \XMM3, \XMM3
 682                vpxor   \T1, \XMM4, \XMM4
 683                vpxor   \T1, \XMM5, \XMM5
 684                vpxor   \T1, \XMM6, \XMM6
 685                vpxor   \T1, \XMM7, \XMM7
 686                vpxor   \T1, \XMM8, \XMM8
 687
 688        #######################################################################
 689
 690
 691
 692
 693
 694                vmovdqu 16*1(arg1), \T1
 695                vaesenc \T1, \XMM1, \XMM1
 696                vaesenc \T1, \XMM2, \XMM2
 697                vaesenc \T1, \XMM3, \XMM3
 698                vaesenc \T1, \XMM4, \XMM4
 699                vaesenc \T1, \XMM5, \XMM5
 700                vaesenc \T1, \XMM6, \XMM6
 701                vaesenc \T1, \XMM7, \XMM7
 702                vaesenc \T1, \XMM8, \XMM8
 703
 704                vmovdqu 16*2(arg1), \T1
 705                vaesenc \T1, \XMM1, \XMM1
 706                vaesenc \T1, \XMM2, \XMM2
 707                vaesenc \T1, \XMM3, \XMM3
 708                vaesenc \T1, \XMM4, \XMM4
 709                vaesenc \T1, \XMM5, \XMM5
 710                vaesenc \T1, \XMM6, \XMM6
 711                vaesenc \T1, \XMM7, \XMM7
 712                vaesenc \T1, \XMM8, \XMM8
 713
 714
 715        #######################################################################
 716
 717        vmovdqa         HashKey_8(arg1), \T5
 718        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
 719        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
 720
 721        vpshufd         $0b01001110, \T2, \T6
 722        vpxor           \T2, \T6, \T6
 723
 724        vmovdqa         HashKey_8_k(arg1), \T5
 725        vpclmulqdq      $0x00, \T5, \T6, \T6
 726
 727                vmovdqu 16*3(arg1), \T1
 728                vaesenc \T1, \XMM1, \XMM1
 729                vaesenc \T1, \XMM2, \XMM2
 730                vaesenc \T1, \XMM3, \XMM3
 731                vaesenc \T1, \XMM4, \XMM4
 732                vaesenc \T1, \XMM5, \XMM5
 733                vaesenc \T1, \XMM6, \XMM6
 734                vaesenc \T1, \XMM7, \XMM7
 735                vaesenc \T1, \XMM8, \XMM8
 736
 737        vmovdqa         TMP2(%rsp), \T1
 738        vmovdqa         HashKey_7(arg1), \T5
 739        vpclmulqdq      $0x11, \T5, \T1, \T3
 740        vpxor           \T3, \T4, \T4
 741        vpclmulqdq      $0x00, \T5, \T1, \T3
 742        vpxor           \T3, \T7, \T7
 743
 744        vpshufd         $0b01001110, \T1, \T3
 745        vpxor           \T1, \T3, \T3
 746        vmovdqa         HashKey_7_k(arg1), \T5
 747        vpclmulqdq      $0x10, \T5, \T3, \T3
 748        vpxor           \T3, \T6, \T6
 749
 750                vmovdqu 16*4(arg1), \T1
 751                vaesenc \T1, \XMM1, \XMM1
 752                vaesenc \T1, \XMM2, \XMM2
 753                vaesenc \T1, \XMM3, \XMM3
 754                vaesenc \T1, \XMM4, \XMM4
 755                vaesenc \T1, \XMM5, \XMM5
 756                vaesenc \T1, \XMM6, \XMM6
 757                vaesenc \T1, \XMM7, \XMM7
 758                vaesenc \T1, \XMM8, \XMM8
 759
 760        #######################################################################
 761
 762        vmovdqa         TMP3(%rsp), \T1
 763        vmovdqa         HashKey_6(arg1), \T5
 764        vpclmulqdq      $0x11, \T5, \T1, \T3
 765        vpxor           \T3, \T4, \T4
 766        vpclmulqdq      $0x00, \T5, \T1, \T3
 767        vpxor           \T3, \T7, \T7
 768
 769        vpshufd         $0b01001110, \T1, \T3
 770        vpxor           \T1, \T3, \T3
 771        vmovdqa         HashKey_6_k(arg1), \T5
 772        vpclmulqdq      $0x10, \T5, \T3, \T3
 773        vpxor           \T3, \T6, \T6
 774
 775                vmovdqu 16*5(arg1), \T1
 776                vaesenc \T1, \XMM1, \XMM1
 777                vaesenc \T1, \XMM2, \XMM2
 778                vaesenc \T1, \XMM3, \XMM3
 779                vaesenc \T1, \XMM4, \XMM4
 780                vaesenc \T1, \XMM5, \XMM5
 781                vaesenc \T1, \XMM6, \XMM6
 782                vaesenc \T1, \XMM7, \XMM7
 783                vaesenc \T1, \XMM8, \XMM8
 784
 785        vmovdqa         TMP4(%rsp), \T1
 786        vmovdqa         HashKey_5(arg1), \T5
 787        vpclmulqdq      $0x11, \T5, \T1, \T3
 788        vpxor           \T3, \T4, \T4
 789        vpclmulqdq      $0x00, \T5, \T1, \T3
 790        vpxor           \T3, \T7, \T7
 791
 792        vpshufd         $0b01001110, \T1, \T3
 793        vpxor           \T1, \T3, \T3
 794        vmovdqa         HashKey_5_k(arg1), \T5
 795        vpclmulqdq      $0x10, \T5, \T3, \T3
 796        vpxor           \T3, \T6, \T6
 797
 798                vmovdqu 16*6(arg1), \T1
 799                vaesenc \T1, \XMM1, \XMM1
 800                vaesenc \T1, \XMM2, \XMM2
 801                vaesenc \T1, \XMM3, \XMM3
 802                vaesenc \T1, \XMM4, \XMM4
 803                vaesenc \T1, \XMM5, \XMM5
 804                vaesenc \T1, \XMM6, \XMM6
 805                vaesenc \T1, \XMM7, \XMM7
 806                vaesenc \T1, \XMM8, \XMM8
 807
 808
 809        vmovdqa         TMP5(%rsp), \T1
 810        vmovdqa         HashKey_4(arg1), \T5
 811        vpclmulqdq      $0x11, \T5, \T1, \T3
 812        vpxor           \T3, \T4, \T4
 813        vpclmulqdq      $0x00, \T5, \T1, \T3
 814        vpxor           \T3, \T7, \T7
 815
 816        vpshufd         $0b01001110, \T1, \T3
 817        vpxor           \T1, \T3, \T3
 818        vmovdqa         HashKey_4_k(arg1), \T5
 819        vpclmulqdq      $0x10, \T5, \T3, \T3
 820        vpxor           \T3, \T6, \T6
 821
 822                vmovdqu 16*7(arg1), \T1
 823                vaesenc \T1, \XMM1, \XMM1
 824                vaesenc \T1, \XMM2, \XMM2
 825                vaesenc \T1, \XMM3, \XMM3
 826                vaesenc \T1, \XMM4, \XMM4
 827                vaesenc \T1, \XMM5, \XMM5
 828                vaesenc \T1, \XMM6, \XMM6
 829                vaesenc \T1, \XMM7, \XMM7
 830                vaesenc \T1, \XMM8, \XMM8
 831
 832        vmovdqa         TMP6(%rsp), \T1
 833        vmovdqa         HashKey_3(arg1), \T5
 834        vpclmulqdq      $0x11, \T5, \T1, \T3
 835        vpxor           \T3, \T4, \T4
 836        vpclmulqdq      $0x00, \T5, \T1, \T3
 837        vpxor           \T3, \T7, \T7
 838
 839        vpshufd         $0b01001110, \T1, \T3
 840        vpxor           \T1, \T3, \T3
 841        vmovdqa         HashKey_3_k(arg1), \T5
 842        vpclmulqdq      $0x10, \T5, \T3, \T3
 843        vpxor           \T3, \T6, \T6
 844
 845
 846                vmovdqu 16*8(arg1), \T1
 847                vaesenc \T1, \XMM1, \XMM1
 848                vaesenc \T1, \XMM2, \XMM2
 849                vaesenc \T1, \XMM3, \XMM3
 850                vaesenc \T1, \XMM4, \XMM4
 851                vaesenc \T1, \XMM5, \XMM5
 852                vaesenc \T1, \XMM6, \XMM6
 853                vaesenc \T1, \XMM7, \XMM7
 854                vaesenc \T1, \XMM8, \XMM8
 855
 856        vmovdqa         TMP7(%rsp), \T1
 857        vmovdqa         HashKey_2(arg1), \T5
 858        vpclmulqdq      $0x11, \T5, \T1, \T3
 859        vpxor           \T3, \T4, \T4
 860        vpclmulqdq      $0x00, \T5, \T1, \T3
 861        vpxor           \T3, \T7, \T7
 862
 863        vpshufd         $0b01001110, \T1, \T3
 864        vpxor           \T1, \T3, \T3
 865        vmovdqa         HashKey_2_k(arg1), \T5
 866        vpclmulqdq      $0x10, \T5, \T3, \T3
 867        vpxor           \T3, \T6, \T6
 868
 869        #######################################################################
 870
 871                vmovdqu 16*9(arg1), \T5
 872                vaesenc \T5, \XMM1, \XMM1
 873                vaesenc \T5, \XMM2, \XMM2
 874                vaesenc \T5, \XMM3, \XMM3
 875                vaesenc \T5, \XMM4, \XMM4
 876                vaesenc \T5, \XMM5, \XMM5
 877                vaesenc \T5, \XMM6, \XMM6
 878                vaesenc \T5, \XMM7, \XMM7
 879                vaesenc \T5, \XMM8, \XMM8
 880
 881        vmovdqa         TMP8(%rsp), \T1
 882        vmovdqa         HashKey(arg1), \T5
 883        vpclmulqdq      $0x11, \T5, \T1, \T3
 884        vpxor           \T3, \T4, \T4
 885        vpclmulqdq      $0x00, \T5, \T1, \T3
 886        vpxor           \T3, \T7, \T7
 887
 888        vpshufd         $0b01001110, \T1, \T3
 889        vpxor           \T1, \T3, \T3
 890        vmovdqa         HashKey_k(arg1), \T5
 891        vpclmulqdq      $0x10, \T5, \T3, \T3
 892        vpxor           \T3, \T6, \T6
 893
 894        vpxor           \T4, \T6, \T6
 895        vpxor           \T7, \T6, \T6
 896
 897                vmovdqu 16*10(arg1), \T5
 898
 899        i = 0
 900        j = 1
 901        setreg
 902.rep 8
 903                vpxor   16*i(arg3, %r11), \T5, \T2
 904                .if \ENC_DEC == ENC
 905                vaesenclast     \T2, reg_j, reg_j
 906                .else
 907                vaesenclast     \T2, reg_j, \T3
 908                vmovdqu 16*i(arg3, %r11), reg_j
 909                vmovdqu \T3, 16*i(arg2, %r11)
 910                .endif
 911        i = (i+1)
 912        j = (j+1)
 913        setreg
 914.endr
 915        #######################################################################
 916
 917
 918        vpslldq $8, \T6, \T3                            # shift-L T3 2 DWs
 919        vpsrldq $8, \T6, \T6                            # shift-R T2 2 DWs
 920        vpxor   \T3, \T7, \T7
 921        vpxor   \T4, \T6, \T6                           # accumulate the results in T6:T7
 922
 923
 924
 925        #######################################################################
 926        #first phase of the reduction
 927        #######################################################################
 928        vpslld  $31, \T7, \T2                           # packed right shifting << 31
 929        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
 930        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
 931
 932        vpxor   \T3, \T2, \T2                           # xor the shifted versions
 933        vpxor   \T4, \T2, \T2
 934
 935        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
 936
 937        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
 938        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
 939        #######################################################################
 940                .if \ENC_DEC == ENC
 941                vmovdqu  \XMM1, 16*0(arg2,%r11)         # Write to the Ciphertext buffer
 942                vmovdqu  \XMM2, 16*1(arg2,%r11)         # Write to the Ciphertext buffer
 943                vmovdqu  \XMM3, 16*2(arg2,%r11)         # Write to the Ciphertext buffer
 944                vmovdqu  \XMM4, 16*3(arg2,%r11)         # Write to the Ciphertext buffer
 945                vmovdqu  \XMM5, 16*4(arg2,%r11)         # Write to the Ciphertext buffer
 946                vmovdqu  \XMM6, 16*5(arg2,%r11)         # Write to the Ciphertext buffer
 947                vmovdqu  \XMM7, 16*6(arg2,%r11)         # Write to the Ciphertext buffer
 948                vmovdqu  \XMM8, 16*7(arg2,%r11)         # Write to the Ciphertext buffer
 949                .endif
 950
 951        #######################################################################
 952        #second phase of the reduction
 953        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
 954        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
 955        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
 956        vpxor   \T3, \T2, \T2                           # xor the shifted versions
 957        vpxor   \T4, \T2, \T2
 958
 959        vpxor   \T1, \T2, \T2
 960        vpxor   \T2, \T7, \T7
 961        vpxor   \T7, \T6, \T6                           # the result is in T6
 962        #######################################################################
 963
 964                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1   # perform a 16Byte swap
 965                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2   # perform a 16Byte swap
 966                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3   # perform a 16Byte swap
 967                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4   # perform a 16Byte swap
 968                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5   # perform a 16Byte swap
 969                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6   # perform a 16Byte swap
 970                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7   # perform a 16Byte swap
 971                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8   # perform a 16Byte swap
 972
 973
 974        vpxor   \T6, \XMM1, \XMM1
 975
 976
 977
 978.endm
 979
 980
 981# GHASH the last 4 ciphertext blocks.
 982.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
 983
 984        ## Karatsuba Method
 985
 986
 987        vpshufd         $0b01001110, \XMM1, \T2
 988        vpxor           \XMM1, \T2, \T2
 989        vmovdqa         HashKey_8(arg1), \T5
 990        vpclmulqdq      $0x11, \T5, \XMM1, \T6
 991        vpclmulqdq      $0x00, \T5, \XMM1, \T7
 992
 993        vmovdqa         HashKey_8_k(arg1), \T3
 994        vpclmulqdq      $0x00, \T3, \T2, \XMM1
 995
 996        ######################
 997
 998        vpshufd         $0b01001110, \XMM2, \T2
 999        vpxor           \XMM2, \T2, \T2
1000        vmovdqa         HashKey_7(arg1), \T5
1001        vpclmulqdq      $0x11, \T5, \XMM2, \T4
1002        vpxor           \T4, \T6, \T6
1003
1004        vpclmulqdq      $0x00, \T5, \XMM2, \T4
1005        vpxor           \T4, \T7, \T7
1006
1007        vmovdqa         HashKey_7_k(arg1), \T3
1008        vpclmulqdq      $0x00, \T3, \T2, \T2
1009        vpxor           \T2, \XMM1, \XMM1
1010
1011        ######################
1012
1013        vpshufd         $0b01001110, \XMM3, \T2
1014        vpxor           \XMM3, \T2, \T2
1015        vmovdqa         HashKey_6(arg1), \T5
1016        vpclmulqdq      $0x11, \T5, \XMM3, \T4
1017        vpxor           \T4, \T6, \T6
1018
1019        vpclmulqdq      $0x00, \T5, \XMM3, \T4
1020        vpxor           \T4, \T7, \T7
1021
1022        vmovdqa         HashKey_6_k(arg1), \T3
1023        vpclmulqdq      $0x00, \T3, \T2, \T2
1024        vpxor           \T2, \XMM1, \XMM1
1025
1026        ######################
1027
1028        vpshufd         $0b01001110, \XMM4, \T2
1029        vpxor           \XMM4, \T2, \T2
1030        vmovdqa         HashKey_5(arg1), \T5
1031        vpclmulqdq      $0x11, \T5, \XMM4, \T4
1032        vpxor           \T4, \T6, \T6
1033
1034        vpclmulqdq      $0x00, \T5, \XMM4, \T4
1035        vpxor           \T4, \T7, \T7
1036
1037        vmovdqa         HashKey_5_k(arg1), \T3
1038        vpclmulqdq      $0x00, \T3, \T2, \T2
1039        vpxor           \T2, \XMM1, \XMM1
1040
1041        ######################
1042
1043        vpshufd         $0b01001110, \XMM5, \T2
1044        vpxor           \XMM5, \T2, \T2
1045        vmovdqa         HashKey_4(arg1), \T5
1046        vpclmulqdq      $0x11, \T5, \XMM5, \T4
1047        vpxor           \T4, \T6, \T6
1048
1049        vpclmulqdq      $0x00, \T5, \XMM5, \T4
1050        vpxor           \T4, \T7, \T7
1051
1052        vmovdqa         HashKey_4_k(arg1), \T3
1053        vpclmulqdq      $0x00, \T3, \T2, \T2
1054        vpxor           \T2, \XMM1, \XMM1
1055
1056        ######################
1057
1058        vpshufd         $0b01001110, \XMM6, \T2
1059        vpxor           \XMM6, \T2, \T2
1060        vmovdqa         HashKey_3(arg1), \T5
1061        vpclmulqdq      $0x11, \T5, \XMM6, \T4
1062        vpxor           \T4, \T6, \T6
1063
1064        vpclmulqdq      $0x00, \T5, \XMM6, \T4
1065        vpxor           \T4, \T7, \T7
1066
1067        vmovdqa         HashKey_3_k(arg1), \T3
1068        vpclmulqdq      $0x00, \T3, \T2, \T2
1069        vpxor           \T2, \XMM1, \XMM1
1070
1071        ######################
1072
1073        vpshufd         $0b01001110, \XMM7, \T2
1074        vpxor           \XMM7, \T2, \T2
1075        vmovdqa         HashKey_2(arg1), \T5
1076        vpclmulqdq      $0x11, \T5, \XMM7, \T4
1077        vpxor           \T4, \T6, \T6
1078
1079        vpclmulqdq      $0x00, \T5, \XMM7, \T4
1080        vpxor           \T4, \T7, \T7
1081
1082        vmovdqa         HashKey_2_k(arg1), \T3
1083        vpclmulqdq      $0x00, \T3, \T2, \T2
1084        vpxor           \T2, \XMM1, \XMM1
1085
1086        ######################
1087
1088        vpshufd         $0b01001110, \XMM8, \T2
1089        vpxor           \XMM8, \T2, \T2
1090        vmovdqa         HashKey(arg1), \T5
1091        vpclmulqdq      $0x11, \T5, \XMM8, \T4
1092        vpxor           \T4, \T6, \T6
1093
1094        vpclmulqdq      $0x00, \T5, \XMM8, \T4
1095        vpxor           \T4, \T7, \T7
1096
1097        vmovdqa         HashKey_k(arg1), \T3
1098        vpclmulqdq      $0x00, \T3, \T2, \T2
1099
1100        vpxor           \T2, \XMM1, \XMM1
1101        vpxor           \T6, \XMM1, \XMM1
1102        vpxor           \T7, \XMM1, \T2
1103
1104
1105
1106
1107        vpslldq $8, \T2, \T4
1108        vpsrldq $8, \T2, \T2
1109
1110        vpxor   \T4, \T7, \T7
1111        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
1112                                # the accumulated carry-less multiplications
1113
1114        #######################################################################
1115        #first phase of the reduction
1116        vpslld  $31, \T7, \T2   # packed right shifting << 31
1117        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
1118        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
1119
1120        vpxor   \T3, \T2, \T2   # xor the shifted versions
1121        vpxor   \T4, \T2, \T2
1122
1123        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
1124
1125        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
1126        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
1127        #######################################################################
1128
1129
1130        #second phase of the reduction
1131        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
1132        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
1133        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
1134        vpxor   \T3, \T2, \T2   # xor the shifted versions
1135        vpxor   \T4, \T2, \T2
1136
1137        vpxor   \T1, \T2, \T2
1138        vpxor   \T2, \T7, \T7
1139        vpxor   \T7, \T6, \T6   # the result is in T6
1140
1141.endm
1142
1143
1144# combined for GCM encrypt and decrypt functions
1145# clobbering all xmm registers
1146# clobbering r10, r11, r12, r13, r14, r15
1147.macro  GCM_ENC_DEC_AVX     ENC_DEC
1148
1149        #the number of pushes must equal STACK_OFFSET
1150        push    %r12
1151        push    %r13
1152        push    %r14
1153        push    %r15
1154
1155        mov     %rsp, %r14
1156
1157
1158
1159
1160        sub     $VARIABLE_OFFSET, %rsp
1161        and     $~63, %rsp                  # align rsp to 64 bytes
1162
1163
1164        vmovdqu  HashKey(arg1), %xmm13      # xmm13 = HashKey
1165
1166        mov     arg4, %r13                  # save the number of bytes of plaintext/ciphertext
1167        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
1168
1169        mov     %r13, %r12
1170        shr     $4, %r12
1171        and     $7, %r12
1172        jz      _initial_num_blocks_is_0\@
1173
1174        cmp     $7, %r12
1175        je      _initial_num_blocks_is_7\@
1176        cmp     $6, %r12
1177        je      _initial_num_blocks_is_6\@
1178        cmp     $5, %r12
1179        je      _initial_num_blocks_is_5\@
1180        cmp     $4, %r12
1181        je      _initial_num_blocks_is_4\@
1182        cmp     $3, %r12
1183        je      _initial_num_blocks_is_3\@
1184        cmp     $2, %r12
1185        je      _initial_num_blocks_is_2\@
1186
1187        jmp     _initial_num_blocks_is_1\@
1188
1189_initial_num_blocks_is_7\@:
1190        INITIAL_BLOCKS_AVX  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1191        sub     $16*7, %r13
1192        jmp     _initial_blocks_encrypted\@
1193
1194_initial_num_blocks_is_6\@:
1195        INITIAL_BLOCKS_AVX  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1196        sub     $16*6, %r13
1197        jmp     _initial_blocks_encrypted\@
1198
1199_initial_num_blocks_is_5\@:
1200        INITIAL_BLOCKS_AVX  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1201        sub     $16*5, %r13
1202        jmp     _initial_blocks_encrypted\@
1203
1204_initial_num_blocks_is_4\@:
1205        INITIAL_BLOCKS_AVX  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1206        sub     $16*4, %r13
1207        jmp     _initial_blocks_encrypted\@
1208
1209_initial_num_blocks_is_3\@:
1210        INITIAL_BLOCKS_AVX  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1211        sub     $16*3, %r13
1212        jmp     _initial_blocks_encrypted\@
1213
1214_initial_num_blocks_is_2\@:
1215        INITIAL_BLOCKS_AVX  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1216        sub     $16*2, %r13
1217        jmp     _initial_blocks_encrypted\@
1218
1219_initial_num_blocks_is_1\@:
1220        INITIAL_BLOCKS_AVX  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1221        sub     $16*1, %r13
1222        jmp     _initial_blocks_encrypted\@
1223
1224_initial_num_blocks_is_0\@:
1225        INITIAL_BLOCKS_AVX  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1226
1227
1228_initial_blocks_encrypted\@:
1229        cmp     $0, %r13
1230        je      _zero_cipher_left\@
1231
1232        sub     $128, %r13
1233        je      _eight_cipher_left\@
1234
1235
1236
1237
1238        vmovd   %xmm9, %r15d
1239        and     $255, %r15d
1240        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1241
1242
1243_encrypt_by_8_new\@:
1244        cmp     $(255-8), %r15d
1245        jg      _encrypt_by_8\@
1246
1247
1248
1249        add     $8, %r15b
1250        GHASH_8_ENCRYPT_8_PARALLEL_AVX      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
1251        add     $128, %r11
1252        sub     $128, %r13
1253        jne     _encrypt_by_8_new\@
1254
1255        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1256        jmp     _eight_cipher_left\@
1257
1258_encrypt_by_8\@:
1259        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1260        add     $8, %r15b
1261        GHASH_8_ENCRYPT_8_PARALLEL_AVX      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
1262        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1263        add     $128, %r11
1264        sub     $128, %r13
1265        jne     _encrypt_by_8_new\@
1266
1267        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1268
1269
1270
1271
1272_eight_cipher_left\@:
1273        GHASH_LAST_8_AVX    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
1274
1275
1276_zero_cipher_left\@:
1277        cmp     $16, arg4
1278        jl      _only_less_than_16\@
1279
1280        mov     arg4, %r13
1281        and     $15, %r13                            # r13 = (arg4 mod 16)
1282
1283        je      _multiple_of_16_bytes\@
1284
1285        # handle the last <16 Byte block seperately
1286
1287
1288        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
1289        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1290        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
1291
1292        sub     $16, %r11
1293        add     %r13, %r11
1294        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
1295
1296        lea     SHIFT_MASK+16(%rip), %r12
1297        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
1298                                                     # able to shift 16-r13 bytes (r13 is the
1299                                                     # number of bytes in plaintext mod 16)
1300        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
1301        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
1302        jmp     _final_ghash_mul\@
1303
1304_only_less_than_16\@:
1305        # check for 0 length
1306        mov     arg4, %r13
1307        and     $15, %r13                            # r13 = (arg4 mod 16)
1308
1309        je      _multiple_of_16_bytes\@
1310
1311        # handle the last <16 Byte block seperately
1312
1313
1314        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
1315        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1316        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
1317
1318
1319        lea     SHIFT_MASK+16(%rip), %r12
1320        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
1321                                                     # able to shift 16-r13 bytes (r13 is the
1322                                                     # number of bytes in plaintext mod 16)
1323
1324_get_last_16_byte_loop\@:
1325        movb    (arg3, %r11),  %al
1326        movb    %al,  TMP1 (%rsp , %r11)
1327        add     $1, %r11
1328        cmp     %r13,  %r11
1329        jne     _get_last_16_byte_loop\@
1330
1331        vmovdqu  TMP1(%rsp), %xmm1
1332
1333        sub     $16, %r11
1334
1335_final_ghash_mul\@:
1336        .if  \ENC_DEC ==  DEC
1337        vmovdqa %xmm1, %xmm2
1338        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
1339        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
1340                                                     # mask out top 16-r13 bytes of xmm9
1341        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
1342        vpand   %xmm1, %xmm2, %xmm2
1343        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
1344        vpxor   %xmm2, %xmm14, %xmm14
1345        #GHASH computation for the last <16 Byte block
1346        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1347        sub     %r13, %r11
1348        add     $16, %r11
1349        .else
1350        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
1351        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
1352                                                     # mask out top 16-r13 bytes of xmm9
1353        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
1354        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1355        vpxor   %xmm9, %xmm14, %xmm14
1356        #GHASH computation for the last <16 Byte block
1357        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1358        sub     %r13, %r11
1359        add     $16, %r11
1360        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
1361        .endif
1362
1363
1364        #############################
1365        # output r13 Bytes
1366        vmovq   %xmm9, %rax
1367        cmp     $8, %r13
1368        jle     _less_than_8_bytes_left\@
1369
1370        mov     %rax, (arg2 , %r11)
1371        add     $8, %r11
1372        vpsrldq $8, %xmm9, %xmm9
1373        vmovq   %xmm9, %rax
1374        sub     $8, %r13
1375
1376_less_than_8_bytes_left\@:
1377        movb    %al, (arg2 , %r11)
1378        add     $1, %r11
1379        shr     $8, %rax
1380        sub     $1, %r13
1381        jne     _less_than_8_bytes_left\@
1382        #############################
1383
1384_multiple_of_16_bytes\@:
1385        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
1386        shl     $3, %r12                             # convert into number of bits
1387        vmovd   %r12d, %xmm15                        # len(A) in xmm15
1388
1389        shl     $3, arg4                             # len(C) in bits  (*128)
1390        vmovq   arg4, %xmm1
1391        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
1392        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
1393
1394        vpxor   %xmm15, %xmm14, %xmm14
1395        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
1396        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
1397
1398        mov     arg5, %rax                           # rax = *Y0
1399        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
1400
1401        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
1402
1403        vpxor   %xmm14, %xmm9, %xmm9
1404
1405
1406
1407_return_T\@:
1408        mov     arg8, %r10              # r10 = authTag
1409        mov     arg9, %r11              # r11 = auth_tag_len
1410
1411        cmp     $16, %r11
1412        je      _T_16\@
1413
1414        cmp     $12, %r11
1415        je      _T_12\@
1416
1417_T_8\@:
1418        vmovq   %xmm9, %rax
1419        mov     %rax, (%r10)
1420        jmp     _return_T_done\@
1421_T_12\@:
1422        vmovq   %xmm9, %rax
1423        mov     %rax, (%r10)
1424        vpsrldq $8, %xmm9, %xmm9
1425        vmovd   %xmm9, %eax
1426        mov     %eax, 8(%r10)
1427        jmp     _return_T_done\@
1428
1429_T_16\@:
1430        vmovdqu %xmm9, (%r10)
1431
1432_return_T_done\@:
1433        mov     %r14, %rsp
1434
1435        pop     %r15
1436        pop     %r14
1437        pop     %r13
1438        pop     %r12
1439.endm
1440
1441
1442#############################################################
1443#void   aesni_gcm_precomp_avx_gen2
1444#        (gcm_data     *my_ctx_data,
1445#        u8     *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1446#############################################################
1447ENTRY(aesni_gcm_precomp_avx_gen2)
1448        #the number of pushes must equal STACK_OFFSET
1449        push    %r12
1450        push    %r13
1451        push    %r14
1452        push    %r15
1453
1454        mov     %rsp, %r14
1455
1456
1457
1458        sub     $VARIABLE_OFFSET, %rsp
1459        and     $~63, %rsp                  # align rsp to 64 bytes
1460
1461        vmovdqu  (arg2), %xmm6              # xmm6 = HashKey
1462
1463        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
1464        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
1465        vmovdqa  %xmm6, %xmm2
1466        vpsllq   $1, %xmm6, %xmm6
1467        vpsrlq   $63, %xmm2, %xmm2
1468        vmovdqa  %xmm2, %xmm1
1469        vpslldq  $8, %xmm2, %xmm2
1470        vpsrldq  $8, %xmm1, %xmm1
1471        vpor     %xmm2, %xmm6, %xmm6
1472        #reduction
1473        vpshufd  $0b00100100, %xmm1, %xmm2
1474        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1475        vpand    POLY(%rip), %xmm2, %xmm2
1476        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
1477        #######################################################################
1478        vmovdqa  %xmm6, HashKey(arg1)       # store HashKey<<1 mod poly
1479
1480
1481        PRECOMPUTE_AVX  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1482
1483        mov     %r14, %rsp
1484
1485        pop     %r15
1486        pop     %r14
1487        pop     %r13
1488        pop     %r12
1489        ret
1490ENDPROC(aesni_gcm_precomp_avx_gen2)
1491
1492###############################################################################
1493#void   aesni_gcm_enc_avx_gen2(
1494#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1495#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
1496#        const   u8 *in, /* Plaintext input */
1497#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
1498#        u8      *iv, /* Pre-counter block j0: 4 byte salt
1499#                       (from Security Association) concatenated with 8 byte
1500#                       Initialisation Vector (from IPSec ESP Payload)
1501#                       concatenated with 0x00000001. 16-byte aligned pointer. */
1502#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1503#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1504#        u8      *auth_tag, /* Authenticated Tag output. */
1505#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1506#                               Valid values are 16 (most likely), 12 or 8. */
1507###############################################################################
1508ENTRY(aesni_gcm_enc_avx_gen2)
1509        GCM_ENC_DEC_AVX     ENC
1510        ret
1511ENDPROC(aesni_gcm_enc_avx_gen2)
1512
1513###############################################################################
1514#void   aesni_gcm_dec_avx_gen2(
1515#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1516#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
1517#        const   u8 *in, /* Ciphertext input */
1518#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
1519#        u8      *iv, /* Pre-counter block j0: 4 byte salt
1520#                       (from Security Association) concatenated with 8 byte
1521#                       Initialisation Vector (from IPSec ESP Payload)
1522#                       concatenated with 0x00000001. 16-byte aligned pointer. */
1523#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1524#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1525#        u8      *auth_tag, /* Authenticated Tag output. */
1526#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1527#                               Valid values are 16 (most likely), 12 or 8. */
1528###############################################################################
1529ENTRY(aesni_gcm_dec_avx_gen2)
1530        GCM_ENC_DEC_AVX     DEC
1531        ret
1532ENDPROC(aesni_gcm_dec_avx_gen2)
1533#endif /* CONFIG_AS_AVX */
1534
1535#ifdef CONFIG_AS_AVX2
1536###############################################################################
1537# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1538# Input: A and B (128-bits each, bit-reflected)
1539# Output: C = A*B*x mod poly, (i.e. >>1 )
1540# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1541# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1542###############################################################################
1543.macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1544
1545        vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
1546        vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
1547        vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
1548        vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
1549        vpxor           \T3, \GH, \GH
1550
1551
1552        vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
1553        vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
1554
1555        vpxor           \T3, \T1, \T1
1556        vpxor           \T2, \GH, \GH
1557
1558        #######################################################################
1559        #first phase of the reduction
1560        vmovdqa         POLY2(%rip), \T3
1561
1562        vpclmulqdq      $0x01, \GH, \T3, \T2
1563        vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
1564
1565        vpxor           \T2, \GH, \GH          # first phase of the reduction complete
1566        #######################################################################
1567        #second phase of the reduction
1568        vpclmulqdq      $0x00, \GH, \T3, \T2
1569        vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1570
1571        vpclmulqdq      $0x10, \GH, \T3, \GH
1572        vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1573
1574        vpxor           \T2, \GH, \GH          # second phase of the reduction complete
1575        #######################################################################
1576        vpxor           \T1, \GH, \GH          # the result is in GH
1577
1578
1579.endm
1580
1581.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1582
1583        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1584        vmovdqa  \HK, \T5
1585        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
1586        vmovdqa  \T5, HashKey_2(arg1)                       #  [HashKey_2] = HashKey^2<<1 mod poly
1587
1588        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
1589        vmovdqa  \T5, HashKey_3(arg1)
1590
1591        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
1592        vmovdqa  \T5, HashKey_4(arg1)
1593
1594        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
1595        vmovdqa  \T5, HashKey_5(arg1)
1596
1597        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
1598        vmovdqa  \T5, HashKey_6(arg1)
1599
1600        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
1601        vmovdqa  \T5, HashKey_7(arg1)
1602
1603        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
1604        vmovdqa  \T5, HashKey_8(arg1)
1605
1606.endm
1607
1608
1609## if a = number of total plaintext bytes
1610## b = floor(a/16)
1611## num_initial_blocks = b mod 4#
1612## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1613## r10, r11, r12, rax are clobbered
1614## arg1, arg2, arg3, r14 are used as a pointer only, not modified
1615
1616.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1617        i = (8-\num_initial_blocks)
1618        setreg
1619
1620        mov     arg6, %r10                       # r10 = AAD
1621        mov     arg7, %r12                       # r12 = aadLen
1622
1623
1624        mov     %r12, %r11
1625
1626        vpxor   reg_i, reg_i, reg_i
1627_get_AAD_loop\@:
1628        vmovd   (%r10), \T1
1629        vpslldq $12, \T1, \T1
1630        vpsrldq $4, reg_i, reg_i
1631        vpxor   \T1, reg_i, reg_i
1632
1633        add     $4, %r10
1634        sub     $4, %r12
1635        jg      _get_AAD_loop\@
1636
1637
1638        cmp     $16, %r11
1639        je      _get_AAD_loop2_done\@
1640        mov     $16, %r12
1641
1642_get_AAD_loop2\@:
1643        vpsrldq $4, reg_i, reg_i
1644        sub     $4, %r12
1645        cmp     %r11, %r12
1646        jg      _get_AAD_loop2\@
1647
1648_get_AAD_loop2_done\@:
1649
1650        #byte-reflect the AAD data
1651        vpshufb SHUF_MASK(%rip), reg_i, reg_i
1652
1653        # initialize the data pointer offset as zero
1654        xor     %r11, %r11
1655
1656        # start AES for num_initial_blocks blocks
1657        mov     arg5, %rax                     # rax = *Y0
1658        vmovdqu (%rax), \CTR                   # CTR = Y0
1659        vpshufb SHUF_MASK(%rip), \CTR, \CTR
1660
1661
1662        i = (9-\num_initial_blocks)
1663        setreg
1664.rep \num_initial_blocks
1665                vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
1666                vmovdqa \CTR, reg_i
1667                vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
1668        i = (i+1)
1669        setreg
1670.endr
1671
1672        vmovdqa  (arg1), \T_key
1673        i = (9-\num_initial_blocks)
1674        setreg
1675.rep \num_initial_blocks
1676                vpxor   \T_key, reg_i, reg_i
1677        i = (i+1)
1678        setreg
1679.endr
1680
1681        j = 1
1682        setreg
1683.rep 9
1684        vmovdqa  16*j(arg1), \T_key
1685        i = (9-\num_initial_blocks)
1686        setreg
1687.rep \num_initial_blocks
1688        vaesenc \T_key, reg_i, reg_i
1689        i = (i+1)
1690        setreg
1691.endr
1692
1693        j = (j+1)
1694        setreg
1695.endr
1696
1697
1698        vmovdqa  16*10(arg1), \T_key
1699        i = (9-\num_initial_blocks)
1700        setreg
1701.rep \num_initial_blocks
1702        vaesenclast      \T_key, reg_i, reg_i
1703        i = (i+1)
1704        setreg
1705.endr
1706
1707        i = (9-\num_initial_blocks)
1708        setreg
1709.rep \num_initial_blocks
1710                vmovdqu (arg3, %r11), \T1
1711                vpxor   \T1, reg_i, reg_i
1712                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for
1713                                                       # num_initial_blocks blocks
1714                add     $16, %r11
1715.if  \ENC_DEC == DEC
1716                vmovdqa \T1, reg_i
1717.endif
1718                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
1719        i = (i+1)
1720        setreg
1721.endr
1722
1723
1724        i = (8-\num_initial_blocks)
1725        j = (9-\num_initial_blocks)
1726        setreg
1727        GHASH_MUL_AVX2       reg_i, \T2, \T1, \T3, \T4, \T5, \T6
1728
1729.rep \num_initial_blocks
1730        vpxor    reg_i, reg_j, reg_j
1731        GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
1732        i = (i+1)
1733        j = (j+1)
1734        setreg
1735.endr
1736        # XMM8 has the combined result here
1737
1738        vmovdqa  \XMM8, TMP1(%rsp)
1739        vmovdqa  \XMM8, \T3
1740
1741        cmp     $128, %r13
1742        jl      _initial_blocks_done\@                  # no need for precomputed constants
1743
1744###############################################################################
1745# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1746                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1747                vmovdqa  \CTR, \XMM1
1748                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
1749
1750                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1751                vmovdqa  \CTR, \XMM2
1752                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
1753
1754                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1755                vmovdqa  \CTR, \XMM3
1756                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
1757
1758                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1759                vmovdqa  \CTR, \XMM4
1760                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
1761
1762                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1763                vmovdqa  \CTR, \XMM5
1764                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
1765
1766                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1767                vmovdqa  \CTR, \XMM6
1768                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
1769
1770                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1771                vmovdqa  \CTR, \XMM7
1772                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
1773
1774                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1775                vmovdqa  \CTR, \XMM8
1776                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
1777
1778                vmovdqa  (arg1), \T_key
1779                vpxor    \T_key, \XMM1, \XMM1
1780                vpxor    \T_key, \XMM2, \XMM2
1781                vpxor    \T_key, \XMM3, \XMM3
1782                vpxor    \T_key, \XMM4, \XMM4
1783                vpxor    \T_key, \XMM5, \XMM5
1784                vpxor    \T_key, \XMM6, \XMM6
1785                vpxor    \T_key, \XMM7, \XMM7
1786                vpxor    \T_key, \XMM8, \XMM8
1787
1788                i = 1
1789                setreg
1790.rep    9       # do 9 rounds
1791                vmovdqa  16*i(arg1), \T_key
1792                vaesenc  \T_key, \XMM1, \XMM1
1793                vaesenc  \T_key, \XMM2, \XMM2
1794                vaesenc  \T_key, \XMM3, \XMM3
1795                vaesenc  \T_key, \XMM4, \XMM4
1796                vaesenc  \T_key, \XMM5, \XMM5
1797                vaesenc  \T_key, \XMM6, \XMM6
1798                vaesenc  \T_key, \XMM7, \XMM7
1799                vaesenc  \T_key, \XMM8, \XMM8
1800                i = (i+1)
1801                setreg
1802.endr
1803
1804
1805                vmovdqa  16*i(arg1), \T_key
1806                vaesenclast  \T_key, \XMM1, \XMM1
1807                vaesenclast  \T_key, \XMM2, \XMM2
1808                vaesenclast  \T_key, \XMM3, \XMM3
1809                vaesenclast  \T_key, \XMM4, \XMM4
1810                vaesenclast  \T_key, \XMM5, \XMM5
1811                vaesenclast  \T_key, \XMM6, \XMM6
1812                vaesenclast  \T_key, \XMM7, \XMM7
1813                vaesenclast  \T_key, \XMM8, \XMM8
1814
1815                vmovdqu  (arg3, %r11), \T1
1816                vpxor    \T1, \XMM1, \XMM1
1817                vmovdqu  \XMM1, (arg2 , %r11)
1818                .if   \ENC_DEC == DEC
1819                vmovdqa  \T1, \XMM1
1820                .endif
1821
1822                vmovdqu  16*1(arg3, %r11), \T1
1823                vpxor    \T1, \XMM2, \XMM2
1824                vmovdqu  \XMM2, 16*1(arg2 , %r11)
1825                .if   \ENC_DEC == DEC
1826                vmovdqa  \T1, \XMM2
1827                .endif
1828
1829                vmovdqu  16*2(arg3, %r11), \T1
1830                vpxor    \T1, \XMM3, \XMM3
1831                vmovdqu  \XMM3, 16*2(arg2 , %r11)
1832                .if   \ENC_DEC == DEC
1833                vmovdqa  \T1, \XMM3
1834                .endif
1835
1836                vmovdqu  16*3(arg3, %r11), \T1
1837                vpxor    \T1, \XMM4, \XMM4
1838                vmovdqu  \XMM4, 16*3(arg2 , %r11)
1839                .if   \ENC_DEC == DEC
1840                vmovdqa  \T1, \XMM4
1841                .endif
1842
1843                vmovdqu  16*4(arg3, %r11), \T1
1844                vpxor    \T1, \XMM5, \XMM5
1845                vmovdqu  \XMM5, 16*4(arg2 , %r11)
1846                .if   \ENC_DEC == DEC
1847                vmovdqa  \T1, \XMM5
1848                .endif
1849
1850                vmovdqu  16*5(arg3, %r11), \T1
1851                vpxor    \T1, \XMM6, \XMM6
1852                vmovdqu  \XMM6, 16*5(arg2 , %r11)
1853                .if   \ENC_DEC == DEC
1854                vmovdqa  \T1, \XMM6
1855                .endif
1856
1857                vmovdqu  16*6(arg3, %r11), \T1
1858                vpxor    \T1, \XMM7, \XMM7
1859                vmovdqu  \XMM7, 16*6(arg2 , %r11)
1860                .if   \ENC_DEC == DEC
1861                vmovdqa  \T1, \XMM7
1862                .endif
1863
1864                vmovdqu  16*7(arg3, %r11), \T1
1865                vpxor    \T1, \XMM8, \XMM8
1866                vmovdqu  \XMM8, 16*7(arg2 , %r11)
1867                .if   \ENC_DEC == DEC
1868                vmovdqa  \T1, \XMM8
1869                .endif
1870
1871                add     $128, %r11
1872
1873                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
1874                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
1875                                                           # the corresponding ciphertext
1876                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
1877                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
1878                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
1879                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
1880                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
1881                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
1882                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
1883
1884###############################################################################
1885
1886_initial_blocks_done\@:
1887
1888
1889.endm
1890
1891
1892
1893# encrypt 8 blocks at a time
1894# ghash the 8 previously encrypted ciphertext blocks
1895# arg1, arg2, arg3 are used as pointers only, not modified
1896# r11 is the data offset value
1897.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1898
1899        vmovdqa \XMM1, \T2
1900        vmovdqa \XMM2, TMP2(%rsp)
1901        vmovdqa \XMM3, TMP3(%rsp)
1902        vmovdqa \XMM4, TMP4(%rsp)
1903        vmovdqa \XMM5, TMP5(%rsp)
1904        vmovdqa \XMM6, TMP6(%rsp)
1905        vmovdqa \XMM7, TMP7(%rsp)
1906        vmovdqa \XMM8, TMP8(%rsp)
1907
1908.if \loop_idx == in_order
1909                vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
1910                vpaddd  ONE(%rip), \XMM1, \XMM2
1911                vpaddd  ONE(%rip), \XMM2, \XMM3
1912                vpaddd  ONE(%rip), \XMM3, \XMM4
1913                vpaddd  ONE(%rip), \XMM4, \XMM5
1914                vpaddd  ONE(%rip), \XMM5, \XMM6
1915                vpaddd  ONE(%rip), \XMM6, \XMM7
1916                vpaddd  ONE(%rip), \XMM7, \XMM8
1917                vmovdqa \XMM8, \CTR
1918
1919                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
1920                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
1921                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
1922                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
1923                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
1924                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
1925                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
1926                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
1927.else
1928                vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
1929                vpaddd  ONEf(%rip), \XMM1, \XMM2
1930                vpaddd  ONEf(%rip), \XMM2, \XMM3
1931                vpaddd  ONEf(%rip), \XMM3, \XMM4
1932                vpaddd  ONEf(%rip), \XMM4, \XMM5
1933                vpaddd  ONEf(%rip), \XMM5, \XMM6
1934                vpaddd  ONEf(%rip), \XMM6, \XMM7
1935                vpaddd  ONEf(%rip), \XMM7, \XMM8
1936                vmovdqa \XMM8, \CTR
1937.endif
1938
1939
1940        #######################################################################
1941
1942                vmovdqu (arg1), \T1
1943                vpxor   \T1, \XMM1, \XMM1
1944                vpxor   \T1, \XMM2, \XMM2
1945                vpxor   \T1, \XMM3, \XMM3
1946                vpxor   \T1, \XMM4, \XMM4
1947                vpxor   \T1, \XMM5, \XMM5
1948                vpxor   \T1, \XMM6, \XMM6
1949                vpxor   \T1, \XMM7, \XMM7
1950                vpxor   \T1, \XMM8, \XMM8
1951
1952        #######################################################################
1953
1954
1955
1956
1957
1958                vmovdqu 16*1(arg1), \T1
1959                vaesenc \T1, \XMM1, \XMM1
1960                vaesenc \T1, \XMM2, \XMM2
1961                vaesenc \T1, \XMM3, \XMM3
1962                vaesenc \T1, \XMM4, \XMM4
1963                vaesenc \T1, \XMM5, \XMM5
1964                vaesenc \T1, \XMM6, \XMM6
1965                vaesenc \T1, \XMM7, \XMM7
1966                vaesenc \T1, \XMM8, \XMM8
1967
1968                vmovdqu 16*2(arg1), \T1
1969                vaesenc \T1, \XMM1, \XMM1
1970                vaesenc \T1, \XMM2, \XMM2
1971                vaesenc \T1, \XMM3, \XMM3
1972                vaesenc \T1, \XMM4, \XMM4
1973                vaesenc \T1, \XMM5, \XMM5
1974                vaesenc \T1, \XMM6, \XMM6
1975                vaesenc \T1, \XMM7, \XMM7
1976                vaesenc \T1, \XMM8, \XMM8
1977
1978
1979        #######################################################################
1980
1981        vmovdqa         HashKey_8(arg1), \T5
1982        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
1983        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
1984        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
1985        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
1986        vpxor           \T5, \T6, \T6
1987
1988                vmovdqu 16*3(arg1), \T1
1989                vaesenc \T1, \XMM1, \XMM1
1990                vaesenc \T1, \XMM2, \XMM2
1991                vaesenc \T1, \XMM3, \XMM3
1992                vaesenc \T1, \XMM4, \XMM4
1993                vaesenc \T1, \XMM5, \XMM5
1994                vaesenc \T1, \XMM6, \XMM6
1995                vaesenc \T1, \XMM7, \XMM7
1996                vaesenc \T1, \XMM8, \XMM8
1997
1998        vmovdqa         TMP2(%rsp), \T1
1999        vmovdqa         HashKey_7(arg1), \T5
2000        vpclmulqdq      $0x11, \T5, \T1, \T3
2001        vpxor           \T3, \T4, \T4
2002
2003        vpclmulqdq      $0x00, \T5, \T1, \T3
2004        vpxor           \T3, \T7, \T7
2005
2006        vpclmulqdq      $0x01, \T5, \T1, \T3
2007        vpxor           \T3, \T6, \T6
2008
2009        vpclmulqdq      $0x10, \T5, \T1, \T3
2010        vpxor           \T3, \T6, \T6
2011
2012                vmovdqu 16*4(arg1), \T1
2013                vaesenc \T1, \XMM1, \XMM1
2014                vaesenc \T1, \XMM2, \XMM2
2015                vaesenc \T1, \XMM3, \XMM3
2016                vaesenc \T1, \XMM4, \XMM4
2017                vaesenc \T1, \XMM5, \XMM5
2018                vaesenc \T1, \XMM6, \XMM6
2019                vaesenc \T1, \XMM7, \XMM7
2020                vaesenc \T1, \XMM8, \XMM8
2021
2022        #######################################################################
2023
2024        vmovdqa         TMP3(%rsp), \T1
2025        vmovdqa         HashKey_6(arg1), \T5
2026        vpclmulqdq      $0x11, \T5, \T1, \T3
2027        vpxor           \T3, \T4, \T4
2028
2029        vpclmulqdq      $0x00, \T5, \T1, \T3
2030        vpxor           \T3, \T7, \T7
2031
2032        vpclmulqdq      $0x01, \T5, \T1, \T3
2033        vpxor           \T3, \T6, \T6
2034
2035        vpclmulqdq      $0x10, \T5, \T1, \T3
2036        vpxor           \T3, \T6, \T6
2037
2038                vmovdqu 16*5(arg1), \T1
2039                vaesenc \T1, \XMM1, \XMM1
2040                vaesenc \T1, \XMM2, \XMM2
2041                vaesenc \T1, \XMM3, \XMM3
2042                vaesenc \T1, \XMM4, \XMM4
2043                vaesenc \T1, \XMM5, \XMM5
2044                vaesenc \T1, \XMM6, \XMM6
2045                vaesenc \T1, \XMM7, \XMM7
2046                vaesenc \T1, \XMM8, \XMM8
2047
2048        vmovdqa         TMP4(%rsp), \T1
2049        vmovdqa         HashKey_5(arg1), \T5
2050        vpclmulqdq      $0x11, \T5, \T1, \T3
2051        vpxor           \T3, \T4, \T4
2052
2053        vpclmulqdq      $0x00, \T5, \T1, \T3
2054        vpxor           \T3, \T7, \T7
2055
2056        vpclmulqdq      $0x01, \T5, \T1, \T3
2057        vpxor           \T3, \T6, \T6
2058
2059        vpclmulqdq      $0x10, \T5, \T1, \T3
2060        vpxor           \T3, \T6, \T6
2061
2062                vmovdqu 16*6(arg1), \T1
2063                vaesenc \T1, \XMM1, \XMM1
2064                vaesenc \T1, \XMM2, \XMM2
2065                vaesenc \T1, \XMM3, \XMM3
2066                vaesenc \T1, \XMM4, \XMM4
2067                vaesenc \T1, \XMM5, \XMM5
2068                vaesenc \T1, \XMM6, \XMM6
2069                vaesenc \T1, \XMM7, \XMM7
2070                vaesenc \T1, \XMM8, \XMM8
2071
2072
2073        vmovdqa         TMP5(%rsp), \T1
2074        vmovdqa         HashKey_4(arg1), \T5
2075        vpclmulqdq      $0x11, \T5, \T1, \T3
2076        vpxor           \T3, \T4, \T4
2077
2078        vpclmulqdq      $0x00, \T5, \T1, \T3
2079        vpxor           \T3, \T7, \T7
2080
2081        vpclmulqdq      $0x01, \T5, \T1, \T3
2082        vpxor           \T3, \T6, \T6
2083
2084        vpclmulqdq      $0x10, \T5, \T1, \T3
2085        vpxor           \T3, \T6, \T6
2086
2087                vmovdqu 16*7(arg1), \T1
2088                vaesenc \T1, \XMM1, \XMM1
2089                vaesenc \T1, \XMM2, \XMM2
2090                vaesenc \T1, \XMM3, \XMM3
2091                vaesenc \T1, \XMM4, \XMM4
2092                vaesenc \T1, \XMM5, \XMM5
2093                vaesenc \T1, \XMM6, \XMM6
2094                vaesenc \T1, \XMM7, \XMM7
2095                vaesenc \T1, \XMM8, \XMM8
2096
2097        vmovdqa         TMP6(%rsp), \T1
2098        vmovdqa         HashKey_3(arg1), \T5
2099        vpclmulqdq      $0x11, \T5, \T1, \T3
2100        vpxor           \T3, \T4, \T4
2101
2102        vpclmulqdq      $0x00, \T5, \T1, \T3
2103        vpxor           \T3, \T7, \T7
2104
2105        vpclmulqdq      $0x01, \T5, \T1, \T3
2106        vpxor           \T3, \T6, \T6
2107
2108        vpclmulqdq      $0x10, \T5, \T1, \T3
2109        vpxor           \T3, \T6, \T6
2110
2111                vmovdqu 16*8(arg1), \T1
2112                vaesenc \T1, \XMM1, \XMM1
2113                vaesenc \T1, \XMM2, \XMM2
2114                vaesenc \T1, \XMM3, \XMM3
2115                vaesenc \T1, \XMM4, \XMM4
2116                vaesenc \T1, \XMM5, \XMM5
2117                vaesenc \T1, \XMM6, \XMM6
2118                vaesenc \T1, \XMM7, \XMM7
2119                vaesenc \T1, \XMM8, \XMM8
2120
2121        vmovdqa         TMP7(%rsp), \T1
2122        vmovdqa         HashKey_2(arg1), \T5
2123        vpclmulqdq      $0x11, \T5, \T1, \T3
2124        vpxor           \T3, \T4, \T4
2125
2126        vpclmulqdq      $0x00, \T5, \T1, \T3
2127        vpxor           \T3, \T7, \T7
2128
2129        vpclmulqdq      $0x01, \T5, \T1, \T3
2130        vpxor           \T3, \T6, \T6
2131
2132        vpclmulqdq      $0x10, \T5, \T1, \T3
2133        vpxor           \T3, \T6, \T6
2134
2135
2136        #######################################################################
2137
2138                vmovdqu 16*9(arg1), \T5
2139                vaesenc \T5, \XMM1, \XMM1
2140                vaesenc \T5, \XMM2, \XMM2
2141                vaesenc \T5, \XMM3, \XMM3
2142                vaesenc \T5, \XMM4, \XMM4
2143                vaesenc \T5, \XMM5, \XMM5
2144                vaesenc \T5, \XMM6, \XMM6
2145                vaesenc \T5, \XMM7, \XMM7
2146                vaesenc \T5, \XMM8, \XMM8
2147
2148        vmovdqa         TMP8(%rsp), \T1
2149        vmovdqa         HashKey(arg1), \T5
2150
2151        vpclmulqdq      $0x00, \T5, \T1, \T3
2152        vpxor           \T3, \T7, \T7
2153
2154        vpclmulqdq      $0x01, \T5, \T1, \T3
2155        vpxor           \T3, \T6, \T6
2156
2157        vpclmulqdq      $0x10, \T5, \T1, \T3
2158        vpxor           \T3, \T6, \T6
2159
2160        vpclmulqdq      $0x11, \T5, \T1, \T3
2161        vpxor           \T3, \T4, \T1
2162
2163
2164                vmovdqu 16*10(arg1), \T5
2165
2166        i = 0
2167        j = 1
2168        setreg
2169.rep 8
2170                vpxor   16*i(arg3, %r11), \T5, \T2
2171                .if \ENC_DEC == ENC
2172                vaesenclast     \T2, reg_j, reg_j
2173                .else
2174                vaesenclast     \T2, reg_j, \T3
2175                vmovdqu 16*i(arg3, %r11), reg_j
2176                vmovdqu \T3, 16*i(arg2, %r11)
2177                .endif
2178        i = (i+1)
2179        j = (j+1)
2180        setreg
2181.endr
2182        #######################################################################
2183
2184
2185        vpslldq $8, \T6, \T3                            # shift-L T3 2 DWs
2186        vpsrldq $8, \T6, \T6                            # shift-R T2 2 DWs
2187        vpxor   \T3, \T7, \T7
2188        vpxor   \T6, \T1, \T1                           # accumulate the results in T1:T7
2189
2190
2191
2192        #######################################################################
2193        #first phase of the reduction
2194        vmovdqa         POLY2(%rip), \T3
2195
2196        vpclmulqdq      $0x01, \T7, \T3, \T2
2197        vpslldq         $8, \T2, \T2                    # shift-L xmm2 2 DWs
2198
2199        vpxor           \T2, \T7, \T7                   # first phase of the reduction complete
2200        #######################################################################
2201                .if \ENC_DEC == ENC
2202                vmovdqu  \XMM1, 16*0(arg2,%r11)         # Write to the Ciphertext buffer
2203                vmovdqu  \XMM2, 16*1(arg2,%r11)         # Write to the Ciphertext buffer
2204                vmovdqu  \XMM3, 16*2(arg2,%r11)         # Write to the Ciphertext buffer
2205                vmovdqu  \XMM4, 16*3(arg2,%r11)         # Write to the Ciphertext buffer
2206                vmovdqu  \XMM5, 16*4(arg2,%r11)         # Write to the Ciphertext buffer
2207                vmovdqu  \XMM6, 16*5(arg2,%r11)         # Write to the Ciphertext buffer
2208                vmovdqu  \XMM7, 16*6(arg2,%r11)         # Write to the Ciphertext buffer
2209                vmovdqu  \XMM8, 16*7(arg2,%r11)         # Write to the Ciphertext buffer
2210                .endif
2211
2212        #######################################################################
2213        #second phase of the reduction
2214        vpclmulqdq      $0x00, \T7, \T3, \T2
2215        vpsrldq         $4, \T2, \T2                    # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2216
2217        vpclmulqdq      $0x10, \T7, \T3, \T4
2218        vpslldq         $4, \T4, \T4                    # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2219
2220        vpxor           \T2, \T4, \T4                   # second phase of the reduction complete
2221        #######################################################################
2222        vpxor           \T4, \T1, \T1                   # the result is in T1
2223
2224                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1   # perform a 16Byte swap
2225                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2   # perform a 16Byte swap
2226                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3   # perform a 16Byte swap
2227                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4   # perform a 16Byte swap
2228                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5   # perform a 16Byte swap
2229                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6   # perform a 16Byte swap
2230                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7   # perform a 16Byte swap
2231                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8   # perform a 16Byte swap
2232
2233
2234        vpxor   \T1, \XMM1, \XMM1
2235
2236
2237
2238.endm
2239
2240
2241# GHASH the last 4 ciphertext blocks.
2242.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2243
2244        ## Karatsuba Method
2245
2246        vmovdqa         HashKey_8(arg1), \T5
2247
2248        vpshufd         $0b01001110, \XMM1, \T2
2249        vpshufd         $0b01001110, \T5, \T3
2250        vpxor           \XMM1, \T2, \T2
2251        vpxor           \T5, \T3, \T3
2252
2253        vpclmulqdq      $0x11, \T5, \XMM1, \T6
2254        vpclmulqdq      $0x00, \T5, \XMM1, \T7
2255
2256        vpclmulqdq      $0x00, \T3, \T2, \XMM1
2257
2258        ######################
2259
2260        vmovdqa         HashKey_7(arg1), \T5
2261        vpshufd         $0b01001110, \XMM2, \T2
2262        vpshufd         $0b01001110, \T5, \T3
2263        vpxor           \XMM2, \T2, \T2
2264        vpxor           \T5, \T3, \T3
2265
2266        vpclmulqdq      $0x11, \T5, \XMM2, \T4
2267        vpxor           \T4, \T6, \T6
2268
2269        vpclmulqdq      $0x00, \T5, \XMM2, \T4
2270        vpxor           \T4, \T7, \T7
2271
2272        vpclmulqdq      $0x00, \T3, \T2, \T2
2273
2274        vpxor           \T2, \XMM1, \XMM1
2275
2276        ######################
2277
2278        vmovdqa         HashKey_6(arg1), \T5
2279        vpshufd         $0b01001110, \XMM3, \T2
2280        vpshufd         $0b01001110, \T5, \T3
2281        vpxor           \XMM3, \T2, \T2
2282        vpxor           \T5, \T3, \T3
2283
2284        vpclmulqdq      $0x11, \T5, \XMM3, \T4
2285        vpxor           \T4, \T6, \T6
2286
2287        vpclmulqdq      $0x00, \T5, \XMM3, \T4
2288        vpxor           \T4, \T7, \T7
2289
2290        vpclmulqdq      $0x00, \T3, \T2, \T2
2291
2292        vpxor           \T2, \XMM1, \XMM1
2293
2294        ######################
2295
2296        vmovdqa         HashKey_5(arg1), \T5
2297        vpshufd         $0b01001110, \XMM4, \T2
2298        vpshufd         $0b01001110, \T5, \T3
2299        vpxor           \XMM4, \T2, \T2
2300        vpxor           \T5, \T3, \T3
2301
2302        vpclmulqdq      $0x11, \T5, \XMM4, \T4
2303        vpxor           \T4, \T6, \T6
2304
2305        vpclmulqdq      $0x00, \T5, \XMM4, \T4
2306        vpxor           \T4, \T7, \T7
2307
2308        vpclmulqdq      $0x00, \T3, \T2, \T2
2309
2310        vpxor           \T2, \XMM1, \XMM1
2311
2312        ######################
2313
2314        vmovdqa         HashKey_4(arg1), \T5
2315        vpshufd         $0b01001110, \XMM5, \T2
2316        vpshufd         $0b01001110, \T5, \T3
2317        vpxor           \XMM5, \T2, \T2
2318        vpxor           \T5, \T3, \T3
2319
2320        vpclmulqdq      $0x11, \T5, \XMM5, \T4
2321        vpxor           \T4, \T6, \T6
2322
2323        vpclmulqdq      $0x00, \T5, \XMM5, \T4
2324        vpxor           \T4, \T7, \T7
2325
2326        vpclmulqdq      $0x00, \T3, \T2, \T2
2327
2328        vpxor           \T2, \XMM1, \XMM1
2329
2330        ######################
2331
2332        vmovdqa         HashKey_3(arg1), \T5
2333        vpshufd         $0b01001110, \XMM6, \T2
2334        vpshufd         $0b01001110, \T5, \T3
2335        vpxor           \XMM6, \T2, \T2
2336        vpxor           \T5, \T3, \T3
2337
2338        vpclmulqdq      $0x11, \T5, \XMM6, \T4
2339        vpxor           \T4, \T6, \T6
2340
2341        vpclmulqdq      $0x00, \T5, \XMM6, \T4
2342        vpxor           \T4, \T7, \T7
2343
2344        vpclmulqdq      $0x00, \T3, \T2, \T2
2345
2346        vpxor           \T2, \XMM1, \XMM1
2347
2348        ######################
2349
2350        vmovdqa         HashKey_2(arg1), \T5
2351        vpshufd         $0b01001110, \XMM7, \T2
2352        vpshufd         $0b01001110, \T5, \T3
2353        vpxor           \XMM7, \T2, \T2
2354        vpxor           \T5, \T3, \T3
2355
2356        vpclmulqdq      $0x11, \T5, \XMM7, \T4
2357        vpxor           \T4, \T6, \T6
2358
2359        vpclmulqdq      $0x00, \T5, \XMM7, \T4
2360        vpxor           \T4, \T7, \T7
2361
2362        vpclmulqdq      $0x00, \T3, \T2, \T2
2363
2364        vpxor           \T2, \XMM1, \XMM1
2365
2366        ######################
2367
2368        vmovdqa         HashKey(arg1), \T5
2369        vpshufd         $0b01001110, \XMM8, \T2
2370        vpshufd         $0b01001110, \T5, \T3
2371        vpxor           \XMM8, \T2, \T2
2372        vpxor           \T5, \T3, \T3
2373
2374        vpclmulqdq      $0x11, \T5, \XMM8, \T4
2375        vpxor           \T4, \T6, \T6
2376
2377        vpclmulqdq      $0x00, \T5, \XMM8, \T4
2378        vpxor           \T4, \T7, \T7
2379
2380        vpclmulqdq      $0x00, \T3, \T2, \T2
2381
2382        vpxor           \T2, \XMM1, \XMM1
2383        vpxor           \T6, \XMM1, \XMM1
2384        vpxor           \T7, \XMM1, \T2
2385
2386
2387
2388
2389        vpslldq $8, \T2, \T4
2390        vpsrldq $8, \T2, \T2
2391
2392        vpxor   \T4, \T7, \T7
2393        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
2394                                                   # accumulated carry-less multiplications
2395
2396        #######################################################################
2397        #first phase of the reduction
2398        vmovdqa         POLY2(%rip), \T3
2399
2400        vpclmulqdq      $0x01, \T7, \T3, \T2
2401        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
2402
2403        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
2404        #######################################################################
2405
2406
2407        #second phase of the reduction
2408        vpclmulqdq      $0x00, \T7, \T3, \T2
2409        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2410
2411        vpclmulqdq      $0x10, \T7, \T3, \T4
2412        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2413
2414        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
2415        #######################################################################
2416        vpxor           \T4, \T6, \T6              # the result is in T6
2417.endm
2418
2419
2420
2421# combined for GCM encrypt and decrypt functions
2422# clobbering all xmm registers
2423# clobbering r10, r11, r12, r13, r14, r15
2424.macro  GCM_ENC_DEC_AVX2     ENC_DEC
2425
2426        #the number of pushes must equal STACK_OFFSET
2427        push    %r12
2428        push    %r13
2429        push    %r14
2430        push    %r15
2431
2432        mov     %rsp, %r14
2433
2434
2435
2436
2437        sub     $VARIABLE_OFFSET, %rsp
2438        and     $~63, %rsp                         # align rsp to 64 bytes
2439
2440
2441        vmovdqu  HashKey(arg1), %xmm13             # xmm13 = HashKey
2442
2443        mov     arg4, %r13                         # save the number of bytes of plaintext/ciphertext
2444        and     $-16, %r13                         # r13 = r13 - (r13 mod 16)
2445
2446        mov     %r13, %r12
2447        shr     $4, %r12
2448        and     $7, %r12
2449        jz      _initial_num_blocks_is_0\@
2450
2451        cmp     $7, %r12
2452        je      _initial_num_blocks_is_7\@
2453        cmp     $6, %r12
2454        je      _initial_num_blocks_is_6\@
2455        cmp     $5, %r12
2456        je      _initial_num_blocks_is_5\@
2457        cmp     $4, %r12
2458        je      _initial_num_blocks_is_4\@
2459        cmp     $3, %r12
2460        je      _initial_num_blocks_is_3\@
2461        cmp     $2, %r12
2462        je      _initial_num_blocks_is_2\@
2463
2464        jmp     _initial_num_blocks_is_1\@
2465
2466_initial_num_blocks_is_7\@:
2467        INITIAL_BLOCKS_AVX2  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2468        sub     $16*7, %r13
2469        jmp     _initial_blocks_encrypted\@
2470
2471_initial_num_blocks_is_6\@:
2472        INITIAL_BLOCKS_AVX2  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2473        sub     $16*6, %r13
2474        jmp     _initial_blocks_encrypted\@
2475
2476_initial_num_blocks_is_5\@:
2477        INITIAL_BLOCKS_AVX2  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2478        sub     $16*5, %r13
2479        jmp     _initial_blocks_encrypted\@
2480
2481_initial_num_blocks_is_4\@:
2482        INITIAL_BLOCKS_AVX2  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2483        sub     $16*4, %r13
2484        jmp     _initial_blocks_encrypted\@
2485
2486_initial_num_blocks_is_3\@:
2487        INITIAL_BLOCKS_AVX2  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2488        sub     $16*3, %r13
2489        jmp     _initial_blocks_encrypted\@
2490
2491_initial_num_blocks_is_2\@:
2492        INITIAL_BLOCKS_AVX2  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2493        sub     $16*2, %r13
2494        jmp     _initial_blocks_encrypted\@
2495
2496_initial_num_blocks_is_1\@:
2497        INITIAL_BLOCKS_AVX2  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2498        sub     $16*1, %r13
2499        jmp     _initial_blocks_encrypted\@
2500
2501_initial_num_blocks_is_0\@:
2502        INITIAL_BLOCKS_AVX2  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2503
2504
2505_initial_blocks_encrypted\@:
2506        cmp     $0, %r13
2507        je      _zero_cipher_left\@
2508
2509        sub     $128, %r13
2510        je      _eight_cipher_left\@
2511
2512
2513
2514
2515        vmovd   %xmm9, %r15d
2516        and     $255, %r15d
2517        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2518
2519
2520_encrypt_by_8_new\@:
2521        cmp     $(255-8), %r15d
2522        jg      _encrypt_by_8\@
2523
2524
2525
2526        add     $8, %r15b
2527        GHASH_8_ENCRYPT_8_PARALLEL_AVX2      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
2528        add     $128, %r11
2529        sub     $128, %r13
2530        jne     _encrypt_by_8_new\@
2531
2532        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2533        jmp     _eight_cipher_left\@
2534
2535_encrypt_by_8\@:
2536        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2537        add     $8, %r15b
2538        GHASH_8_ENCRYPT_8_PARALLEL_AVX2      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
2539        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2540        add     $128, %r11
2541        sub     $128, %r13
2542        jne     _encrypt_by_8_new\@
2543
2544        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2545
2546
2547
2548
2549_eight_cipher_left\@:
2550        GHASH_LAST_8_AVX2    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
2551
2552
2553_zero_cipher_left\@:
2554        cmp     $16, arg4
2555        jl      _only_less_than_16\@
2556
2557        mov     arg4, %r13
2558        and     $15, %r13                            # r13 = (arg4 mod 16)
2559
2560        je      _multiple_of_16_bytes\@
2561
2562        # handle the last <16 Byte block seperately
2563
2564
2565        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
2566        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2567        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
2568
2569        sub     $16, %r11
2570        add     %r13, %r11
2571        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
2572
2573        lea     SHIFT_MASK+16(%rip), %r12
2574        sub     %r13, %r12                           # adjust the shuffle mask pointer
2575                                                     # to be able to shift 16-r13 bytes
2576                                                     # (r13 is the number of bytes in plaintext mod 16)
2577        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
2578        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
2579        jmp     _final_ghash_mul\@
2580
2581_only_less_than_16\@:
2582        # check for 0 length
2583        mov     arg4, %r13
2584        and     $15, %r13                            # r13 = (arg4 mod 16)
2585
2586        je      _multiple_of_16_bytes\@
2587
2588        # handle the last <16 Byte block seperately
2589
2590
2591        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
2592        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2593        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
2594
2595
2596        lea     SHIFT_MASK+16(%rip), %r12
2597        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
2598                                                     # able to shift 16-r13 bytes (r13 is the
2599                                                     # number of bytes in plaintext mod 16)
2600
2601_get_last_16_byte_loop\@:
2602        movb    (arg3, %r11),  %al
2603        movb    %al,  TMP1 (%rsp , %r11)
2604        add     $1, %r11
2605        cmp     %r13,  %r11
2606        jne     _get_last_16_byte_loop\@
2607
2608        vmovdqu  TMP1(%rsp), %xmm1
2609
2610        sub     $16, %r11
2611
2612_final_ghash_mul\@:
2613        .if  \ENC_DEC ==  DEC
2614        vmovdqa %xmm1, %xmm2
2615        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
2616        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2617        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
2618        vpand   %xmm1, %xmm2, %xmm2
2619        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
2620        vpxor   %xmm2, %xmm14, %xmm14
2621        #GHASH computation for the last <16 Byte block
2622        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2623        sub     %r13, %r11
2624        add     $16, %r11
2625        .else
2626        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
2627        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2628        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
2629        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2630        vpxor   %xmm9, %xmm14, %xmm14
2631        #GHASH computation for the last <16 Byte block
2632        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2633        sub     %r13, %r11
2634        add     $16, %r11
2635        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
2636        .endif
2637
2638
2639        #############################
2640        # output r13 Bytes
2641        vmovq   %xmm9, %rax
2642        cmp     $8, %r13
2643        jle     _less_than_8_bytes_left\@
2644
2645        mov     %rax, (arg2 , %r11)
2646        add     $8, %r11
2647        vpsrldq $8, %xmm9, %xmm9
2648        vmovq   %xmm9, %rax
2649        sub     $8, %r13
2650
2651_less_than_8_bytes_left\@:
2652        movb    %al, (arg2 , %r11)
2653        add     $1, %r11
2654        shr     $8, %rax
2655        sub     $1, %r13
2656        jne     _less_than_8_bytes_left\@
2657        #############################
2658
2659_multiple_of_16_bytes\@:
2660        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
2661        shl     $3, %r12                             # convert into number of bits
2662        vmovd   %r12d, %xmm15                        # len(A) in xmm15
2663
2664        shl     $3, arg4                             # len(C) in bits  (*128)
2665        vmovq   arg4, %xmm1
2666        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
2667        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
2668
2669        vpxor   %xmm15, %xmm14, %xmm14
2670        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
2671        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14              # perform a 16Byte swap
2672
2673        mov     arg5, %rax                           # rax = *Y0
2674        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
2675
2676        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
2677
2678        vpxor   %xmm14, %xmm9, %xmm9
2679
2680
2681
2682_return_T\@:
2683        mov     arg8, %r10              # r10 = authTag
2684        mov     arg9, %r11              # r11 = auth_tag_len
2685
2686        cmp     $16, %r11
2687        je      _T_16\@
2688
2689        cmp     $12, %r11
2690        je      _T_12\@
2691
2692_T_8\@:
2693        vmovq   %xmm9, %rax
2694        mov     %rax, (%r10)
2695        jmp     _return_T_done\@
2696_T_12\@:
2697        vmovq   %xmm9, %rax
2698        mov     %rax, (%r10)
2699        vpsrldq $8, %xmm9, %xmm9
2700        vmovd   %xmm9, %eax
2701        mov     %eax, 8(%r10)
2702        jmp     _return_T_done\@
2703
2704_T_16\@:
2705        vmovdqu %xmm9, (%r10)
2706
2707_return_T_done\@:
2708        mov     %r14, %rsp
2709
2710        pop     %r15
2711        pop     %r14
2712        pop     %r13
2713        pop     %r12
2714.endm
2715
2716
2717#############################################################
2718#void   aesni_gcm_precomp_avx_gen4
2719#        (gcm_data     *my_ctx_data,
2720#        u8     *hash_subkey)# /* H, the Hash sub key input.
2721#                               Data starts on a 16-byte boundary. */
2722#############################################################
2723ENTRY(aesni_gcm_precomp_avx_gen4)
2724        #the number of pushes must equal STACK_OFFSET
2725        push    %r12
2726        push    %r13
2727        push    %r14
2728        push    %r15
2729
2730        mov     %rsp, %r14
2731
2732
2733
2734        sub     $VARIABLE_OFFSET, %rsp
2735        and     $~63, %rsp                    # align rsp to 64 bytes
2736
2737        vmovdqu  (arg2), %xmm6                # xmm6 = HashKey
2738
2739        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
2740        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
2741        vmovdqa  %xmm6, %xmm2
2742        vpsllq   $1, %xmm6, %xmm6
2743        vpsrlq   $63, %xmm2, %xmm2
2744        vmovdqa  %xmm2, %xmm1
2745        vpslldq  $8, %xmm2, %xmm2
2746        vpsrldq  $8, %xmm1, %xmm1
2747        vpor     %xmm2, %xmm6, %xmm6
2748        #reduction
2749        vpshufd  $0b00100100, %xmm1, %xmm2
2750        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2751        vpand    POLY(%rip), %xmm2, %xmm2
2752        vpxor    %xmm2, %xmm6, %xmm6          # xmm6 holds the HashKey<<1 mod poly
2753        #######################################################################
2754        vmovdqa  %xmm6, HashKey(arg1)         # store HashKey<<1 mod poly
2755
2756
2757        PRECOMPUTE_AVX2  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2758
2759        mov     %r14, %rsp
2760
2761        pop     %r15
2762        pop     %r14
2763        pop     %r13
2764        pop     %r12
2765        ret
2766ENDPROC(aesni_gcm_precomp_avx_gen4)
2767
2768
2769###############################################################################
2770#void   aesni_gcm_enc_avx_gen4(
2771#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2772#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
2773#        const   u8 *in, /* Plaintext input */
2774#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
2775#        u8      *iv, /* Pre-counter block j0: 4 byte salt
2776#                       (from Security Association) concatenated with 8 byte
2777#                        Initialisation Vector (from IPSec ESP Payload)
2778#                        concatenated with 0x00000001. 16-byte aligned pointer. */
2779#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2780#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2781#        u8      *auth_tag, /* Authenticated Tag output. */
2782#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2783#                               Valid values are 16 (most likely), 12 or 8. */
2784###############################################################################
2785ENTRY(aesni_gcm_enc_avx_gen4)
2786        GCM_ENC_DEC_AVX2     ENC
2787        ret
2788ENDPROC(aesni_gcm_enc_avx_gen4)
2789
2790###############################################################################
2791#void   aesni_gcm_dec_avx_gen4(
2792#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2793#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
2794#        const   u8 *in, /* Ciphertext input */
2795#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
2796#        u8      *iv, /* Pre-counter block j0: 4 byte salt
2797#                       (from Security Association) concatenated with 8 byte
2798#                       Initialisation Vector (from IPSec ESP Payload)
2799#                       concatenated with 0x00000001. 16-byte aligned pointer. */
2800#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2801#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2802#        u8      *auth_tag, /* Authenticated Tag output. */
2803#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2804#                               Valid values are 16 (most likely), 12 or 8. */
2805###############################################################################
2806ENTRY(aesni_gcm_dec_avx_gen4)
2807        GCM_ENC_DEC_AVX2     DEC
2808        ret
2809ENDPROC(aesni_gcm_dec_avx_gen4)
2810
2811#endif /* CONFIG_AS_AVX2 */
2812