linux/arch/x86/crypto/aesni-intel_avx-x86_64.S
<<
>>
Prefs
   1########################################################################
   2# Copyright (c) 2013, Intel Corporation
   3#
   4# This software is available to you under a choice of one of two
   5# licenses.  You may choose to be licensed under the terms of the GNU
   6# General Public License (GPL) Version 2, available from the file
   7# COPYING in the main directory of this source tree, or the
   8# OpenIB.org BSD license below:
   9#
  10# Redistribution and use in source and binary forms, with or without
  11# modification, are permitted provided that the following conditions are
  12# met:
  13#
  14# * Redistributions of source code must retain the above copyright
  15#   notice, this list of conditions and the following disclaimer.
  16#
  17# * Redistributions in binary form must reproduce the above copyright
  18#   notice, this list of conditions and the following disclaimer in the
  19#   documentation and/or other materials provided with the
  20#   distribution.
  21#
  22# * Neither the name of the Intel Corporation nor the names of its
  23#   contributors may be used to endorse or promote products derived from
  24#   this software without specific prior written permission.
  25#
  26#
  27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
  28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
  31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
  34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38########################################################################
  39##
  40## Authors:
  41##      Erdinc Ozturk <erdinc.ozturk@intel.com>
  42##      Vinodh Gopal <vinodh.gopal@intel.com>
  43##      James Guilford <james.guilford@intel.com>
  44##      Tim Chen <tim.c.chen@linux.intel.com>
  45##
  46## References:
  47##       This code was derived and highly optimized from the code described in paper:
  48##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
  49##                      on Intel Architecture Processors. August, 2010
  50##       The details of the implementation is explained in:
  51##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
  52##                      on Intel Architecture Processors. October, 2012.
  53##
  54## Assumptions:
  55##
  56##
  57##
  58## iv:
  59##       0                   1                   2                   3
  60##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  61##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  62##       |                             Salt  (From the SA)               |
  63##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  64##       |                     Initialization Vector                     |
  65##       |         (This is the sequence number from IPSec header)       |
  66##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  67##       |                              0x1                              |
  68##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  69##
  70##
  71##
  72## AAD:
  73##       AAD padded to 128 bits with 0
  74##       for example, assume AAD is a u32 vector
  75##
  76##       if AAD is 8 bytes:
  77##       AAD[3] = {A0, A1}#
  78##       padded AAD in xmm register = {A1 A0 0 0}
  79##
  80##       0                   1                   2                   3
  81##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  82##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  83##       |                               SPI (A1)                        |
  84##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  85##       |                     32-bit Sequence Number (A0)               |
  86##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  87##       |                              0x0                              |
  88##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  89##
  90##                                       AAD Format with 32-bit Sequence Number
  91##
  92##       if AAD is 12 bytes:
  93##       AAD[3] = {A0, A1, A2}#
  94##       padded AAD in xmm register = {A2 A1 A0 0}
  95##
  96##       0                   1                   2                   3
  97##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  98##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  99##       |                               SPI (A2)                        |
 100##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 101##       |                 64-bit Extended Sequence Number {A1,A0}       |
 102##       |                                                               |
 103##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 104##       |                              0x0                              |
 105##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 106##
 107##        AAD Format with 64-bit Extended Sequence Number
 108##
 109##
 110## aadLen:
 111##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
 112##       The code additionally supports aadLen of length 16 bytes.
 113##
 114## TLen:
 115##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
 116##
 117## poly = x^128 + x^127 + x^126 + x^121 + 1
 118## throughout the code, one tab and two tab indentations are used. one tab is
 119## for GHASH part, two tabs is for AES part.
 120##
 121
 122#include <linux/linkage.h>
 123#include <asm/inst.h>
 124
 125.data
 126.align 16
 127
 128POLY:            .octa     0xC2000000000000000000000000000001
 129POLY2:           .octa     0xC20000000000000000000001C2000000
 130TWOONE:          .octa     0x00000001000000000000000000000001
 131
 132# order of these constants should not change.
 133# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
 134
 135SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
 136SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
 137ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
 138ZERO:            .octa     0x00000000000000000000000000000000
 139ONE:             .octa     0x00000000000000000000000000000001
 140ONEf:            .octa     0x01000000000000000000000000000000
 141
 142.section .rodata
 143.align 16
 144.type aad_shift_arr, @object
 145.size aad_shift_arr, 272
 146aad_shift_arr:
 147        .octa     0xffffffffffffffffffffffffffffffff
 148        .octa     0xffffffffffffffffffffffffffffff0C
 149        .octa     0xffffffffffffffffffffffffffff0D0C
 150        .octa     0xffffffffffffffffffffffffff0E0D0C
 151        .octa     0xffffffffffffffffffffffff0F0E0D0C
 152        .octa     0xffffffffffffffffffffff0C0B0A0908
 153        .octa     0xffffffffffffffffffff0D0C0B0A0908
 154        .octa     0xffffffffffffffffff0E0D0C0B0A0908
 155        .octa     0xffffffffffffffff0F0E0D0C0B0A0908
 156        .octa     0xffffffffffffff0C0B0A090807060504
 157        .octa     0xffffffffffff0D0C0B0A090807060504
 158        .octa     0xffffffffff0E0D0C0B0A090807060504
 159        .octa     0xffffffff0F0E0D0C0B0A090807060504
 160        .octa     0xffffff0C0B0A09080706050403020100
 161        .octa     0xffff0D0C0B0A09080706050403020100
 162        .octa     0xff0E0D0C0B0A09080706050403020100
 163        .octa     0x0F0E0D0C0B0A09080706050403020100
 164
 165
 166.text
 167
 168
 169##define the fields of the gcm aes context
 170#{
 171#        u8 expanded_keys[16*11] store expanded keys
 172#        u8 shifted_hkey_1[16]   store HashKey <<1 mod poly here
 173#        u8 shifted_hkey_2[16]   store HashKey^2 <<1 mod poly here
 174#        u8 shifted_hkey_3[16]   store HashKey^3 <<1 mod poly here
 175#        u8 shifted_hkey_4[16]   store HashKey^4 <<1 mod poly here
 176#        u8 shifted_hkey_5[16]   store HashKey^5 <<1 mod poly here
 177#        u8 shifted_hkey_6[16]   store HashKey^6 <<1 mod poly here
 178#        u8 shifted_hkey_7[16]   store HashKey^7 <<1 mod poly here
 179#        u8 shifted_hkey_8[16]   store HashKey^8 <<1 mod poly here
 180#        u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
 181#        u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
 182#        u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
 183#        u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
 184#        u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
 185#        u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
 186#        u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
 187#        u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
 188#} gcm_ctx#
 189
 190HashKey        = 16*11   # store HashKey <<1 mod poly here
 191HashKey_2      = 16*12   # store HashKey^2 <<1 mod poly here
 192HashKey_3      = 16*13   # store HashKey^3 <<1 mod poly here
 193HashKey_4      = 16*14   # store HashKey^4 <<1 mod poly here
 194HashKey_5      = 16*15   # store HashKey^5 <<1 mod poly here
 195HashKey_6      = 16*16   # store HashKey^6 <<1 mod poly here
 196HashKey_7      = 16*17   # store HashKey^7 <<1 mod poly here
 197HashKey_8      = 16*18   # store HashKey^8 <<1 mod poly here
 198HashKey_k      = 16*19   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
 199HashKey_2_k    = 16*20   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
 200HashKey_3_k    = 16*21   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
 201HashKey_4_k    = 16*22   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
 202HashKey_5_k    = 16*23   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
 203HashKey_6_k    = 16*24   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
 204HashKey_7_k    = 16*25   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
 205HashKey_8_k    = 16*26   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
 206
 207#define arg1 %rdi
 208#define arg2 %rsi
 209#define arg3 %rdx
 210#define arg4 %rcx
 211#define arg5 %r8
 212#define arg6 %r9
 213#define arg7 STACK_OFFSET+8*1(%r14)
 214#define arg8 STACK_OFFSET+8*2(%r14)
 215#define arg9 STACK_OFFSET+8*3(%r14)
 216
 217i = 0
 218j = 0
 219
 220out_order = 0
 221in_order = 1
 222DEC = 0
 223ENC = 1
 224
 225.macro define_reg r n
 226reg_\r = %xmm\n
 227.endm
 228
 229.macro setreg
 230.altmacro
 231define_reg i %i
 232define_reg j %j
 233.noaltmacro
 234.endm
 235
 236# need to push 4 registers into stack to maintain
 237STACK_OFFSET = 8*4
 238
 239TMP1 =   16*0    # Temporary storage for AAD
 240TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
 241TMP3 =   16*2    # Temporary storage for AES State 3
 242TMP4 =   16*3    # Temporary storage for AES State 4
 243TMP5 =   16*4    # Temporary storage for AES State 5
 244TMP6 =   16*5    # Temporary storage for AES State 6
 245TMP7 =   16*6    # Temporary storage for AES State 7
 246TMP8 =   16*7    # Temporary storage for AES State 8
 247
 248VARIABLE_OFFSET = 16*8
 249
 250################################
 251# Utility Macros
 252################################
 253
 254# Encryption of a single block
 255.macro ENCRYPT_SINGLE_BLOCK XMM0
 256                vpxor    (arg1), \XMM0, \XMM0
 257                i = 1
 258                setreg
 259.rep 9
 260                vaesenc  16*i(arg1), \XMM0, \XMM0
 261                i = (i+1)
 262                setreg
 263.endr
 264                vaesenclast 16*10(arg1), \XMM0, \XMM0
 265.endm
 266
 267#ifdef CONFIG_AS_AVX
 268###############################################################################
 269# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 270# Input: A and B (128-bits each, bit-reflected)
 271# Output: C = A*B*x mod poly, (i.e. >>1 )
 272# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 273# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 274###############################################################################
 275.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
 276
 277        vpshufd         $0b01001110, \GH, \T2
 278        vpshufd         $0b01001110, \HK, \T3
 279        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
 280        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
 281
 282        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
 283        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
 284        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
 285        vpxor           \GH, \T2,\T2
 286        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
 287
 288        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
 289        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
 290        vpxor           \T3, \GH, \GH
 291        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
 292
 293        #first phase of the reduction
 294        vpslld  $31, \GH, \T2                   # packed right shifting << 31
 295        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
 296        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
 297
 298        vpxor   \T3, \T2, \T2                   # xor the shifted versions
 299        vpxor   \T4, \T2, \T2
 300
 301        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
 302
 303        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
 304        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
 305
 306        #second phase of the reduction
 307
 308        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
 309        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
 310        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
 311        vpxor   \T3, \T2, \T2                   # xor the shifted versions
 312        vpxor   \T4, \T2, \T2
 313
 314        vpxor   \T5, \T2, \T2
 315        vpxor   \T2, \GH, \GH
 316        vpxor   \T1, \GH, \GH                   # the result is in GH
 317
 318
 319.endm
 320
 321.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
 322
 323        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 324        vmovdqa  \HK, \T5
 325
 326        vpshufd  $0b01001110, \T5, \T1
 327        vpxor    \T5, \T1, \T1
 328        vmovdqa  \T1, HashKey_k(arg1)
 329
 330        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
 331        vmovdqa  \T5, HashKey_2(arg1)                    #  [HashKey_2] = HashKey^2<<1 mod poly
 332        vpshufd  $0b01001110, \T5, \T1
 333        vpxor    \T5, \T1, \T1
 334        vmovdqa  \T1, HashKey_2_k(arg1)
 335
 336        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
 337        vmovdqa  \T5, HashKey_3(arg1)
 338        vpshufd  $0b01001110, \T5, \T1
 339        vpxor    \T5, \T1, \T1
 340        vmovdqa  \T1, HashKey_3_k(arg1)
 341
 342        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
 343        vmovdqa  \T5, HashKey_4(arg1)
 344        vpshufd  $0b01001110, \T5, \T1
 345        vpxor    \T5, \T1, \T1
 346        vmovdqa  \T1, HashKey_4_k(arg1)
 347
 348        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
 349        vmovdqa  \T5, HashKey_5(arg1)
 350        vpshufd  $0b01001110, \T5, \T1
 351        vpxor    \T5, \T1, \T1
 352        vmovdqa  \T1, HashKey_5_k(arg1)
 353
 354        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
 355        vmovdqa  \T5, HashKey_6(arg1)
 356        vpshufd  $0b01001110, \T5, \T1
 357        vpxor    \T5, \T1, \T1
 358        vmovdqa  \T1, HashKey_6_k(arg1)
 359
 360        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
 361        vmovdqa  \T5, HashKey_7(arg1)
 362        vpshufd  $0b01001110, \T5, \T1
 363        vpxor    \T5, \T1, \T1
 364        vmovdqa  \T1, HashKey_7_k(arg1)
 365
 366        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
 367        vmovdqa  \T5, HashKey_8(arg1)
 368        vpshufd  $0b01001110, \T5, \T1
 369        vpxor    \T5, \T1, \T1
 370        vmovdqa  \T1, HashKey_8_k(arg1)
 371
 372.endm
 373
 374## if a = number of total plaintext bytes
 375## b = floor(a/16)
 376## num_initial_blocks = b mod 4#
 377## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
 378## r10, r11, r12, rax are clobbered
 379## arg1, arg2, arg3, r14 are used as a pointer only, not modified
 380
 381.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
 382        i = (8-\num_initial_blocks)
 383        j = 0
 384        setreg
 385
 386        mov     arg6, %r10                      # r10 = AAD
 387        mov     arg7, %r12                      # r12 = aadLen
 388
 389
 390        mov     %r12, %r11
 391
 392        vpxor   reg_j, reg_j, reg_j
 393        vpxor   reg_i, reg_i, reg_i
 394        cmp     $16, %r11
 395        jl      _get_AAD_rest8\@
 396_get_AAD_blocks\@:
 397        vmovdqu (%r10), reg_i
 398        vpshufb SHUF_MASK(%rip), reg_i, reg_i
 399        vpxor   reg_i, reg_j, reg_j
 400        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6
 401        add     $16, %r10
 402        sub     $16, %r12
 403        sub     $16, %r11
 404        cmp     $16, %r11
 405        jge     _get_AAD_blocks\@
 406        vmovdqu reg_j, reg_i
 407        cmp     $0, %r11
 408        je      _get_AAD_done\@
 409
 410        vpxor   reg_i, reg_i, reg_i
 411
 412        /* read the last <16B of AAD. since we have at least 4B of
 413        data right after the AAD (the ICV, and maybe some CT), we can
 414        read 4B/8B blocks safely, and then get rid of the extra stuff */
 415_get_AAD_rest8\@:
 416        cmp     $4, %r11
 417        jle     _get_AAD_rest4\@
 418        movq    (%r10), \T1
 419        add     $8, %r10
 420        sub     $8, %r11
 421        vpslldq $8, \T1, \T1
 422        vpsrldq $8, reg_i, reg_i
 423        vpxor   \T1, reg_i, reg_i
 424        jmp     _get_AAD_rest8\@
 425_get_AAD_rest4\@:
 426        cmp     $0, %r11
 427        jle      _get_AAD_rest0\@
 428        mov     (%r10), %eax
 429        movq    %rax, \T1
 430        add     $4, %r10
 431        sub     $4, %r11
 432        vpslldq $12, \T1, \T1
 433        vpsrldq $4, reg_i, reg_i
 434        vpxor   \T1, reg_i, reg_i
 435_get_AAD_rest0\@:
 436        /* finalize: shift out the extra bytes we read, and align
 437        left. since pslldq can only shift by an immediate, we use
 438        vpshufb and an array of shuffle masks */
 439        movq    %r12, %r11
 440        salq    $4, %r11
 441        movdqu  aad_shift_arr(%r11), \T1
 442        vpshufb \T1, reg_i, reg_i
 443_get_AAD_rest_final\@:
 444        vpshufb SHUF_MASK(%rip), reg_i, reg_i
 445        vpxor   reg_j, reg_i, reg_i
 446        GHASH_MUL_AVX       reg_i, \T2, \T1, \T3, \T4, \T5, \T6
 447
 448_get_AAD_done\@:
 449        # initialize the data pointer offset as zero
 450        xor     %r11, %r11
 451
 452        # start AES for num_initial_blocks blocks
 453        mov     arg5, %rax                     # rax = *Y0
 454        vmovdqu (%rax), \CTR                   # CTR = Y0
 455        vpshufb SHUF_MASK(%rip), \CTR, \CTR
 456
 457
 458        i = (9-\num_initial_blocks)
 459        setreg
 460.rep \num_initial_blocks
 461                vpaddd  ONE(%rip), \CTR, \CTR           # INCR Y0
 462                vmovdqa \CTR, reg_i
 463                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
 464        i = (i+1)
 465        setreg
 466.endr
 467
 468        vmovdqa  (arg1), \T_key
 469        i = (9-\num_initial_blocks)
 470        setreg
 471.rep \num_initial_blocks
 472                vpxor   \T_key, reg_i, reg_i
 473        i = (i+1)
 474        setreg
 475.endr
 476
 477        j = 1
 478        setreg
 479.rep 9
 480        vmovdqa  16*j(arg1), \T_key
 481        i = (9-\num_initial_blocks)
 482        setreg
 483.rep \num_initial_blocks
 484        vaesenc \T_key, reg_i, reg_i
 485        i = (i+1)
 486        setreg
 487.endr
 488
 489        j = (j+1)
 490        setreg
 491.endr
 492
 493
 494        vmovdqa  16*10(arg1), \T_key
 495        i = (9-\num_initial_blocks)
 496        setreg
 497.rep \num_initial_blocks
 498        vaesenclast      \T_key, reg_i, reg_i
 499        i = (i+1)
 500        setreg
 501.endr
 502
 503        i = (9-\num_initial_blocks)
 504        setreg
 505.rep \num_initial_blocks
 506                vmovdqu (arg3, %r11), \T1
 507                vpxor   \T1, reg_i, reg_i
 508                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for num_initial_blocks blocks
 509                add     $16, %r11
 510.if  \ENC_DEC == DEC
 511                vmovdqa \T1, reg_i
 512.endif
 513                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
 514        i = (i+1)
 515        setreg
 516.endr
 517
 518
 519        i = (8-\num_initial_blocks)
 520        j = (9-\num_initial_blocks)
 521        setreg
 522
 523.rep \num_initial_blocks
 524        vpxor    reg_i, reg_j, reg_j
 525        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
 526        i = (i+1)
 527        j = (j+1)
 528        setreg
 529.endr
 530        # XMM8 has the combined result here
 531
 532        vmovdqa  \XMM8, TMP1(%rsp)
 533        vmovdqa  \XMM8, \T3
 534
 535        cmp     $128, %r13
 536        jl      _initial_blocks_done\@                  # no need for precomputed constants
 537
 538###############################################################################
 539# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 540                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 541                vmovdqa  \CTR, \XMM1
 542                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
 543
 544                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 545                vmovdqa  \CTR, \XMM2
 546                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
 547
 548                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 549                vmovdqa  \CTR, \XMM3
 550                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
 551
 552                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 553                vmovdqa  \CTR, \XMM4
 554                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
 555
 556                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 557                vmovdqa  \CTR, \XMM5
 558                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
 559
 560                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 561                vmovdqa  \CTR, \XMM6
 562                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
 563
 564                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 565                vmovdqa  \CTR, \XMM7
 566                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
 567
 568                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 569                vmovdqa  \CTR, \XMM8
 570                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
 571
 572                vmovdqa  (arg1), \T_key
 573                vpxor    \T_key, \XMM1, \XMM1
 574                vpxor    \T_key, \XMM2, \XMM2
 575                vpxor    \T_key, \XMM3, \XMM3
 576                vpxor    \T_key, \XMM4, \XMM4
 577                vpxor    \T_key, \XMM5, \XMM5
 578                vpxor    \T_key, \XMM6, \XMM6
 579                vpxor    \T_key, \XMM7, \XMM7
 580                vpxor    \T_key, \XMM8, \XMM8
 581
 582                i = 1
 583                setreg
 584.rep    9       # do 9 rounds
 585                vmovdqa  16*i(arg1), \T_key
 586                vaesenc  \T_key, \XMM1, \XMM1
 587                vaesenc  \T_key, \XMM2, \XMM2
 588                vaesenc  \T_key, \XMM3, \XMM3
 589                vaesenc  \T_key, \XMM4, \XMM4
 590                vaesenc  \T_key, \XMM5, \XMM5
 591                vaesenc  \T_key, \XMM6, \XMM6
 592                vaesenc  \T_key, \XMM7, \XMM7
 593                vaesenc  \T_key, \XMM8, \XMM8
 594                i = (i+1)
 595                setreg
 596.endr
 597
 598
 599                vmovdqa  16*i(arg1), \T_key
 600                vaesenclast  \T_key, \XMM1, \XMM1
 601                vaesenclast  \T_key, \XMM2, \XMM2
 602                vaesenclast  \T_key, \XMM3, \XMM3
 603                vaesenclast  \T_key, \XMM4, \XMM4
 604                vaesenclast  \T_key, \XMM5, \XMM5
 605                vaesenclast  \T_key, \XMM6, \XMM6
 606                vaesenclast  \T_key, \XMM7, \XMM7
 607                vaesenclast  \T_key, \XMM8, \XMM8
 608
 609                vmovdqu  (arg3, %r11), \T1
 610                vpxor    \T1, \XMM1, \XMM1
 611                vmovdqu  \XMM1, (arg2 , %r11)
 612                .if   \ENC_DEC == DEC
 613                vmovdqa  \T1, \XMM1
 614                .endif
 615
 616                vmovdqu  16*1(arg3, %r11), \T1
 617                vpxor    \T1, \XMM2, \XMM2
 618                vmovdqu  \XMM2, 16*1(arg2 , %r11)
 619                .if   \ENC_DEC == DEC
 620                vmovdqa  \T1, \XMM2
 621                .endif
 622
 623                vmovdqu  16*2(arg3, %r11), \T1
 624                vpxor    \T1, \XMM3, \XMM3
 625                vmovdqu  \XMM3, 16*2(arg2 , %r11)
 626                .if   \ENC_DEC == DEC
 627                vmovdqa  \T1, \XMM3
 628                .endif
 629
 630                vmovdqu  16*3(arg3, %r11), \T1
 631                vpxor    \T1, \XMM4, \XMM4
 632                vmovdqu  \XMM4, 16*3(arg2 , %r11)
 633                .if   \ENC_DEC == DEC
 634                vmovdqa  \T1, \XMM4
 635                .endif
 636
 637                vmovdqu  16*4(arg3, %r11), \T1
 638                vpxor    \T1, \XMM5, \XMM5
 639                vmovdqu  \XMM5, 16*4(arg2 , %r11)
 640                .if   \ENC_DEC == DEC
 641                vmovdqa  \T1, \XMM5
 642                .endif
 643
 644                vmovdqu  16*5(arg3, %r11), \T1
 645                vpxor    \T1, \XMM6, \XMM6
 646                vmovdqu  \XMM6, 16*5(arg2 , %r11)
 647                .if   \ENC_DEC == DEC
 648                vmovdqa  \T1, \XMM6
 649                .endif
 650
 651                vmovdqu  16*6(arg3, %r11), \T1
 652                vpxor    \T1, \XMM7, \XMM7
 653                vmovdqu  \XMM7, 16*6(arg2 , %r11)
 654                .if   \ENC_DEC == DEC
 655                vmovdqa  \T1, \XMM7
 656                .endif
 657
 658                vmovdqu  16*7(arg3, %r11), \T1
 659                vpxor    \T1, \XMM8, \XMM8
 660                vmovdqu  \XMM8, 16*7(arg2 , %r11)
 661                .if   \ENC_DEC == DEC
 662                vmovdqa  \T1, \XMM8
 663                .endif
 664
 665                add     $128, %r11
 666
 667                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
 668                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
 669                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
 670                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
 671                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
 672                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
 673                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
 674                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
 675                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
 676
 677###############################################################################
 678
 679_initial_blocks_done\@:
 680
 681.endm
 682
 683# encrypt 8 blocks at a time
 684# ghash the 8 previously encrypted ciphertext blocks
 685# arg1, arg2, arg3 are used as pointers only, not modified
 686# r11 is the data offset value
 687.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
 688
 689        vmovdqa \XMM1, \T2
 690        vmovdqa \XMM2, TMP2(%rsp)
 691        vmovdqa \XMM3, TMP3(%rsp)
 692        vmovdqa \XMM4, TMP4(%rsp)
 693        vmovdqa \XMM5, TMP5(%rsp)
 694        vmovdqa \XMM6, TMP6(%rsp)
 695        vmovdqa \XMM7, TMP7(%rsp)
 696        vmovdqa \XMM8, TMP8(%rsp)
 697
 698.if \loop_idx == in_order
 699                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
 700                vpaddd  ONE(%rip), \XMM1, \XMM2
 701                vpaddd  ONE(%rip), \XMM2, \XMM3
 702                vpaddd  ONE(%rip), \XMM3, \XMM4
 703                vpaddd  ONE(%rip), \XMM4, \XMM5
 704                vpaddd  ONE(%rip), \XMM5, \XMM6
 705                vpaddd  ONE(%rip), \XMM6, \XMM7
 706                vpaddd  ONE(%rip), \XMM7, \XMM8
 707                vmovdqa \XMM8, \CTR
 708
 709                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
 710                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
 711                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
 712                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
 713                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
 714                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
 715                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
 716                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
 717.else
 718                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
 719                vpaddd  ONEf(%rip), \XMM1, \XMM2
 720                vpaddd  ONEf(%rip), \XMM2, \XMM3
 721                vpaddd  ONEf(%rip), \XMM3, \XMM4
 722                vpaddd  ONEf(%rip), \XMM4, \XMM5
 723                vpaddd  ONEf(%rip), \XMM5, \XMM6
 724                vpaddd  ONEf(%rip), \XMM6, \XMM7
 725                vpaddd  ONEf(%rip), \XMM7, \XMM8
 726                vmovdqa \XMM8, \CTR
 727.endif
 728
 729
 730        #######################################################################
 731
 732                vmovdqu (arg1), \T1
 733                vpxor   \T1, \XMM1, \XMM1
 734                vpxor   \T1, \XMM2, \XMM2
 735                vpxor   \T1, \XMM3, \XMM3
 736                vpxor   \T1, \XMM4, \XMM4
 737                vpxor   \T1, \XMM5, \XMM5
 738                vpxor   \T1, \XMM6, \XMM6
 739                vpxor   \T1, \XMM7, \XMM7
 740                vpxor   \T1, \XMM8, \XMM8
 741
 742        #######################################################################
 743
 744
 745
 746
 747
 748                vmovdqu 16*1(arg1), \T1
 749                vaesenc \T1, \XMM1, \XMM1
 750                vaesenc \T1, \XMM2, \XMM2
 751                vaesenc \T1, \XMM3, \XMM3
 752                vaesenc \T1, \XMM4, \XMM4
 753                vaesenc \T1, \XMM5, \XMM5
 754                vaesenc \T1, \XMM6, \XMM6
 755                vaesenc \T1, \XMM7, \XMM7
 756                vaesenc \T1, \XMM8, \XMM8
 757
 758                vmovdqu 16*2(arg1), \T1
 759                vaesenc \T1, \XMM1, \XMM1
 760                vaesenc \T1, \XMM2, \XMM2
 761                vaesenc \T1, \XMM3, \XMM3
 762                vaesenc \T1, \XMM4, \XMM4
 763                vaesenc \T1, \XMM5, \XMM5
 764                vaesenc \T1, \XMM6, \XMM6
 765                vaesenc \T1, \XMM7, \XMM7
 766                vaesenc \T1, \XMM8, \XMM8
 767
 768
 769        #######################################################################
 770
 771        vmovdqa         HashKey_8(arg1), \T5
 772        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
 773        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
 774
 775        vpshufd         $0b01001110, \T2, \T6
 776        vpxor           \T2, \T6, \T6
 777
 778        vmovdqa         HashKey_8_k(arg1), \T5
 779        vpclmulqdq      $0x00, \T5, \T6, \T6
 780
 781                vmovdqu 16*3(arg1), \T1
 782                vaesenc \T1, \XMM1, \XMM1
 783                vaesenc \T1, \XMM2, \XMM2
 784                vaesenc \T1, \XMM3, \XMM3
 785                vaesenc \T1, \XMM4, \XMM4
 786                vaesenc \T1, \XMM5, \XMM5
 787                vaesenc \T1, \XMM6, \XMM6
 788                vaesenc \T1, \XMM7, \XMM7
 789                vaesenc \T1, \XMM8, \XMM8
 790
 791        vmovdqa         TMP2(%rsp), \T1
 792        vmovdqa         HashKey_7(arg1), \T5
 793        vpclmulqdq      $0x11, \T5, \T1, \T3
 794        vpxor           \T3, \T4, \T4
 795        vpclmulqdq      $0x00, \T5, \T1, \T3
 796        vpxor           \T3, \T7, \T7
 797
 798        vpshufd         $0b01001110, \T1, \T3
 799        vpxor           \T1, \T3, \T3
 800        vmovdqa         HashKey_7_k(arg1), \T5
 801        vpclmulqdq      $0x10, \T5, \T3, \T3
 802        vpxor           \T3, \T6, \T6
 803
 804                vmovdqu 16*4(arg1), \T1
 805                vaesenc \T1, \XMM1, \XMM1
 806                vaesenc \T1, \XMM2, \XMM2
 807                vaesenc \T1, \XMM3, \XMM3
 808                vaesenc \T1, \XMM4, \XMM4
 809                vaesenc \T1, \XMM5, \XMM5
 810                vaesenc \T1, \XMM6, \XMM6
 811                vaesenc \T1, \XMM7, \XMM7
 812                vaesenc \T1, \XMM8, \XMM8
 813
 814        #######################################################################
 815
 816        vmovdqa         TMP3(%rsp), \T1
 817        vmovdqa         HashKey_6(arg1), \T5
 818        vpclmulqdq      $0x11, \T5, \T1, \T3
 819        vpxor           \T3, \T4, \T4
 820        vpclmulqdq      $0x00, \T5, \T1, \T3
 821        vpxor           \T3, \T7, \T7
 822
 823        vpshufd         $0b01001110, \T1, \T3
 824        vpxor           \T1, \T3, \T3
 825        vmovdqa         HashKey_6_k(arg1), \T5
 826        vpclmulqdq      $0x10, \T5, \T3, \T3
 827        vpxor           \T3, \T6, \T6
 828
 829                vmovdqu 16*5(arg1), \T1
 830                vaesenc \T1, \XMM1, \XMM1
 831                vaesenc \T1, \XMM2, \XMM2
 832                vaesenc \T1, \XMM3, \XMM3
 833                vaesenc \T1, \XMM4, \XMM4
 834                vaesenc \T1, \XMM5, \XMM5
 835                vaesenc \T1, \XMM6, \XMM6
 836                vaesenc \T1, \XMM7, \XMM7
 837                vaesenc \T1, \XMM8, \XMM8
 838
 839        vmovdqa         TMP4(%rsp), \T1
 840        vmovdqa         HashKey_5(arg1), \T5
 841        vpclmulqdq      $0x11, \T5, \T1, \T3
 842        vpxor           \T3, \T4, \T4
 843        vpclmulqdq      $0x00, \T5, \T1, \T3
 844        vpxor           \T3, \T7, \T7
 845
 846        vpshufd         $0b01001110, \T1, \T3
 847        vpxor           \T1, \T3, \T3
 848        vmovdqa         HashKey_5_k(arg1), \T5
 849        vpclmulqdq      $0x10, \T5, \T3, \T3
 850        vpxor           \T3, \T6, \T6
 851
 852                vmovdqu 16*6(arg1), \T1
 853                vaesenc \T1, \XMM1, \XMM1
 854                vaesenc \T1, \XMM2, \XMM2
 855                vaesenc \T1, \XMM3, \XMM3
 856                vaesenc \T1, \XMM4, \XMM4
 857                vaesenc \T1, \XMM5, \XMM5
 858                vaesenc \T1, \XMM6, \XMM6
 859                vaesenc \T1, \XMM7, \XMM7
 860                vaesenc \T1, \XMM8, \XMM8
 861
 862
 863        vmovdqa         TMP5(%rsp), \T1
 864        vmovdqa         HashKey_4(arg1), \T5
 865        vpclmulqdq      $0x11, \T5, \T1, \T3
 866        vpxor           \T3, \T4, \T4
 867        vpclmulqdq      $0x00, \T5, \T1, \T3
 868        vpxor           \T3, \T7, \T7
 869
 870        vpshufd         $0b01001110, \T1, \T3
 871        vpxor           \T1, \T3, \T3
 872        vmovdqa         HashKey_4_k(arg1), \T5
 873        vpclmulqdq      $0x10, \T5, \T3, \T3
 874        vpxor           \T3, \T6, \T6
 875
 876                vmovdqu 16*7(arg1), \T1
 877                vaesenc \T1, \XMM1, \XMM1
 878                vaesenc \T1, \XMM2, \XMM2
 879                vaesenc \T1, \XMM3, \XMM3
 880                vaesenc \T1, \XMM4, \XMM4
 881                vaesenc \T1, \XMM5, \XMM5
 882                vaesenc \T1, \XMM6, \XMM6
 883                vaesenc \T1, \XMM7, \XMM7
 884                vaesenc \T1, \XMM8, \XMM8
 885
 886        vmovdqa         TMP6(%rsp), \T1
 887        vmovdqa         HashKey_3(arg1), \T5
 888        vpclmulqdq      $0x11, \T5, \T1, \T3
 889        vpxor           \T3, \T4, \T4
 890        vpclmulqdq      $0x00, \T5, \T1, \T3
 891        vpxor           \T3, \T7, \T7
 892
 893        vpshufd         $0b01001110, \T1, \T3
 894        vpxor           \T1, \T3, \T3
 895        vmovdqa         HashKey_3_k(arg1), \T5
 896        vpclmulqdq      $0x10, \T5, \T3, \T3
 897        vpxor           \T3, \T6, \T6
 898
 899
 900                vmovdqu 16*8(arg1), \T1
 901                vaesenc \T1, \XMM1, \XMM1
 902                vaesenc \T1, \XMM2, \XMM2
 903                vaesenc \T1, \XMM3, \XMM3
 904                vaesenc \T1, \XMM4, \XMM4
 905                vaesenc \T1, \XMM5, \XMM5
 906                vaesenc \T1, \XMM6, \XMM6
 907                vaesenc \T1, \XMM7, \XMM7
 908                vaesenc \T1, \XMM8, \XMM8
 909
 910        vmovdqa         TMP7(%rsp), \T1
 911        vmovdqa         HashKey_2(arg1), \T5
 912        vpclmulqdq      $0x11, \T5, \T1, \T3
 913        vpxor           \T3, \T4, \T4
 914        vpclmulqdq      $0x00, \T5, \T1, \T3
 915        vpxor           \T3, \T7, \T7
 916
 917        vpshufd         $0b01001110, \T1, \T3
 918        vpxor           \T1, \T3, \T3
 919        vmovdqa         HashKey_2_k(arg1), \T5
 920        vpclmulqdq      $0x10, \T5, \T3, \T3
 921        vpxor           \T3, \T6, \T6
 922
 923        #######################################################################
 924
 925                vmovdqu 16*9(arg1), \T5
 926                vaesenc \T5, \XMM1, \XMM1
 927                vaesenc \T5, \XMM2, \XMM2
 928                vaesenc \T5, \XMM3, \XMM3
 929                vaesenc \T5, \XMM4, \XMM4
 930                vaesenc \T5, \XMM5, \XMM5
 931                vaesenc \T5, \XMM6, \XMM6
 932                vaesenc \T5, \XMM7, \XMM7
 933                vaesenc \T5, \XMM8, \XMM8
 934
 935        vmovdqa         TMP8(%rsp), \T1
 936        vmovdqa         HashKey(arg1), \T5
 937        vpclmulqdq      $0x11, \T5, \T1, \T3
 938        vpxor           \T3, \T4, \T4
 939        vpclmulqdq      $0x00, \T5, \T1, \T3
 940        vpxor           \T3, \T7, \T7
 941
 942        vpshufd         $0b01001110, \T1, \T3
 943        vpxor           \T1, \T3, \T3
 944        vmovdqa         HashKey_k(arg1), \T5
 945        vpclmulqdq      $0x10, \T5, \T3, \T3
 946        vpxor           \T3, \T6, \T6
 947
 948        vpxor           \T4, \T6, \T6
 949        vpxor           \T7, \T6, \T6
 950
 951                vmovdqu 16*10(arg1), \T5
 952
 953        i = 0
 954        j = 1
 955        setreg
 956.rep 8
 957                vpxor   16*i(arg3, %r11), \T5, \T2
 958                .if \ENC_DEC == ENC
 959                vaesenclast     \T2, reg_j, reg_j
 960                .else
 961                vaesenclast     \T2, reg_j, \T3
 962                vmovdqu 16*i(arg3, %r11), reg_j
 963                vmovdqu \T3, 16*i(arg2, %r11)
 964                .endif
 965        i = (i+1)
 966        j = (j+1)
 967        setreg
 968.endr
 969        #######################################################################
 970
 971
 972        vpslldq $8, \T6, \T3                            # shift-L T3 2 DWs
 973        vpsrldq $8, \T6, \T6                            # shift-R T2 2 DWs
 974        vpxor   \T3, \T7, \T7
 975        vpxor   \T4, \T6, \T6                           # accumulate the results in T6:T7
 976
 977
 978
 979        #######################################################################
 980        #first phase of the reduction
 981        #######################################################################
 982        vpslld  $31, \T7, \T2                           # packed right shifting << 31
 983        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
 984        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
 985
 986        vpxor   \T3, \T2, \T2                           # xor the shifted versions
 987        vpxor   \T4, \T2, \T2
 988
 989        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
 990
 991        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
 992        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
 993        #######################################################################
 994                .if \ENC_DEC == ENC
 995                vmovdqu  \XMM1, 16*0(arg2,%r11)         # Write to the Ciphertext buffer
 996                vmovdqu  \XMM2, 16*1(arg2,%r11)         # Write to the Ciphertext buffer
 997                vmovdqu  \XMM3, 16*2(arg2,%r11)         # Write to the Ciphertext buffer
 998                vmovdqu  \XMM4, 16*3(arg2,%r11)         # Write to the Ciphertext buffer
 999                vmovdqu  \XMM5, 16*4(arg2,%r11)         # Write to the Ciphertext buffer
1000                vmovdqu  \XMM6, 16*5(arg2,%r11)         # Write to the Ciphertext buffer
1001                vmovdqu  \XMM7, 16*6(arg2,%r11)         # Write to the Ciphertext buffer
1002                vmovdqu  \XMM8, 16*7(arg2,%r11)         # Write to the Ciphertext buffer
1003                .endif
1004
1005        #######################################################################
1006        #second phase of the reduction
1007        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
1008        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
1009        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
1010        vpxor   \T3, \T2, \T2                           # xor the shifted versions
1011        vpxor   \T4, \T2, \T2
1012
1013        vpxor   \T1, \T2, \T2
1014        vpxor   \T2, \T7, \T7
1015        vpxor   \T7, \T6, \T6                           # the result is in T6
1016        #######################################################################
1017
1018                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1   # perform a 16Byte swap
1019                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2   # perform a 16Byte swap
1020                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3   # perform a 16Byte swap
1021                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4   # perform a 16Byte swap
1022                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5   # perform a 16Byte swap
1023                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6   # perform a 16Byte swap
1024                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7   # perform a 16Byte swap
1025                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8   # perform a 16Byte swap
1026
1027
1028        vpxor   \T6, \XMM1, \XMM1
1029
1030
1031
1032.endm
1033
1034
1035# GHASH the last 4 ciphertext blocks.
1036.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1037
1038        ## Karatsuba Method
1039
1040
1041        vpshufd         $0b01001110, \XMM1, \T2
1042        vpxor           \XMM1, \T2, \T2
1043        vmovdqa         HashKey_8(arg1), \T5
1044        vpclmulqdq      $0x11, \T5, \XMM1, \T6
1045        vpclmulqdq      $0x00, \T5, \XMM1, \T7
1046
1047        vmovdqa         HashKey_8_k(arg1), \T3
1048        vpclmulqdq      $0x00, \T3, \T2, \XMM1
1049
1050        ######################
1051
1052        vpshufd         $0b01001110, \XMM2, \T2
1053        vpxor           \XMM2, \T2, \T2
1054        vmovdqa         HashKey_7(arg1), \T5
1055        vpclmulqdq      $0x11, \T5, \XMM2, \T4
1056        vpxor           \T4, \T6, \T6
1057
1058        vpclmulqdq      $0x00, \T5, \XMM2, \T4
1059        vpxor           \T4, \T7, \T7
1060
1061        vmovdqa         HashKey_7_k(arg1), \T3
1062        vpclmulqdq      $0x00, \T3, \T2, \T2
1063        vpxor           \T2, \XMM1, \XMM1
1064
1065        ######################
1066
1067        vpshufd         $0b01001110, \XMM3, \T2
1068        vpxor           \XMM3, \T2, \T2
1069        vmovdqa         HashKey_6(arg1), \T5
1070        vpclmulqdq      $0x11, \T5, \XMM3, \T4
1071        vpxor           \T4, \T6, \T6
1072
1073        vpclmulqdq      $0x00, \T5, \XMM3, \T4
1074        vpxor           \T4, \T7, \T7
1075
1076        vmovdqa         HashKey_6_k(arg1), \T3
1077        vpclmulqdq      $0x00, \T3, \T2, \T2
1078        vpxor           \T2, \XMM1, \XMM1
1079
1080        ######################
1081
1082        vpshufd         $0b01001110, \XMM4, \T2
1083        vpxor           \XMM4, \T2, \T2
1084        vmovdqa         HashKey_5(arg1), \T5
1085        vpclmulqdq      $0x11, \T5, \XMM4, \T4
1086        vpxor           \T4, \T6, \T6
1087
1088        vpclmulqdq      $0x00, \T5, \XMM4, \T4
1089        vpxor           \T4, \T7, \T7
1090
1091        vmovdqa         HashKey_5_k(arg1), \T3
1092        vpclmulqdq      $0x00, \T3, \T2, \T2
1093        vpxor           \T2, \XMM1, \XMM1
1094
1095        ######################
1096
1097        vpshufd         $0b01001110, \XMM5, \T2
1098        vpxor           \XMM5, \T2, \T2
1099        vmovdqa         HashKey_4(arg1), \T5
1100        vpclmulqdq      $0x11, \T5, \XMM5, \T4
1101        vpxor           \T4, \T6, \T6
1102
1103        vpclmulqdq      $0x00, \T5, \XMM5, \T4
1104        vpxor           \T4, \T7, \T7
1105
1106        vmovdqa         HashKey_4_k(arg1), \T3
1107        vpclmulqdq      $0x00, \T3, \T2, \T2
1108        vpxor           \T2, \XMM1, \XMM1
1109
1110        ######################
1111
1112        vpshufd         $0b01001110, \XMM6, \T2
1113        vpxor           \XMM6, \T2, \T2
1114        vmovdqa         HashKey_3(arg1), \T5
1115        vpclmulqdq      $0x11, \T5, \XMM6, \T4
1116        vpxor           \T4, \T6, \T6
1117
1118        vpclmulqdq      $0x00, \T5, \XMM6, \T4
1119        vpxor           \T4, \T7, \T7
1120
1121        vmovdqa         HashKey_3_k(arg1), \T3
1122        vpclmulqdq      $0x00, \T3, \T2, \T2
1123        vpxor           \T2, \XMM1, \XMM1
1124
1125        ######################
1126
1127        vpshufd         $0b01001110, \XMM7, \T2
1128        vpxor           \XMM7, \T2, \T2
1129        vmovdqa         HashKey_2(arg1), \T5
1130        vpclmulqdq      $0x11, \T5, \XMM7, \T4
1131        vpxor           \T4, \T6, \T6
1132
1133        vpclmulqdq      $0x00, \T5, \XMM7, \T4
1134        vpxor           \T4, \T7, \T7
1135
1136        vmovdqa         HashKey_2_k(arg1), \T3
1137        vpclmulqdq      $0x00, \T3, \T2, \T2
1138        vpxor           \T2, \XMM1, \XMM1
1139
1140        ######################
1141
1142        vpshufd         $0b01001110, \XMM8, \T2
1143        vpxor           \XMM8, \T2, \T2
1144        vmovdqa         HashKey(arg1), \T5
1145        vpclmulqdq      $0x11, \T5, \XMM8, \T4
1146        vpxor           \T4, \T6, \T6
1147
1148        vpclmulqdq      $0x00, \T5, \XMM8, \T4
1149        vpxor           \T4, \T7, \T7
1150
1151        vmovdqa         HashKey_k(arg1), \T3
1152        vpclmulqdq      $0x00, \T3, \T2, \T2
1153
1154        vpxor           \T2, \XMM1, \XMM1
1155        vpxor           \T6, \XMM1, \XMM1
1156        vpxor           \T7, \XMM1, \T2
1157
1158
1159
1160
1161        vpslldq $8, \T2, \T4
1162        vpsrldq $8, \T2, \T2
1163
1164        vpxor   \T4, \T7, \T7
1165        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
1166                                # the accumulated carry-less multiplications
1167
1168        #######################################################################
1169        #first phase of the reduction
1170        vpslld  $31, \T7, \T2   # packed right shifting << 31
1171        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
1172        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
1173
1174        vpxor   \T3, \T2, \T2   # xor the shifted versions
1175        vpxor   \T4, \T2, \T2
1176
1177        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
1178
1179        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
1180        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
1181        #######################################################################
1182
1183
1184        #second phase of the reduction
1185        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
1186        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
1187        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
1188        vpxor   \T3, \T2, \T2   # xor the shifted versions
1189        vpxor   \T4, \T2, \T2
1190
1191        vpxor   \T1, \T2, \T2
1192        vpxor   \T2, \T7, \T7
1193        vpxor   \T7, \T6, \T6   # the result is in T6
1194
1195.endm
1196
1197
1198# combined for GCM encrypt and decrypt functions
1199# clobbering all xmm registers
1200# clobbering r10, r11, r12, r13, r14, r15
1201.macro  GCM_ENC_DEC_AVX     ENC_DEC
1202
1203        #the number of pushes must equal STACK_OFFSET
1204        push    %r12
1205        push    %r13
1206        push    %r14
1207        push    %r15
1208
1209        mov     %rsp, %r14
1210
1211
1212
1213
1214        sub     $VARIABLE_OFFSET, %rsp
1215        and     $~63, %rsp                  # align rsp to 64 bytes
1216
1217
1218        vmovdqu  HashKey(arg1), %xmm13      # xmm13 = HashKey
1219
1220        mov     arg4, %r13                  # save the number of bytes of plaintext/ciphertext
1221        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
1222
1223        mov     %r13, %r12
1224        shr     $4, %r12
1225        and     $7, %r12
1226        jz      _initial_num_blocks_is_0\@
1227
1228        cmp     $7, %r12
1229        je      _initial_num_blocks_is_7\@
1230        cmp     $6, %r12
1231        je      _initial_num_blocks_is_6\@
1232        cmp     $5, %r12
1233        je      _initial_num_blocks_is_5\@
1234        cmp     $4, %r12
1235        je      _initial_num_blocks_is_4\@
1236        cmp     $3, %r12
1237        je      _initial_num_blocks_is_3\@
1238        cmp     $2, %r12
1239        je      _initial_num_blocks_is_2\@
1240
1241        jmp     _initial_num_blocks_is_1\@
1242
1243_initial_num_blocks_is_7\@:
1244        INITIAL_BLOCKS_AVX  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1245        sub     $16*7, %r13
1246        jmp     _initial_blocks_encrypted\@
1247
1248_initial_num_blocks_is_6\@:
1249        INITIAL_BLOCKS_AVX  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1250        sub     $16*6, %r13
1251        jmp     _initial_blocks_encrypted\@
1252
1253_initial_num_blocks_is_5\@:
1254        INITIAL_BLOCKS_AVX  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1255        sub     $16*5, %r13
1256        jmp     _initial_blocks_encrypted\@
1257
1258_initial_num_blocks_is_4\@:
1259        INITIAL_BLOCKS_AVX  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1260        sub     $16*4, %r13
1261        jmp     _initial_blocks_encrypted\@
1262
1263_initial_num_blocks_is_3\@:
1264        INITIAL_BLOCKS_AVX  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1265        sub     $16*3, %r13
1266        jmp     _initial_blocks_encrypted\@
1267
1268_initial_num_blocks_is_2\@:
1269        INITIAL_BLOCKS_AVX  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1270        sub     $16*2, %r13
1271        jmp     _initial_blocks_encrypted\@
1272
1273_initial_num_blocks_is_1\@:
1274        INITIAL_BLOCKS_AVX  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1275        sub     $16*1, %r13
1276        jmp     _initial_blocks_encrypted\@
1277
1278_initial_num_blocks_is_0\@:
1279        INITIAL_BLOCKS_AVX  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1280
1281
1282_initial_blocks_encrypted\@:
1283        cmp     $0, %r13
1284        je      _zero_cipher_left\@
1285
1286        sub     $128, %r13
1287        je      _eight_cipher_left\@
1288
1289
1290
1291
1292        vmovd   %xmm9, %r15d
1293        and     $255, %r15d
1294        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1295
1296
1297_encrypt_by_8_new\@:
1298        cmp     $(255-8), %r15d
1299        jg      _encrypt_by_8\@
1300
1301
1302
1303        add     $8, %r15b
1304        GHASH_8_ENCRYPT_8_PARALLEL_AVX      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
1305        add     $128, %r11
1306        sub     $128, %r13
1307        jne     _encrypt_by_8_new\@
1308
1309        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1310        jmp     _eight_cipher_left\@
1311
1312_encrypt_by_8\@:
1313        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1314        add     $8, %r15b
1315        GHASH_8_ENCRYPT_8_PARALLEL_AVX      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
1316        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1317        add     $128, %r11
1318        sub     $128, %r13
1319        jne     _encrypt_by_8_new\@
1320
1321        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1322
1323
1324
1325
1326_eight_cipher_left\@:
1327        GHASH_LAST_8_AVX    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
1328
1329
1330_zero_cipher_left\@:
1331        cmp     $16, arg4
1332        jl      _only_less_than_16\@
1333
1334        mov     arg4, %r13
1335        and     $15, %r13                            # r13 = (arg4 mod 16)
1336
1337        je      _multiple_of_16_bytes\@
1338
1339        # handle the last <16 Byte block seperately
1340
1341
1342        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
1343        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1344        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
1345
1346        sub     $16, %r11
1347        add     %r13, %r11
1348        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
1349
1350        lea     SHIFT_MASK+16(%rip), %r12
1351        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
1352                                                     # able to shift 16-r13 bytes (r13 is the
1353                                                     # number of bytes in plaintext mod 16)
1354        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
1355        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
1356        jmp     _final_ghash_mul\@
1357
1358_only_less_than_16\@:
1359        # check for 0 length
1360        mov     arg4, %r13
1361        and     $15, %r13                            # r13 = (arg4 mod 16)
1362
1363        je      _multiple_of_16_bytes\@
1364
1365        # handle the last <16 Byte block seperately
1366
1367
1368        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
1369        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1370        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
1371
1372
1373        lea     SHIFT_MASK+16(%rip), %r12
1374        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
1375                                                     # able to shift 16-r13 bytes (r13 is the
1376                                                     # number of bytes in plaintext mod 16)
1377
1378_get_last_16_byte_loop\@:
1379        movb    (arg3, %r11),  %al
1380        movb    %al,  TMP1 (%rsp , %r11)
1381        add     $1, %r11
1382        cmp     %r13,  %r11
1383        jne     _get_last_16_byte_loop\@
1384
1385        vmovdqu  TMP1(%rsp), %xmm1
1386
1387        sub     $16, %r11
1388
1389_final_ghash_mul\@:
1390        .if  \ENC_DEC ==  DEC
1391        vmovdqa %xmm1, %xmm2
1392        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
1393        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
1394                                                     # mask out top 16-r13 bytes of xmm9
1395        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
1396        vpand   %xmm1, %xmm2, %xmm2
1397        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
1398        vpxor   %xmm2, %xmm14, %xmm14
1399        #GHASH computation for the last <16 Byte block
1400        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1401        sub     %r13, %r11
1402        add     $16, %r11
1403        .else
1404        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
1405        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
1406                                                     # mask out top 16-r13 bytes of xmm9
1407        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
1408        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1409        vpxor   %xmm9, %xmm14, %xmm14
1410        #GHASH computation for the last <16 Byte block
1411        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1412        sub     %r13, %r11
1413        add     $16, %r11
1414        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
1415        .endif
1416
1417
1418        #############################
1419        # output r13 Bytes
1420        vmovq   %xmm9, %rax
1421        cmp     $8, %r13
1422        jle     _less_than_8_bytes_left\@
1423
1424        mov     %rax, (arg2 , %r11)
1425        add     $8, %r11
1426        vpsrldq $8, %xmm9, %xmm9
1427        vmovq   %xmm9, %rax
1428        sub     $8, %r13
1429
1430_less_than_8_bytes_left\@:
1431        movb    %al, (arg2 , %r11)
1432        add     $1, %r11
1433        shr     $8, %rax
1434        sub     $1, %r13
1435        jne     _less_than_8_bytes_left\@
1436        #############################
1437
1438_multiple_of_16_bytes\@:
1439        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
1440        shl     $3, %r12                             # convert into number of bits
1441        vmovd   %r12d, %xmm15                        # len(A) in xmm15
1442
1443        shl     $3, arg4                             # len(C) in bits  (*128)
1444        vmovq   arg4, %xmm1
1445        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
1446        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
1447
1448        vpxor   %xmm15, %xmm14, %xmm14
1449        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
1450        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
1451
1452        mov     arg5, %rax                           # rax = *Y0
1453        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
1454
1455        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
1456
1457        vpxor   %xmm14, %xmm9, %xmm9
1458
1459
1460
1461_return_T\@:
1462        mov     arg8, %r10              # r10 = authTag
1463        mov     arg9, %r11              # r11 = auth_tag_len
1464
1465        cmp     $16, %r11
1466        je      _T_16\@
1467
1468        cmp     $8, %r11
1469        jl      _T_4\@
1470
1471_T_8\@:
1472        vmovq   %xmm9, %rax
1473        mov     %rax, (%r10)
1474        add     $8, %r10
1475        sub     $8, %r11
1476        vpsrldq $8, %xmm9, %xmm9
1477        cmp     $0, %r11
1478        je     _return_T_done\@
1479_T_4\@:
1480        vmovd   %xmm9, %eax
1481        mov     %eax, (%r10)
1482        add     $4, %r10
1483        sub     $4, %r11
1484        vpsrldq     $4, %xmm9, %xmm9
1485        cmp     $0, %r11
1486        je     _return_T_done\@
1487_T_123\@:
1488        vmovd     %xmm9, %eax
1489        cmp     $2, %r11
1490        jl     _T_1\@
1491        mov     %ax, (%r10)
1492        cmp     $2, %r11
1493        je     _return_T_done\@
1494        add     $2, %r10
1495        sar     $16, %eax
1496_T_1\@:
1497        mov     %al, (%r10)
1498        jmp     _return_T_done\@
1499
1500_T_16\@:
1501        vmovdqu %xmm9, (%r10)
1502
1503_return_T_done\@:
1504        mov     %r14, %rsp
1505
1506        pop     %r15
1507        pop     %r14
1508        pop     %r13
1509        pop     %r12
1510.endm
1511
1512
1513#############################################################
1514#void   aesni_gcm_precomp_avx_gen2
1515#        (gcm_data     *my_ctx_data,
1516#        u8     *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1517#############################################################
1518ENTRY(aesni_gcm_precomp_avx_gen2)
1519        #the number of pushes must equal STACK_OFFSET
1520        push    %r12
1521        push    %r13
1522        push    %r14
1523        push    %r15
1524
1525        mov     %rsp, %r14
1526
1527
1528
1529        sub     $VARIABLE_OFFSET, %rsp
1530        and     $~63, %rsp                  # align rsp to 64 bytes
1531
1532        vmovdqu  (arg2), %xmm6              # xmm6 = HashKey
1533
1534        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
1535        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
1536        vmovdqa  %xmm6, %xmm2
1537        vpsllq   $1, %xmm6, %xmm6
1538        vpsrlq   $63, %xmm2, %xmm2
1539        vmovdqa  %xmm2, %xmm1
1540        vpslldq  $8, %xmm2, %xmm2
1541        vpsrldq  $8, %xmm1, %xmm1
1542        vpor     %xmm2, %xmm6, %xmm6
1543        #reduction
1544        vpshufd  $0b00100100, %xmm1, %xmm2
1545        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1546        vpand    POLY(%rip), %xmm2, %xmm2
1547        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
1548        #######################################################################
1549        vmovdqa  %xmm6, HashKey(arg1)       # store HashKey<<1 mod poly
1550
1551
1552        PRECOMPUTE_AVX  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1553
1554        mov     %r14, %rsp
1555
1556        pop     %r15
1557        pop     %r14
1558        pop     %r13
1559        pop     %r12
1560        ret
1561ENDPROC(aesni_gcm_precomp_avx_gen2)
1562
1563###############################################################################
1564#void   aesni_gcm_enc_avx_gen2(
1565#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1566#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
1567#        const   u8 *in, /* Plaintext input */
1568#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
1569#        u8      *iv, /* Pre-counter block j0: 4 byte salt
1570#                       (from Security Association) concatenated with 8 byte
1571#                       Initialisation Vector (from IPSec ESP Payload)
1572#                       concatenated with 0x00000001. 16-byte aligned pointer. */
1573#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1574#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1575#        u8      *auth_tag, /* Authenticated Tag output. */
1576#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1577#                               Valid values are 16 (most likely), 12 or 8. */
1578###############################################################################
1579ENTRY(aesni_gcm_enc_avx_gen2)
1580        GCM_ENC_DEC_AVX     ENC
1581        ret
1582ENDPROC(aesni_gcm_enc_avx_gen2)
1583
1584###############################################################################
1585#void   aesni_gcm_dec_avx_gen2(
1586#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1587#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
1588#        const   u8 *in, /* Ciphertext input */
1589#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
1590#        u8      *iv, /* Pre-counter block j0: 4 byte salt
1591#                       (from Security Association) concatenated with 8 byte
1592#                       Initialisation Vector (from IPSec ESP Payload)
1593#                       concatenated with 0x00000001. 16-byte aligned pointer. */
1594#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1595#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1596#        u8      *auth_tag, /* Authenticated Tag output. */
1597#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1598#                               Valid values are 16 (most likely), 12 or 8. */
1599###############################################################################
1600ENTRY(aesni_gcm_dec_avx_gen2)
1601        GCM_ENC_DEC_AVX     DEC
1602        ret
1603ENDPROC(aesni_gcm_dec_avx_gen2)
1604#endif /* CONFIG_AS_AVX */
1605
1606#ifdef CONFIG_AS_AVX2
1607###############################################################################
1608# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1609# Input: A and B (128-bits each, bit-reflected)
1610# Output: C = A*B*x mod poly, (i.e. >>1 )
1611# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1612# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1613###############################################################################
1614.macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1615
1616        vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
1617        vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
1618        vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
1619        vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
1620        vpxor           \T3, \GH, \GH
1621
1622
1623        vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
1624        vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
1625
1626        vpxor           \T3, \T1, \T1
1627        vpxor           \T2, \GH, \GH
1628
1629        #######################################################################
1630        #first phase of the reduction
1631        vmovdqa         POLY2(%rip), \T3
1632
1633        vpclmulqdq      $0x01, \GH, \T3, \T2
1634        vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
1635
1636        vpxor           \T2, \GH, \GH          # first phase of the reduction complete
1637        #######################################################################
1638        #second phase of the reduction
1639        vpclmulqdq      $0x00, \GH, \T3, \T2
1640        vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1641
1642        vpclmulqdq      $0x10, \GH, \T3, \GH
1643        vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1644
1645        vpxor           \T2, \GH, \GH          # second phase of the reduction complete
1646        #######################################################################
1647        vpxor           \T1, \GH, \GH          # the result is in GH
1648
1649
1650.endm
1651
1652.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1653
1654        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1655        vmovdqa  \HK, \T5
1656        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
1657        vmovdqa  \T5, HashKey_2(arg1)                       #  [HashKey_2] = HashKey^2<<1 mod poly
1658
1659        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
1660        vmovdqa  \T5, HashKey_3(arg1)
1661
1662        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
1663        vmovdqa  \T5, HashKey_4(arg1)
1664
1665        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
1666        vmovdqa  \T5, HashKey_5(arg1)
1667
1668        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
1669        vmovdqa  \T5, HashKey_6(arg1)
1670
1671        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
1672        vmovdqa  \T5, HashKey_7(arg1)
1673
1674        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
1675        vmovdqa  \T5, HashKey_8(arg1)
1676
1677.endm
1678
1679
1680## if a = number of total plaintext bytes
1681## b = floor(a/16)
1682## num_initial_blocks = b mod 4#
1683## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1684## r10, r11, r12, rax are clobbered
1685## arg1, arg2, arg3, r14 are used as a pointer only, not modified
1686
1687.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1688        i = (8-\num_initial_blocks)
1689        j = 0
1690        setreg
1691
1692        mov     arg6, %r10                       # r10 = AAD
1693        mov     arg7, %r12                       # r12 = aadLen
1694
1695
1696        mov     %r12, %r11
1697
1698        vpxor   reg_j, reg_j, reg_j
1699        vpxor   reg_i, reg_i, reg_i
1700
1701        cmp     $16, %r11
1702        jl      _get_AAD_rest8\@
1703_get_AAD_blocks\@:
1704        vmovdqu (%r10), reg_i
1705        vpshufb SHUF_MASK(%rip), reg_i, reg_i
1706        vpxor   reg_i, reg_j, reg_j
1707        GHASH_MUL_AVX2      reg_j, \T2, \T1, \T3, \T4, \T5, \T6
1708        add     $16, %r10
1709        sub     $16, %r12
1710        sub     $16, %r11
1711        cmp     $16, %r11
1712        jge     _get_AAD_blocks\@
1713        vmovdqu reg_j, reg_i
1714        cmp     $0, %r11
1715        je      _get_AAD_done\@
1716
1717        vpxor   reg_i, reg_i, reg_i
1718
1719        /* read the last <16B of AAD. since we have at least 4B of
1720        data right after the AAD (the ICV, and maybe some CT), we can
1721        read 4B/8B blocks safely, and then get rid of the extra stuff */
1722_get_AAD_rest8\@:
1723        cmp     $4, %r11
1724        jle     _get_AAD_rest4\@
1725        movq    (%r10), \T1
1726        add     $8, %r10
1727        sub     $8, %r11
1728        vpslldq $8, \T1, \T1
1729        vpsrldq $8, reg_i, reg_i
1730        vpxor   \T1, reg_i, reg_i
1731        jmp     _get_AAD_rest8\@
1732_get_AAD_rest4\@:
1733        cmp     $0, %r11
1734        jle     _get_AAD_rest0\@
1735        mov     (%r10), %eax
1736        movq    %rax, \T1
1737        add     $4, %r10
1738        sub     $4, %r11
1739        vpslldq $12, \T1, \T1
1740        vpsrldq $4, reg_i, reg_i
1741        vpxor   \T1, reg_i, reg_i
1742_get_AAD_rest0\@:
1743        /* finalize: shift out the extra bytes we read, and align
1744        left. since pslldq can only shift by an immediate, we use
1745        vpshufb and an array of shuffle masks */
1746        movq    %r12, %r11
1747        salq    $4, %r11
1748        movdqu  aad_shift_arr(%r11), \T1
1749        vpshufb \T1, reg_i, reg_i
1750_get_AAD_rest_final\@:
1751        vpshufb SHUF_MASK(%rip), reg_i, reg_i
1752        vpxor   reg_j, reg_i, reg_i
1753        GHASH_MUL_AVX2      reg_i, \T2, \T1, \T3, \T4, \T5, \T6
1754
1755_get_AAD_done\@:
1756        # initialize the data pointer offset as zero
1757        xor     %r11, %r11
1758
1759        # start AES for num_initial_blocks blocks
1760        mov     arg5, %rax                     # rax = *Y0
1761        vmovdqu (%rax), \CTR                   # CTR = Y0
1762        vpshufb SHUF_MASK(%rip), \CTR, \CTR
1763
1764
1765        i = (9-\num_initial_blocks)
1766        setreg
1767.rep \num_initial_blocks
1768                vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
1769                vmovdqa \CTR, reg_i
1770                vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
1771        i = (i+1)
1772        setreg
1773.endr
1774
1775        vmovdqa  (arg1), \T_key
1776        i = (9-\num_initial_blocks)
1777        setreg
1778.rep \num_initial_blocks
1779                vpxor   \T_key, reg_i, reg_i
1780        i = (i+1)
1781        setreg
1782.endr
1783
1784        j = 1
1785        setreg
1786.rep 9
1787        vmovdqa  16*j(arg1), \T_key
1788        i = (9-\num_initial_blocks)
1789        setreg
1790.rep \num_initial_blocks
1791        vaesenc \T_key, reg_i, reg_i
1792        i = (i+1)
1793        setreg
1794.endr
1795
1796        j = (j+1)
1797        setreg
1798.endr
1799
1800
1801        vmovdqa  16*10(arg1), \T_key
1802        i = (9-\num_initial_blocks)
1803        setreg
1804.rep \num_initial_blocks
1805        vaesenclast      \T_key, reg_i, reg_i
1806        i = (i+1)
1807        setreg
1808.endr
1809
1810        i = (9-\num_initial_blocks)
1811        setreg
1812.rep \num_initial_blocks
1813                vmovdqu (arg3, %r11), \T1
1814                vpxor   \T1, reg_i, reg_i
1815                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for
1816                                                       # num_initial_blocks blocks
1817                add     $16, %r11
1818.if  \ENC_DEC == DEC
1819                vmovdqa \T1, reg_i
1820.endif
1821                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
1822        i = (i+1)
1823        setreg
1824.endr
1825
1826
1827        i = (8-\num_initial_blocks)
1828        j = (9-\num_initial_blocks)
1829        setreg
1830
1831.rep \num_initial_blocks
1832        vpxor    reg_i, reg_j, reg_j
1833        GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
1834        i = (i+1)
1835        j = (j+1)
1836        setreg
1837.endr
1838        # XMM8 has the combined result here
1839
1840        vmovdqa  \XMM8, TMP1(%rsp)
1841        vmovdqa  \XMM8, \T3
1842
1843        cmp     $128, %r13
1844        jl      _initial_blocks_done\@                  # no need for precomputed constants
1845
1846###############################################################################
1847# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1848                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1849                vmovdqa  \CTR, \XMM1
1850                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
1851
1852                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1853                vmovdqa  \CTR, \XMM2
1854                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
1855
1856                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1857                vmovdqa  \CTR, \XMM3
1858                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
1859
1860                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1861                vmovdqa  \CTR, \XMM4
1862                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
1863
1864                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1865                vmovdqa  \CTR, \XMM5
1866                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
1867
1868                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1869                vmovdqa  \CTR, \XMM6
1870                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
1871
1872                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1873                vmovdqa  \CTR, \XMM7
1874                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
1875
1876                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1877                vmovdqa  \CTR, \XMM8
1878                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
1879
1880                vmovdqa  (arg1), \T_key
1881                vpxor    \T_key, \XMM1, \XMM1
1882                vpxor    \T_key, \XMM2, \XMM2
1883                vpxor    \T_key, \XMM3, \XMM3
1884                vpxor    \T_key, \XMM4, \XMM4
1885                vpxor    \T_key, \XMM5, \XMM5
1886                vpxor    \T_key, \XMM6, \XMM6
1887                vpxor    \T_key, \XMM7, \XMM7
1888                vpxor    \T_key, \XMM8, \XMM8
1889
1890                i = 1
1891                setreg
1892.rep    9       # do 9 rounds
1893                vmovdqa  16*i(arg1), \T_key
1894                vaesenc  \T_key, \XMM1, \XMM1
1895                vaesenc  \T_key, \XMM2, \XMM2
1896                vaesenc  \T_key, \XMM3, \XMM3
1897                vaesenc  \T_key, \XMM4, \XMM4
1898                vaesenc  \T_key, \XMM5, \XMM5
1899                vaesenc  \T_key, \XMM6, \XMM6
1900                vaesenc  \T_key, \XMM7, \XMM7
1901                vaesenc  \T_key, \XMM8, \XMM8
1902                i = (i+1)
1903                setreg
1904.endr
1905
1906
1907                vmovdqa  16*i(arg1), \T_key
1908                vaesenclast  \T_key, \XMM1, \XMM1
1909                vaesenclast  \T_key, \XMM2, \XMM2
1910                vaesenclast  \T_key, \XMM3, \XMM3
1911                vaesenclast  \T_key, \XMM4, \XMM4
1912                vaesenclast  \T_key, \XMM5, \XMM5
1913                vaesenclast  \T_key, \XMM6, \XMM6
1914                vaesenclast  \T_key, \XMM7, \XMM7
1915                vaesenclast  \T_key, \XMM8, \XMM8
1916
1917                vmovdqu  (arg3, %r11), \T1
1918                vpxor    \T1, \XMM1, \XMM1
1919                vmovdqu  \XMM1, (arg2 , %r11)
1920                .if   \ENC_DEC == DEC
1921                vmovdqa  \T1, \XMM1
1922                .endif
1923
1924                vmovdqu  16*1(arg3, %r11), \T1
1925                vpxor    \T1, \XMM2, \XMM2
1926                vmovdqu  \XMM2, 16*1(arg2 , %r11)
1927                .if   \ENC_DEC == DEC
1928                vmovdqa  \T1, \XMM2
1929                .endif
1930
1931                vmovdqu  16*2(arg3, %r11), \T1
1932                vpxor    \T1, \XMM3, \XMM3
1933                vmovdqu  \XMM3, 16*2(arg2 , %r11)
1934                .if   \ENC_DEC == DEC
1935                vmovdqa  \T1, \XMM3
1936                .endif
1937
1938                vmovdqu  16*3(arg3, %r11), \T1
1939                vpxor    \T1, \XMM4, \XMM4
1940                vmovdqu  \XMM4, 16*3(arg2 , %r11)
1941                .if   \ENC_DEC == DEC
1942                vmovdqa  \T1, \XMM4
1943                .endif
1944
1945                vmovdqu  16*4(arg3, %r11), \T1
1946                vpxor    \T1, \XMM5, \XMM5
1947                vmovdqu  \XMM5, 16*4(arg2 , %r11)
1948                .if   \ENC_DEC == DEC
1949                vmovdqa  \T1, \XMM5
1950                .endif
1951
1952                vmovdqu  16*5(arg3, %r11), \T1
1953                vpxor    \T1, \XMM6, \XMM6
1954                vmovdqu  \XMM6, 16*5(arg2 , %r11)
1955                .if   \ENC_DEC == DEC
1956                vmovdqa  \T1, \XMM6
1957                .endif
1958
1959                vmovdqu  16*6(arg3, %r11), \T1
1960                vpxor    \T1, \XMM7, \XMM7
1961                vmovdqu  \XMM7, 16*6(arg2 , %r11)
1962                .if   \ENC_DEC == DEC
1963                vmovdqa  \T1, \XMM7
1964                .endif
1965
1966                vmovdqu  16*7(arg3, %r11), \T1
1967                vpxor    \T1, \XMM8, \XMM8
1968                vmovdqu  \XMM8, 16*7(arg2 , %r11)
1969                .if   \ENC_DEC == DEC
1970                vmovdqa  \T1, \XMM8
1971                .endif
1972
1973                add     $128, %r11
1974
1975                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
1976                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
1977                                                           # the corresponding ciphertext
1978                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
1979                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
1980                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
1981                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
1982                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
1983                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
1984                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
1985
1986###############################################################################
1987
1988_initial_blocks_done\@:
1989
1990
1991.endm
1992
1993
1994
1995# encrypt 8 blocks at a time
1996# ghash the 8 previously encrypted ciphertext blocks
1997# arg1, arg2, arg3 are used as pointers only, not modified
1998# r11 is the data offset value
1999.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2000
2001        vmovdqa \XMM1, \T2
2002        vmovdqa \XMM2, TMP2(%rsp)
2003        vmovdqa \XMM3, TMP3(%rsp)
2004        vmovdqa \XMM4, TMP4(%rsp)
2005        vmovdqa \XMM5, TMP5(%rsp)
2006        vmovdqa \XMM6, TMP6(%rsp)
2007        vmovdqa \XMM7, TMP7(%rsp)
2008        vmovdqa \XMM8, TMP8(%rsp)
2009
2010.if \loop_idx == in_order
2011                vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
2012                vpaddd  ONE(%rip), \XMM1, \XMM2
2013                vpaddd  ONE(%rip), \XMM2, \XMM3
2014                vpaddd  ONE(%rip), \XMM3, \XMM4
2015                vpaddd  ONE(%rip), \XMM4, \XMM5
2016                vpaddd  ONE(%rip), \XMM5, \XMM6
2017                vpaddd  ONE(%rip), \XMM6, \XMM7
2018                vpaddd  ONE(%rip), \XMM7, \XMM8
2019                vmovdqa \XMM8, \CTR
2020
2021                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2022                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2023                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2024                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2025                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2026                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2027                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2028                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2029.else
2030                vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
2031                vpaddd  ONEf(%rip), \XMM1, \XMM2
2032                vpaddd  ONEf(%rip), \XMM2, \XMM3
2033                vpaddd  ONEf(%rip), \XMM3, \XMM4
2034                vpaddd  ONEf(%rip), \XMM4, \XMM5
2035                vpaddd  ONEf(%rip), \XMM5, \XMM6
2036                vpaddd  ONEf(%rip), \XMM6, \XMM7
2037                vpaddd  ONEf(%rip), \XMM7, \XMM8
2038                vmovdqa \XMM8, \CTR
2039.endif
2040
2041
2042        #######################################################################
2043
2044                vmovdqu (arg1), \T1
2045                vpxor   \T1, \XMM1, \XMM1
2046                vpxor   \T1, \XMM2, \XMM2
2047                vpxor   \T1, \XMM3, \XMM3
2048                vpxor   \T1, \XMM4, \XMM4
2049                vpxor   \T1, \XMM5, \XMM5
2050                vpxor   \T1, \XMM6, \XMM6
2051                vpxor   \T1, \XMM7, \XMM7
2052                vpxor   \T1, \XMM8, \XMM8
2053
2054        #######################################################################
2055
2056
2057
2058
2059
2060                vmovdqu 16*1(arg1), \T1
2061                vaesenc \T1, \XMM1, \XMM1
2062                vaesenc \T1, \XMM2, \XMM2
2063                vaesenc \T1, \XMM3, \XMM3
2064                vaesenc \T1, \XMM4, \XMM4
2065                vaesenc \T1, \XMM5, \XMM5
2066                vaesenc \T1, \XMM6, \XMM6
2067                vaesenc \T1, \XMM7, \XMM7
2068                vaesenc \T1, \XMM8, \XMM8
2069
2070                vmovdqu 16*2(arg1), \T1
2071                vaesenc \T1, \XMM1, \XMM1
2072                vaesenc \T1, \XMM2, \XMM2
2073                vaesenc \T1, \XMM3, \XMM3
2074                vaesenc \T1, \XMM4, \XMM4
2075                vaesenc \T1, \XMM5, \XMM5
2076                vaesenc \T1, \XMM6, \XMM6
2077                vaesenc \T1, \XMM7, \XMM7
2078                vaesenc \T1, \XMM8, \XMM8
2079
2080
2081        #######################################################################
2082
2083        vmovdqa         HashKey_8(arg1), \T5
2084        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
2085        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
2086        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
2087        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
2088        vpxor           \T5, \T6, \T6
2089
2090                vmovdqu 16*3(arg1), \T1
2091                vaesenc \T1, \XMM1, \XMM1
2092                vaesenc \T1, \XMM2, \XMM2
2093                vaesenc \T1, \XMM3, \XMM3
2094                vaesenc \T1, \XMM4, \XMM4
2095                vaesenc \T1, \XMM5, \XMM5
2096                vaesenc \T1, \XMM6, \XMM6
2097                vaesenc \T1, \XMM7, \XMM7
2098                vaesenc \T1, \XMM8, \XMM8
2099
2100        vmovdqa         TMP2(%rsp), \T1
2101        vmovdqa         HashKey_7(arg1), \T5
2102        vpclmulqdq      $0x11, \T5, \T1, \T3
2103        vpxor           \T3, \T4, \T4
2104
2105        vpclmulqdq      $0x00, \T5, \T1, \T3
2106        vpxor           \T3, \T7, \T7
2107
2108        vpclmulqdq      $0x01, \T5, \T1, \T3
2109        vpxor           \T3, \T6, \T6
2110
2111        vpclmulqdq      $0x10, \T5, \T1, \T3
2112        vpxor           \T3, \T6, \T6
2113
2114                vmovdqu 16*4(arg1), \T1
2115                vaesenc \T1, \XMM1, \XMM1
2116                vaesenc \T1, \XMM2, \XMM2
2117                vaesenc \T1, \XMM3, \XMM3
2118                vaesenc \T1, \XMM4, \XMM4
2119                vaesenc \T1, \XMM5, \XMM5
2120                vaesenc \T1, \XMM6, \XMM6
2121                vaesenc \T1, \XMM7, \XMM7
2122                vaesenc \T1, \XMM8, \XMM8
2123
2124        #######################################################################
2125
2126        vmovdqa         TMP3(%rsp), \T1
2127        vmovdqa         HashKey_6(arg1), \T5
2128        vpclmulqdq      $0x11, \T5, \T1, \T3
2129        vpxor           \T3, \T4, \T4
2130
2131        vpclmulqdq      $0x00, \T5, \T1, \T3
2132        vpxor           \T3, \T7, \T7
2133
2134        vpclmulqdq      $0x01, \T5, \T1, \T3
2135        vpxor           \T3, \T6, \T6
2136
2137        vpclmulqdq      $0x10, \T5, \T1, \T3
2138        vpxor           \T3, \T6, \T6
2139
2140                vmovdqu 16*5(arg1), \T1
2141                vaesenc \T1, \XMM1, \XMM1
2142                vaesenc \T1, \XMM2, \XMM2
2143                vaesenc \T1, \XMM3, \XMM3
2144                vaesenc \T1, \XMM4, \XMM4
2145                vaesenc \T1, \XMM5, \XMM5
2146                vaesenc \T1, \XMM6, \XMM6
2147                vaesenc \T1, \XMM7, \XMM7
2148                vaesenc \T1, \XMM8, \XMM8
2149
2150        vmovdqa         TMP4(%rsp), \T1
2151        vmovdqa         HashKey_5(arg1), \T5
2152        vpclmulqdq      $0x11, \T5, \T1, \T3
2153        vpxor           \T3, \T4, \T4
2154
2155        vpclmulqdq      $0x00, \T5, \T1, \T3
2156        vpxor           \T3, \T7, \T7
2157
2158        vpclmulqdq      $0x01, \T5, \T1, \T3
2159        vpxor           \T3, \T6, \T6
2160
2161        vpclmulqdq      $0x10, \T5, \T1, \T3
2162        vpxor           \T3, \T6, \T6
2163
2164                vmovdqu 16*6(arg1), \T1
2165                vaesenc \T1, \XMM1, \XMM1
2166                vaesenc \T1, \XMM2, \XMM2
2167                vaesenc \T1, \XMM3, \XMM3
2168                vaesenc \T1, \XMM4, \XMM4
2169                vaesenc \T1, \XMM5, \XMM5
2170                vaesenc \T1, \XMM6, \XMM6
2171                vaesenc \T1, \XMM7, \XMM7
2172                vaesenc \T1, \XMM8, \XMM8
2173
2174
2175        vmovdqa         TMP5(%rsp), \T1
2176        vmovdqa         HashKey_4(arg1), \T5
2177        vpclmulqdq      $0x11, \T5, \T1, \T3
2178        vpxor           \T3, \T4, \T4
2179
2180        vpclmulqdq      $0x00, \T5, \T1, \T3
2181        vpxor           \T3, \T7, \T7
2182
2183        vpclmulqdq      $0x01, \T5, \T1, \T3
2184        vpxor           \T3, \T6, \T6
2185
2186        vpclmulqdq      $0x10, \T5, \T1, \T3
2187        vpxor           \T3, \T6, \T6
2188
2189                vmovdqu 16*7(arg1), \T1
2190                vaesenc \T1, \XMM1, \XMM1
2191                vaesenc \T1, \XMM2, \XMM2
2192                vaesenc \T1, \XMM3, \XMM3
2193                vaesenc \T1, \XMM4, \XMM4
2194                vaesenc \T1, \XMM5, \XMM5
2195                vaesenc \T1, \XMM6, \XMM6
2196                vaesenc \T1, \XMM7, \XMM7
2197                vaesenc \T1, \XMM8, \XMM8
2198
2199        vmovdqa         TMP6(%rsp), \T1
2200        vmovdqa         HashKey_3(arg1), \T5
2201        vpclmulqdq      $0x11, \T5, \T1, \T3
2202        vpxor           \T3, \T4, \T4
2203
2204        vpclmulqdq      $0x00, \T5, \T1, \T3
2205        vpxor           \T3, \T7, \T7
2206
2207        vpclmulqdq      $0x01, \T5, \T1, \T3
2208        vpxor           \T3, \T6, \T6
2209
2210        vpclmulqdq      $0x10, \T5, \T1, \T3
2211        vpxor           \T3, \T6, \T6
2212
2213                vmovdqu 16*8(arg1), \T1
2214                vaesenc \T1, \XMM1, \XMM1
2215                vaesenc \T1, \XMM2, \XMM2
2216                vaesenc \T1, \XMM3, \XMM3
2217                vaesenc \T1, \XMM4, \XMM4
2218                vaesenc \T1, \XMM5, \XMM5
2219                vaesenc \T1, \XMM6, \XMM6
2220                vaesenc \T1, \XMM7, \XMM7
2221                vaesenc \T1, \XMM8, \XMM8
2222
2223        vmovdqa         TMP7(%rsp), \T1
2224        vmovdqa         HashKey_2(arg1), \T5
2225        vpclmulqdq      $0x11, \T5, \T1, \T3
2226        vpxor           \T3, \T4, \T4
2227
2228        vpclmulqdq      $0x00, \T5, \T1, \T3
2229        vpxor           \T3, \T7, \T7
2230
2231        vpclmulqdq      $0x01, \T5, \T1, \T3
2232        vpxor           \T3, \T6, \T6
2233
2234        vpclmulqdq      $0x10, \T5, \T1, \T3
2235        vpxor           \T3, \T6, \T6
2236
2237
2238        #######################################################################
2239
2240                vmovdqu 16*9(arg1), \T5
2241                vaesenc \T5, \XMM1, \XMM1
2242                vaesenc \T5, \XMM2, \XMM2
2243                vaesenc \T5, \XMM3, \XMM3
2244                vaesenc \T5, \XMM4, \XMM4
2245                vaesenc \T5, \XMM5, \XMM5
2246                vaesenc \T5, \XMM6, \XMM6
2247                vaesenc \T5, \XMM7, \XMM7
2248                vaesenc \T5, \XMM8, \XMM8
2249
2250        vmovdqa         TMP8(%rsp), \T1
2251        vmovdqa         HashKey(arg1), \T5
2252
2253        vpclmulqdq      $0x00, \T5, \T1, \T3
2254        vpxor           \T3, \T7, \T7
2255
2256        vpclmulqdq      $0x01, \T5, \T1, \T3
2257        vpxor           \T3, \T6, \T6
2258
2259        vpclmulqdq      $0x10, \T5, \T1, \T3
2260        vpxor           \T3, \T6, \T6
2261
2262        vpclmulqdq      $0x11, \T5, \T1, \T3
2263        vpxor           \T3, \T4, \T1
2264
2265
2266                vmovdqu 16*10(arg1), \T5
2267
2268        i = 0
2269        j = 1
2270        setreg
2271.rep 8
2272                vpxor   16*i(arg3, %r11), \T5, \T2
2273                .if \ENC_DEC == ENC
2274                vaesenclast     \T2, reg_j, reg_j
2275                .else
2276                vaesenclast     \T2, reg_j, \T3
2277                vmovdqu 16*i(arg3, %r11), reg_j
2278                vmovdqu \T3, 16*i(arg2, %r11)
2279                .endif
2280        i = (i+1)
2281        j = (j+1)
2282        setreg
2283.endr
2284        #######################################################################
2285
2286
2287        vpslldq $8, \T6, \T3                            # shift-L T3 2 DWs
2288        vpsrldq $8, \T6, \T6                            # shift-R T2 2 DWs
2289        vpxor   \T3, \T7, \T7
2290        vpxor   \T6, \T1, \T1                           # accumulate the results in T1:T7
2291
2292
2293
2294        #######################################################################
2295        #first phase of the reduction
2296        vmovdqa         POLY2(%rip), \T3
2297
2298        vpclmulqdq      $0x01, \T7, \T3, \T2
2299        vpslldq         $8, \T2, \T2                    # shift-L xmm2 2 DWs
2300
2301        vpxor           \T2, \T7, \T7                   # first phase of the reduction complete
2302        #######################################################################
2303                .if \ENC_DEC == ENC
2304                vmovdqu  \XMM1, 16*0(arg2,%r11)         # Write to the Ciphertext buffer
2305                vmovdqu  \XMM2, 16*1(arg2,%r11)         # Write to the Ciphertext buffer
2306                vmovdqu  \XMM3, 16*2(arg2,%r11)         # Write to the Ciphertext buffer
2307                vmovdqu  \XMM4, 16*3(arg2,%r11)         # Write to the Ciphertext buffer
2308                vmovdqu  \XMM5, 16*4(arg2,%r11)         # Write to the Ciphertext buffer
2309                vmovdqu  \XMM6, 16*5(arg2,%r11)         # Write to the Ciphertext buffer
2310                vmovdqu  \XMM7, 16*6(arg2,%r11)         # Write to the Ciphertext buffer
2311                vmovdqu  \XMM8, 16*7(arg2,%r11)         # Write to the Ciphertext buffer
2312                .endif
2313
2314        #######################################################################
2315        #second phase of the reduction
2316        vpclmulqdq      $0x00, \T7, \T3, \T2
2317        vpsrldq         $4, \T2, \T2                    # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2318
2319        vpclmulqdq      $0x10, \T7, \T3, \T4
2320        vpslldq         $4, \T4, \T4                    # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2321
2322        vpxor           \T2, \T4, \T4                   # second phase of the reduction complete
2323        #######################################################################
2324        vpxor           \T4, \T1, \T1                   # the result is in T1
2325
2326                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1   # perform a 16Byte swap
2327                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2   # perform a 16Byte swap
2328                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3   # perform a 16Byte swap
2329                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4   # perform a 16Byte swap
2330                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5   # perform a 16Byte swap
2331                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6   # perform a 16Byte swap
2332                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7   # perform a 16Byte swap
2333                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8   # perform a 16Byte swap
2334
2335
2336        vpxor   \T1, \XMM1, \XMM1
2337
2338
2339
2340.endm
2341
2342
2343# GHASH the last 4 ciphertext blocks.
2344.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2345
2346        ## Karatsuba Method
2347
2348        vmovdqa         HashKey_8(arg1), \T5
2349
2350        vpshufd         $0b01001110, \XMM1, \T2
2351        vpshufd         $0b01001110, \T5, \T3
2352        vpxor           \XMM1, \T2, \T2
2353        vpxor           \T5, \T3, \T3
2354
2355        vpclmulqdq      $0x11, \T5, \XMM1, \T6
2356        vpclmulqdq      $0x00, \T5, \XMM1, \T7
2357
2358        vpclmulqdq      $0x00, \T3, \T2, \XMM1
2359
2360        ######################
2361
2362        vmovdqa         HashKey_7(arg1), \T5
2363        vpshufd         $0b01001110, \XMM2, \T2
2364        vpshufd         $0b01001110, \T5, \T3
2365        vpxor           \XMM2, \T2, \T2
2366        vpxor           \T5, \T3, \T3
2367
2368        vpclmulqdq      $0x11, \T5, \XMM2, \T4
2369        vpxor           \T4, \T6, \T6
2370
2371        vpclmulqdq      $0x00, \T5, \XMM2, \T4
2372        vpxor           \T4, \T7, \T7
2373
2374        vpclmulqdq      $0x00, \T3, \T2, \T2
2375
2376        vpxor           \T2, \XMM1, \XMM1
2377
2378        ######################
2379
2380        vmovdqa         HashKey_6(arg1), \T5
2381        vpshufd         $0b01001110, \XMM3, \T2
2382        vpshufd         $0b01001110, \T5, \T3
2383        vpxor           \XMM3, \T2, \T2
2384        vpxor           \T5, \T3, \T3
2385
2386        vpclmulqdq      $0x11, \T5, \XMM3, \T4
2387        vpxor           \T4, \T6, \T6
2388
2389        vpclmulqdq      $0x00, \T5, \XMM3, \T4
2390        vpxor           \T4, \T7, \T7
2391
2392        vpclmulqdq      $0x00, \T3, \T2, \T2
2393
2394        vpxor           \T2, \XMM1, \XMM1
2395
2396        ######################
2397
2398        vmovdqa         HashKey_5(arg1), \T5
2399        vpshufd         $0b01001110, \XMM4, \T2
2400        vpshufd         $0b01001110, \T5, \T3
2401        vpxor           \XMM4, \T2, \T2
2402        vpxor           \T5, \T3, \T3
2403
2404        vpclmulqdq      $0x11, \T5, \XMM4, \T4
2405        vpxor           \T4, \T6, \T6
2406
2407        vpclmulqdq      $0x00, \T5, \XMM4, \T4
2408        vpxor           \T4, \T7, \T7
2409
2410        vpclmulqdq      $0x00, \T3, \T2, \T2
2411
2412        vpxor           \T2, \XMM1, \XMM1
2413
2414        ######################
2415
2416        vmovdqa         HashKey_4(arg1), \T5
2417        vpshufd         $0b01001110, \XMM5, \T2
2418        vpshufd         $0b01001110, \T5, \T3
2419        vpxor           \XMM5, \T2, \T2
2420        vpxor           \T5, \T3, \T3
2421
2422        vpclmulqdq      $0x11, \T5, \XMM5, \T4
2423        vpxor           \T4, \T6, \T6
2424
2425        vpclmulqdq      $0x00, \T5, \XMM5, \T4
2426        vpxor           \T4, \T7, \T7
2427
2428        vpclmulqdq      $0x00, \T3, \T2, \T2
2429
2430        vpxor           \T2, \XMM1, \XMM1
2431
2432        ######################
2433
2434        vmovdqa         HashKey_3(arg1), \T5
2435        vpshufd         $0b01001110, \XMM6, \T2
2436        vpshufd         $0b01001110, \T5, \T3
2437        vpxor           \XMM6, \T2, \T2
2438        vpxor           \T5, \T3, \T3
2439
2440        vpclmulqdq      $0x11, \T5, \XMM6, \T4
2441        vpxor           \T4, \T6, \T6
2442
2443        vpclmulqdq      $0x00, \T5, \XMM6, \T4
2444        vpxor           \T4, \T7, \T7
2445
2446        vpclmulqdq      $0x00, \T3, \T2, \T2
2447
2448        vpxor           \T2, \XMM1, \XMM1
2449
2450        ######################
2451
2452        vmovdqa         HashKey_2(arg1), \T5
2453        vpshufd         $0b01001110, \XMM7, \T2
2454        vpshufd         $0b01001110, \T5, \T3
2455        vpxor           \XMM7, \T2, \T2
2456        vpxor           \T5, \T3, \T3
2457
2458        vpclmulqdq      $0x11, \T5, \XMM7, \T4
2459        vpxor           \T4, \T6, \T6
2460
2461        vpclmulqdq      $0x00, \T5, \XMM7, \T4
2462        vpxor           \T4, \T7, \T7
2463
2464        vpclmulqdq      $0x00, \T3, \T2, \T2
2465
2466        vpxor           \T2, \XMM1, \XMM1
2467
2468        ######################
2469
2470        vmovdqa         HashKey(arg1), \T5
2471        vpshufd         $0b01001110, \XMM8, \T2
2472        vpshufd         $0b01001110, \T5, \T3
2473        vpxor           \XMM8, \T2, \T2
2474        vpxor           \T5, \T3, \T3
2475
2476        vpclmulqdq      $0x11, \T5, \XMM8, \T4
2477        vpxor           \T4, \T6, \T6
2478
2479        vpclmulqdq      $0x00, \T5, \XMM8, \T4
2480        vpxor           \T4, \T7, \T7
2481
2482        vpclmulqdq      $0x00, \T3, \T2, \T2
2483
2484        vpxor           \T2, \XMM1, \XMM1
2485        vpxor           \T6, \XMM1, \XMM1
2486        vpxor           \T7, \XMM1, \T2
2487
2488
2489
2490
2491        vpslldq $8, \T2, \T4
2492        vpsrldq $8, \T2, \T2
2493
2494        vpxor   \T4, \T7, \T7
2495        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
2496                                                   # accumulated carry-less multiplications
2497
2498        #######################################################################
2499        #first phase of the reduction
2500        vmovdqa         POLY2(%rip), \T3
2501
2502        vpclmulqdq      $0x01, \T7, \T3, \T2
2503        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
2504
2505        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
2506        #######################################################################
2507
2508
2509        #second phase of the reduction
2510        vpclmulqdq      $0x00, \T7, \T3, \T2
2511        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2512
2513        vpclmulqdq      $0x10, \T7, \T3, \T4
2514        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2515
2516        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
2517        #######################################################################
2518        vpxor           \T4, \T6, \T6              # the result is in T6
2519.endm
2520
2521
2522
2523# combined for GCM encrypt and decrypt functions
2524# clobbering all xmm registers
2525# clobbering r10, r11, r12, r13, r14, r15
2526.macro  GCM_ENC_DEC_AVX2     ENC_DEC
2527
2528        #the number of pushes must equal STACK_OFFSET
2529        push    %r12
2530        push    %r13
2531        push    %r14
2532        push    %r15
2533
2534        mov     %rsp, %r14
2535
2536
2537
2538
2539        sub     $VARIABLE_OFFSET, %rsp
2540        and     $~63, %rsp                         # align rsp to 64 bytes
2541
2542
2543        vmovdqu  HashKey(arg1), %xmm13             # xmm13 = HashKey
2544
2545        mov     arg4, %r13                         # save the number of bytes of plaintext/ciphertext
2546        and     $-16, %r13                         # r13 = r13 - (r13 mod 16)
2547
2548        mov     %r13, %r12
2549        shr     $4, %r12
2550        and     $7, %r12
2551        jz      _initial_num_blocks_is_0\@
2552
2553        cmp     $7, %r12
2554        je      _initial_num_blocks_is_7\@
2555        cmp     $6, %r12
2556        je      _initial_num_blocks_is_6\@
2557        cmp     $5, %r12
2558        je      _initial_num_blocks_is_5\@
2559        cmp     $4, %r12
2560        je      _initial_num_blocks_is_4\@
2561        cmp     $3, %r12
2562        je      _initial_num_blocks_is_3\@
2563        cmp     $2, %r12
2564        je      _initial_num_blocks_is_2\@
2565
2566        jmp     _initial_num_blocks_is_1\@
2567
2568_initial_num_blocks_is_7\@:
2569        INITIAL_BLOCKS_AVX2  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2570        sub     $16*7, %r13
2571        jmp     _initial_blocks_encrypted\@
2572
2573_initial_num_blocks_is_6\@:
2574        INITIAL_BLOCKS_AVX2  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2575        sub     $16*6, %r13
2576        jmp     _initial_blocks_encrypted\@
2577
2578_initial_num_blocks_is_5\@:
2579        INITIAL_BLOCKS_AVX2  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2580        sub     $16*5, %r13
2581        jmp     _initial_blocks_encrypted\@
2582
2583_initial_num_blocks_is_4\@:
2584        INITIAL_BLOCKS_AVX2  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2585        sub     $16*4, %r13
2586        jmp     _initial_blocks_encrypted\@
2587
2588_initial_num_blocks_is_3\@:
2589        INITIAL_BLOCKS_AVX2  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2590        sub     $16*3, %r13
2591        jmp     _initial_blocks_encrypted\@
2592
2593_initial_num_blocks_is_2\@:
2594        INITIAL_BLOCKS_AVX2  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2595        sub     $16*2, %r13
2596        jmp     _initial_blocks_encrypted\@
2597
2598_initial_num_blocks_is_1\@:
2599        INITIAL_BLOCKS_AVX2  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2600        sub     $16*1, %r13
2601        jmp     _initial_blocks_encrypted\@
2602
2603_initial_num_blocks_is_0\@:
2604        INITIAL_BLOCKS_AVX2  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2605
2606
2607_initial_blocks_encrypted\@:
2608        cmp     $0, %r13
2609        je      _zero_cipher_left\@
2610
2611        sub     $128, %r13
2612        je      _eight_cipher_left\@
2613
2614
2615
2616
2617        vmovd   %xmm9, %r15d
2618        and     $255, %r15d
2619        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2620
2621
2622_encrypt_by_8_new\@:
2623        cmp     $(255-8), %r15d
2624        jg      _encrypt_by_8\@
2625
2626
2627
2628        add     $8, %r15b
2629        GHASH_8_ENCRYPT_8_PARALLEL_AVX2      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
2630        add     $128, %r11
2631        sub     $128, %r13
2632        jne     _encrypt_by_8_new\@
2633
2634        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2635        jmp     _eight_cipher_left\@
2636
2637_encrypt_by_8\@:
2638        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2639        add     $8, %r15b
2640        GHASH_8_ENCRYPT_8_PARALLEL_AVX2      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
2641        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2642        add     $128, %r11
2643        sub     $128, %r13
2644        jne     _encrypt_by_8_new\@
2645
2646        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2647
2648
2649
2650
2651_eight_cipher_left\@:
2652        GHASH_LAST_8_AVX2    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
2653
2654
2655_zero_cipher_left\@:
2656        cmp     $16, arg4
2657        jl      _only_less_than_16\@
2658
2659        mov     arg4, %r13
2660        and     $15, %r13                            # r13 = (arg4 mod 16)
2661
2662        je      _multiple_of_16_bytes\@
2663
2664        # handle the last <16 Byte block seperately
2665
2666
2667        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
2668        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2669        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
2670
2671        sub     $16, %r11
2672        add     %r13, %r11
2673        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
2674
2675        lea     SHIFT_MASK+16(%rip), %r12
2676        sub     %r13, %r12                           # adjust the shuffle mask pointer
2677                                                     # to be able to shift 16-r13 bytes
2678                                                     # (r13 is the number of bytes in plaintext mod 16)
2679        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
2680        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
2681        jmp     _final_ghash_mul\@
2682
2683_only_less_than_16\@:
2684        # check for 0 length
2685        mov     arg4, %r13
2686        and     $15, %r13                            # r13 = (arg4 mod 16)
2687
2688        je      _multiple_of_16_bytes\@
2689
2690        # handle the last <16 Byte block seperately
2691
2692
2693        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
2694        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2695        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
2696
2697
2698        lea     SHIFT_MASK+16(%rip), %r12
2699        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
2700                                                     # able to shift 16-r13 bytes (r13 is the
2701                                                     # number of bytes in plaintext mod 16)
2702
2703_get_last_16_byte_loop\@:
2704        movb    (arg3, %r11),  %al
2705        movb    %al,  TMP1 (%rsp , %r11)
2706        add     $1, %r11
2707        cmp     %r13,  %r11
2708        jne     _get_last_16_byte_loop\@
2709
2710        vmovdqu  TMP1(%rsp), %xmm1
2711
2712        sub     $16, %r11
2713
2714_final_ghash_mul\@:
2715        .if  \ENC_DEC ==  DEC
2716        vmovdqa %xmm1, %xmm2
2717        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
2718        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2719        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
2720        vpand   %xmm1, %xmm2, %xmm2
2721        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
2722        vpxor   %xmm2, %xmm14, %xmm14
2723        #GHASH computation for the last <16 Byte block
2724        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2725        sub     %r13, %r11
2726        add     $16, %r11
2727        .else
2728        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
2729        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2730        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
2731        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2732        vpxor   %xmm9, %xmm14, %xmm14
2733        #GHASH computation for the last <16 Byte block
2734        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2735        sub     %r13, %r11
2736        add     $16, %r11
2737        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
2738        .endif
2739
2740
2741        #############################
2742        # output r13 Bytes
2743        vmovq   %xmm9, %rax
2744        cmp     $8, %r13
2745        jle     _less_than_8_bytes_left\@
2746
2747        mov     %rax, (arg2 , %r11)
2748        add     $8, %r11
2749        vpsrldq $8, %xmm9, %xmm9
2750        vmovq   %xmm9, %rax
2751        sub     $8, %r13
2752
2753_less_than_8_bytes_left\@:
2754        movb    %al, (arg2 , %r11)
2755        add     $1, %r11
2756        shr     $8, %rax
2757        sub     $1, %r13
2758        jne     _less_than_8_bytes_left\@
2759        #############################
2760
2761_multiple_of_16_bytes\@:
2762        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
2763        shl     $3, %r12                             # convert into number of bits
2764        vmovd   %r12d, %xmm15                        # len(A) in xmm15
2765
2766        shl     $3, arg4                             # len(C) in bits  (*128)
2767        vmovq   arg4, %xmm1
2768        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
2769        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
2770
2771        vpxor   %xmm15, %xmm14, %xmm14
2772        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
2773        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14              # perform a 16Byte swap
2774
2775        mov     arg5, %rax                           # rax = *Y0
2776        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
2777
2778        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
2779
2780        vpxor   %xmm14, %xmm9, %xmm9
2781
2782
2783
2784_return_T\@:
2785        mov     arg8, %r10              # r10 = authTag
2786        mov     arg9, %r11              # r11 = auth_tag_len
2787
2788        cmp     $16, %r11
2789        je      _T_16\@
2790
2791        cmp     $8, %r11
2792        jl      _T_4\@
2793
2794_T_8\@:
2795        vmovq   %xmm9, %rax
2796        mov     %rax, (%r10)
2797        add     $8, %r10
2798        sub     $8, %r11
2799        vpsrldq $8, %xmm9, %xmm9
2800        cmp     $0, %r11
2801        je     _return_T_done\@
2802_T_4\@:
2803        vmovd   %xmm9, %eax
2804        mov     %eax, (%r10)
2805        add     $4, %r10
2806        sub     $4, %r11
2807        vpsrldq     $4, %xmm9, %xmm9
2808        cmp     $0, %r11
2809        je     _return_T_done\@
2810_T_123\@:
2811        vmovd     %xmm9, %eax
2812        cmp     $2, %r11
2813        jl     _T_1\@
2814        mov     %ax, (%r10)
2815        cmp     $2, %r11
2816        je     _return_T_done\@
2817        add     $2, %r10
2818        sar     $16, %eax
2819_T_1\@:
2820        mov     %al, (%r10)
2821        jmp     _return_T_done\@
2822
2823_T_16\@:
2824        vmovdqu %xmm9, (%r10)
2825
2826_return_T_done\@:
2827        mov     %r14, %rsp
2828
2829        pop     %r15
2830        pop     %r14
2831        pop     %r13
2832        pop     %r12
2833.endm
2834
2835
2836#############################################################
2837#void   aesni_gcm_precomp_avx_gen4
2838#        (gcm_data     *my_ctx_data,
2839#        u8     *hash_subkey)# /* H, the Hash sub key input.
2840#                               Data starts on a 16-byte boundary. */
2841#############################################################
2842ENTRY(aesni_gcm_precomp_avx_gen4)
2843        #the number of pushes must equal STACK_OFFSET
2844        push    %r12
2845        push    %r13
2846        push    %r14
2847        push    %r15
2848
2849        mov     %rsp, %r14
2850
2851
2852
2853        sub     $VARIABLE_OFFSET, %rsp
2854        and     $~63, %rsp                    # align rsp to 64 bytes
2855
2856        vmovdqu  (arg2), %xmm6                # xmm6 = HashKey
2857
2858        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
2859        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
2860        vmovdqa  %xmm6, %xmm2
2861        vpsllq   $1, %xmm6, %xmm6
2862        vpsrlq   $63, %xmm2, %xmm2
2863        vmovdqa  %xmm2, %xmm1
2864        vpslldq  $8, %xmm2, %xmm2
2865        vpsrldq  $8, %xmm1, %xmm1
2866        vpor     %xmm2, %xmm6, %xmm6
2867        #reduction
2868        vpshufd  $0b00100100, %xmm1, %xmm2
2869        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2870        vpand    POLY(%rip), %xmm2, %xmm2
2871        vpxor    %xmm2, %xmm6, %xmm6          # xmm6 holds the HashKey<<1 mod poly
2872        #######################################################################
2873        vmovdqa  %xmm6, HashKey(arg1)         # store HashKey<<1 mod poly
2874
2875
2876        PRECOMPUTE_AVX2  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2877
2878        mov     %r14, %rsp
2879
2880        pop     %r15
2881        pop     %r14
2882        pop     %r13
2883        pop     %r12
2884        ret
2885ENDPROC(aesni_gcm_precomp_avx_gen4)
2886
2887
2888###############################################################################
2889#void   aesni_gcm_enc_avx_gen4(
2890#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2891#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
2892#        const   u8 *in, /* Plaintext input */
2893#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
2894#        u8      *iv, /* Pre-counter block j0: 4 byte salt
2895#                       (from Security Association) concatenated with 8 byte
2896#                        Initialisation Vector (from IPSec ESP Payload)
2897#                        concatenated with 0x00000001. 16-byte aligned pointer. */
2898#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2899#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2900#        u8      *auth_tag, /* Authenticated Tag output. */
2901#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2902#                               Valid values are 16 (most likely), 12 or 8. */
2903###############################################################################
2904ENTRY(aesni_gcm_enc_avx_gen4)
2905        GCM_ENC_DEC_AVX2     ENC
2906        ret
2907ENDPROC(aesni_gcm_enc_avx_gen4)
2908
2909###############################################################################
2910#void   aesni_gcm_dec_avx_gen4(
2911#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2912#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
2913#        const   u8 *in, /* Ciphertext input */
2914#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
2915#        u8      *iv, /* Pre-counter block j0: 4 byte salt
2916#                       (from Security Association) concatenated with 8 byte
2917#                       Initialisation Vector (from IPSec ESP Payload)
2918#                       concatenated with 0x00000001. 16-byte aligned pointer. */
2919#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2920#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2921#        u8      *auth_tag, /* Authenticated Tag output. */
2922#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2923#                               Valid values are 16 (most likely), 12 or 8. */
2924###############################################################################
2925ENTRY(aesni_gcm_dec_avx_gen4)
2926        GCM_ENC_DEC_AVX2     DEC
2927        ret
2928ENDPROC(aesni_gcm_dec_avx_gen4)
2929
2930#endif /* CONFIG_AS_AVX2 */
2931