linux/arch/x86/crypto/aesni-intel_avx-x86_64.S
<<
>>
Prefs
   1########################################################################
   2# Copyright (c) 2013, Intel Corporation
   3#
   4# This software is available to you under a choice of one of two
   5# licenses.  You may choose to be licensed under the terms of the GNU
   6# General Public License (GPL) Version 2, available from the file
   7# COPYING in the main directory of this source tree, or the
   8# OpenIB.org BSD license below:
   9#
  10# Redistribution and use in source and binary forms, with or without
  11# modification, are permitted provided that the following conditions are
  12# met:
  13#
  14# * Redistributions of source code must retain the above copyright
  15#   notice, this list of conditions and the following disclaimer.
  16#
  17# * Redistributions in binary form must reproduce the above copyright
  18#   notice, this list of conditions and the following disclaimer in the
  19#   documentation and/or other materials provided with the
  20#   distribution.
  21#
  22# * Neither the name of the Intel Corporation nor the names of its
  23#   contributors may be used to endorse or promote products derived from
  24#   this software without specific prior written permission.
  25#
  26#
  27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
  28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
  31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
  34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38########################################################################
  39##
  40## Authors:
  41##      Erdinc Ozturk <erdinc.ozturk@intel.com>
  42##      Vinodh Gopal <vinodh.gopal@intel.com>
  43##      James Guilford <james.guilford@intel.com>
  44##      Tim Chen <tim.c.chen@linux.intel.com>
  45##
  46## References:
  47##       This code was derived and highly optimized from the code described in paper:
  48##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
  49##                      on Intel Architecture Processors. August, 2010
  50##       The details of the implementation is explained in:
  51##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
  52##                      on Intel Architecture Processors. October, 2012.
  53##
  54## Assumptions:
  55##
  56##
  57##
  58## iv:
  59##       0                   1                   2                   3
  60##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  61##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  62##       |                             Salt  (From the SA)               |
  63##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  64##       |                     Initialization Vector                     |
  65##       |         (This is the sequence number from IPSec header)       |
  66##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  67##       |                              0x1                              |
  68##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  69##
  70##
  71##
  72## AAD:
  73##       AAD padded to 128 bits with 0
  74##       for example, assume AAD is a u32 vector
  75##
  76##       if AAD is 8 bytes:
  77##       AAD[3] = {A0, A1}#
  78##       padded AAD in xmm register = {A1 A0 0 0}
  79##
  80##       0                   1                   2                   3
  81##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  82##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  83##       |                               SPI (A1)                        |
  84##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  85##       |                     32-bit Sequence Number (A0)               |
  86##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  87##       |                              0x0                              |
  88##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  89##
  90##                                       AAD Format with 32-bit Sequence Number
  91##
  92##       if AAD is 12 bytes:
  93##       AAD[3] = {A0, A1, A2}#
  94##       padded AAD in xmm register = {A2 A1 A0 0}
  95##
  96##       0                   1                   2                   3
  97##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  98##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  99##       |                               SPI (A2)                        |
 100##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 101##       |                 64-bit Extended Sequence Number {A1,A0}       |
 102##       |                                                               |
 103##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 104##       |                              0x0                              |
 105##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 106##
 107##        AAD Format with 64-bit Extended Sequence Number
 108##
 109##
 110## aadLen:
 111##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
 112##       The code additionally supports aadLen of length 16 bytes.
 113##
 114## TLen:
 115##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
 116##
 117## poly = x^128 + x^127 + x^126 + x^121 + 1
 118## throughout the code, one tab and two tab indentations are used. one tab is
 119## for GHASH part, two tabs is for AES part.
 120##
 121
 122#include <linux/linkage.h>
 123#include <asm/inst.h>
 124
 125# constants in mergeable sections, linker can reorder and merge
 126.section        .rodata.cst16.POLY, "aM", @progbits, 16
 127.align 16
 128POLY:            .octa     0xC2000000000000000000000000000001
 129
 130.section        .rodata.cst16.POLY2, "aM", @progbits, 16
 131.align 16
 132POLY2:           .octa     0xC20000000000000000000001C2000000
 133
 134.section        .rodata.cst16.TWOONE, "aM", @progbits, 16
 135.align 16
 136TWOONE:          .octa     0x00000001000000000000000000000001
 137
 138.section        .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
 139.align 16
 140SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
 141
 142.section        .rodata.cst16.ONE, "aM", @progbits, 16
 143.align 16
 144ONE:             .octa     0x00000000000000000000000000000001
 145
 146.section        .rodata.cst16.ONEf, "aM", @progbits, 16
 147.align 16
 148ONEf:            .octa     0x01000000000000000000000000000000
 149
 150# order of these constants should not change.
 151# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
 152.section        .rodata, "a", @progbits
 153.align 16
 154SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
 155ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
 156                 .octa     0x00000000000000000000000000000000
 157
 158.section .rodata
 159.align 16
 160.type aad_shift_arr, @object
 161.size aad_shift_arr, 272
 162aad_shift_arr:
 163        .octa     0xffffffffffffffffffffffffffffffff
 164        .octa     0xffffffffffffffffffffffffffffff0C
 165        .octa     0xffffffffffffffffffffffffffff0D0C
 166        .octa     0xffffffffffffffffffffffffff0E0D0C
 167        .octa     0xffffffffffffffffffffffff0F0E0D0C
 168        .octa     0xffffffffffffffffffffff0C0B0A0908
 169        .octa     0xffffffffffffffffffff0D0C0B0A0908
 170        .octa     0xffffffffffffffffff0E0D0C0B0A0908
 171        .octa     0xffffffffffffffff0F0E0D0C0B0A0908
 172        .octa     0xffffffffffffff0C0B0A090807060504
 173        .octa     0xffffffffffff0D0C0B0A090807060504
 174        .octa     0xffffffffff0E0D0C0B0A090807060504
 175        .octa     0xffffffff0F0E0D0C0B0A090807060504
 176        .octa     0xffffff0C0B0A09080706050403020100
 177        .octa     0xffff0D0C0B0A09080706050403020100
 178        .octa     0xff0E0D0C0B0A09080706050403020100
 179        .octa     0x0F0E0D0C0B0A09080706050403020100
 180
 181
 182.text
 183
 184
 185##define the fields of the gcm aes context
 186#{
 187#        u8 expanded_keys[16*11] store expanded keys
 188#        u8 shifted_hkey_1[16]   store HashKey <<1 mod poly here
 189#        u8 shifted_hkey_2[16]   store HashKey^2 <<1 mod poly here
 190#        u8 shifted_hkey_3[16]   store HashKey^3 <<1 mod poly here
 191#        u8 shifted_hkey_4[16]   store HashKey^4 <<1 mod poly here
 192#        u8 shifted_hkey_5[16]   store HashKey^5 <<1 mod poly here
 193#        u8 shifted_hkey_6[16]   store HashKey^6 <<1 mod poly here
 194#        u8 shifted_hkey_7[16]   store HashKey^7 <<1 mod poly here
 195#        u8 shifted_hkey_8[16]   store HashKey^8 <<1 mod poly here
 196#        u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
 197#        u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
 198#        u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
 199#        u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
 200#        u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
 201#        u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
 202#        u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
 203#        u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
 204#} gcm_ctx#
 205
 206HashKey        = 16*11   # store HashKey <<1 mod poly here
 207HashKey_2      = 16*12   # store HashKey^2 <<1 mod poly here
 208HashKey_3      = 16*13   # store HashKey^3 <<1 mod poly here
 209HashKey_4      = 16*14   # store HashKey^4 <<1 mod poly here
 210HashKey_5      = 16*15   # store HashKey^5 <<1 mod poly here
 211HashKey_6      = 16*16   # store HashKey^6 <<1 mod poly here
 212HashKey_7      = 16*17   # store HashKey^7 <<1 mod poly here
 213HashKey_8      = 16*18   # store HashKey^8 <<1 mod poly here
 214HashKey_k      = 16*19   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
 215HashKey_2_k    = 16*20   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
 216HashKey_3_k    = 16*21   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
 217HashKey_4_k    = 16*22   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
 218HashKey_5_k    = 16*23   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
 219HashKey_6_k    = 16*24   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
 220HashKey_7_k    = 16*25   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
 221HashKey_8_k    = 16*26   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
 222
 223#define arg1 %rdi
 224#define arg2 %rsi
 225#define arg3 %rdx
 226#define arg4 %rcx
 227#define arg5 %r8
 228#define arg6 %r9
 229#define arg7 STACK_OFFSET+8*1(%r14)
 230#define arg8 STACK_OFFSET+8*2(%r14)
 231#define arg9 STACK_OFFSET+8*3(%r14)
 232
 233i = 0
 234j = 0
 235
 236out_order = 0
 237in_order = 1
 238DEC = 0
 239ENC = 1
 240
 241.macro define_reg r n
 242reg_\r = %xmm\n
 243.endm
 244
 245.macro setreg
 246.altmacro
 247define_reg i %i
 248define_reg j %j
 249.noaltmacro
 250.endm
 251
 252# need to push 4 registers into stack to maintain
 253STACK_OFFSET = 8*4
 254
 255TMP1 =   16*0    # Temporary storage for AAD
 256TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
 257TMP3 =   16*2    # Temporary storage for AES State 3
 258TMP4 =   16*3    # Temporary storage for AES State 4
 259TMP5 =   16*4    # Temporary storage for AES State 5
 260TMP6 =   16*5    # Temporary storage for AES State 6
 261TMP7 =   16*6    # Temporary storage for AES State 7
 262TMP8 =   16*7    # Temporary storage for AES State 8
 263
 264VARIABLE_OFFSET = 16*8
 265
 266################################
 267# Utility Macros
 268################################
 269
 270# Encryption of a single block
 271.macro ENCRYPT_SINGLE_BLOCK XMM0
 272                vpxor    (arg1), \XMM0, \XMM0
 273                i = 1
 274                setreg
 275.rep 9
 276                vaesenc  16*i(arg1), \XMM0, \XMM0
 277                i = (i+1)
 278                setreg
 279.endr
 280                vaesenclast 16*10(arg1), \XMM0, \XMM0
 281.endm
 282
 283#ifdef CONFIG_AS_AVX
 284###############################################################################
 285# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 286# Input: A and B (128-bits each, bit-reflected)
 287# Output: C = A*B*x mod poly, (i.e. >>1 )
 288# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 289# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 290###############################################################################
 291.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
 292
 293        vpshufd         $0b01001110, \GH, \T2
 294        vpshufd         $0b01001110, \HK, \T3
 295        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
 296        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
 297
 298        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
 299        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
 300        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
 301        vpxor           \GH, \T2,\T2
 302        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
 303
 304        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
 305        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
 306        vpxor           \T3, \GH, \GH
 307        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
 308
 309        #first phase of the reduction
 310        vpslld  $31, \GH, \T2                   # packed right shifting << 31
 311        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
 312        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
 313
 314        vpxor   \T3, \T2, \T2                   # xor the shifted versions
 315        vpxor   \T4, \T2, \T2
 316
 317        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
 318
 319        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
 320        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
 321
 322        #second phase of the reduction
 323
 324        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
 325        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
 326        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
 327        vpxor   \T3, \T2, \T2                   # xor the shifted versions
 328        vpxor   \T4, \T2, \T2
 329
 330        vpxor   \T5, \T2, \T2
 331        vpxor   \T2, \GH, \GH
 332        vpxor   \T1, \GH, \GH                   # the result is in GH
 333
 334
 335.endm
 336
 337.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
 338
 339        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 340        vmovdqa  \HK, \T5
 341
 342        vpshufd  $0b01001110, \T5, \T1
 343        vpxor    \T5, \T1, \T1
 344        vmovdqa  \T1, HashKey_k(arg1)
 345
 346        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
 347        vmovdqa  \T5, HashKey_2(arg1)                    #  [HashKey_2] = HashKey^2<<1 mod poly
 348        vpshufd  $0b01001110, \T5, \T1
 349        vpxor    \T5, \T1, \T1
 350        vmovdqa  \T1, HashKey_2_k(arg1)
 351
 352        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
 353        vmovdqa  \T5, HashKey_3(arg1)
 354        vpshufd  $0b01001110, \T5, \T1
 355        vpxor    \T5, \T1, \T1
 356        vmovdqa  \T1, HashKey_3_k(arg1)
 357
 358        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
 359        vmovdqa  \T5, HashKey_4(arg1)
 360        vpshufd  $0b01001110, \T5, \T1
 361        vpxor    \T5, \T1, \T1
 362        vmovdqa  \T1, HashKey_4_k(arg1)
 363
 364        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
 365        vmovdqa  \T5, HashKey_5(arg1)
 366        vpshufd  $0b01001110, \T5, \T1
 367        vpxor    \T5, \T1, \T1
 368        vmovdqa  \T1, HashKey_5_k(arg1)
 369
 370        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
 371        vmovdqa  \T5, HashKey_6(arg1)
 372        vpshufd  $0b01001110, \T5, \T1
 373        vpxor    \T5, \T1, \T1
 374        vmovdqa  \T1, HashKey_6_k(arg1)
 375
 376        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
 377        vmovdqa  \T5, HashKey_7(arg1)
 378        vpshufd  $0b01001110, \T5, \T1
 379        vpxor    \T5, \T1, \T1
 380        vmovdqa  \T1, HashKey_7_k(arg1)
 381
 382        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
 383        vmovdqa  \T5, HashKey_8(arg1)
 384        vpshufd  $0b01001110, \T5, \T1
 385        vpxor    \T5, \T1, \T1
 386        vmovdqa  \T1, HashKey_8_k(arg1)
 387
 388.endm
 389
 390## if a = number of total plaintext bytes
 391## b = floor(a/16)
 392## num_initial_blocks = b mod 4#
 393## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
 394## r10, r11, r12, rax are clobbered
 395## arg1, arg2, arg3, r14 are used as a pointer only, not modified
 396
 397.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
 398        i = (8-\num_initial_blocks)
 399        j = 0
 400        setreg
 401
 402        mov     arg6, %r10                      # r10 = AAD
 403        mov     arg7, %r12                      # r12 = aadLen
 404
 405
 406        mov     %r12, %r11
 407
 408        vpxor   reg_j, reg_j, reg_j
 409        vpxor   reg_i, reg_i, reg_i
 410        cmp     $16, %r11
 411        jl      _get_AAD_rest8\@
 412_get_AAD_blocks\@:
 413        vmovdqu (%r10), reg_i
 414        vpshufb SHUF_MASK(%rip), reg_i, reg_i
 415        vpxor   reg_i, reg_j, reg_j
 416        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6
 417        add     $16, %r10
 418        sub     $16, %r12
 419        sub     $16, %r11
 420        cmp     $16, %r11
 421        jge     _get_AAD_blocks\@
 422        vmovdqu reg_j, reg_i
 423        cmp     $0, %r11
 424        je      _get_AAD_done\@
 425
 426        vpxor   reg_i, reg_i, reg_i
 427
 428        /* read the last <16B of AAD. since we have at least 4B of
 429        data right after the AAD (the ICV, and maybe some CT), we can
 430        read 4B/8B blocks safely, and then get rid of the extra stuff */
 431_get_AAD_rest8\@:
 432        cmp     $4, %r11
 433        jle     _get_AAD_rest4\@
 434        movq    (%r10), \T1
 435        add     $8, %r10
 436        sub     $8, %r11
 437        vpslldq $8, \T1, \T1
 438        vpsrldq $8, reg_i, reg_i
 439        vpxor   \T1, reg_i, reg_i
 440        jmp     _get_AAD_rest8\@
 441_get_AAD_rest4\@:
 442        cmp     $0, %r11
 443        jle      _get_AAD_rest0\@
 444        mov     (%r10), %eax
 445        movq    %rax, \T1
 446        add     $4, %r10
 447        sub     $4, %r11
 448        vpslldq $12, \T1, \T1
 449        vpsrldq $4, reg_i, reg_i
 450        vpxor   \T1, reg_i, reg_i
 451_get_AAD_rest0\@:
 452        /* finalize: shift out the extra bytes we read, and align
 453        left. since pslldq can only shift by an immediate, we use
 454        vpshufb and an array of shuffle masks */
 455        movq    %r12, %r11
 456        salq    $4, %r11
 457        movdqu  aad_shift_arr(%r11), \T1
 458        vpshufb \T1, reg_i, reg_i
 459_get_AAD_rest_final\@:
 460        vpshufb SHUF_MASK(%rip), reg_i, reg_i
 461        vpxor   reg_j, reg_i, reg_i
 462        GHASH_MUL_AVX       reg_i, \T2, \T1, \T3, \T4, \T5, \T6
 463
 464_get_AAD_done\@:
 465        # initialize the data pointer offset as zero
 466        xor     %r11, %r11
 467
 468        # start AES for num_initial_blocks blocks
 469        mov     arg5, %rax                     # rax = *Y0
 470        vmovdqu (%rax), \CTR                   # CTR = Y0
 471        vpshufb SHUF_MASK(%rip), \CTR, \CTR
 472
 473
 474        i = (9-\num_initial_blocks)
 475        setreg
 476.rep \num_initial_blocks
 477                vpaddd  ONE(%rip), \CTR, \CTR           # INCR Y0
 478                vmovdqa \CTR, reg_i
 479                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
 480        i = (i+1)
 481        setreg
 482.endr
 483
 484        vmovdqa  (arg1), \T_key
 485        i = (9-\num_initial_blocks)
 486        setreg
 487.rep \num_initial_blocks
 488                vpxor   \T_key, reg_i, reg_i
 489        i = (i+1)
 490        setreg
 491.endr
 492
 493        j = 1
 494        setreg
 495.rep 9
 496        vmovdqa  16*j(arg1), \T_key
 497        i = (9-\num_initial_blocks)
 498        setreg
 499.rep \num_initial_blocks
 500        vaesenc \T_key, reg_i, reg_i
 501        i = (i+1)
 502        setreg
 503.endr
 504
 505        j = (j+1)
 506        setreg
 507.endr
 508
 509
 510        vmovdqa  16*10(arg1), \T_key
 511        i = (9-\num_initial_blocks)
 512        setreg
 513.rep \num_initial_blocks
 514        vaesenclast      \T_key, reg_i, reg_i
 515        i = (i+1)
 516        setreg
 517.endr
 518
 519        i = (9-\num_initial_blocks)
 520        setreg
 521.rep \num_initial_blocks
 522                vmovdqu (arg3, %r11), \T1
 523                vpxor   \T1, reg_i, reg_i
 524                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for num_initial_blocks blocks
 525                add     $16, %r11
 526.if  \ENC_DEC == DEC
 527                vmovdqa \T1, reg_i
 528.endif
 529                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
 530        i = (i+1)
 531        setreg
 532.endr
 533
 534
 535        i = (8-\num_initial_blocks)
 536        j = (9-\num_initial_blocks)
 537        setreg
 538
 539.rep \num_initial_blocks
 540        vpxor    reg_i, reg_j, reg_j
 541        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
 542        i = (i+1)
 543        j = (j+1)
 544        setreg
 545.endr
 546        # XMM8 has the combined result here
 547
 548        vmovdqa  \XMM8, TMP1(%rsp)
 549        vmovdqa  \XMM8, \T3
 550
 551        cmp     $128, %r13
 552        jl      _initial_blocks_done\@                  # no need for precomputed constants
 553
 554###############################################################################
 555# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 556                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 557                vmovdqa  \CTR, \XMM1
 558                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
 559
 560                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 561                vmovdqa  \CTR, \XMM2
 562                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
 563
 564                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 565                vmovdqa  \CTR, \XMM3
 566                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
 567
 568                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 569                vmovdqa  \CTR, \XMM4
 570                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
 571
 572                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 573                vmovdqa  \CTR, \XMM5
 574                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
 575
 576                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 577                vmovdqa  \CTR, \XMM6
 578                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
 579
 580                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 581                vmovdqa  \CTR, \XMM7
 582                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
 583
 584                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
 585                vmovdqa  \CTR, \XMM8
 586                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
 587
 588                vmovdqa  (arg1), \T_key
 589                vpxor    \T_key, \XMM1, \XMM1
 590                vpxor    \T_key, \XMM2, \XMM2
 591                vpxor    \T_key, \XMM3, \XMM3
 592                vpxor    \T_key, \XMM4, \XMM4
 593                vpxor    \T_key, \XMM5, \XMM5
 594                vpxor    \T_key, \XMM6, \XMM6
 595                vpxor    \T_key, \XMM7, \XMM7
 596                vpxor    \T_key, \XMM8, \XMM8
 597
 598                i = 1
 599                setreg
 600.rep    9       # do 9 rounds
 601                vmovdqa  16*i(arg1), \T_key
 602                vaesenc  \T_key, \XMM1, \XMM1
 603                vaesenc  \T_key, \XMM2, \XMM2
 604                vaesenc  \T_key, \XMM3, \XMM3
 605                vaesenc  \T_key, \XMM4, \XMM4
 606                vaesenc  \T_key, \XMM5, \XMM5
 607                vaesenc  \T_key, \XMM6, \XMM6
 608                vaesenc  \T_key, \XMM7, \XMM7
 609                vaesenc  \T_key, \XMM8, \XMM8
 610                i = (i+1)
 611                setreg
 612.endr
 613
 614
 615                vmovdqa  16*i(arg1), \T_key
 616                vaesenclast  \T_key, \XMM1, \XMM1
 617                vaesenclast  \T_key, \XMM2, \XMM2
 618                vaesenclast  \T_key, \XMM3, \XMM3
 619                vaesenclast  \T_key, \XMM4, \XMM4
 620                vaesenclast  \T_key, \XMM5, \XMM5
 621                vaesenclast  \T_key, \XMM6, \XMM6
 622                vaesenclast  \T_key, \XMM7, \XMM7
 623                vaesenclast  \T_key, \XMM8, \XMM8
 624
 625                vmovdqu  (arg3, %r11), \T1
 626                vpxor    \T1, \XMM1, \XMM1
 627                vmovdqu  \XMM1, (arg2 , %r11)
 628                .if   \ENC_DEC == DEC
 629                vmovdqa  \T1, \XMM1
 630                .endif
 631
 632                vmovdqu  16*1(arg3, %r11), \T1
 633                vpxor    \T1, \XMM2, \XMM2
 634                vmovdqu  \XMM2, 16*1(arg2 , %r11)
 635                .if   \ENC_DEC == DEC
 636                vmovdqa  \T1, \XMM2
 637                .endif
 638
 639                vmovdqu  16*2(arg3, %r11), \T1
 640                vpxor    \T1, \XMM3, \XMM3
 641                vmovdqu  \XMM3, 16*2(arg2 , %r11)
 642                .if   \ENC_DEC == DEC
 643                vmovdqa  \T1, \XMM3
 644                .endif
 645
 646                vmovdqu  16*3(arg3, %r11), \T1
 647                vpxor    \T1, \XMM4, \XMM4
 648                vmovdqu  \XMM4, 16*3(arg2 , %r11)
 649                .if   \ENC_DEC == DEC
 650                vmovdqa  \T1, \XMM4
 651                .endif
 652
 653                vmovdqu  16*4(arg3, %r11), \T1
 654                vpxor    \T1, \XMM5, \XMM5
 655                vmovdqu  \XMM5, 16*4(arg2 , %r11)
 656                .if   \ENC_DEC == DEC
 657                vmovdqa  \T1, \XMM5
 658                .endif
 659
 660                vmovdqu  16*5(arg3, %r11), \T1
 661                vpxor    \T1, \XMM6, \XMM6
 662                vmovdqu  \XMM6, 16*5(arg2 , %r11)
 663                .if   \ENC_DEC == DEC
 664                vmovdqa  \T1, \XMM6
 665                .endif
 666
 667                vmovdqu  16*6(arg3, %r11), \T1
 668                vpxor    \T1, \XMM7, \XMM7
 669                vmovdqu  \XMM7, 16*6(arg2 , %r11)
 670                .if   \ENC_DEC == DEC
 671                vmovdqa  \T1, \XMM7
 672                .endif
 673
 674                vmovdqu  16*7(arg3, %r11), \T1
 675                vpxor    \T1, \XMM8, \XMM8
 676                vmovdqu  \XMM8, 16*7(arg2 , %r11)
 677                .if   \ENC_DEC == DEC
 678                vmovdqa  \T1, \XMM8
 679                .endif
 680
 681                add     $128, %r11
 682
 683                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
 684                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
 685                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
 686                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
 687                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
 688                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
 689                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
 690                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
 691                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
 692
 693###############################################################################
 694
 695_initial_blocks_done\@:
 696
 697.endm
 698
 699# encrypt 8 blocks at a time
 700# ghash the 8 previously encrypted ciphertext blocks
 701# arg1, arg2, arg3 are used as pointers only, not modified
 702# r11 is the data offset value
 703.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
 704
 705        vmovdqa \XMM1, \T2
 706        vmovdqa \XMM2, TMP2(%rsp)
 707        vmovdqa \XMM3, TMP3(%rsp)
 708        vmovdqa \XMM4, TMP4(%rsp)
 709        vmovdqa \XMM5, TMP5(%rsp)
 710        vmovdqa \XMM6, TMP6(%rsp)
 711        vmovdqa \XMM7, TMP7(%rsp)
 712        vmovdqa \XMM8, TMP8(%rsp)
 713
 714.if \loop_idx == in_order
 715                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
 716                vpaddd  ONE(%rip), \XMM1, \XMM2
 717                vpaddd  ONE(%rip), \XMM2, \XMM3
 718                vpaddd  ONE(%rip), \XMM3, \XMM4
 719                vpaddd  ONE(%rip), \XMM4, \XMM5
 720                vpaddd  ONE(%rip), \XMM5, \XMM6
 721                vpaddd  ONE(%rip), \XMM6, \XMM7
 722                vpaddd  ONE(%rip), \XMM7, \XMM8
 723                vmovdqa \XMM8, \CTR
 724
 725                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
 726                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
 727                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
 728                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
 729                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
 730                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
 731                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
 732                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
 733.else
 734                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
 735                vpaddd  ONEf(%rip), \XMM1, \XMM2
 736                vpaddd  ONEf(%rip), \XMM2, \XMM3
 737                vpaddd  ONEf(%rip), \XMM3, \XMM4
 738                vpaddd  ONEf(%rip), \XMM4, \XMM5
 739                vpaddd  ONEf(%rip), \XMM5, \XMM6
 740                vpaddd  ONEf(%rip), \XMM6, \XMM7
 741                vpaddd  ONEf(%rip), \XMM7, \XMM8
 742                vmovdqa \XMM8, \CTR
 743.endif
 744
 745
 746        #######################################################################
 747
 748                vmovdqu (arg1), \T1
 749                vpxor   \T1, \XMM1, \XMM1
 750                vpxor   \T1, \XMM2, \XMM2
 751                vpxor   \T1, \XMM3, \XMM3
 752                vpxor   \T1, \XMM4, \XMM4
 753                vpxor   \T1, \XMM5, \XMM5
 754                vpxor   \T1, \XMM6, \XMM6
 755                vpxor   \T1, \XMM7, \XMM7
 756                vpxor   \T1, \XMM8, \XMM8
 757
 758        #######################################################################
 759
 760
 761
 762
 763
 764                vmovdqu 16*1(arg1), \T1
 765                vaesenc \T1, \XMM1, \XMM1
 766                vaesenc \T1, \XMM2, \XMM2
 767                vaesenc \T1, \XMM3, \XMM3
 768                vaesenc \T1, \XMM4, \XMM4
 769                vaesenc \T1, \XMM5, \XMM5
 770                vaesenc \T1, \XMM6, \XMM6
 771                vaesenc \T1, \XMM7, \XMM7
 772                vaesenc \T1, \XMM8, \XMM8
 773
 774                vmovdqu 16*2(arg1), \T1
 775                vaesenc \T1, \XMM1, \XMM1
 776                vaesenc \T1, \XMM2, \XMM2
 777                vaesenc \T1, \XMM3, \XMM3
 778                vaesenc \T1, \XMM4, \XMM4
 779                vaesenc \T1, \XMM5, \XMM5
 780                vaesenc \T1, \XMM6, \XMM6
 781                vaesenc \T1, \XMM7, \XMM7
 782                vaesenc \T1, \XMM8, \XMM8
 783
 784
 785        #######################################################################
 786
 787        vmovdqa         HashKey_8(arg1), \T5
 788        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
 789        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
 790
 791        vpshufd         $0b01001110, \T2, \T6
 792        vpxor           \T2, \T6, \T6
 793
 794        vmovdqa         HashKey_8_k(arg1), \T5
 795        vpclmulqdq      $0x00, \T5, \T6, \T6
 796
 797                vmovdqu 16*3(arg1), \T1
 798                vaesenc \T1, \XMM1, \XMM1
 799                vaesenc \T1, \XMM2, \XMM2
 800                vaesenc \T1, \XMM3, \XMM3
 801                vaesenc \T1, \XMM4, \XMM4
 802                vaesenc \T1, \XMM5, \XMM5
 803                vaesenc \T1, \XMM6, \XMM6
 804                vaesenc \T1, \XMM7, \XMM7
 805                vaesenc \T1, \XMM8, \XMM8
 806
 807        vmovdqa         TMP2(%rsp), \T1
 808        vmovdqa         HashKey_7(arg1), \T5
 809        vpclmulqdq      $0x11, \T5, \T1, \T3
 810        vpxor           \T3, \T4, \T4
 811        vpclmulqdq      $0x00, \T5, \T1, \T3
 812        vpxor           \T3, \T7, \T7
 813
 814        vpshufd         $0b01001110, \T1, \T3
 815        vpxor           \T1, \T3, \T3
 816        vmovdqa         HashKey_7_k(arg1), \T5
 817        vpclmulqdq      $0x10, \T5, \T3, \T3
 818        vpxor           \T3, \T6, \T6
 819
 820                vmovdqu 16*4(arg1), \T1
 821                vaesenc \T1, \XMM1, \XMM1
 822                vaesenc \T1, \XMM2, \XMM2
 823                vaesenc \T1, \XMM3, \XMM3
 824                vaesenc \T1, \XMM4, \XMM4
 825                vaesenc \T1, \XMM5, \XMM5
 826                vaesenc \T1, \XMM6, \XMM6
 827                vaesenc \T1, \XMM7, \XMM7
 828                vaesenc \T1, \XMM8, \XMM8
 829
 830        #######################################################################
 831
 832        vmovdqa         TMP3(%rsp), \T1
 833        vmovdqa         HashKey_6(arg1), \T5
 834        vpclmulqdq      $0x11, \T5, \T1, \T3
 835        vpxor           \T3, \T4, \T4
 836        vpclmulqdq      $0x00, \T5, \T1, \T3
 837        vpxor           \T3, \T7, \T7
 838
 839        vpshufd         $0b01001110, \T1, \T3
 840        vpxor           \T1, \T3, \T3
 841        vmovdqa         HashKey_6_k(arg1), \T5
 842        vpclmulqdq      $0x10, \T5, \T3, \T3
 843        vpxor           \T3, \T6, \T6
 844
 845                vmovdqu 16*5(arg1), \T1
 846                vaesenc \T1, \XMM1, \XMM1
 847                vaesenc \T1, \XMM2, \XMM2
 848                vaesenc \T1, \XMM3, \XMM3
 849                vaesenc \T1, \XMM4, \XMM4
 850                vaesenc \T1, \XMM5, \XMM5
 851                vaesenc \T1, \XMM6, \XMM6
 852                vaesenc \T1, \XMM7, \XMM7
 853                vaesenc \T1, \XMM8, \XMM8
 854
 855        vmovdqa         TMP4(%rsp), \T1
 856        vmovdqa         HashKey_5(arg1), \T5
 857        vpclmulqdq      $0x11, \T5, \T1, \T3
 858        vpxor           \T3, \T4, \T4
 859        vpclmulqdq      $0x00, \T5, \T1, \T3
 860        vpxor           \T3, \T7, \T7
 861
 862        vpshufd         $0b01001110, \T1, \T3
 863        vpxor           \T1, \T3, \T3
 864        vmovdqa         HashKey_5_k(arg1), \T5
 865        vpclmulqdq      $0x10, \T5, \T3, \T3
 866        vpxor           \T3, \T6, \T6
 867
 868                vmovdqu 16*6(arg1), \T1
 869                vaesenc \T1, \XMM1, \XMM1
 870                vaesenc \T1, \XMM2, \XMM2
 871                vaesenc \T1, \XMM3, \XMM3
 872                vaesenc \T1, \XMM4, \XMM4
 873                vaesenc \T1, \XMM5, \XMM5
 874                vaesenc \T1, \XMM6, \XMM6
 875                vaesenc \T1, \XMM7, \XMM7
 876                vaesenc \T1, \XMM8, \XMM8
 877
 878
 879        vmovdqa         TMP5(%rsp), \T1
 880        vmovdqa         HashKey_4(arg1), \T5
 881        vpclmulqdq      $0x11, \T5, \T1, \T3
 882        vpxor           \T3, \T4, \T4
 883        vpclmulqdq      $0x00, \T5, \T1, \T3
 884        vpxor           \T3, \T7, \T7
 885
 886        vpshufd         $0b01001110, \T1, \T3
 887        vpxor           \T1, \T3, \T3
 888        vmovdqa         HashKey_4_k(arg1), \T5
 889        vpclmulqdq      $0x10, \T5, \T3, \T3
 890        vpxor           \T3, \T6, \T6
 891
 892                vmovdqu 16*7(arg1), \T1
 893                vaesenc \T1, \XMM1, \XMM1
 894                vaesenc \T1, \XMM2, \XMM2
 895                vaesenc \T1, \XMM3, \XMM3
 896                vaesenc \T1, \XMM4, \XMM4
 897                vaesenc \T1, \XMM5, \XMM5
 898                vaesenc \T1, \XMM6, \XMM6
 899                vaesenc \T1, \XMM7, \XMM7
 900                vaesenc \T1, \XMM8, \XMM8
 901
 902        vmovdqa         TMP6(%rsp), \T1
 903        vmovdqa         HashKey_3(arg1), \T5
 904        vpclmulqdq      $0x11, \T5, \T1, \T3
 905        vpxor           \T3, \T4, \T4
 906        vpclmulqdq      $0x00, \T5, \T1, \T3
 907        vpxor           \T3, \T7, \T7
 908
 909        vpshufd         $0b01001110, \T1, \T3
 910        vpxor           \T1, \T3, \T3
 911        vmovdqa         HashKey_3_k(arg1), \T5
 912        vpclmulqdq      $0x10, \T5, \T3, \T3
 913        vpxor           \T3, \T6, \T6
 914
 915
 916                vmovdqu 16*8(arg1), \T1
 917                vaesenc \T1, \XMM1, \XMM1
 918                vaesenc \T1, \XMM2, \XMM2
 919                vaesenc \T1, \XMM3, \XMM3
 920                vaesenc \T1, \XMM4, \XMM4
 921                vaesenc \T1, \XMM5, \XMM5
 922                vaesenc \T1, \XMM6, \XMM6
 923                vaesenc \T1, \XMM7, \XMM7
 924                vaesenc \T1, \XMM8, \XMM8
 925
 926        vmovdqa         TMP7(%rsp), \T1
 927        vmovdqa         HashKey_2(arg1), \T5
 928        vpclmulqdq      $0x11, \T5, \T1, \T3
 929        vpxor           \T3, \T4, \T4
 930        vpclmulqdq      $0x00, \T5, \T1, \T3
 931        vpxor           \T3, \T7, \T7
 932
 933        vpshufd         $0b01001110, \T1, \T3
 934        vpxor           \T1, \T3, \T3
 935        vmovdqa         HashKey_2_k(arg1), \T5
 936        vpclmulqdq      $0x10, \T5, \T3, \T3
 937        vpxor           \T3, \T6, \T6
 938
 939        #######################################################################
 940
 941                vmovdqu 16*9(arg1), \T5
 942                vaesenc \T5, \XMM1, \XMM1
 943                vaesenc \T5, \XMM2, \XMM2
 944                vaesenc \T5, \XMM3, \XMM3
 945                vaesenc \T5, \XMM4, \XMM4
 946                vaesenc \T5, \XMM5, \XMM5
 947                vaesenc \T5, \XMM6, \XMM6
 948                vaesenc \T5, \XMM7, \XMM7
 949                vaesenc \T5, \XMM8, \XMM8
 950
 951        vmovdqa         TMP8(%rsp), \T1
 952        vmovdqa         HashKey(arg1), \T5
 953        vpclmulqdq      $0x11, \T5, \T1, \T3
 954        vpxor           \T3, \T4, \T4
 955        vpclmulqdq      $0x00, \T5, \T1, \T3
 956        vpxor           \T3, \T7, \T7
 957
 958        vpshufd         $0b01001110, \T1, \T3
 959        vpxor           \T1, \T3, \T3
 960        vmovdqa         HashKey_k(arg1), \T5
 961        vpclmulqdq      $0x10, \T5, \T3, \T3
 962        vpxor           \T3, \T6, \T6
 963
 964        vpxor           \T4, \T6, \T6
 965        vpxor           \T7, \T6, \T6
 966
 967                vmovdqu 16*10(arg1), \T5
 968
 969        i = 0
 970        j = 1
 971        setreg
 972.rep 8
 973                vpxor   16*i(arg3, %r11), \T5, \T2
 974                .if \ENC_DEC == ENC
 975                vaesenclast     \T2, reg_j, reg_j
 976                .else
 977                vaesenclast     \T2, reg_j, \T3
 978                vmovdqu 16*i(arg3, %r11), reg_j
 979                vmovdqu \T3, 16*i(arg2, %r11)
 980                .endif
 981        i = (i+1)
 982        j = (j+1)
 983        setreg
 984.endr
 985        #######################################################################
 986
 987
 988        vpslldq $8, \T6, \T3                            # shift-L T3 2 DWs
 989        vpsrldq $8, \T6, \T6                            # shift-R T2 2 DWs
 990        vpxor   \T3, \T7, \T7
 991        vpxor   \T4, \T6, \T6                           # accumulate the results in T6:T7
 992
 993
 994
 995        #######################################################################
 996        #first phase of the reduction
 997        #######################################################################
 998        vpslld  $31, \T7, \T2                           # packed right shifting << 31
 999        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
1000        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
1001
1002        vpxor   \T3, \T2, \T2                           # xor the shifted versions
1003        vpxor   \T4, \T2, \T2
1004
1005        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
1006
1007        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
1008        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
1009        #######################################################################
1010                .if \ENC_DEC == ENC
1011                vmovdqu  \XMM1, 16*0(arg2,%r11)         # Write to the Ciphertext buffer
1012                vmovdqu  \XMM2, 16*1(arg2,%r11)         # Write to the Ciphertext buffer
1013                vmovdqu  \XMM3, 16*2(arg2,%r11)         # Write to the Ciphertext buffer
1014                vmovdqu  \XMM4, 16*3(arg2,%r11)         # Write to the Ciphertext buffer
1015                vmovdqu  \XMM5, 16*4(arg2,%r11)         # Write to the Ciphertext buffer
1016                vmovdqu  \XMM6, 16*5(arg2,%r11)         # Write to the Ciphertext buffer
1017                vmovdqu  \XMM7, 16*6(arg2,%r11)         # Write to the Ciphertext buffer
1018                vmovdqu  \XMM8, 16*7(arg2,%r11)         # Write to the Ciphertext buffer
1019                .endif
1020
1021        #######################################################################
1022        #second phase of the reduction
1023        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
1024        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
1025        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
1026        vpxor   \T3, \T2, \T2                           # xor the shifted versions
1027        vpxor   \T4, \T2, \T2
1028
1029        vpxor   \T1, \T2, \T2
1030        vpxor   \T2, \T7, \T7
1031        vpxor   \T7, \T6, \T6                           # the result is in T6
1032        #######################################################################
1033
1034                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1   # perform a 16Byte swap
1035                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2   # perform a 16Byte swap
1036                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3   # perform a 16Byte swap
1037                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4   # perform a 16Byte swap
1038                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5   # perform a 16Byte swap
1039                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6   # perform a 16Byte swap
1040                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7   # perform a 16Byte swap
1041                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8   # perform a 16Byte swap
1042
1043
1044        vpxor   \T6, \XMM1, \XMM1
1045
1046
1047
1048.endm
1049
1050
1051# GHASH the last 4 ciphertext blocks.
1052.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1053
1054        ## Karatsuba Method
1055
1056
1057        vpshufd         $0b01001110, \XMM1, \T2
1058        vpxor           \XMM1, \T2, \T2
1059        vmovdqa         HashKey_8(arg1), \T5
1060        vpclmulqdq      $0x11, \T5, \XMM1, \T6
1061        vpclmulqdq      $0x00, \T5, \XMM1, \T7
1062
1063        vmovdqa         HashKey_8_k(arg1), \T3
1064        vpclmulqdq      $0x00, \T3, \T2, \XMM1
1065
1066        ######################
1067
1068        vpshufd         $0b01001110, \XMM2, \T2
1069        vpxor           \XMM2, \T2, \T2
1070        vmovdqa         HashKey_7(arg1), \T5
1071        vpclmulqdq      $0x11, \T5, \XMM2, \T4
1072        vpxor           \T4, \T6, \T6
1073
1074        vpclmulqdq      $0x00, \T5, \XMM2, \T4
1075        vpxor           \T4, \T7, \T7
1076
1077        vmovdqa         HashKey_7_k(arg1), \T3
1078        vpclmulqdq      $0x00, \T3, \T2, \T2
1079        vpxor           \T2, \XMM1, \XMM1
1080
1081        ######################
1082
1083        vpshufd         $0b01001110, \XMM3, \T2
1084        vpxor           \XMM3, \T2, \T2
1085        vmovdqa         HashKey_6(arg1), \T5
1086        vpclmulqdq      $0x11, \T5, \XMM3, \T4
1087        vpxor           \T4, \T6, \T6
1088
1089        vpclmulqdq      $0x00, \T5, \XMM3, \T4
1090        vpxor           \T4, \T7, \T7
1091
1092        vmovdqa         HashKey_6_k(arg1), \T3
1093        vpclmulqdq      $0x00, \T3, \T2, \T2
1094        vpxor           \T2, \XMM1, \XMM1
1095
1096        ######################
1097
1098        vpshufd         $0b01001110, \XMM4, \T2
1099        vpxor           \XMM4, \T2, \T2
1100        vmovdqa         HashKey_5(arg1), \T5
1101        vpclmulqdq      $0x11, \T5, \XMM4, \T4
1102        vpxor           \T4, \T6, \T6
1103
1104        vpclmulqdq      $0x00, \T5, \XMM4, \T4
1105        vpxor           \T4, \T7, \T7
1106
1107        vmovdqa         HashKey_5_k(arg1), \T3
1108        vpclmulqdq      $0x00, \T3, \T2, \T2
1109        vpxor           \T2, \XMM1, \XMM1
1110
1111        ######################
1112
1113        vpshufd         $0b01001110, \XMM5, \T2
1114        vpxor           \XMM5, \T2, \T2
1115        vmovdqa         HashKey_4(arg1), \T5
1116        vpclmulqdq      $0x11, \T5, \XMM5, \T4
1117        vpxor           \T4, \T6, \T6
1118
1119        vpclmulqdq      $0x00, \T5, \XMM5, \T4
1120        vpxor           \T4, \T7, \T7
1121
1122        vmovdqa         HashKey_4_k(arg1), \T3
1123        vpclmulqdq      $0x00, \T3, \T2, \T2
1124        vpxor           \T2, \XMM1, \XMM1
1125
1126        ######################
1127
1128        vpshufd         $0b01001110, \XMM6, \T2
1129        vpxor           \XMM6, \T2, \T2
1130        vmovdqa         HashKey_3(arg1), \T5
1131        vpclmulqdq      $0x11, \T5, \XMM6, \T4
1132        vpxor           \T4, \T6, \T6
1133
1134        vpclmulqdq      $0x00, \T5, \XMM6, \T4
1135        vpxor           \T4, \T7, \T7
1136
1137        vmovdqa         HashKey_3_k(arg1), \T3
1138        vpclmulqdq      $0x00, \T3, \T2, \T2
1139        vpxor           \T2, \XMM1, \XMM1
1140
1141        ######################
1142
1143        vpshufd         $0b01001110, \XMM7, \T2
1144        vpxor           \XMM7, \T2, \T2
1145        vmovdqa         HashKey_2(arg1), \T5
1146        vpclmulqdq      $0x11, \T5, \XMM7, \T4
1147        vpxor           \T4, \T6, \T6
1148
1149        vpclmulqdq      $0x00, \T5, \XMM7, \T4
1150        vpxor           \T4, \T7, \T7
1151
1152        vmovdqa         HashKey_2_k(arg1), \T3
1153        vpclmulqdq      $0x00, \T3, \T2, \T2
1154        vpxor           \T2, \XMM1, \XMM1
1155
1156        ######################
1157
1158        vpshufd         $0b01001110, \XMM8, \T2
1159        vpxor           \XMM8, \T2, \T2
1160        vmovdqa         HashKey(arg1), \T5
1161        vpclmulqdq      $0x11, \T5, \XMM8, \T4
1162        vpxor           \T4, \T6, \T6
1163
1164        vpclmulqdq      $0x00, \T5, \XMM8, \T4
1165        vpxor           \T4, \T7, \T7
1166
1167        vmovdqa         HashKey_k(arg1), \T3
1168        vpclmulqdq      $0x00, \T3, \T2, \T2
1169
1170        vpxor           \T2, \XMM1, \XMM1
1171        vpxor           \T6, \XMM1, \XMM1
1172        vpxor           \T7, \XMM1, \T2
1173
1174
1175
1176
1177        vpslldq $8, \T2, \T4
1178        vpsrldq $8, \T2, \T2
1179
1180        vpxor   \T4, \T7, \T7
1181        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
1182                                # the accumulated carry-less multiplications
1183
1184        #######################################################################
1185        #first phase of the reduction
1186        vpslld  $31, \T7, \T2   # packed right shifting << 31
1187        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
1188        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
1189
1190        vpxor   \T3, \T2, \T2   # xor the shifted versions
1191        vpxor   \T4, \T2, \T2
1192
1193        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
1194
1195        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
1196        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
1197        #######################################################################
1198
1199
1200        #second phase of the reduction
1201        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
1202        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
1203        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
1204        vpxor   \T3, \T2, \T2   # xor the shifted versions
1205        vpxor   \T4, \T2, \T2
1206
1207        vpxor   \T1, \T2, \T2
1208        vpxor   \T2, \T7, \T7
1209        vpxor   \T7, \T6, \T6   # the result is in T6
1210
1211.endm
1212
1213
1214# combined for GCM encrypt and decrypt functions
1215# clobbering all xmm registers
1216# clobbering r10, r11, r12, r13, r14, r15
1217.macro  GCM_ENC_DEC_AVX     ENC_DEC
1218
1219        #the number of pushes must equal STACK_OFFSET
1220        push    %r12
1221        push    %r13
1222        push    %r14
1223        push    %r15
1224
1225        mov     %rsp, %r14
1226
1227
1228
1229
1230        sub     $VARIABLE_OFFSET, %rsp
1231        and     $~63, %rsp                  # align rsp to 64 bytes
1232
1233
1234        vmovdqu  HashKey(arg1), %xmm13      # xmm13 = HashKey
1235
1236        mov     arg4, %r13                  # save the number of bytes of plaintext/ciphertext
1237        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
1238
1239        mov     %r13, %r12
1240        shr     $4, %r12
1241        and     $7, %r12
1242        jz      _initial_num_blocks_is_0\@
1243
1244        cmp     $7, %r12
1245        je      _initial_num_blocks_is_7\@
1246        cmp     $6, %r12
1247        je      _initial_num_blocks_is_6\@
1248        cmp     $5, %r12
1249        je      _initial_num_blocks_is_5\@
1250        cmp     $4, %r12
1251        je      _initial_num_blocks_is_4\@
1252        cmp     $3, %r12
1253        je      _initial_num_blocks_is_3\@
1254        cmp     $2, %r12
1255        je      _initial_num_blocks_is_2\@
1256
1257        jmp     _initial_num_blocks_is_1\@
1258
1259_initial_num_blocks_is_7\@:
1260        INITIAL_BLOCKS_AVX  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1261        sub     $16*7, %r13
1262        jmp     _initial_blocks_encrypted\@
1263
1264_initial_num_blocks_is_6\@:
1265        INITIAL_BLOCKS_AVX  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1266        sub     $16*6, %r13
1267        jmp     _initial_blocks_encrypted\@
1268
1269_initial_num_blocks_is_5\@:
1270        INITIAL_BLOCKS_AVX  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1271        sub     $16*5, %r13
1272        jmp     _initial_blocks_encrypted\@
1273
1274_initial_num_blocks_is_4\@:
1275        INITIAL_BLOCKS_AVX  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1276        sub     $16*4, %r13
1277        jmp     _initial_blocks_encrypted\@
1278
1279_initial_num_blocks_is_3\@:
1280        INITIAL_BLOCKS_AVX  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1281        sub     $16*3, %r13
1282        jmp     _initial_blocks_encrypted\@
1283
1284_initial_num_blocks_is_2\@:
1285        INITIAL_BLOCKS_AVX  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1286        sub     $16*2, %r13
1287        jmp     _initial_blocks_encrypted\@
1288
1289_initial_num_blocks_is_1\@:
1290        INITIAL_BLOCKS_AVX  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1291        sub     $16*1, %r13
1292        jmp     _initial_blocks_encrypted\@
1293
1294_initial_num_blocks_is_0\@:
1295        INITIAL_BLOCKS_AVX  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1296
1297
1298_initial_blocks_encrypted\@:
1299        cmp     $0, %r13
1300        je      _zero_cipher_left\@
1301
1302        sub     $128, %r13
1303        je      _eight_cipher_left\@
1304
1305
1306
1307
1308        vmovd   %xmm9, %r15d
1309        and     $255, %r15d
1310        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1311
1312
1313_encrypt_by_8_new\@:
1314        cmp     $(255-8), %r15d
1315        jg      _encrypt_by_8\@
1316
1317
1318
1319        add     $8, %r15b
1320        GHASH_8_ENCRYPT_8_PARALLEL_AVX      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
1321        add     $128, %r11
1322        sub     $128, %r13
1323        jne     _encrypt_by_8_new\@
1324
1325        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1326        jmp     _eight_cipher_left\@
1327
1328_encrypt_by_8\@:
1329        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1330        add     $8, %r15b
1331        GHASH_8_ENCRYPT_8_PARALLEL_AVX      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
1332        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1333        add     $128, %r11
1334        sub     $128, %r13
1335        jne     _encrypt_by_8_new\@
1336
1337        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1338
1339
1340
1341
1342_eight_cipher_left\@:
1343        GHASH_LAST_8_AVX    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
1344
1345
1346_zero_cipher_left\@:
1347        cmp     $16, arg4
1348        jl      _only_less_than_16\@
1349
1350        mov     arg4, %r13
1351        and     $15, %r13                            # r13 = (arg4 mod 16)
1352
1353        je      _multiple_of_16_bytes\@
1354
1355        # handle the last <16 Byte block seperately
1356
1357
1358        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
1359        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1360        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
1361
1362        sub     $16, %r11
1363        add     %r13, %r11
1364        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
1365
1366        lea     SHIFT_MASK+16(%rip), %r12
1367        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
1368                                                     # able to shift 16-r13 bytes (r13 is the
1369                                                     # number of bytes in plaintext mod 16)
1370        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
1371        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
1372        jmp     _final_ghash_mul\@
1373
1374_only_less_than_16\@:
1375        # check for 0 length
1376        mov     arg4, %r13
1377        and     $15, %r13                            # r13 = (arg4 mod 16)
1378
1379        je      _multiple_of_16_bytes\@
1380
1381        # handle the last <16 Byte block seperately
1382
1383
1384        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
1385        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1386        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
1387
1388
1389        lea     SHIFT_MASK+16(%rip), %r12
1390        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
1391                                                     # able to shift 16-r13 bytes (r13 is the
1392                                                     # number of bytes in plaintext mod 16)
1393
1394_get_last_16_byte_loop\@:
1395        movb    (arg3, %r11),  %al
1396        movb    %al,  TMP1 (%rsp , %r11)
1397        add     $1, %r11
1398        cmp     %r13,  %r11
1399        jne     _get_last_16_byte_loop\@
1400
1401        vmovdqu  TMP1(%rsp), %xmm1
1402
1403        sub     $16, %r11
1404
1405_final_ghash_mul\@:
1406        .if  \ENC_DEC ==  DEC
1407        vmovdqa %xmm1, %xmm2
1408        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
1409        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
1410                                                     # mask out top 16-r13 bytes of xmm9
1411        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
1412        vpand   %xmm1, %xmm2, %xmm2
1413        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
1414        vpxor   %xmm2, %xmm14, %xmm14
1415        #GHASH computation for the last <16 Byte block
1416        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1417        sub     %r13, %r11
1418        add     $16, %r11
1419        .else
1420        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
1421        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
1422                                                     # mask out top 16-r13 bytes of xmm9
1423        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
1424        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1425        vpxor   %xmm9, %xmm14, %xmm14
1426        #GHASH computation for the last <16 Byte block
1427        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1428        sub     %r13, %r11
1429        add     $16, %r11
1430        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
1431        .endif
1432
1433
1434        #############################
1435        # output r13 Bytes
1436        vmovq   %xmm9, %rax
1437        cmp     $8, %r13
1438        jle     _less_than_8_bytes_left\@
1439
1440        mov     %rax, (arg2 , %r11)
1441        add     $8, %r11
1442        vpsrldq $8, %xmm9, %xmm9
1443        vmovq   %xmm9, %rax
1444        sub     $8, %r13
1445
1446_less_than_8_bytes_left\@:
1447        movb    %al, (arg2 , %r11)
1448        add     $1, %r11
1449        shr     $8, %rax
1450        sub     $1, %r13
1451        jne     _less_than_8_bytes_left\@
1452        #############################
1453
1454_multiple_of_16_bytes\@:
1455        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
1456        shl     $3, %r12                             # convert into number of bits
1457        vmovd   %r12d, %xmm15                        # len(A) in xmm15
1458
1459        shl     $3, arg4                             # len(C) in bits  (*128)
1460        vmovq   arg4, %xmm1
1461        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
1462        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
1463
1464        vpxor   %xmm15, %xmm14, %xmm14
1465        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
1466        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
1467
1468        mov     arg5, %rax                           # rax = *Y0
1469        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
1470
1471        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
1472
1473        vpxor   %xmm14, %xmm9, %xmm9
1474
1475
1476
1477_return_T\@:
1478        mov     arg8, %r10              # r10 = authTag
1479        mov     arg9, %r11              # r11 = auth_tag_len
1480
1481        cmp     $16, %r11
1482        je      _T_16\@
1483
1484        cmp     $8, %r11
1485        jl      _T_4\@
1486
1487_T_8\@:
1488        vmovq   %xmm9, %rax
1489        mov     %rax, (%r10)
1490        add     $8, %r10
1491        sub     $8, %r11
1492        vpsrldq $8, %xmm9, %xmm9
1493        cmp     $0, %r11
1494        je     _return_T_done\@
1495_T_4\@:
1496        vmovd   %xmm9, %eax
1497        mov     %eax, (%r10)
1498        add     $4, %r10
1499        sub     $4, %r11
1500        vpsrldq     $4, %xmm9, %xmm9
1501        cmp     $0, %r11
1502        je     _return_T_done\@
1503_T_123\@:
1504        vmovd     %xmm9, %eax
1505        cmp     $2, %r11
1506        jl     _T_1\@
1507        mov     %ax, (%r10)
1508        cmp     $2, %r11
1509        je     _return_T_done\@
1510        add     $2, %r10
1511        sar     $16, %eax
1512_T_1\@:
1513        mov     %al, (%r10)
1514        jmp     _return_T_done\@
1515
1516_T_16\@:
1517        vmovdqu %xmm9, (%r10)
1518
1519_return_T_done\@:
1520        mov     %r14, %rsp
1521
1522        pop     %r15
1523        pop     %r14
1524        pop     %r13
1525        pop     %r12
1526.endm
1527
1528
1529#############################################################
1530#void   aesni_gcm_precomp_avx_gen2
1531#        (gcm_data     *my_ctx_data,
1532#        u8     *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1533#############################################################
1534ENTRY(aesni_gcm_precomp_avx_gen2)
1535        #the number of pushes must equal STACK_OFFSET
1536        push    %r12
1537        push    %r13
1538        push    %r14
1539        push    %r15
1540
1541        mov     %rsp, %r14
1542
1543
1544
1545        sub     $VARIABLE_OFFSET, %rsp
1546        and     $~63, %rsp                  # align rsp to 64 bytes
1547
1548        vmovdqu  (arg2), %xmm6              # xmm6 = HashKey
1549
1550        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
1551        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
1552        vmovdqa  %xmm6, %xmm2
1553        vpsllq   $1, %xmm6, %xmm6
1554        vpsrlq   $63, %xmm2, %xmm2
1555        vmovdqa  %xmm2, %xmm1
1556        vpslldq  $8, %xmm2, %xmm2
1557        vpsrldq  $8, %xmm1, %xmm1
1558        vpor     %xmm2, %xmm6, %xmm6
1559        #reduction
1560        vpshufd  $0b00100100, %xmm1, %xmm2
1561        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1562        vpand    POLY(%rip), %xmm2, %xmm2
1563        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
1564        #######################################################################
1565        vmovdqa  %xmm6, HashKey(arg1)       # store HashKey<<1 mod poly
1566
1567
1568        PRECOMPUTE_AVX  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1569
1570        mov     %r14, %rsp
1571
1572        pop     %r15
1573        pop     %r14
1574        pop     %r13
1575        pop     %r12
1576        ret
1577ENDPROC(aesni_gcm_precomp_avx_gen2)
1578
1579###############################################################################
1580#void   aesni_gcm_enc_avx_gen2(
1581#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1582#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
1583#        const   u8 *in, /* Plaintext input */
1584#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
1585#        u8      *iv, /* Pre-counter block j0: 4 byte salt
1586#                       (from Security Association) concatenated with 8 byte
1587#                       Initialisation Vector (from IPSec ESP Payload)
1588#                       concatenated with 0x00000001. 16-byte aligned pointer. */
1589#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1590#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1591#        u8      *auth_tag, /* Authenticated Tag output. */
1592#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1593#                               Valid values are 16 (most likely), 12 or 8. */
1594###############################################################################
1595ENTRY(aesni_gcm_enc_avx_gen2)
1596        GCM_ENC_DEC_AVX     ENC
1597        ret
1598ENDPROC(aesni_gcm_enc_avx_gen2)
1599
1600###############################################################################
1601#void   aesni_gcm_dec_avx_gen2(
1602#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1603#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
1604#        const   u8 *in, /* Ciphertext input */
1605#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
1606#        u8      *iv, /* Pre-counter block j0: 4 byte salt
1607#                       (from Security Association) concatenated with 8 byte
1608#                       Initialisation Vector (from IPSec ESP Payload)
1609#                       concatenated with 0x00000001. 16-byte aligned pointer. */
1610#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1611#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1612#        u8      *auth_tag, /* Authenticated Tag output. */
1613#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1614#                               Valid values are 16 (most likely), 12 or 8. */
1615###############################################################################
1616ENTRY(aesni_gcm_dec_avx_gen2)
1617        GCM_ENC_DEC_AVX     DEC
1618        ret
1619ENDPROC(aesni_gcm_dec_avx_gen2)
1620#endif /* CONFIG_AS_AVX */
1621
1622#ifdef CONFIG_AS_AVX2
1623###############################################################################
1624# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1625# Input: A and B (128-bits each, bit-reflected)
1626# Output: C = A*B*x mod poly, (i.e. >>1 )
1627# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1628# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1629###############################################################################
1630.macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1631
1632        vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
1633        vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
1634        vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
1635        vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
1636        vpxor           \T3, \GH, \GH
1637
1638
1639        vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
1640        vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
1641
1642        vpxor           \T3, \T1, \T1
1643        vpxor           \T2, \GH, \GH
1644
1645        #######################################################################
1646        #first phase of the reduction
1647        vmovdqa         POLY2(%rip), \T3
1648
1649        vpclmulqdq      $0x01, \GH, \T3, \T2
1650        vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
1651
1652        vpxor           \T2, \GH, \GH          # first phase of the reduction complete
1653        #######################################################################
1654        #second phase of the reduction
1655        vpclmulqdq      $0x00, \GH, \T3, \T2
1656        vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1657
1658        vpclmulqdq      $0x10, \GH, \T3, \GH
1659        vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1660
1661        vpxor           \T2, \GH, \GH          # second phase of the reduction complete
1662        #######################################################################
1663        vpxor           \T1, \GH, \GH          # the result is in GH
1664
1665
1666.endm
1667
1668.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1669
1670        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1671        vmovdqa  \HK, \T5
1672        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
1673        vmovdqa  \T5, HashKey_2(arg1)                       #  [HashKey_2] = HashKey^2<<1 mod poly
1674
1675        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
1676        vmovdqa  \T5, HashKey_3(arg1)
1677
1678        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
1679        vmovdqa  \T5, HashKey_4(arg1)
1680
1681        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
1682        vmovdqa  \T5, HashKey_5(arg1)
1683
1684        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
1685        vmovdqa  \T5, HashKey_6(arg1)
1686
1687        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
1688        vmovdqa  \T5, HashKey_7(arg1)
1689
1690        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
1691        vmovdqa  \T5, HashKey_8(arg1)
1692
1693.endm
1694
1695
1696## if a = number of total plaintext bytes
1697## b = floor(a/16)
1698## num_initial_blocks = b mod 4#
1699## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1700## r10, r11, r12, rax are clobbered
1701## arg1, arg2, arg3, r14 are used as a pointer only, not modified
1702
1703.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1704        i = (8-\num_initial_blocks)
1705        j = 0
1706        setreg
1707
1708        mov     arg6, %r10                       # r10 = AAD
1709        mov     arg7, %r12                       # r12 = aadLen
1710
1711
1712        mov     %r12, %r11
1713
1714        vpxor   reg_j, reg_j, reg_j
1715        vpxor   reg_i, reg_i, reg_i
1716
1717        cmp     $16, %r11
1718        jl      _get_AAD_rest8\@
1719_get_AAD_blocks\@:
1720        vmovdqu (%r10), reg_i
1721        vpshufb SHUF_MASK(%rip), reg_i, reg_i
1722        vpxor   reg_i, reg_j, reg_j
1723        GHASH_MUL_AVX2      reg_j, \T2, \T1, \T3, \T4, \T5, \T6
1724        add     $16, %r10
1725        sub     $16, %r12
1726        sub     $16, %r11
1727        cmp     $16, %r11
1728        jge     _get_AAD_blocks\@
1729        vmovdqu reg_j, reg_i
1730        cmp     $0, %r11
1731        je      _get_AAD_done\@
1732
1733        vpxor   reg_i, reg_i, reg_i
1734
1735        /* read the last <16B of AAD. since we have at least 4B of
1736        data right after the AAD (the ICV, and maybe some CT), we can
1737        read 4B/8B blocks safely, and then get rid of the extra stuff */
1738_get_AAD_rest8\@:
1739        cmp     $4, %r11
1740        jle     _get_AAD_rest4\@
1741        movq    (%r10), \T1
1742        add     $8, %r10
1743        sub     $8, %r11
1744        vpslldq $8, \T1, \T1
1745        vpsrldq $8, reg_i, reg_i
1746        vpxor   \T1, reg_i, reg_i
1747        jmp     _get_AAD_rest8\@
1748_get_AAD_rest4\@:
1749        cmp     $0, %r11
1750        jle     _get_AAD_rest0\@
1751        mov     (%r10), %eax
1752        movq    %rax, \T1
1753        add     $4, %r10
1754        sub     $4, %r11
1755        vpslldq $12, \T1, \T1
1756        vpsrldq $4, reg_i, reg_i
1757        vpxor   \T1, reg_i, reg_i
1758_get_AAD_rest0\@:
1759        /* finalize: shift out the extra bytes we read, and align
1760        left. since pslldq can only shift by an immediate, we use
1761        vpshufb and an array of shuffle masks */
1762        movq    %r12, %r11
1763        salq    $4, %r11
1764        movdqu  aad_shift_arr(%r11), \T1
1765        vpshufb \T1, reg_i, reg_i
1766_get_AAD_rest_final\@:
1767        vpshufb SHUF_MASK(%rip), reg_i, reg_i
1768        vpxor   reg_j, reg_i, reg_i
1769        GHASH_MUL_AVX2      reg_i, \T2, \T1, \T3, \T4, \T5, \T6
1770
1771_get_AAD_done\@:
1772        # initialize the data pointer offset as zero
1773        xor     %r11, %r11
1774
1775        # start AES for num_initial_blocks blocks
1776        mov     arg5, %rax                     # rax = *Y0
1777        vmovdqu (%rax), \CTR                   # CTR = Y0
1778        vpshufb SHUF_MASK(%rip), \CTR, \CTR
1779
1780
1781        i = (9-\num_initial_blocks)
1782        setreg
1783.rep \num_initial_blocks
1784                vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
1785                vmovdqa \CTR, reg_i
1786                vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
1787        i = (i+1)
1788        setreg
1789.endr
1790
1791        vmovdqa  (arg1), \T_key
1792        i = (9-\num_initial_blocks)
1793        setreg
1794.rep \num_initial_blocks
1795                vpxor   \T_key, reg_i, reg_i
1796        i = (i+1)
1797        setreg
1798.endr
1799
1800        j = 1
1801        setreg
1802.rep 9
1803        vmovdqa  16*j(arg1), \T_key
1804        i = (9-\num_initial_blocks)
1805        setreg
1806.rep \num_initial_blocks
1807        vaesenc \T_key, reg_i, reg_i
1808        i = (i+1)
1809        setreg
1810.endr
1811
1812        j = (j+1)
1813        setreg
1814.endr
1815
1816
1817        vmovdqa  16*10(arg1), \T_key
1818        i = (9-\num_initial_blocks)
1819        setreg
1820.rep \num_initial_blocks
1821        vaesenclast      \T_key, reg_i, reg_i
1822        i = (i+1)
1823        setreg
1824.endr
1825
1826        i = (9-\num_initial_blocks)
1827        setreg
1828.rep \num_initial_blocks
1829                vmovdqu (arg3, %r11), \T1
1830                vpxor   \T1, reg_i, reg_i
1831                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for
1832                                                       # num_initial_blocks blocks
1833                add     $16, %r11
1834.if  \ENC_DEC == DEC
1835                vmovdqa \T1, reg_i
1836.endif
1837                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
1838        i = (i+1)
1839        setreg
1840.endr
1841
1842
1843        i = (8-\num_initial_blocks)
1844        j = (9-\num_initial_blocks)
1845        setreg
1846
1847.rep \num_initial_blocks
1848        vpxor    reg_i, reg_j, reg_j
1849        GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
1850        i = (i+1)
1851        j = (j+1)
1852        setreg
1853.endr
1854        # XMM8 has the combined result here
1855
1856        vmovdqa  \XMM8, TMP1(%rsp)
1857        vmovdqa  \XMM8, \T3
1858
1859        cmp     $128, %r13
1860        jl      _initial_blocks_done\@                  # no need for precomputed constants
1861
1862###############################################################################
1863# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1864                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1865                vmovdqa  \CTR, \XMM1
1866                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
1867
1868                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1869                vmovdqa  \CTR, \XMM2
1870                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
1871
1872                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1873                vmovdqa  \CTR, \XMM3
1874                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
1875
1876                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1877                vmovdqa  \CTR, \XMM4
1878                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
1879
1880                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1881                vmovdqa  \CTR, \XMM5
1882                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
1883
1884                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1885                vmovdqa  \CTR, \XMM6
1886                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
1887
1888                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1889                vmovdqa  \CTR, \XMM7
1890                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
1891
1892                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1893                vmovdqa  \CTR, \XMM8
1894                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
1895
1896                vmovdqa  (arg1), \T_key
1897                vpxor    \T_key, \XMM1, \XMM1
1898                vpxor    \T_key, \XMM2, \XMM2
1899                vpxor    \T_key, \XMM3, \XMM3
1900                vpxor    \T_key, \XMM4, \XMM4
1901                vpxor    \T_key, \XMM5, \XMM5
1902                vpxor    \T_key, \XMM6, \XMM6
1903                vpxor    \T_key, \XMM7, \XMM7
1904                vpxor    \T_key, \XMM8, \XMM8
1905
1906                i = 1
1907                setreg
1908.rep    9       # do 9 rounds
1909                vmovdqa  16*i(arg1), \T_key
1910                vaesenc  \T_key, \XMM1, \XMM1
1911                vaesenc  \T_key, \XMM2, \XMM2
1912                vaesenc  \T_key, \XMM3, \XMM3
1913                vaesenc  \T_key, \XMM4, \XMM4
1914                vaesenc  \T_key, \XMM5, \XMM5
1915                vaesenc  \T_key, \XMM6, \XMM6
1916                vaesenc  \T_key, \XMM7, \XMM7
1917                vaesenc  \T_key, \XMM8, \XMM8
1918                i = (i+1)
1919                setreg
1920.endr
1921
1922
1923                vmovdqa  16*i(arg1), \T_key
1924                vaesenclast  \T_key, \XMM1, \XMM1
1925                vaesenclast  \T_key, \XMM2, \XMM2
1926                vaesenclast  \T_key, \XMM3, \XMM3
1927                vaesenclast  \T_key, \XMM4, \XMM4
1928                vaesenclast  \T_key, \XMM5, \XMM5
1929                vaesenclast  \T_key, \XMM6, \XMM6
1930                vaesenclast  \T_key, \XMM7, \XMM7
1931                vaesenclast  \T_key, \XMM8, \XMM8
1932
1933                vmovdqu  (arg3, %r11), \T1
1934                vpxor    \T1, \XMM1, \XMM1
1935                vmovdqu  \XMM1, (arg2 , %r11)
1936                .if   \ENC_DEC == DEC
1937                vmovdqa  \T1, \XMM1
1938                .endif
1939
1940                vmovdqu  16*1(arg3, %r11), \T1
1941                vpxor    \T1, \XMM2, \XMM2
1942                vmovdqu  \XMM2, 16*1(arg2 , %r11)
1943                .if   \ENC_DEC == DEC
1944                vmovdqa  \T1, \XMM2
1945                .endif
1946
1947                vmovdqu  16*2(arg3, %r11), \T1
1948                vpxor    \T1, \XMM3, \XMM3
1949                vmovdqu  \XMM3, 16*2(arg2 , %r11)
1950                .if   \ENC_DEC == DEC
1951                vmovdqa  \T1, \XMM3
1952                .endif
1953
1954                vmovdqu  16*3(arg3, %r11), \T1
1955                vpxor    \T1, \XMM4, \XMM4
1956                vmovdqu  \XMM4, 16*3(arg2 , %r11)
1957                .if   \ENC_DEC == DEC
1958                vmovdqa  \T1, \XMM4
1959                .endif
1960
1961                vmovdqu  16*4(arg3, %r11), \T1
1962                vpxor    \T1, \XMM5, \XMM5
1963                vmovdqu  \XMM5, 16*4(arg2 , %r11)
1964                .if   \ENC_DEC == DEC
1965                vmovdqa  \T1, \XMM5
1966                .endif
1967
1968                vmovdqu  16*5(arg3, %r11), \T1
1969                vpxor    \T1, \XMM6, \XMM6
1970                vmovdqu  \XMM6, 16*5(arg2 , %r11)
1971                .if   \ENC_DEC == DEC
1972                vmovdqa  \T1, \XMM6
1973                .endif
1974
1975                vmovdqu  16*6(arg3, %r11), \T1
1976                vpxor    \T1, \XMM7, \XMM7
1977                vmovdqu  \XMM7, 16*6(arg2 , %r11)
1978                .if   \ENC_DEC == DEC
1979                vmovdqa  \T1, \XMM7
1980                .endif
1981
1982                vmovdqu  16*7(arg3, %r11), \T1
1983                vpxor    \T1, \XMM8, \XMM8
1984                vmovdqu  \XMM8, 16*7(arg2 , %r11)
1985                .if   \ENC_DEC == DEC
1986                vmovdqa  \T1, \XMM8
1987                .endif
1988
1989                add     $128, %r11
1990
1991                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
1992                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
1993                                                           # the corresponding ciphertext
1994                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
1995                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
1996                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
1997                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
1998                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
1999                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2000                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2001
2002###############################################################################
2003
2004_initial_blocks_done\@:
2005
2006
2007.endm
2008
2009
2010
2011# encrypt 8 blocks at a time
2012# ghash the 8 previously encrypted ciphertext blocks
2013# arg1, arg2, arg3 are used as pointers only, not modified
2014# r11 is the data offset value
2015.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2016
2017        vmovdqa \XMM1, \T2
2018        vmovdqa \XMM2, TMP2(%rsp)
2019        vmovdqa \XMM3, TMP3(%rsp)
2020        vmovdqa \XMM4, TMP4(%rsp)
2021        vmovdqa \XMM5, TMP5(%rsp)
2022        vmovdqa \XMM6, TMP6(%rsp)
2023        vmovdqa \XMM7, TMP7(%rsp)
2024        vmovdqa \XMM8, TMP8(%rsp)
2025
2026.if \loop_idx == in_order
2027                vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
2028                vpaddd  ONE(%rip), \XMM1, \XMM2
2029                vpaddd  ONE(%rip), \XMM2, \XMM3
2030                vpaddd  ONE(%rip), \XMM3, \XMM4
2031                vpaddd  ONE(%rip), \XMM4, \XMM5
2032                vpaddd  ONE(%rip), \XMM5, \XMM6
2033                vpaddd  ONE(%rip), \XMM6, \XMM7
2034                vpaddd  ONE(%rip), \XMM7, \XMM8
2035                vmovdqa \XMM8, \CTR
2036
2037                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2038                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2039                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2040                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2041                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2042                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2043                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2044                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2045.else
2046                vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
2047                vpaddd  ONEf(%rip), \XMM1, \XMM2
2048                vpaddd  ONEf(%rip), \XMM2, \XMM3
2049                vpaddd  ONEf(%rip), \XMM3, \XMM4
2050                vpaddd  ONEf(%rip), \XMM4, \XMM5
2051                vpaddd  ONEf(%rip), \XMM5, \XMM6
2052                vpaddd  ONEf(%rip), \XMM6, \XMM7
2053                vpaddd  ONEf(%rip), \XMM7, \XMM8
2054                vmovdqa \XMM8, \CTR
2055.endif
2056
2057
2058        #######################################################################
2059
2060                vmovdqu (arg1), \T1
2061                vpxor   \T1, \XMM1, \XMM1
2062                vpxor   \T1, \XMM2, \XMM2
2063                vpxor   \T1, \XMM3, \XMM3
2064                vpxor   \T1, \XMM4, \XMM4
2065                vpxor   \T1, \XMM5, \XMM5
2066                vpxor   \T1, \XMM6, \XMM6
2067                vpxor   \T1, \XMM7, \XMM7
2068                vpxor   \T1, \XMM8, \XMM8
2069
2070        #######################################################################
2071
2072
2073
2074
2075
2076                vmovdqu 16*1(arg1), \T1
2077                vaesenc \T1, \XMM1, \XMM1
2078                vaesenc \T1, \XMM2, \XMM2
2079                vaesenc \T1, \XMM3, \XMM3
2080                vaesenc \T1, \XMM4, \XMM4
2081                vaesenc \T1, \XMM5, \XMM5
2082                vaesenc \T1, \XMM6, \XMM6
2083                vaesenc \T1, \XMM7, \XMM7
2084                vaesenc \T1, \XMM8, \XMM8
2085
2086                vmovdqu 16*2(arg1), \T1
2087                vaesenc \T1, \XMM1, \XMM1
2088                vaesenc \T1, \XMM2, \XMM2
2089                vaesenc \T1, \XMM3, \XMM3
2090                vaesenc \T1, \XMM4, \XMM4
2091                vaesenc \T1, \XMM5, \XMM5
2092                vaesenc \T1, \XMM6, \XMM6
2093                vaesenc \T1, \XMM7, \XMM7
2094                vaesenc \T1, \XMM8, \XMM8
2095
2096
2097        #######################################################################
2098
2099        vmovdqa         HashKey_8(arg1), \T5
2100        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
2101        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
2102        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
2103        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
2104        vpxor           \T5, \T6, \T6
2105
2106                vmovdqu 16*3(arg1), \T1
2107                vaesenc \T1, \XMM1, \XMM1
2108                vaesenc \T1, \XMM2, \XMM2
2109                vaesenc \T1, \XMM3, \XMM3
2110                vaesenc \T1, \XMM4, \XMM4
2111                vaesenc \T1, \XMM5, \XMM5
2112                vaesenc \T1, \XMM6, \XMM6
2113                vaesenc \T1, \XMM7, \XMM7
2114                vaesenc \T1, \XMM8, \XMM8
2115
2116        vmovdqa         TMP2(%rsp), \T1
2117        vmovdqa         HashKey_7(arg1), \T5
2118        vpclmulqdq      $0x11, \T5, \T1, \T3
2119        vpxor           \T3, \T4, \T4
2120
2121        vpclmulqdq      $0x00, \T5, \T1, \T3
2122        vpxor           \T3, \T7, \T7
2123
2124        vpclmulqdq      $0x01, \T5, \T1, \T3
2125        vpxor           \T3, \T6, \T6
2126
2127        vpclmulqdq      $0x10, \T5, \T1, \T3
2128        vpxor           \T3, \T6, \T6
2129
2130                vmovdqu 16*4(arg1), \T1
2131                vaesenc \T1, \XMM1, \XMM1
2132                vaesenc \T1, \XMM2, \XMM2
2133                vaesenc \T1, \XMM3, \XMM3
2134                vaesenc \T1, \XMM4, \XMM4
2135                vaesenc \T1, \XMM5, \XMM5
2136                vaesenc \T1, \XMM6, \XMM6
2137                vaesenc \T1, \XMM7, \XMM7
2138                vaesenc \T1, \XMM8, \XMM8
2139
2140        #######################################################################
2141
2142        vmovdqa         TMP3(%rsp), \T1
2143        vmovdqa         HashKey_6(arg1), \T5
2144        vpclmulqdq      $0x11, \T5, \T1, \T3
2145        vpxor           \T3, \T4, \T4
2146
2147        vpclmulqdq      $0x00, \T5, \T1, \T3
2148        vpxor           \T3, \T7, \T7
2149
2150        vpclmulqdq      $0x01, \T5, \T1, \T3
2151        vpxor           \T3, \T6, \T6
2152
2153        vpclmulqdq      $0x10, \T5, \T1, \T3
2154        vpxor           \T3, \T6, \T6
2155
2156                vmovdqu 16*5(arg1), \T1
2157                vaesenc \T1, \XMM1, \XMM1
2158                vaesenc \T1, \XMM2, \XMM2
2159                vaesenc \T1, \XMM3, \XMM3
2160                vaesenc \T1, \XMM4, \XMM4
2161                vaesenc \T1, \XMM5, \XMM5
2162                vaesenc \T1, \XMM6, \XMM6
2163                vaesenc \T1, \XMM7, \XMM7
2164                vaesenc \T1, \XMM8, \XMM8
2165
2166        vmovdqa         TMP4(%rsp), \T1
2167        vmovdqa         HashKey_5(arg1), \T5
2168        vpclmulqdq      $0x11, \T5, \T1, \T3
2169        vpxor           \T3, \T4, \T4
2170
2171        vpclmulqdq      $0x00, \T5, \T1, \T3
2172        vpxor           \T3, \T7, \T7
2173
2174        vpclmulqdq      $0x01, \T5, \T1, \T3
2175        vpxor           \T3, \T6, \T6
2176
2177        vpclmulqdq      $0x10, \T5, \T1, \T3
2178        vpxor           \T3, \T6, \T6
2179
2180                vmovdqu 16*6(arg1), \T1
2181                vaesenc \T1, \XMM1, \XMM1
2182                vaesenc \T1, \XMM2, \XMM2
2183                vaesenc \T1, \XMM3, \XMM3
2184                vaesenc \T1, \XMM4, \XMM4
2185                vaesenc \T1, \XMM5, \XMM5
2186                vaesenc \T1, \XMM6, \XMM6
2187                vaesenc \T1, \XMM7, \XMM7
2188                vaesenc \T1, \XMM8, \XMM8
2189
2190
2191        vmovdqa         TMP5(%rsp), \T1
2192        vmovdqa         HashKey_4(arg1), \T5
2193        vpclmulqdq      $0x11, \T5, \T1, \T3
2194        vpxor           \T3, \T4, \T4
2195
2196        vpclmulqdq      $0x00, \T5, \T1, \T3
2197        vpxor           \T3, \T7, \T7
2198
2199        vpclmulqdq      $0x01, \T5, \T1, \T3
2200        vpxor           \T3, \T6, \T6
2201
2202        vpclmulqdq      $0x10, \T5, \T1, \T3
2203        vpxor           \T3, \T6, \T6
2204
2205                vmovdqu 16*7(arg1), \T1
2206                vaesenc \T1, \XMM1, \XMM1
2207                vaesenc \T1, \XMM2, \XMM2
2208                vaesenc \T1, \XMM3, \XMM3
2209                vaesenc \T1, \XMM4, \XMM4
2210                vaesenc \T1, \XMM5, \XMM5
2211                vaesenc \T1, \XMM6, \XMM6
2212                vaesenc \T1, \XMM7, \XMM7
2213                vaesenc \T1, \XMM8, \XMM8
2214
2215        vmovdqa         TMP6(%rsp), \T1
2216        vmovdqa         HashKey_3(arg1), \T5
2217        vpclmulqdq      $0x11, \T5, \T1, \T3
2218        vpxor           \T3, \T4, \T4
2219
2220        vpclmulqdq      $0x00, \T5, \T1, \T3
2221        vpxor           \T3, \T7, \T7
2222
2223        vpclmulqdq      $0x01, \T5, \T1, \T3
2224        vpxor           \T3, \T6, \T6
2225
2226        vpclmulqdq      $0x10, \T5, \T1, \T3
2227        vpxor           \T3, \T6, \T6
2228
2229                vmovdqu 16*8(arg1), \T1
2230                vaesenc \T1, \XMM1, \XMM1
2231                vaesenc \T1, \XMM2, \XMM2
2232                vaesenc \T1, \XMM3, \XMM3
2233                vaesenc \T1, \XMM4, \XMM4
2234                vaesenc \T1, \XMM5, \XMM5
2235                vaesenc \T1, \XMM6, \XMM6
2236                vaesenc \T1, \XMM7, \XMM7
2237                vaesenc \T1, \XMM8, \XMM8
2238
2239        vmovdqa         TMP7(%rsp), \T1
2240        vmovdqa         HashKey_2(arg1), \T5
2241        vpclmulqdq      $0x11, \T5, \T1, \T3
2242        vpxor           \T3, \T4, \T4
2243
2244        vpclmulqdq      $0x00, \T5, \T1, \T3
2245        vpxor           \T3, \T7, \T7
2246
2247        vpclmulqdq      $0x01, \T5, \T1, \T3
2248        vpxor           \T3, \T6, \T6
2249
2250        vpclmulqdq      $0x10, \T5, \T1, \T3
2251        vpxor           \T3, \T6, \T6
2252
2253
2254        #######################################################################
2255
2256                vmovdqu 16*9(arg1), \T5
2257                vaesenc \T5, \XMM1, \XMM1
2258                vaesenc \T5, \XMM2, \XMM2
2259                vaesenc \T5, \XMM3, \XMM3
2260                vaesenc \T5, \XMM4, \XMM4
2261                vaesenc \T5, \XMM5, \XMM5
2262                vaesenc \T5, \XMM6, \XMM6
2263                vaesenc \T5, \XMM7, \XMM7
2264                vaesenc \T5, \XMM8, \XMM8
2265
2266        vmovdqa         TMP8(%rsp), \T1
2267        vmovdqa         HashKey(arg1), \T5
2268
2269        vpclmulqdq      $0x00, \T5, \T1, \T3
2270        vpxor           \T3, \T7, \T7
2271
2272        vpclmulqdq      $0x01, \T5, \T1, \T3
2273        vpxor           \T3, \T6, \T6
2274
2275        vpclmulqdq      $0x10, \T5, \T1, \T3
2276        vpxor           \T3, \T6, \T6
2277
2278        vpclmulqdq      $0x11, \T5, \T1, \T3
2279        vpxor           \T3, \T4, \T1
2280
2281
2282                vmovdqu 16*10(arg1), \T5
2283
2284        i = 0
2285        j = 1
2286        setreg
2287.rep 8
2288                vpxor   16*i(arg3, %r11), \T5, \T2
2289                .if \ENC_DEC == ENC
2290                vaesenclast     \T2, reg_j, reg_j
2291                .else
2292                vaesenclast     \T2, reg_j, \T3
2293                vmovdqu 16*i(arg3, %r11), reg_j
2294                vmovdqu \T3, 16*i(arg2, %r11)
2295                .endif
2296        i = (i+1)
2297        j = (j+1)
2298        setreg
2299.endr
2300        #######################################################################
2301
2302
2303        vpslldq $8, \T6, \T3                            # shift-L T3 2 DWs
2304        vpsrldq $8, \T6, \T6                            # shift-R T2 2 DWs
2305        vpxor   \T3, \T7, \T7
2306        vpxor   \T6, \T1, \T1                           # accumulate the results in T1:T7
2307
2308
2309
2310        #######################################################################
2311        #first phase of the reduction
2312        vmovdqa         POLY2(%rip), \T3
2313
2314        vpclmulqdq      $0x01, \T7, \T3, \T2
2315        vpslldq         $8, \T2, \T2                    # shift-L xmm2 2 DWs
2316
2317        vpxor           \T2, \T7, \T7                   # first phase of the reduction complete
2318        #######################################################################
2319                .if \ENC_DEC == ENC
2320                vmovdqu  \XMM1, 16*0(arg2,%r11)         # Write to the Ciphertext buffer
2321                vmovdqu  \XMM2, 16*1(arg2,%r11)         # Write to the Ciphertext buffer
2322                vmovdqu  \XMM3, 16*2(arg2,%r11)         # Write to the Ciphertext buffer
2323                vmovdqu  \XMM4, 16*3(arg2,%r11)         # Write to the Ciphertext buffer
2324                vmovdqu  \XMM5, 16*4(arg2,%r11)         # Write to the Ciphertext buffer
2325                vmovdqu  \XMM6, 16*5(arg2,%r11)         # Write to the Ciphertext buffer
2326                vmovdqu  \XMM7, 16*6(arg2,%r11)         # Write to the Ciphertext buffer
2327                vmovdqu  \XMM8, 16*7(arg2,%r11)         # Write to the Ciphertext buffer
2328                .endif
2329
2330        #######################################################################
2331        #second phase of the reduction
2332        vpclmulqdq      $0x00, \T7, \T3, \T2
2333        vpsrldq         $4, \T2, \T2                    # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2334
2335        vpclmulqdq      $0x10, \T7, \T3, \T4
2336        vpslldq         $4, \T4, \T4                    # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2337
2338        vpxor           \T2, \T4, \T4                   # second phase of the reduction complete
2339        #######################################################################
2340        vpxor           \T4, \T1, \T1                   # the result is in T1
2341
2342                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1   # perform a 16Byte swap
2343                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2   # perform a 16Byte swap
2344                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3   # perform a 16Byte swap
2345                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4   # perform a 16Byte swap
2346                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5   # perform a 16Byte swap
2347                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6   # perform a 16Byte swap
2348                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7   # perform a 16Byte swap
2349                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8   # perform a 16Byte swap
2350
2351
2352        vpxor   \T1, \XMM1, \XMM1
2353
2354
2355
2356.endm
2357
2358
2359# GHASH the last 4 ciphertext blocks.
2360.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2361
2362        ## Karatsuba Method
2363
2364        vmovdqa         HashKey_8(arg1), \T5
2365
2366        vpshufd         $0b01001110, \XMM1, \T2
2367        vpshufd         $0b01001110, \T5, \T3
2368        vpxor           \XMM1, \T2, \T2
2369        vpxor           \T5, \T3, \T3
2370
2371        vpclmulqdq      $0x11, \T5, \XMM1, \T6
2372        vpclmulqdq      $0x00, \T5, \XMM1, \T7
2373
2374        vpclmulqdq      $0x00, \T3, \T2, \XMM1
2375
2376        ######################
2377
2378        vmovdqa         HashKey_7(arg1), \T5
2379        vpshufd         $0b01001110, \XMM2, \T2
2380        vpshufd         $0b01001110, \T5, \T3
2381        vpxor           \XMM2, \T2, \T2
2382        vpxor           \T5, \T3, \T3
2383
2384        vpclmulqdq      $0x11, \T5, \XMM2, \T4
2385        vpxor           \T4, \T6, \T6
2386
2387        vpclmulqdq      $0x00, \T5, \XMM2, \T4
2388        vpxor           \T4, \T7, \T7
2389
2390        vpclmulqdq      $0x00, \T3, \T2, \T2
2391
2392        vpxor           \T2, \XMM1, \XMM1
2393
2394        ######################
2395
2396        vmovdqa         HashKey_6(arg1), \T5
2397        vpshufd         $0b01001110, \XMM3, \T2
2398        vpshufd         $0b01001110, \T5, \T3
2399        vpxor           \XMM3, \T2, \T2
2400        vpxor           \T5, \T3, \T3
2401
2402        vpclmulqdq      $0x11, \T5, \XMM3, \T4
2403        vpxor           \T4, \T6, \T6
2404
2405        vpclmulqdq      $0x00, \T5, \XMM3, \T4
2406        vpxor           \T4, \T7, \T7
2407
2408        vpclmulqdq      $0x00, \T3, \T2, \T2
2409
2410        vpxor           \T2, \XMM1, \XMM1
2411
2412        ######################
2413
2414        vmovdqa         HashKey_5(arg1), \T5
2415        vpshufd         $0b01001110, \XMM4, \T2
2416        vpshufd         $0b01001110, \T5, \T3
2417        vpxor           \XMM4, \T2, \T2
2418        vpxor           \T5, \T3, \T3
2419
2420        vpclmulqdq      $0x11, \T5, \XMM4, \T4
2421        vpxor           \T4, \T6, \T6
2422
2423        vpclmulqdq      $0x00, \T5, \XMM4, \T4
2424        vpxor           \T4, \T7, \T7
2425
2426        vpclmulqdq      $0x00, \T3, \T2, \T2
2427
2428        vpxor           \T2, \XMM1, \XMM1
2429
2430        ######################
2431
2432        vmovdqa         HashKey_4(arg1), \T5
2433        vpshufd         $0b01001110, \XMM5, \T2
2434        vpshufd         $0b01001110, \T5, \T3
2435        vpxor           \XMM5, \T2, \T2
2436        vpxor           \T5, \T3, \T3
2437
2438        vpclmulqdq      $0x11, \T5, \XMM5, \T4
2439        vpxor           \T4, \T6, \T6
2440
2441        vpclmulqdq      $0x00, \T5, \XMM5, \T4
2442        vpxor           \T4, \T7, \T7
2443
2444        vpclmulqdq      $0x00, \T3, \T2, \T2
2445
2446        vpxor           \T2, \XMM1, \XMM1
2447
2448        ######################
2449
2450        vmovdqa         HashKey_3(arg1), \T5
2451        vpshufd         $0b01001110, \XMM6, \T2
2452        vpshufd         $0b01001110, \T5, \T3
2453        vpxor           \XMM6, \T2, \T2
2454        vpxor           \T5, \T3, \T3
2455
2456        vpclmulqdq      $0x11, \T5, \XMM6, \T4
2457        vpxor           \T4, \T6, \T6
2458
2459        vpclmulqdq      $0x00, \T5, \XMM6, \T4
2460        vpxor           \T4, \T7, \T7
2461
2462        vpclmulqdq      $0x00, \T3, \T2, \T2
2463
2464        vpxor           \T2, \XMM1, \XMM1
2465
2466        ######################
2467
2468        vmovdqa         HashKey_2(arg1), \T5
2469        vpshufd         $0b01001110, \XMM7, \T2
2470        vpshufd         $0b01001110, \T5, \T3
2471        vpxor           \XMM7, \T2, \T2
2472        vpxor           \T5, \T3, \T3
2473
2474        vpclmulqdq      $0x11, \T5, \XMM7, \T4
2475        vpxor           \T4, \T6, \T6
2476
2477        vpclmulqdq      $0x00, \T5, \XMM7, \T4
2478        vpxor           \T4, \T7, \T7
2479
2480        vpclmulqdq      $0x00, \T3, \T2, \T2
2481
2482        vpxor           \T2, \XMM1, \XMM1
2483
2484        ######################
2485
2486        vmovdqa         HashKey(arg1), \T5
2487        vpshufd         $0b01001110, \XMM8, \T2
2488        vpshufd         $0b01001110, \T5, \T3
2489        vpxor           \XMM8, \T2, \T2
2490        vpxor           \T5, \T3, \T3
2491
2492        vpclmulqdq      $0x11, \T5, \XMM8, \T4
2493        vpxor           \T4, \T6, \T6
2494
2495        vpclmulqdq      $0x00, \T5, \XMM8, \T4
2496        vpxor           \T4, \T7, \T7
2497
2498        vpclmulqdq      $0x00, \T3, \T2, \T2
2499
2500        vpxor           \T2, \XMM1, \XMM1
2501        vpxor           \T6, \XMM1, \XMM1
2502        vpxor           \T7, \XMM1, \T2
2503
2504
2505
2506
2507        vpslldq $8, \T2, \T4
2508        vpsrldq $8, \T2, \T2
2509
2510        vpxor   \T4, \T7, \T7
2511        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
2512                                                   # accumulated carry-less multiplications
2513
2514        #######################################################################
2515        #first phase of the reduction
2516        vmovdqa         POLY2(%rip), \T3
2517
2518        vpclmulqdq      $0x01, \T7, \T3, \T2
2519        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
2520
2521        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
2522        #######################################################################
2523
2524
2525        #second phase of the reduction
2526        vpclmulqdq      $0x00, \T7, \T3, \T2
2527        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2528
2529        vpclmulqdq      $0x10, \T7, \T3, \T4
2530        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2531
2532        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
2533        #######################################################################
2534        vpxor           \T4, \T6, \T6              # the result is in T6
2535.endm
2536
2537
2538
2539# combined for GCM encrypt and decrypt functions
2540# clobbering all xmm registers
2541# clobbering r10, r11, r12, r13, r14, r15
2542.macro  GCM_ENC_DEC_AVX2     ENC_DEC
2543
2544        #the number of pushes must equal STACK_OFFSET
2545        push    %r12
2546        push    %r13
2547        push    %r14
2548        push    %r15
2549
2550        mov     %rsp, %r14
2551
2552
2553
2554
2555        sub     $VARIABLE_OFFSET, %rsp
2556        and     $~63, %rsp                         # align rsp to 64 bytes
2557
2558
2559        vmovdqu  HashKey(arg1), %xmm13             # xmm13 = HashKey
2560
2561        mov     arg4, %r13                         # save the number of bytes of plaintext/ciphertext
2562        and     $-16, %r13                         # r13 = r13 - (r13 mod 16)
2563
2564        mov     %r13, %r12
2565        shr     $4, %r12
2566        and     $7, %r12
2567        jz      _initial_num_blocks_is_0\@
2568
2569        cmp     $7, %r12
2570        je      _initial_num_blocks_is_7\@
2571        cmp     $6, %r12
2572        je      _initial_num_blocks_is_6\@
2573        cmp     $5, %r12
2574        je      _initial_num_blocks_is_5\@
2575        cmp     $4, %r12
2576        je      _initial_num_blocks_is_4\@
2577        cmp     $3, %r12
2578        je      _initial_num_blocks_is_3\@
2579        cmp     $2, %r12
2580        je      _initial_num_blocks_is_2\@
2581
2582        jmp     _initial_num_blocks_is_1\@
2583
2584_initial_num_blocks_is_7\@:
2585        INITIAL_BLOCKS_AVX2  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2586        sub     $16*7, %r13
2587        jmp     _initial_blocks_encrypted\@
2588
2589_initial_num_blocks_is_6\@:
2590        INITIAL_BLOCKS_AVX2  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2591        sub     $16*6, %r13
2592        jmp     _initial_blocks_encrypted\@
2593
2594_initial_num_blocks_is_5\@:
2595        INITIAL_BLOCKS_AVX2  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2596        sub     $16*5, %r13
2597        jmp     _initial_blocks_encrypted\@
2598
2599_initial_num_blocks_is_4\@:
2600        INITIAL_BLOCKS_AVX2  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2601        sub     $16*4, %r13
2602        jmp     _initial_blocks_encrypted\@
2603
2604_initial_num_blocks_is_3\@:
2605        INITIAL_BLOCKS_AVX2  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2606        sub     $16*3, %r13
2607        jmp     _initial_blocks_encrypted\@
2608
2609_initial_num_blocks_is_2\@:
2610        INITIAL_BLOCKS_AVX2  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2611        sub     $16*2, %r13
2612        jmp     _initial_blocks_encrypted\@
2613
2614_initial_num_blocks_is_1\@:
2615        INITIAL_BLOCKS_AVX2  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2616        sub     $16*1, %r13
2617        jmp     _initial_blocks_encrypted\@
2618
2619_initial_num_blocks_is_0\@:
2620        INITIAL_BLOCKS_AVX2  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2621
2622
2623_initial_blocks_encrypted\@:
2624        cmp     $0, %r13
2625        je      _zero_cipher_left\@
2626
2627        sub     $128, %r13
2628        je      _eight_cipher_left\@
2629
2630
2631
2632
2633        vmovd   %xmm9, %r15d
2634        and     $255, %r15d
2635        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2636
2637
2638_encrypt_by_8_new\@:
2639        cmp     $(255-8), %r15d
2640        jg      _encrypt_by_8\@
2641
2642
2643
2644        add     $8, %r15b
2645        GHASH_8_ENCRYPT_8_PARALLEL_AVX2      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
2646        add     $128, %r11
2647        sub     $128, %r13
2648        jne     _encrypt_by_8_new\@
2649
2650        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2651        jmp     _eight_cipher_left\@
2652
2653_encrypt_by_8\@:
2654        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2655        add     $8, %r15b
2656        GHASH_8_ENCRYPT_8_PARALLEL_AVX2      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
2657        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2658        add     $128, %r11
2659        sub     $128, %r13
2660        jne     _encrypt_by_8_new\@
2661
2662        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2663
2664
2665
2666
2667_eight_cipher_left\@:
2668        GHASH_LAST_8_AVX2    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
2669
2670
2671_zero_cipher_left\@:
2672        cmp     $16, arg4
2673        jl      _only_less_than_16\@
2674
2675        mov     arg4, %r13
2676        and     $15, %r13                            # r13 = (arg4 mod 16)
2677
2678        je      _multiple_of_16_bytes\@
2679
2680        # handle the last <16 Byte block seperately
2681
2682
2683        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
2684        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2685        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
2686
2687        sub     $16, %r11
2688        add     %r13, %r11
2689        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
2690
2691        lea     SHIFT_MASK+16(%rip), %r12
2692        sub     %r13, %r12                           # adjust the shuffle mask pointer
2693                                                     # to be able to shift 16-r13 bytes
2694                                                     # (r13 is the number of bytes in plaintext mod 16)
2695        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
2696        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
2697        jmp     _final_ghash_mul\@
2698
2699_only_less_than_16\@:
2700        # check for 0 length
2701        mov     arg4, %r13
2702        and     $15, %r13                            # r13 = (arg4 mod 16)
2703
2704        je      _multiple_of_16_bytes\@
2705
2706        # handle the last <16 Byte block seperately
2707
2708
2709        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
2710        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2711        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
2712
2713
2714        lea     SHIFT_MASK+16(%rip), %r12
2715        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
2716                                                     # able to shift 16-r13 bytes (r13 is the
2717                                                     # number of bytes in plaintext mod 16)
2718
2719_get_last_16_byte_loop\@:
2720        movb    (arg3, %r11),  %al
2721        movb    %al,  TMP1 (%rsp , %r11)
2722        add     $1, %r11
2723        cmp     %r13,  %r11
2724        jne     _get_last_16_byte_loop\@
2725
2726        vmovdqu  TMP1(%rsp), %xmm1
2727
2728        sub     $16, %r11
2729
2730_final_ghash_mul\@:
2731        .if  \ENC_DEC ==  DEC
2732        vmovdqa %xmm1, %xmm2
2733        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
2734        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2735        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
2736        vpand   %xmm1, %xmm2, %xmm2
2737        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
2738        vpxor   %xmm2, %xmm14, %xmm14
2739        #GHASH computation for the last <16 Byte block
2740        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2741        sub     %r13, %r11
2742        add     $16, %r11
2743        .else
2744        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
2745        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2746        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
2747        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2748        vpxor   %xmm9, %xmm14, %xmm14
2749        #GHASH computation for the last <16 Byte block
2750        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2751        sub     %r13, %r11
2752        add     $16, %r11
2753        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
2754        .endif
2755
2756
2757        #############################
2758        # output r13 Bytes
2759        vmovq   %xmm9, %rax
2760        cmp     $8, %r13
2761        jle     _less_than_8_bytes_left\@
2762
2763        mov     %rax, (arg2 , %r11)
2764        add     $8, %r11
2765        vpsrldq $8, %xmm9, %xmm9
2766        vmovq   %xmm9, %rax
2767        sub     $8, %r13
2768
2769_less_than_8_bytes_left\@:
2770        movb    %al, (arg2 , %r11)
2771        add     $1, %r11
2772        shr     $8, %rax
2773        sub     $1, %r13
2774        jne     _less_than_8_bytes_left\@
2775        #############################
2776
2777_multiple_of_16_bytes\@:
2778        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
2779        shl     $3, %r12                             # convert into number of bits
2780        vmovd   %r12d, %xmm15                        # len(A) in xmm15
2781
2782        shl     $3, arg4                             # len(C) in bits  (*128)
2783        vmovq   arg4, %xmm1
2784        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
2785        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
2786
2787        vpxor   %xmm15, %xmm14, %xmm14
2788        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
2789        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14              # perform a 16Byte swap
2790
2791        mov     arg5, %rax                           # rax = *Y0
2792        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
2793
2794        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
2795
2796        vpxor   %xmm14, %xmm9, %xmm9
2797
2798
2799
2800_return_T\@:
2801        mov     arg8, %r10              # r10 = authTag
2802        mov     arg9, %r11              # r11 = auth_tag_len
2803
2804        cmp     $16, %r11
2805        je      _T_16\@
2806
2807        cmp     $8, %r11
2808        jl      _T_4\@
2809
2810_T_8\@:
2811        vmovq   %xmm9, %rax
2812        mov     %rax, (%r10)
2813        add     $8, %r10
2814        sub     $8, %r11
2815        vpsrldq $8, %xmm9, %xmm9
2816        cmp     $0, %r11
2817        je     _return_T_done\@
2818_T_4\@:
2819        vmovd   %xmm9, %eax
2820        mov     %eax, (%r10)
2821        add     $4, %r10
2822        sub     $4, %r11
2823        vpsrldq     $4, %xmm9, %xmm9
2824        cmp     $0, %r11
2825        je     _return_T_done\@
2826_T_123\@:
2827        vmovd     %xmm9, %eax
2828        cmp     $2, %r11
2829        jl     _T_1\@
2830        mov     %ax, (%r10)
2831        cmp     $2, %r11
2832        je     _return_T_done\@
2833        add     $2, %r10
2834        sar     $16, %eax
2835_T_1\@:
2836        mov     %al, (%r10)
2837        jmp     _return_T_done\@
2838
2839_T_16\@:
2840        vmovdqu %xmm9, (%r10)
2841
2842_return_T_done\@:
2843        mov     %r14, %rsp
2844
2845        pop     %r15
2846        pop     %r14
2847        pop     %r13
2848        pop     %r12
2849.endm
2850
2851
2852#############################################################
2853#void   aesni_gcm_precomp_avx_gen4
2854#        (gcm_data     *my_ctx_data,
2855#        u8     *hash_subkey)# /* H, the Hash sub key input.
2856#                               Data starts on a 16-byte boundary. */
2857#############################################################
2858ENTRY(aesni_gcm_precomp_avx_gen4)
2859        #the number of pushes must equal STACK_OFFSET
2860        push    %r12
2861        push    %r13
2862        push    %r14
2863        push    %r15
2864
2865        mov     %rsp, %r14
2866
2867
2868
2869        sub     $VARIABLE_OFFSET, %rsp
2870        and     $~63, %rsp                    # align rsp to 64 bytes
2871
2872        vmovdqu  (arg2), %xmm6                # xmm6 = HashKey
2873
2874        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
2875        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
2876        vmovdqa  %xmm6, %xmm2
2877        vpsllq   $1, %xmm6, %xmm6
2878        vpsrlq   $63, %xmm2, %xmm2
2879        vmovdqa  %xmm2, %xmm1
2880        vpslldq  $8, %xmm2, %xmm2
2881        vpsrldq  $8, %xmm1, %xmm1
2882        vpor     %xmm2, %xmm6, %xmm6
2883        #reduction
2884        vpshufd  $0b00100100, %xmm1, %xmm2
2885        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2886        vpand    POLY(%rip), %xmm2, %xmm2
2887        vpxor    %xmm2, %xmm6, %xmm6          # xmm6 holds the HashKey<<1 mod poly
2888        #######################################################################
2889        vmovdqa  %xmm6, HashKey(arg1)         # store HashKey<<1 mod poly
2890
2891
2892        PRECOMPUTE_AVX2  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2893
2894        mov     %r14, %rsp
2895
2896        pop     %r15
2897        pop     %r14
2898        pop     %r13
2899        pop     %r12
2900        ret
2901ENDPROC(aesni_gcm_precomp_avx_gen4)
2902
2903
2904###############################################################################
2905#void   aesni_gcm_enc_avx_gen4(
2906#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2907#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
2908#        const   u8 *in, /* Plaintext input */
2909#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
2910#        u8      *iv, /* Pre-counter block j0: 4 byte salt
2911#                       (from Security Association) concatenated with 8 byte
2912#                        Initialisation Vector (from IPSec ESP Payload)
2913#                        concatenated with 0x00000001. 16-byte aligned pointer. */
2914#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2915#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2916#        u8      *auth_tag, /* Authenticated Tag output. */
2917#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2918#                               Valid values are 16 (most likely), 12 or 8. */
2919###############################################################################
2920ENTRY(aesni_gcm_enc_avx_gen4)
2921        GCM_ENC_DEC_AVX2     ENC
2922        ret
2923ENDPROC(aesni_gcm_enc_avx_gen4)
2924
2925###############################################################################
2926#void   aesni_gcm_dec_avx_gen4(
2927#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2928#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
2929#        const   u8 *in, /* Ciphertext input */
2930#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
2931#        u8      *iv, /* Pre-counter block j0: 4 byte salt
2932#                       (from Security Association) concatenated with 8 byte
2933#                       Initialisation Vector (from IPSec ESP Payload)
2934#                       concatenated with 0x00000001. 16-byte aligned pointer. */
2935#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2936#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2937#        u8      *auth_tag, /* Authenticated Tag output. */
2938#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2939#                               Valid values are 16 (most likely), 12 or 8. */
2940###############################################################################
2941ENTRY(aesni_gcm_dec_avx_gen4)
2942        GCM_ENC_DEC_AVX2     DEC
2943        ret
2944ENDPROC(aesni_gcm_dec_avx_gen4)
2945
2946#endif /* CONFIG_AS_AVX2 */
2947