linux/arch/x86/crypto/aesni-intel_avx-x86_64.S
<<
>>
Prefs
   1########################################################################
   2# Copyright (c) 2013, Intel Corporation
   3#
   4# This software is available to you under a choice of one of two
   5# licenses.  You may choose to be licensed under the terms of the GNU
   6# General Public License (GPL) Version 2, available from the file
   7# COPYING in the main directory of this source tree, or the
   8# OpenIB.org BSD license below:
   9#
  10# Redistribution and use in source and binary forms, with or without
  11# modification, are permitted provided that the following conditions are
  12# met:
  13#
  14# * Redistributions of source code must retain the above copyright
  15#   notice, this list of conditions and the following disclaimer.
  16#
  17# * Redistributions in binary form must reproduce the above copyright
  18#   notice, this list of conditions and the following disclaimer in the
  19#   documentation and/or other materials provided with the
  20#   distribution.
  21#
  22# * Neither the name of the Intel Corporation nor the names of its
  23#   contributors may be used to endorse or promote products derived from
  24#   this software without specific prior written permission.
  25#
  26#
  27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
  28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
  31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
  34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38########################################################################
  39##
  40## Authors:
  41##      Erdinc Ozturk <erdinc.ozturk@intel.com>
  42##      Vinodh Gopal <vinodh.gopal@intel.com>
  43##      James Guilford <james.guilford@intel.com>
  44##      Tim Chen <tim.c.chen@linux.intel.com>
  45##
  46## References:
  47##       This code was derived and highly optimized from the code described in paper:
  48##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
  49##                      on Intel Architecture Processors. August, 2010
  50##       The details of the implementation is explained in:
  51##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
  52##                      on Intel Architecture Processors. October, 2012.
  53##
  54## Assumptions:
  55##
  56##
  57##
  58## iv:
  59##       0                   1                   2                   3
  60##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  61##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  62##       |                             Salt  (From the SA)               |
  63##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  64##       |                     Initialization Vector                     |
  65##       |         (This is the sequence number from IPSec header)       |
  66##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  67##       |                              0x1                              |
  68##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  69##
  70##
  71##
  72## AAD:
  73##       AAD padded to 128 bits with 0
  74##       for example, assume AAD is a u32 vector
  75##
  76##       if AAD is 8 bytes:
  77##       AAD[3] = {A0, A1}#
  78##       padded AAD in xmm register = {A1 A0 0 0}
  79##
  80##       0                   1                   2                   3
  81##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  82##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  83##       |                               SPI (A1)                        |
  84##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  85##       |                     32-bit Sequence Number (A0)               |
  86##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  87##       |                              0x0                              |
  88##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  89##
  90##                                       AAD Format with 32-bit Sequence Number
  91##
  92##       if AAD is 12 bytes:
  93##       AAD[3] = {A0, A1, A2}#
  94##       padded AAD in xmm register = {A2 A1 A0 0}
  95##
  96##       0                   1                   2                   3
  97##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  98##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  99##       |                               SPI (A2)                        |
 100##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 101##       |                 64-bit Extended Sequence Number {A1,A0}       |
 102##       |                                                               |
 103##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 104##       |                              0x0                              |
 105##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 106##
 107##        AAD Format with 64-bit Extended Sequence Number
 108##
 109##
 110## aadLen:
 111##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
 112##       The code additionally supports aadLen of length 16 bytes.
 113##
 114## TLen:
 115##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
 116##
 117## poly = x^128 + x^127 + x^126 + x^121 + 1
 118## throughout the code, one tab and two tab indentations are used. one tab is
 119## for GHASH part, two tabs is for AES part.
 120##
 121
 122#include <linux/linkage.h>
 123#include <asm/inst.h>
 124
 125# constants in mergeable sections, linker can reorder and merge
 126.section        .rodata.cst16.POLY, "aM", @progbits, 16
 127.align 16
 128POLY:            .octa     0xC2000000000000000000000000000001
 129
 130.section        .rodata.cst16.POLY2, "aM", @progbits, 16
 131.align 16
 132POLY2:           .octa     0xC20000000000000000000001C2000000
 133
 134.section        .rodata.cst16.TWOONE, "aM", @progbits, 16
 135.align 16
 136TWOONE:          .octa     0x00000001000000000000000000000001
 137
 138.section        .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
 139.align 16
 140SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
 141
 142.section        .rodata.cst16.ONE, "aM", @progbits, 16
 143.align 16
 144ONE:             .octa     0x00000000000000000000000000000001
 145
 146.section        .rodata.cst16.ONEf, "aM", @progbits, 16
 147.align 16
 148ONEf:            .octa     0x01000000000000000000000000000000
 149
 150# order of these constants should not change.
 151# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
 152.section        .rodata, "a", @progbits
 153.align 16
 154SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
 155ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
 156                 .octa     0x00000000000000000000000000000000
 157
 158.section .rodata
 159.align 16
 160.type aad_shift_arr, @object
 161.size aad_shift_arr, 272
 162aad_shift_arr:
 163        .octa     0xffffffffffffffffffffffffffffffff
 164        .octa     0xffffffffffffffffffffffffffffff0C
 165        .octa     0xffffffffffffffffffffffffffff0D0C
 166        .octa     0xffffffffffffffffffffffffff0E0D0C
 167        .octa     0xffffffffffffffffffffffff0F0E0D0C
 168        .octa     0xffffffffffffffffffffff0C0B0A0908
 169        .octa     0xffffffffffffffffffff0D0C0B0A0908
 170        .octa     0xffffffffffffffffff0E0D0C0B0A0908
 171        .octa     0xffffffffffffffff0F0E0D0C0B0A0908
 172        .octa     0xffffffffffffff0C0B0A090807060504
 173        .octa     0xffffffffffff0D0C0B0A090807060504
 174        .octa     0xffffffffff0E0D0C0B0A090807060504
 175        .octa     0xffffffff0F0E0D0C0B0A090807060504
 176        .octa     0xffffff0C0B0A09080706050403020100
 177        .octa     0xffff0D0C0B0A09080706050403020100
 178        .octa     0xff0E0D0C0B0A09080706050403020100
 179        .octa     0x0F0E0D0C0B0A09080706050403020100
 180
 181
 182.text
 183
 184
 185#define AadHash 16*0
 186#define AadLen 16*1
 187#define InLen (16*1)+8
 188#define PBlockEncKey 16*2
 189#define OrigIV 16*3
 190#define CurCount 16*4
 191#define PBlockLen 16*5
 192
 193HashKey        = 16*6   # store HashKey <<1 mod poly here
 194HashKey_2      = 16*7   # store HashKey^2 <<1 mod poly here
 195HashKey_3      = 16*8   # store HashKey^3 <<1 mod poly here
 196HashKey_4      = 16*9   # store HashKey^4 <<1 mod poly here
 197HashKey_5      = 16*10   # store HashKey^5 <<1 mod poly here
 198HashKey_6      = 16*11   # store HashKey^6 <<1 mod poly here
 199HashKey_7      = 16*12   # store HashKey^7 <<1 mod poly here
 200HashKey_8      = 16*13   # store HashKey^8 <<1 mod poly here
 201HashKey_k      = 16*14   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
 202HashKey_2_k    = 16*15   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
 203HashKey_3_k    = 16*16   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
 204HashKey_4_k    = 16*17   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
 205HashKey_5_k    = 16*18   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
 206HashKey_6_k    = 16*19   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
 207HashKey_7_k    = 16*20   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
 208HashKey_8_k    = 16*21   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
 209
 210#define arg1 %rdi
 211#define arg2 %rsi
 212#define arg3 %rdx
 213#define arg4 %rcx
 214#define arg5 %r8
 215#define arg6 %r9
 216#define arg7 STACK_OFFSET+8*1(%r14)
 217#define arg8 STACK_OFFSET+8*2(%r14)
 218#define arg9 STACK_OFFSET+8*3(%r14)
 219#define arg10 STACK_OFFSET+8*4(%r14)
 220#define keysize 2*15*16(arg1)
 221
 222i = 0
 223j = 0
 224
 225out_order = 0
 226in_order = 1
 227DEC = 0
 228ENC = 1
 229
 230.macro define_reg r n
 231reg_\r = %xmm\n
 232.endm
 233
 234.macro setreg
 235.altmacro
 236define_reg i %i
 237define_reg j %j
 238.noaltmacro
 239.endm
 240
 241# need to push 4 registers into stack to maintain
 242STACK_OFFSET = 8*4
 243
 244TMP1 =   16*0    # Temporary storage for AAD
 245TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
 246TMP3 =   16*2    # Temporary storage for AES State 3
 247TMP4 =   16*3    # Temporary storage for AES State 4
 248TMP5 =   16*4    # Temporary storage for AES State 5
 249TMP6 =   16*5    # Temporary storage for AES State 6
 250TMP7 =   16*6    # Temporary storage for AES State 7
 251TMP8 =   16*7    # Temporary storage for AES State 8
 252
 253VARIABLE_OFFSET = 16*8
 254
 255################################
 256# Utility Macros
 257################################
 258
 259.macro FUNC_SAVE
 260        #the number of pushes must equal STACK_OFFSET
 261        push    %r12
 262        push    %r13
 263        push    %r14
 264        push    %r15
 265
 266        mov     %rsp, %r14
 267
 268
 269
 270        sub     $VARIABLE_OFFSET, %rsp
 271        and     $~63, %rsp                    # align rsp to 64 bytes
 272.endm
 273
 274.macro FUNC_RESTORE
 275        mov     %r14, %rsp
 276
 277        pop     %r15
 278        pop     %r14
 279        pop     %r13
 280        pop     %r12
 281.endm
 282
 283# Encryption of a single block
 284.macro ENCRYPT_SINGLE_BLOCK REP XMM0
 285                vpxor    (arg1), \XMM0, \XMM0
 286               i = 1
 287               setreg
 288.rep \REP
 289                vaesenc  16*i(arg1), \XMM0, \XMM0
 290               i = (i+1)
 291               setreg
 292.endr
 293                vaesenclast 16*i(arg1), \XMM0, \XMM0
 294.endm
 295
 296# combined for GCM encrypt and decrypt functions
 297# clobbering all xmm registers
 298# clobbering r10, r11, r12, r13, r14, r15
 299.macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
 300        vmovdqu AadHash(arg2), %xmm8
 301        vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
 302        add arg5, InLen(arg2)
 303
 304        # initialize the data pointer offset as zero
 305        xor     %r11d, %r11d
 306
 307        PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
 308        sub %r11, arg5
 309
 310        mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
 311        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
 312
 313        mov     %r13, %r12
 314        shr     $4, %r12
 315        and     $7, %r12
 316        jz      _initial_num_blocks_is_0\@
 317
 318        cmp     $7, %r12
 319        je      _initial_num_blocks_is_7\@
 320        cmp     $6, %r12
 321        je      _initial_num_blocks_is_6\@
 322        cmp     $5, %r12
 323        je      _initial_num_blocks_is_5\@
 324        cmp     $4, %r12
 325        je      _initial_num_blocks_is_4\@
 326        cmp     $3, %r12
 327        je      _initial_num_blocks_is_3\@
 328        cmp     $2, %r12
 329        je      _initial_num_blocks_is_2\@
 330
 331        jmp     _initial_num_blocks_is_1\@
 332
 333_initial_num_blocks_is_7\@:
 334        \INITIAL_BLOCKS  \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
 335        sub     $16*7, %r13
 336        jmp     _initial_blocks_encrypted\@
 337
 338_initial_num_blocks_is_6\@:
 339        \INITIAL_BLOCKS  \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
 340        sub     $16*6, %r13
 341        jmp     _initial_blocks_encrypted\@
 342
 343_initial_num_blocks_is_5\@:
 344        \INITIAL_BLOCKS  \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
 345        sub     $16*5, %r13
 346        jmp     _initial_blocks_encrypted\@
 347
 348_initial_num_blocks_is_4\@:
 349        \INITIAL_BLOCKS  \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
 350        sub     $16*4, %r13
 351        jmp     _initial_blocks_encrypted\@
 352
 353_initial_num_blocks_is_3\@:
 354        \INITIAL_BLOCKS  \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
 355        sub     $16*3, %r13
 356        jmp     _initial_blocks_encrypted\@
 357
 358_initial_num_blocks_is_2\@:
 359        \INITIAL_BLOCKS  \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
 360        sub     $16*2, %r13
 361        jmp     _initial_blocks_encrypted\@
 362
 363_initial_num_blocks_is_1\@:
 364        \INITIAL_BLOCKS  \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
 365        sub     $16*1, %r13
 366        jmp     _initial_blocks_encrypted\@
 367
 368_initial_num_blocks_is_0\@:
 369        \INITIAL_BLOCKS  \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
 370
 371
 372_initial_blocks_encrypted\@:
 373        cmp     $0, %r13
 374        je      _zero_cipher_left\@
 375
 376        sub     $128, %r13
 377        je      _eight_cipher_left\@
 378
 379
 380
 381
 382        vmovd   %xmm9, %r15d
 383        and     $255, %r15d
 384        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
 385
 386
 387_encrypt_by_8_new\@:
 388        cmp     $(255-8), %r15d
 389        jg      _encrypt_by_8\@
 390
 391
 392
 393        add     $8, %r15b
 394        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
 395        add     $128, %r11
 396        sub     $128, %r13
 397        jne     _encrypt_by_8_new\@
 398
 399        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
 400        jmp     _eight_cipher_left\@
 401
 402_encrypt_by_8\@:
 403        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
 404        add     $8, %r15b
 405        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
 406        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
 407        add     $128, %r11
 408        sub     $128, %r13
 409        jne     _encrypt_by_8_new\@
 410
 411        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
 412
 413
 414
 415
 416_eight_cipher_left\@:
 417        \GHASH_LAST_8    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
 418
 419
 420_zero_cipher_left\@:
 421        vmovdqu %xmm14, AadHash(arg2)
 422        vmovdqu %xmm9, CurCount(arg2)
 423
 424        # check for 0 length
 425        mov     arg5, %r13
 426        and     $15, %r13                            # r13 = (arg5 mod 16)
 427
 428        je      _multiple_of_16_bytes\@
 429
 430        # handle the last <16 Byte block separately
 431
 432        mov %r13, PBlockLen(arg2)
 433
 434        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
 435        vmovdqu %xmm9, CurCount(arg2)
 436        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
 437
 438        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
 439        vmovdqu %xmm9, PBlockEncKey(arg2)
 440
 441        cmp $16, arg5
 442        jge _large_enough_update\@
 443
 444        lea (arg4,%r11,1), %r10
 445        mov %r13, %r12
 446
 447        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
 448
 449        lea     SHIFT_MASK+16(%rip), %r12
 450        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
 451                                                     # able to shift 16-r13 bytes (r13 is the
 452        # number of bytes in plaintext mod 16)
 453
 454        jmp _final_ghash_mul\@
 455
 456_large_enough_update\@:
 457        sub $16, %r11
 458        add %r13, %r11
 459
 460        # receive the last <16 Byte block
 461        vmovdqu (arg4, %r11, 1), %xmm1
 462
 463        sub     %r13, %r11
 464        add     $16, %r11
 465
 466        lea     SHIFT_MASK+16(%rip), %r12
 467        # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
 468        # (r13 is the number of bytes in plaintext mod 16)
 469        sub     %r13, %r12
 470        # get the appropriate shuffle mask
 471        vmovdqu (%r12), %xmm2
 472        # shift right 16-r13 bytes
 473        vpshufb  %xmm2, %xmm1, %xmm1
 474
 475_final_ghash_mul\@:
 476        .if  \ENC_DEC ==  DEC
 477        vmovdqa %xmm1, %xmm2
 478        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
 479        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
 480                                                     # mask out top 16-r13 bytes of xmm9
 481        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
 482        vpand   %xmm1, %xmm2, %xmm2
 483        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
 484        vpxor   %xmm2, %xmm14, %xmm14
 485
 486        vmovdqu %xmm14, AadHash(arg2)
 487        .else
 488        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
 489        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
 490                                                     # mask out top 16-r13 bytes of xmm9
 491        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
 492        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
 493        vpxor   %xmm9, %xmm14, %xmm14
 494
 495        vmovdqu %xmm14, AadHash(arg2)
 496        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
 497        .endif
 498
 499
 500        #############################
 501        # output r13 Bytes
 502        vmovq   %xmm9, %rax
 503        cmp     $8, %r13
 504        jle     _less_than_8_bytes_left\@
 505
 506        mov     %rax, (arg3 , %r11)
 507        add     $8, %r11
 508        vpsrldq $8, %xmm9, %xmm9
 509        vmovq   %xmm9, %rax
 510        sub     $8, %r13
 511
 512_less_than_8_bytes_left\@:
 513        movb    %al, (arg3 , %r11)
 514        add     $1, %r11
 515        shr     $8, %rax
 516        sub     $1, %r13
 517        jne     _less_than_8_bytes_left\@
 518        #############################
 519
 520_multiple_of_16_bytes\@:
 521.endm
 522
 523
 524# GCM_COMPLETE Finishes update of tag of last partial block
 525# Output: Authorization Tag (AUTH_TAG)
 526# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 527.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
 528        vmovdqu AadHash(arg2), %xmm14
 529        vmovdqu HashKey(arg2), %xmm13
 530
 531        mov PBlockLen(arg2), %r12
 532        cmp $0, %r12
 533        je _partial_done\@
 534
 535        #GHASH computation for the last <16 Byte block
 536        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 537
 538_partial_done\@:
 539        mov AadLen(arg2), %r12                          # r12 = aadLen (number of bytes)
 540        shl     $3, %r12                             # convert into number of bits
 541        vmovd   %r12d, %xmm15                        # len(A) in xmm15
 542
 543        mov InLen(arg2), %r12
 544        shl     $3, %r12                        # len(C) in bits  (*128)
 545        vmovq   %r12, %xmm1
 546        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
 547        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
 548
 549        vpxor   %xmm15, %xmm14, %xmm14
 550        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
 551        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
 552
 553        vmovdqu OrigIV(arg2), %xmm9
 554
 555        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Y0)
 556
 557        vpxor   %xmm14, %xmm9, %xmm9
 558
 559
 560
 561_return_T\@:
 562        mov     \AUTH_TAG, %r10              # r10 = authTag
 563        mov     \AUTH_TAG_LEN, %r11              # r11 = auth_tag_len
 564
 565        cmp     $16, %r11
 566        je      _T_16\@
 567
 568        cmp     $8, %r11
 569        jl      _T_4\@
 570
 571_T_8\@:
 572        vmovq   %xmm9, %rax
 573        mov     %rax, (%r10)
 574        add     $8, %r10
 575        sub     $8, %r11
 576        vpsrldq $8, %xmm9, %xmm9
 577        cmp     $0, %r11
 578        je     _return_T_done\@
 579_T_4\@:
 580        vmovd   %xmm9, %eax
 581        mov     %eax, (%r10)
 582        add     $4, %r10
 583        sub     $4, %r11
 584        vpsrldq     $4, %xmm9, %xmm9
 585        cmp     $0, %r11
 586        je     _return_T_done\@
 587_T_123\@:
 588        vmovd     %xmm9, %eax
 589        cmp     $2, %r11
 590        jl     _T_1\@
 591        mov     %ax, (%r10)
 592        cmp     $2, %r11
 593        je     _return_T_done\@
 594        add     $2, %r10
 595        sar     $16, %eax
 596_T_1\@:
 597        mov     %al, (%r10)
 598        jmp     _return_T_done\@
 599
 600_T_16\@:
 601        vmovdqu %xmm9, (%r10)
 602
 603_return_T_done\@:
 604.endm
 605
 606.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
 607
 608        mov     \AAD, %r10                      # r10 = AAD
 609        mov     \AADLEN, %r12                      # r12 = aadLen
 610
 611
 612        mov     %r12, %r11
 613
 614        vpxor   \T8, \T8, \T8
 615        vpxor   \T7, \T7, \T7
 616        cmp     $16, %r11
 617        jl      _get_AAD_rest8\@
 618_get_AAD_blocks\@:
 619        vmovdqu (%r10), \T7
 620        vpshufb SHUF_MASK(%rip), \T7, \T7
 621        vpxor   \T7, \T8, \T8
 622        \GHASH_MUL       \T8, \T2, \T1, \T3, \T4, \T5, \T6
 623        add     $16, %r10
 624        sub     $16, %r12
 625        sub     $16, %r11
 626        cmp     $16, %r11
 627        jge     _get_AAD_blocks\@
 628        vmovdqu \T8, \T7
 629        cmp     $0, %r11
 630        je      _get_AAD_done\@
 631
 632        vpxor   \T7, \T7, \T7
 633
 634        /* read the last <16B of AAD. since we have at least 4B of
 635        data right after the AAD (the ICV, and maybe some CT), we can
 636        read 4B/8B blocks safely, and then get rid of the extra stuff */
 637_get_AAD_rest8\@:
 638        cmp     $4, %r11
 639        jle     _get_AAD_rest4\@
 640        movq    (%r10), \T1
 641        add     $8, %r10
 642        sub     $8, %r11
 643        vpslldq $8, \T1, \T1
 644        vpsrldq $8, \T7, \T7
 645        vpxor   \T1, \T7, \T7
 646        jmp     _get_AAD_rest8\@
 647_get_AAD_rest4\@:
 648        cmp     $0, %r11
 649        jle      _get_AAD_rest0\@
 650        mov     (%r10), %eax
 651        movq    %rax, \T1
 652        add     $4, %r10
 653        sub     $4, %r11
 654        vpslldq $12, \T1, \T1
 655        vpsrldq $4, \T7, \T7
 656        vpxor   \T1, \T7, \T7
 657_get_AAD_rest0\@:
 658        /* finalize: shift out the extra bytes we read, and align
 659        left. since pslldq can only shift by an immediate, we use
 660        vpshufb and an array of shuffle masks */
 661        movq    %r12, %r11
 662        salq    $4, %r11
 663        vmovdqu  aad_shift_arr(%r11), \T1
 664        vpshufb \T1, \T7, \T7
 665_get_AAD_rest_final\@:
 666        vpshufb SHUF_MASK(%rip), \T7, \T7
 667        vpxor   \T8, \T7, \T7
 668        \GHASH_MUL       \T7, \T2, \T1, \T3, \T4, \T5, \T6
 669
 670_get_AAD_done\@:
 671        vmovdqu \T7, AadHash(arg2)
 672.endm
 673
 674.macro INIT GHASH_MUL PRECOMPUTE
 675        mov arg6, %r11
 676        mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
 677        xor %r11d, %r11d
 678        mov %r11, InLen(arg2) # ctx_data.in_length = 0
 679
 680        mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
 681        mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
 682        mov arg3, %rax
 683        movdqu (%rax), %xmm0
 684        movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
 685
 686        vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
 687        movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
 688
 689        vmovdqu  (arg4), %xmm6              # xmm6 = HashKey
 690
 691        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
 692        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
 693        vmovdqa  %xmm6, %xmm2
 694        vpsllq   $1, %xmm6, %xmm6
 695        vpsrlq   $63, %xmm2, %xmm2
 696        vmovdqa  %xmm2, %xmm1
 697        vpslldq  $8, %xmm2, %xmm2
 698        vpsrldq  $8, %xmm1, %xmm1
 699        vpor     %xmm2, %xmm6, %xmm6
 700        #reduction
 701        vpshufd  $0b00100100, %xmm1, %xmm2
 702        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
 703        vpand    POLY(%rip), %xmm2, %xmm2
 704        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
 705        #######################################################################
 706        vmovdqu  %xmm6, HashKey(arg2)       # store HashKey<<1 mod poly
 707
 708        CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
 709
 710        \PRECOMPUTE  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
 711.endm
 712
 713
 714# Reads DLEN bytes starting at DPTR and stores in XMMDst
 715# where 0 < DLEN < 16
 716# Clobbers %rax, DLEN
 717.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
 718        vpxor \XMMDst, \XMMDst, \XMMDst
 719
 720        cmp $8, \DLEN
 721        jl _read_lt8_\@
 722        mov (\DPTR), %rax
 723        vpinsrq $0, %rax, \XMMDst, \XMMDst
 724        sub $8, \DLEN
 725        jz _done_read_partial_block_\@
 726        xor %eax, %eax
 727_read_next_byte_\@:
 728        shl $8, %rax
 729        mov 7(\DPTR, \DLEN, 1), %al
 730        dec \DLEN
 731        jnz _read_next_byte_\@
 732        vpinsrq $1, %rax, \XMMDst, \XMMDst
 733        jmp _done_read_partial_block_\@
 734_read_lt8_\@:
 735        xor %eax, %eax
 736_read_next_byte_lt8_\@:
 737        shl $8, %rax
 738        mov -1(\DPTR, \DLEN, 1), %al
 739        dec \DLEN
 740        jnz _read_next_byte_lt8_\@
 741        vpinsrq $0, %rax, \XMMDst, \XMMDst
 742_done_read_partial_block_\@:
 743.endm
 744
 745# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
 746# between update calls.
 747# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
 748# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
 749# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
 750.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
 751        AAD_HASH ENC_DEC
 752        mov     PBlockLen(arg2), %r13
 753        cmp     $0, %r13
 754        je      _partial_block_done_\@  # Leave Macro if no partial blocks
 755        # Read in input data without over reading
 756        cmp     $16, \PLAIN_CYPH_LEN
 757        jl      _fewer_than_16_bytes_\@
 758        vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
 759        jmp     _data_read_\@
 760
 761_fewer_than_16_bytes_\@:
 762        lea     (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
 763        mov     \PLAIN_CYPH_LEN, %r12
 764        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
 765
 766        mov PBlockLen(arg2), %r13
 767
 768_data_read_\@:                          # Finished reading in data
 769
 770        vmovdqu PBlockEncKey(arg2), %xmm9
 771        vmovdqu HashKey(arg2), %xmm13
 772
 773        lea     SHIFT_MASK(%rip), %r12
 774
 775        # adjust the shuffle mask pointer to be able to shift r13 bytes
 776        # r16-r13 is the number of bytes in plaintext mod 16)
 777        add     %r13, %r12
 778        vmovdqu (%r12), %xmm2           # get the appropriate shuffle mask
 779        vpshufb %xmm2, %xmm9, %xmm9             # shift right r13 bytes
 780
 781.if  \ENC_DEC ==  DEC
 782        vmovdqa %xmm1, %xmm3
 783        pxor    %xmm1, %xmm9            # Cyphertext XOR E(K, Yn)
 784
 785        mov     \PLAIN_CYPH_LEN, %r10
 786        add     %r13, %r10
 787        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 788        sub     $16, %r10
 789        # Determine if if partial block is not being filled and
 790        # shift mask accordingly
 791        jge     _no_extra_mask_1_\@
 792        sub     %r10, %r12
 793_no_extra_mask_1_\@:
 794
 795        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
 796        # get the appropriate mask to mask out bottom r13 bytes of xmm9
 797        vpand   %xmm1, %xmm9, %xmm9             # mask out bottom r13 bytes of xmm9
 798
 799        vpand   %xmm1, %xmm3, %xmm3
 800        vmovdqa SHUF_MASK(%rip), %xmm10
 801        vpshufb %xmm10, %xmm3, %xmm3
 802        vpshufb %xmm2, %xmm3, %xmm3
 803        vpxor   %xmm3, \AAD_HASH, \AAD_HASH
 804
 805        cmp     $0, %r10
 806        jl      _partial_incomplete_1_\@
 807
 808        # GHASH computation for the last <16 Byte block
 809        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 810        xor     %eax,%eax
 811
 812        mov     %rax, PBlockLen(arg2)
 813        jmp     _dec_done_\@
 814_partial_incomplete_1_\@:
 815        add     \PLAIN_CYPH_LEN, PBlockLen(arg2)
 816_dec_done_\@:
 817        vmovdqu \AAD_HASH, AadHash(arg2)
 818.else
 819        vpxor   %xmm1, %xmm9, %xmm9                     # Plaintext XOR E(K, Yn)
 820
 821        mov     \PLAIN_CYPH_LEN, %r10
 822        add     %r13, %r10
 823        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 824        sub     $16, %r10
 825        # Determine if if partial block is not being filled and
 826        # shift mask accordingly
 827        jge     _no_extra_mask_2_\@
 828        sub     %r10, %r12
 829_no_extra_mask_2_\@:
 830
 831        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
 832        # get the appropriate mask to mask out bottom r13 bytes of xmm9
 833        vpand   %xmm1, %xmm9, %xmm9
 834
 835        vmovdqa SHUF_MASK(%rip), %xmm1
 836        vpshufb %xmm1, %xmm9, %xmm9
 837        vpshufb %xmm2, %xmm9, %xmm9
 838        vpxor   %xmm9, \AAD_HASH, \AAD_HASH
 839
 840        cmp     $0, %r10
 841        jl      _partial_incomplete_2_\@
 842
 843        # GHASH computation for the last <16 Byte block
 844        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 845        xor     %eax,%eax
 846
 847        mov     %rax, PBlockLen(arg2)
 848        jmp     _encode_done_\@
 849_partial_incomplete_2_\@:
 850        add     \PLAIN_CYPH_LEN, PBlockLen(arg2)
 851_encode_done_\@:
 852        vmovdqu \AAD_HASH, AadHash(arg2)
 853
 854        vmovdqa SHUF_MASK(%rip), %xmm10
 855        # shuffle xmm9 back to output as ciphertext
 856        vpshufb %xmm10, %xmm9, %xmm9
 857        vpshufb %xmm2, %xmm9, %xmm9
 858.endif
 859        # output encrypted Bytes
 860        cmp     $0, %r10
 861        jl      _partial_fill_\@
 862        mov     %r13, %r12
 863        mov     $16, %r13
 864        # Set r13 to be the number of bytes to write out
 865        sub     %r12, %r13
 866        jmp     _count_set_\@
 867_partial_fill_\@:
 868        mov     \PLAIN_CYPH_LEN, %r13
 869_count_set_\@:
 870        vmovdqa %xmm9, %xmm0
 871        vmovq   %xmm0, %rax
 872        cmp     $8, %r13
 873        jle     _less_than_8_bytes_left_\@
 874
 875        mov     %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 876        add     $8, \DATA_OFFSET
 877        psrldq  $8, %xmm0
 878        vmovq   %xmm0, %rax
 879        sub     $8, %r13
 880_less_than_8_bytes_left_\@:
 881        movb    %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 882        add     $1, \DATA_OFFSET
 883        shr     $8, %rax
 884        sub     $1, %r13
 885        jne     _less_than_8_bytes_left_\@
 886_partial_block_done_\@:
 887.endm # PARTIAL_BLOCK
 888
 889#ifdef CONFIG_AS_AVX
 890###############################################################################
 891# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 892# Input: A and B (128-bits each, bit-reflected)
 893# Output: C = A*B*x mod poly, (i.e. >>1 )
 894# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 895# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 896###############################################################################
 897.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
 898
 899        vpshufd         $0b01001110, \GH, \T2
 900        vpshufd         $0b01001110, \HK, \T3
 901        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
 902        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
 903
 904        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
 905        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
 906        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
 907        vpxor           \GH, \T2,\T2
 908        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
 909
 910        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
 911        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
 912        vpxor           \T3, \GH, \GH
 913        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
 914
 915        #first phase of the reduction
 916        vpslld  $31, \GH, \T2                   # packed right shifting << 31
 917        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
 918        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
 919
 920        vpxor   \T3, \T2, \T2                   # xor the shifted versions
 921        vpxor   \T4, \T2, \T2
 922
 923        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
 924
 925        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
 926        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
 927
 928        #second phase of the reduction
 929
 930        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
 931        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
 932        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
 933        vpxor   \T3, \T2, \T2                   # xor the shifted versions
 934        vpxor   \T4, \T2, \T2
 935
 936        vpxor   \T5, \T2, \T2
 937        vpxor   \T2, \GH, \GH
 938        vpxor   \T1, \GH, \GH                   # the result is in GH
 939
 940
 941.endm
 942
 943.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
 944
 945        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 946        vmovdqa  \HK, \T5
 947
 948        vpshufd  $0b01001110, \T5, \T1
 949        vpxor    \T5, \T1, \T1
 950        vmovdqu  \T1, HashKey_k(arg2)
 951
 952        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
 953        vmovdqu  \T5, HashKey_2(arg2)                    #  [HashKey_2] = HashKey^2<<1 mod poly
 954        vpshufd  $0b01001110, \T5, \T1
 955        vpxor    \T5, \T1, \T1
 956        vmovdqu  \T1, HashKey_2_k(arg2)
 957
 958        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
 959        vmovdqu  \T5, HashKey_3(arg2)
 960        vpshufd  $0b01001110, \T5, \T1
 961        vpxor    \T5, \T1, \T1
 962        vmovdqu  \T1, HashKey_3_k(arg2)
 963
 964        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
 965        vmovdqu  \T5, HashKey_4(arg2)
 966        vpshufd  $0b01001110, \T5, \T1
 967        vpxor    \T5, \T1, \T1
 968        vmovdqu  \T1, HashKey_4_k(arg2)
 969
 970        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
 971        vmovdqu  \T5, HashKey_5(arg2)
 972        vpshufd  $0b01001110, \T5, \T1
 973        vpxor    \T5, \T1, \T1
 974        vmovdqu  \T1, HashKey_5_k(arg2)
 975
 976        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
 977        vmovdqu  \T5, HashKey_6(arg2)
 978        vpshufd  $0b01001110, \T5, \T1
 979        vpxor    \T5, \T1, \T1
 980        vmovdqu  \T1, HashKey_6_k(arg2)
 981
 982        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
 983        vmovdqu  \T5, HashKey_7(arg2)
 984        vpshufd  $0b01001110, \T5, \T1
 985        vpxor    \T5, \T1, \T1
 986        vmovdqu  \T1, HashKey_7_k(arg2)
 987
 988        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
 989        vmovdqu  \T5, HashKey_8(arg2)
 990        vpshufd  $0b01001110, \T5, \T1
 991        vpxor    \T5, \T1, \T1
 992        vmovdqu  \T1, HashKey_8_k(arg2)
 993
 994.endm
 995
 996## if a = number of total plaintext bytes
 997## b = floor(a/16)
 998## num_initial_blocks = b mod 4#
 999## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1000## r10, r11, r12, rax are clobbered
1001## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1002
1003.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
1004        i = (8-\num_initial_blocks)
1005        setreg
1006        vmovdqu AadHash(arg2), reg_i
1007
1008        # start AES for num_initial_blocks blocks
1009        vmovdqu CurCount(arg2), \CTR
1010
1011        i = (9-\num_initial_blocks)
1012        setreg
1013.rep \num_initial_blocks
1014                vpaddd  ONE(%rip), \CTR, \CTR           # INCR Y0
1015                vmovdqa \CTR, reg_i
1016                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
1017        i = (i+1)
1018        setreg
1019.endr
1020
1021        vmovdqa  (arg1), \T_key
1022        i = (9-\num_initial_blocks)
1023        setreg
1024.rep \num_initial_blocks
1025                vpxor   \T_key, reg_i, reg_i
1026        i = (i+1)
1027        setreg
1028.endr
1029
1030       j = 1
1031       setreg
1032.rep \REP
1033       vmovdqa  16*j(arg1), \T_key
1034        i = (9-\num_initial_blocks)
1035        setreg
1036.rep \num_initial_blocks
1037        vaesenc \T_key, reg_i, reg_i
1038        i = (i+1)
1039        setreg
1040.endr
1041
1042       j = (j+1)
1043       setreg
1044.endr
1045
1046        vmovdqa  16*j(arg1), \T_key
1047        i = (9-\num_initial_blocks)
1048        setreg
1049.rep \num_initial_blocks
1050        vaesenclast      \T_key, reg_i, reg_i
1051        i = (i+1)
1052        setreg
1053.endr
1054
1055        i = (9-\num_initial_blocks)
1056        setreg
1057.rep \num_initial_blocks
1058                vmovdqu (arg4, %r11), \T1
1059                vpxor   \T1, reg_i, reg_i
1060                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for num_initial_blocks blocks
1061                add     $16, %r11
1062.if  \ENC_DEC == DEC
1063                vmovdqa \T1, reg_i
1064.endif
1065                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
1066        i = (i+1)
1067        setreg
1068.endr
1069
1070
1071        i = (8-\num_initial_blocks)
1072        j = (9-\num_initial_blocks)
1073        setreg
1074
1075.rep \num_initial_blocks
1076        vpxor    reg_i, reg_j, reg_j
1077        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1078        i = (i+1)
1079        j = (j+1)
1080        setreg
1081.endr
1082        # XMM8 has the combined result here
1083
1084        vmovdqa  \XMM8, TMP1(%rsp)
1085        vmovdqa  \XMM8, \T3
1086
1087        cmp     $128, %r13
1088        jl      _initial_blocks_done\@                  # no need for precomputed constants
1089
1090###############################################################################
1091# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1092                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1093                vmovdqa  \CTR, \XMM1
1094                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
1095
1096                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1097                vmovdqa  \CTR, \XMM2
1098                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
1099
1100                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1101                vmovdqa  \CTR, \XMM3
1102                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
1103
1104                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1105                vmovdqa  \CTR, \XMM4
1106                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
1107
1108                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1109                vmovdqa  \CTR, \XMM5
1110                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
1111
1112                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1113                vmovdqa  \CTR, \XMM6
1114                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
1115
1116                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1117                vmovdqa  \CTR, \XMM7
1118                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
1119
1120                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1121                vmovdqa  \CTR, \XMM8
1122                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
1123
1124                vmovdqa  (arg1), \T_key
1125                vpxor    \T_key, \XMM1, \XMM1
1126                vpxor    \T_key, \XMM2, \XMM2
1127                vpxor    \T_key, \XMM3, \XMM3
1128                vpxor    \T_key, \XMM4, \XMM4
1129                vpxor    \T_key, \XMM5, \XMM5
1130                vpxor    \T_key, \XMM6, \XMM6
1131                vpxor    \T_key, \XMM7, \XMM7
1132                vpxor    \T_key, \XMM8, \XMM8
1133
1134               i = 1
1135               setreg
1136.rep    \REP       # do REP rounds
1137                vmovdqa  16*i(arg1), \T_key
1138                vaesenc  \T_key, \XMM1, \XMM1
1139                vaesenc  \T_key, \XMM2, \XMM2
1140                vaesenc  \T_key, \XMM3, \XMM3
1141                vaesenc  \T_key, \XMM4, \XMM4
1142                vaesenc  \T_key, \XMM5, \XMM5
1143                vaesenc  \T_key, \XMM6, \XMM6
1144                vaesenc  \T_key, \XMM7, \XMM7
1145                vaesenc  \T_key, \XMM8, \XMM8
1146               i = (i+1)
1147               setreg
1148.endr
1149
1150                vmovdqa  16*i(arg1), \T_key
1151                vaesenclast  \T_key, \XMM1, \XMM1
1152                vaesenclast  \T_key, \XMM2, \XMM2
1153                vaesenclast  \T_key, \XMM3, \XMM3
1154                vaesenclast  \T_key, \XMM4, \XMM4
1155                vaesenclast  \T_key, \XMM5, \XMM5
1156                vaesenclast  \T_key, \XMM6, \XMM6
1157                vaesenclast  \T_key, \XMM7, \XMM7
1158                vaesenclast  \T_key, \XMM8, \XMM8
1159
1160                vmovdqu  (arg4, %r11), \T1
1161                vpxor    \T1, \XMM1, \XMM1
1162                vmovdqu  \XMM1, (arg3 , %r11)
1163                .if   \ENC_DEC == DEC
1164                vmovdqa  \T1, \XMM1
1165                .endif
1166
1167                vmovdqu  16*1(arg4, %r11), \T1
1168                vpxor    \T1, \XMM2, \XMM2
1169                vmovdqu  \XMM2, 16*1(arg3 , %r11)
1170                .if   \ENC_DEC == DEC
1171                vmovdqa  \T1, \XMM2
1172                .endif
1173
1174                vmovdqu  16*2(arg4, %r11), \T1
1175                vpxor    \T1, \XMM3, \XMM3
1176                vmovdqu  \XMM3, 16*2(arg3 , %r11)
1177                .if   \ENC_DEC == DEC
1178                vmovdqa  \T1, \XMM3
1179                .endif
1180
1181                vmovdqu  16*3(arg4, %r11), \T1
1182                vpxor    \T1, \XMM4, \XMM4
1183                vmovdqu  \XMM4, 16*3(arg3 , %r11)
1184                .if   \ENC_DEC == DEC
1185                vmovdqa  \T1, \XMM4
1186                .endif
1187
1188                vmovdqu  16*4(arg4, %r11), \T1
1189                vpxor    \T1, \XMM5, \XMM5
1190                vmovdqu  \XMM5, 16*4(arg3 , %r11)
1191                .if   \ENC_DEC == DEC
1192                vmovdqa  \T1, \XMM5
1193                .endif
1194
1195                vmovdqu  16*5(arg4, %r11), \T1
1196                vpxor    \T1, \XMM6, \XMM6
1197                vmovdqu  \XMM6, 16*5(arg3 , %r11)
1198                .if   \ENC_DEC == DEC
1199                vmovdqa  \T1, \XMM6
1200                .endif
1201
1202                vmovdqu  16*6(arg4, %r11), \T1
1203                vpxor    \T1, \XMM7, \XMM7
1204                vmovdqu  \XMM7, 16*6(arg3 , %r11)
1205                .if   \ENC_DEC == DEC
1206                vmovdqa  \T1, \XMM7
1207                .endif
1208
1209                vmovdqu  16*7(arg4, %r11), \T1
1210                vpxor    \T1, \XMM8, \XMM8
1211                vmovdqu  \XMM8, 16*7(arg3 , %r11)
1212                .if   \ENC_DEC == DEC
1213                vmovdqa  \T1, \XMM8
1214                .endif
1215
1216                add     $128, %r11
1217
1218                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
1219                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
1220                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
1221                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
1222                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
1223                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
1224                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
1225                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
1226                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
1227
1228###############################################################################
1229
1230_initial_blocks_done\@:
1231
1232.endm
1233
1234# encrypt 8 blocks at a time
1235# ghash the 8 previously encrypted ciphertext blocks
1236# arg1, arg3, arg4 are used as pointers only, not modified
1237# r11 is the data offset value
1238.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1239
1240        vmovdqa \XMM1, \T2
1241        vmovdqa \XMM2, TMP2(%rsp)
1242        vmovdqa \XMM3, TMP3(%rsp)
1243        vmovdqa \XMM4, TMP4(%rsp)
1244        vmovdqa \XMM5, TMP5(%rsp)
1245        vmovdqa \XMM6, TMP6(%rsp)
1246        vmovdqa \XMM7, TMP7(%rsp)
1247        vmovdqa \XMM8, TMP8(%rsp)
1248
1249.if \loop_idx == in_order
1250                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
1251                vpaddd  ONE(%rip), \XMM1, \XMM2
1252                vpaddd  ONE(%rip), \XMM2, \XMM3
1253                vpaddd  ONE(%rip), \XMM3, \XMM4
1254                vpaddd  ONE(%rip), \XMM4, \XMM5
1255                vpaddd  ONE(%rip), \XMM5, \XMM6
1256                vpaddd  ONE(%rip), \XMM6, \XMM7
1257                vpaddd  ONE(%rip), \XMM7, \XMM8
1258                vmovdqa \XMM8, \CTR
1259
1260                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
1261                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
1262                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
1263                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
1264                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
1265                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
1266                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
1267                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
1268.else
1269                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
1270                vpaddd  ONEf(%rip), \XMM1, \XMM2
1271                vpaddd  ONEf(%rip), \XMM2, \XMM3
1272                vpaddd  ONEf(%rip), \XMM3, \XMM4
1273                vpaddd  ONEf(%rip), \XMM4, \XMM5
1274                vpaddd  ONEf(%rip), \XMM5, \XMM6
1275                vpaddd  ONEf(%rip), \XMM6, \XMM7
1276                vpaddd  ONEf(%rip), \XMM7, \XMM8
1277                vmovdqa \XMM8, \CTR
1278.endif
1279
1280
1281        #######################################################################
1282
1283                vmovdqu (arg1), \T1
1284                vpxor   \T1, \XMM1, \XMM1
1285                vpxor   \T1, \XMM2, \XMM2
1286                vpxor   \T1, \XMM3, \XMM3
1287                vpxor   \T1, \XMM4, \XMM4
1288                vpxor   \T1, \XMM5, \XMM5
1289                vpxor   \T1, \XMM6, \XMM6
1290                vpxor   \T1, \XMM7, \XMM7
1291                vpxor   \T1, \XMM8, \XMM8
1292
1293        #######################################################################
1294
1295
1296
1297
1298
1299                vmovdqu 16*1(arg1), \T1
1300                vaesenc \T1, \XMM1, \XMM1
1301                vaesenc \T1, \XMM2, \XMM2
1302                vaesenc \T1, \XMM3, \XMM3
1303                vaesenc \T1, \XMM4, \XMM4
1304                vaesenc \T1, \XMM5, \XMM5
1305                vaesenc \T1, \XMM6, \XMM6
1306                vaesenc \T1, \XMM7, \XMM7
1307                vaesenc \T1, \XMM8, \XMM8
1308
1309                vmovdqu 16*2(arg1), \T1
1310                vaesenc \T1, \XMM1, \XMM1
1311                vaesenc \T1, \XMM2, \XMM2
1312                vaesenc \T1, \XMM3, \XMM3
1313                vaesenc \T1, \XMM4, \XMM4
1314                vaesenc \T1, \XMM5, \XMM5
1315                vaesenc \T1, \XMM6, \XMM6
1316                vaesenc \T1, \XMM7, \XMM7
1317                vaesenc \T1, \XMM8, \XMM8
1318
1319
1320        #######################################################################
1321
1322        vmovdqu         HashKey_8(arg2), \T5
1323        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
1324        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
1325
1326        vpshufd         $0b01001110, \T2, \T6
1327        vpxor           \T2, \T6, \T6
1328
1329        vmovdqu         HashKey_8_k(arg2), \T5
1330        vpclmulqdq      $0x00, \T5, \T6, \T6
1331
1332                vmovdqu 16*3(arg1), \T1
1333                vaesenc \T1, \XMM1, \XMM1
1334                vaesenc \T1, \XMM2, \XMM2
1335                vaesenc \T1, \XMM3, \XMM3
1336                vaesenc \T1, \XMM4, \XMM4
1337                vaesenc \T1, \XMM5, \XMM5
1338                vaesenc \T1, \XMM6, \XMM6
1339                vaesenc \T1, \XMM7, \XMM7
1340                vaesenc \T1, \XMM8, \XMM8
1341
1342        vmovdqa         TMP2(%rsp), \T1
1343        vmovdqu         HashKey_7(arg2), \T5
1344        vpclmulqdq      $0x11, \T5, \T1, \T3
1345        vpxor           \T3, \T4, \T4
1346        vpclmulqdq      $0x00, \T5, \T1, \T3
1347        vpxor           \T3, \T7, \T7
1348
1349        vpshufd         $0b01001110, \T1, \T3
1350        vpxor           \T1, \T3, \T3
1351        vmovdqu         HashKey_7_k(arg2), \T5
1352        vpclmulqdq      $0x10, \T5, \T3, \T3
1353        vpxor           \T3, \T6, \T6
1354
1355                vmovdqu 16*4(arg1), \T1
1356                vaesenc \T1, \XMM1, \XMM1
1357                vaesenc \T1, \XMM2, \XMM2
1358                vaesenc \T1, \XMM3, \XMM3
1359                vaesenc \T1, \XMM4, \XMM4
1360                vaesenc \T1, \XMM5, \XMM5
1361                vaesenc \T1, \XMM6, \XMM6
1362                vaesenc \T1, \XMM7, \XMM7
1363                vaesenc \T1, \XMM8, \XMM8
1364
1365        #######################################################################
1366
1367        vmovdqa         TMP3(%rsp), \T1
1368        vmovdqu         HashKey_6(arg2), \T5
1369        vpclmulqdq      $0x11, \T5, \T1, \T3
1370        vpxor           \T3, \T4, \T4
1371        vpclmulqdq      $0x00, \T5, \T1, \T3
1372        vpxor           \T3, \T7, \T7
1373
1374        vpshufd         $0b01001110, \T1, \T3
1375        vpxor           \T1, \T3, \T3
1376        vmovdqu         HashKey_6_k(arg2), \T5
1377        vpclmulqdq      $0x10, \T5, \T3, \T3
1378        vpxor           \T3, \T6, \T6
1379
1380                vmovdqu 16*5(arg1), \T1
1381                vaesenc \T1, \XMM1, \XMM1
1382                vaesenc \T1, \XMM2, \XMM2
1383                vaesenc \T1, \XMM3, \XMM3
1384                vaesenc \T1, \XMM4, \XMM4
1385                vaesenc \T1, \XMM5, \XMM5
1386                vaesenc \T1, \XMM6, \XMM6
1387                vaesenc \T1, \XMM7, \XMM7
1388                vaesenc \T1, \XMM8, \XMM8
1389
1390        vmovdqa         TMP4(%rsp), \T1
1391        vmovdqu         HashKey_5(arg2), \T5
1392        vpclmulqdq      $0x11, \T5, \T1, \T3
1393        vpxor           \T3, \T4, \T4
1394        vpclmulqdq      $0x00, \T5, \T1, \T3
1395        vpxor           \T3, \T7, \T7
1396
1397        vpshufd         $0b01001110, \T1, \T3
1398        vpxor           \T1, \T3, \T3
1399        vmovdqu         HashKey_5_k(arg2), \T5
1400        vpclmulqdq      $0x10, \T5, \T3, \T3
1401        vpxor           \T3, \T6, \T6
1402
1403                vmovdqu 16*6(arg1), \T1
1404                vaesenc \T1, \XMM1, \XMM1
1405                vaesenc \T1, \XMM2, \XMM2
1406                vaesenc \T1, \XMM3, \XMM3
1407                vaesenc \T1, \XMM4, \XMM4
1408                vaesenc \T1, \XMM5, \XMM5
1409                vaesenc \T1, \XMM6, \XMM6
1410                vaesenc \T1, \XMM7, \XMM7
1411                vaesenc \T1, \XMM8, \XMM8
1412
1413
1414        vmovdqa         TMP5(%rsp), \T1
1415        vmovdqu         HashKey_4(arg2), \T5
1416        vpclmulqdq      $0x11, \T5, \T1, \T3
1417        vpxor           \T3, \T4, \T4
1418        vpclmulqdq      $0x00, \T5, \T1, \T3
1419        vpxor           \T3, \T7, \T7
1420
1421        vpshufd         $0b01001110, \T1, \T3
1422        vpxor           \T1, \T3, \T3
1423        vmovdqu         HashKey_4_k(arg2), \T5
1424        vpclmulqdq      $0x10, \T5, \T3, \T3
1425        vpxor           \T3, \T6, \T6
1426
1427                vmovdqu 16*7(arg1), \T1
1428                vaesenc \T1, \XMM1, \XMM1
1429                vaesenc \T1, \XMM2, \XMM2
1430                vaesenc \T1, \XMM3, \XMM3
1431                vaesenc \T1, \XMM4, \XMM4
1432                vaesenc \T1, \XMM5, \XMM5
1433                vaesenc \T1, \XMM6, \XMM6
1434                vaesenc \T1, \XMM7, \XMM7
1435                vaesenc \T1, \XMM8, \XMM8
1436
1437        vmovdqa         TMP6(%rsp), \T1
1438        vmovdqu         HashKey_3(arg2), \T5
1439        vpclmulqdq      $0x11, \T5, \T1, \T3
1440        vpxor           \T3, \T4, \T4
1441        vpclmulqdq      $0x00, \T5, \T1, \T3
1442        vpxor           \T3, \T7, \T7
1443
1444        vpshufd         $0b01001110, \T1, \T3
1445        vpxor           \T1, \T3, \T3
1446        vmovdqu         HashKey_3_k(arg2), \T5
1447        vpclmulqdq      $0x10, \T5, \T3, \T3
1448        vpxor           \T3, \T6, \T6
1449
1450
1451                vmovdqu 16*8(arg1), \T1
1452                vaesenc \T1, \XMM1, \XMM1
1453                vaesenc \T1, \XMM2, \XMM2
1454                vaesenc \T1, \XMM3, \XMM3
1455                vaesenc \T1, \XMM4, \XMM4
1456                vaesenc \T1, \XMM5, \XMM5
1457                vaesenc \T1, \XMM6, \XMM6
1458                vaesenc \T1, \XMM7, \XMM7
1459                vaesenc \T1, \XMM8, \XMM8
1460
1461        vmovdqa         TMP7(%rsp), \T1
1462        vmovdqu         HashKey_2(arg2), \T5
1463        vpclmulqdq      $0x11, \T5, \T1, \T3
1464        vpxor           \T3, \T4, \T4
1465        vpclmulqdq      $0x00, \T5, \T1, \T3
1466        vpxor           \T3, \T7, \T7
1467
1468        vpshufd         $0b01001110, \T1, \T3
1469        vpxor           \T1, \T3, \T3
1470        vmovdqu         HashKey_2_k(arg2), \T5
1471        vpclmulqdq      $0x10, \T5, \T3, \T3
1472        vpxor           \T3, \T6, \T6
1473
1474        #######################################################################
1475
1476                vmovdqu 16*9(arg1), \T5
1477                vaesenc \T5, \XMM1, \XMM1
1478                vaesenc \T5, \XMM2, \XMM2
1479                vaesenc \T5, \XMM3, \XMM3
1480                vaesenc \T5, \XMM4, \XMM4
1481                vaesenc \T5, \XMM5, \XMM5
1482                vaesenc \T5, \XMM6, \XMM6
1483                vaesenc \T5, \XMM7, \XMM7
1484                vaesenc \T5, \XMM8, \XMM8
1485
1486        vmovdqa         TMP8(%rsp), \T1
1487        vmovdqu         HashKey(arg2), \T5
1488        vpclmulqdq      $0x11, \T5, \T1, \T3
1489        vpxor           \T3, \T4, \T4
1490        vpclmulqdq      $0x00, \T5, \T1, \T3
1491        vpxor           \T3, \T7, \T7
1492
1493        vpshufd         $0b01001110, \T1, \T3
1494        vpxor           \T1, \T3, \T3
1495        vmovdqu         HashKey_k(arg2), \T5
1496        vpclmulqdq      $0x10, \T5, \T3, \T3
1497        vpxor           \T3, \T6, \T6
1498
1499        vpxor           \T4, \T6, \T6
1500        vpxor           \T7, \T6, \T6
1501
1502                vmovdqu 16*10(arg1), \T5
1503
1504        i = 11
1505        setreg
1506.rep (\REP-9)
1507
1508        vaesenc \T5, \XMM1, \XMM1
1509        vaesenc \T5, \XMM2, \XMM2
1510        vaesenc \T5, \XMM3, \XMM3
1511        vaesenc \T5, \XMM4, \XMM4
1512        vaesenc \T5, \XMM5, \XMM5
1513        vaesenc \T5, \XMM6, \XMM6
1514        vaesenc \T5, \XMM7, \XMM7
1515        vaesenc \T5, \XMM8, \XMM8
1516
1517        vmovdqu 16*i(arg1), \T5
1518        i = i + 1
1519        setreg
1520.endr
1521
1522        i = 0
1523        j = 1
1524        setreg
1525.rep 8
1526                vpxor   16*i(arg4, %r11), \T5, \T2
1527                .if \ENC_DEC == ENC
1528                vaesenclast     \T2, reg_j, reg_j
1529                .else
1530                vaesenclast     \T2, reg_j, \T3
1531                vmovdqu 16*i(arg4, %r11), reg_j
1532                vmovdqu \T3, 16*i(arg3, %r11)
1533                .endif
1534        i = (i+1)
1535        j = (j+1)
1536        setreg
1537.endr
1538        #######################################################################
1539
1540
1541        vpslldq $8, \T6, \T3                            # shift-L T3 2 DWs
1542        vpsrldq $8, \T6, \T6                            # shift-R T2 2 DWs
1543        vpxor   \T3, \T7, \T7
1544        vpxor   \T4, \T6, \T6                           # accumulate the results in T6:T7
1545
1546
1547
1548        #######################################################################
1549        #first phase of the reduction
1550        #######################################################################
1551        vpslld  $31, \T7, \T2                           # packed right shifting << 31
1552        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
1553        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
1554
1555        vpxor   \T3, \T2, \T2                           # xor the shifted versions
1556        vpxor   \T4, \T2, \T2
1557
1558        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
1559
1560        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
1561        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
1562        #######################################################################
1563                .if \ENC_DEC == ENC
1564                vmovdqu  \XMM1, 16*0(arg3,%r11)         # Write to the Ciphertext buffer
1565                vmovdqu  \XMM2, 16*1(arg3,%r11)         # Write to the Ciphertext buffer
1566                vmovdqu  \XMM3, 16*2(arg3,%r11)         # Write to the Ciphertext buffer
1567                vmovdqu  \XMM4, 16*3(arg3,%r11)         # Write to the Ciphertext buffer
1568                vmovdqu  \XMM5, 16*4(arg3,%r11)         # Write to the Ciphertext buffer
1569                vmovdqu  \XMM6, 16*5(arg3,%r11)         # Write to the Ciphertext buffer
1570                vmovdqu  \XMM7, 16*6(arg3,%r11)         # Write to the Ciphertext buffer
1571                vmovdqu  \XMM8, 16*7(arg3,%r11)         # Write to the Ciphertext buffer
1572                .endif
1573
1574        #######################################################################
1575        #second phase of the reduction
1576        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
1577        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
1578        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
1579        vpxor   \T3, \T2, \T2                           # xor the shifted versions
1580        vpxor   \T4, \T2, \T2
1581
1582        vpxor   \T1, \T2, \T2
1583        vpxor   \T2, \T7, \T7
1584        vpxor   \T7, \T6, \T6                           # the result is in T6
1585        #######################################################################
1586
1587                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1   # perform a 16Byte swap
1588                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2   # perform a 16Byte swap
1589                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3   # perform a 16Byte swap
1590                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4   # perform a 16Byte swap
1591                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5   # perform a 16Byte swap
1592                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6   # perform a 16Byte swap
1593                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7   # perform a 16Byte swap
1594                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8   # perform a 16Byte swap
1595
1596
1597        vpxor   \T6, \XMM1, \XMM1
1598
1599
1600
1601.endm
1602
1603
1604# GHASH the last 4 ciphertext blocks.
1605.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1606
1607        ## Karatsuba Method
1608
1609
1610        vpshufd         $0b01001110, \XMM1, \T2
1611        vpxor           \XMM1, \T2, \T2
1612        vmovdqu         HashKey_8(arg2), \T5
1613        vpclmulqdq      $0x11, \T5, \XMM1, \T6
1614        vpclmulqdq      $0x00, \T5, \XMM1, \T7
1615
1616        vmovdqu         HashKey_8_k(arg2), \T3
1617        vpclmulqdq      $0x00, \T3, \T2, \XMM1
1618
1619        ######################
1620
1621        vpshufd         $0b01001110, \XMM2, \T2
1622        vpxor           \XMM2, \T2, \T2
1623        vmovdqu         HashKey_7(arg2), \T5
1624        vpclmulqdq      $0x11, \T5, \XMM2, \T4
1625        vpxor           \T4, \T6, \T6
1626
1627        vpclmulqdq      $0x00, \T5, \XMM2, \T4
1628        vpxor           \T4, \T7, \T7
1629
1630        vmovdqu         HashKey_7_k(arg2), \T3
1631        vpclmulqdq      $0x00, \T3, \T2, \T2
1632        vpxor           \T2, \XMM1, \XMM1
1633
1634        ######################
1635
1636        vpshufd         $0b01001110, \XMM3, \T2
1637        vpxor           \XMM3, \T2, \T2
1638        vmovdqu         HashKey_6(arg2), \T5
1639        vpclmulqdq      $0x11, \T5, \XMM3, \T4
1640        vpxor           \T4, \T6, \T6
1641
1642        vpclmulqdq      $0x00, \T5, \XMM3, \T4
1643        vpxor           \T4, \T7, \T7
1644
1645        vmovdqu         HashKey_6_k(arg2), \T3
1646        vpclmulqdq      $0x00, \T3, \T2, \T2
1647        vpxor           \T2, \XMM1, \XMM1
1648
1649        ######################
1650
1651        vpshufd         $0b01001110, \XMM4, \T2
1652        vpxor           \XMM4, \T2, \T2
1653        vmovdqu         HashKey_5(arg2), \T5
1654        vpclmulqdq      $0x11, \T5, \XMM4, \T4
1655        vpxor           \T4, \T6, \T6
1656
1657        vpclmulqdq      $0x00, \T5, \XMM4, \T4
1658        vpxor           \T4, \T7, \T7
1659
1660        vmovdqu         HashKey_5_k(arg2), \T3
1661        vpclmulqdq      $0x00, \T3, \T2, \T2
1662        vpxor           \T2, \XMM1, \XMM1
1663
1664        ######################
1665
1666        vpshufd         $0b01001110, \XMM5, \T2
1667        vpxor           \XMM5, \T2, \T2
1668        vmovdqu         HashKey_4(arg2), \T5
1669        vpclmulqdq      $0x11, \T5, \XMM5, \T4
1670        vpxor           \T4, \T6, \T6
1671
1672        vpclmulqdq      $0x00, \T5, \XMM5, \T4
1673        vpxor           \T4, \T7, \T7
1674
1675        vmovdqu         HashKey_4_k(arg2), \T3
1676        vpclmulqdq      $0x00, \T3, \T2, \T2
1677        vpxor           \T2, \XMM1, \XMM1
1678
1679        ######################
1680
1681        vpshufd         $0b01001110, \XMM6, \T2
1682        vpxor           \XMM6, \T2, \T2
1683        vmovdqu         HashKey_3(arg2), \T5
1684        vpclmulqdq      $0x11, \T5, \XMM6, \T4
1685        vpxor           \T4, \T6, \T6
1686
1687        vpclmulqdq      $0x00, \T5, \XMM6, \T4
1688        vpxor           \T4, \T7, \T7
1689
1690        vmovdqu         HashKey_3_k(arg2), \T3
1691        vpclmulqdq      $0x00, \T3, \T2, \T2
1692        vpxor           \T2, \XMM1, \XMM1
1693
1694        ######################
1695
1696        vpshufd         $0b01001110, \XMM7, \T2
1697        vpxor           \XMM7, \T2, \T2
1698        vmovdqu         HashKey_2(arg2), \T5
1699        vpclmulqdq      $0x11, \T5, \XMM7, \T4
1700        vpxor           \T4, \T6, \T6
1701
1702        vpclmulqdq      $0x00, \T5, \XMM7, \T4
1703        vpxor           \T4, \T7, \T7
1704
1705        vmovdqu         HashKey_2_k(arg2), \T3
1706        vpclmulqdq      $0x00, \T3, \T2, \T2
1707        vpxor           \T2, \XMM1, \XMM1
1708
1709        ######################
1710
1711        vpshufd         $0b01001110, \XMM8, \T2
1712        vpxor           \XMM8, \T2, \T2
1713        vmovdqu         HashKey(arg2), \T5
1714        vpclmulqdq      $0x11, \T5, \XMM8, \T4
1715        vpxor           \T4, \T6, \T6
1716
1717        vpclmulqdq      $0x00, \T5, \XMM8, \T4
1718        vpxor           \T4, \T7, \T7
1719
1720        vmovdqu         HashKey_k(arg2), \T3
1721        vpclmulqdq      $0x00, \T3, \T2, \T2
1722
1723        vpxor           \T2, \XMM1, \XMM1
1724        vpxor           \T6, \XMM1, \XMM1
1725        vpxor           \T7, \XMM1, \T2
1726
1727
1728
1729
1730        vpslldq $8, \T2, \T4
1731        vpsrldq $8, \T2, \T2
1732
1733        vpxor   \T4, \T7, \T7
1734        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
1735                                # the accumulated carry-less multiplications
1736
1737        #######################################################################
1738        #first phase of the reduction
1739        vpslld  $31, \T7, \T2   # packed right shifting << 31
1740        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
1741        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
1742
1743        vpxor   \T3, \T2, \T2   # xor the shifted versions
1744        vpxor   \T4, \T2, \T2
1745
1746        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
1747
1748        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
1749        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
1750        #######################################################################
1751
1752
1753        #second phase of the reduction
1754        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
1755        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
1756        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
1757        vpxor   \T3, \T2, \T2   # xor the shifted versions
1758        vpxor   \T4, \T2, \T2
1759
1760        vpxor   \T1, \T2, \T2
1761        vpxor   \T2, \T7, \T7
1762        vpxor   \T7, \T6, \T6   # the result is in T6
1763
1764.endm
1765
1766#############################################################
1767#void   aesni_gcm_precomp_avx_gen2
1768#        (gcm_data     *my_ctx_data,
1769#         gcm_context_data *data,
1770#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1771#        u8      *iv, /* Pre-counter block j0: 4 byte salt
1772#                       (from Security Association) concatenated with 8 byte
1773#                       Initialisation Vector (from IPSec ESP Payload)
1774#                       concatenated with 0x00000001. 16-byte aligned pointer. */
1775#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1776#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1777#############################################################
1778ENTRY(aesni_gcm_init_avx_gen2)
1779        FUNC_SAVE
1780        INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1781        FUNC_RESTORE
1782        ret
1783ENDPROC(aesni_gcm_init_avx_gen2)
1784
1785###############################################################################
1786#void   aesni_gcm_enc_update_avx_gen2(
1787#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1788#        gcm_context_data *data,
1789#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
1790#        const   u8 *in, /* Plaintext input */
1791#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1792###############################################################################
1793ENTRY(aesni_gcm_enc_update_avx_gen2)
1794        FUNC_SAVE
1795        mov     keysize, %eax
1796        cmp     $32, %eax
1797        je      key_256_enc_update
1798        cmp     $16, %eax
1799        je      key_128_enc_update
1800        # must be 192
1801        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1802        FUNC_RESTORE
1803        ret
1804key_128_enc_update:
1805        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1806        FUNC_RESTORE
1807        ret
1808key_256_enc_update:
1809        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1810        FUNC_RESTORE
1811        ret
1812ENDPROC(aesni_gcm_enc_update_avx_gen2)
1813
1814###############################################################################
1815#void   aesni_gcm_dec_update_avx_gen2(
1816#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1817#        gcm_context_data *data,
1818#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
1819#        const   u8 *in, /* Ciphertext input */
1820#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1821###############################################################################
1822ENTRY(aesni_gcm_dec_update_avx_gen2)
1823        FUNC_SAVE
1824        mov     keysize,%eax
1825        cmp     $32, %eax
1826        je      key_256_dec_update
1827        cmp     $16, %eax
1828        je      key_128_dec_update
1829        # must be 192
1830        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1831        FUNC_RESTORE
1832        ret
1833key_128_dec_update:
1834        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1835        FUNC_RESTORE
1836        ret
1837key_256_dec_update:
1838        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1839        FUNC_RESTORE
1840        ret
1841ENDPROC(aesni_gcm_dec_update_avx_gen2)
1842
1843###############################################################################
1844#void   aesni_gcm_finalize_avx_gen2(
1845#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1846#        gcm_context_data *data,
1847#        u8      *auth_tag, /* Authenticated Tag output. */
1848#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1849#                               Valid values are 16 (most likely), 12 or 8. */
1850###############################################################################
1851ENTRY(aesni_gcm_finalize_avx_gen2)
1852        FUNC_SAVE
1853        mov     keysize,%eax
1854        cmp     $32, %eax
1855        je      key_256_finalize
1856        cmp     $16, %eax
1857        je      key_128_finalize
1858        # must be 192
1859        GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1860        FUNC_RESTORE
1861        ret
1862key_128_finalize:
1863        GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1864        FUNC_RESTORE
1865        ret
1866key_256_finalize:
1867        GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1868        FUNC_RESTORE
1869        ret
1870ENDPROC(aesni_gcm_finalize_avx_gen2)
1871
1872#endif /* CONFIG_AS_AVX */
1873
1874#ifdef CONFIG_AS_AVX2
1875###############################################################################
1876# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1877# Input: A and B (128-bits each, bit-reflected)
1878# Output: C = A*B*x mod poly, (i.e. >>1 )
1879# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1880# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1881###############################################################################
1882.macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1883
1884        vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
1885        vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
1886        vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
1887        vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
1888        vpxor           \T3, \GH, \GH
1889
1890
1891        vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
1892        vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
1893
1894        vpxor           \T3, \T1, \T1
1895        vpxor           \T2, \GH, \GH
1896
1897        #######################################################################
1898        #first phase of the reduction
1899        vmovdqa         POLY2(%rip), \T3
1900
1901        vpclmulqdq      $0x01, \GH, \T3, \T2
1902        vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
1903
1904        vpxor           \T2, \GH, \GH          # first phase of the reduction complete
1905        #######################################################################
1906        #second phase of the reduction
1907        vpclmulqdq      $0x00, \GH, \T3, \T2
1908        vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1909
1910        vpclmulqdq      $0x10, \GH, \T3, \GH
1911        vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1912
1913        vpxor           \T2, \GH, \GH          # second phase of the reduction complete
1914        #######################################################################
1915        vpxor           \T1, \GH, \GH          # the result is in GH
1916
1917
1918.endm
1919
1920.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1921
1922        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1923        vmovdqa  \HK, \T5
1924        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
1925        vmovdqu  \T5, HashKey_2(arg2)                       #  [HashKey_2] = HashKey^2<<1 mod poly
1926
1927        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
1928        vmovdqu  \T5, HashKey_3(arg2)
1929
1930        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
1931        vmovdqu  \T5, HashKey_4(arg2)
1932
1933        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
1934        vmovdqu  \T5, HashKey_5(arg2)
1935
1936        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
1937        vmovdqu  \T5, HashKey_6(arg2)
1938
1939        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
1940        vmovdqu  \T5, HashKey_7(arg2)
1941
1942        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
1943        vmovdqu  \T5, HashKey_8(arg2)
1944
1945.endm
1946
1947## if a = number of total plaintext bytes
1948## b = floor(a/16)
1949## num_initial_blocks = b mod 4#
1950## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1951## r10, r11, r12, rax are clobbered
1952## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1953
1954.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1955        i = (8-\num_initial_blocks)
1956        setreg
1957        vmovdqu AadHash(arg2), reg_i
1958
1959        # start AES for num_initial_blocks blocks
1960        vmovdqu CurCount(arg2), \CTR
1961
1962        i = (9-\num_initial_blocks)
1963        setreg
1964.rep \num_initial_blocks
1965                vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
1966                vmovdqa \CTR, reg_i
1967                vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
1968        i = (i+1)
1969        setreg
1970.endr
1971
1972        vmovdqa  (arg1), \T_key
1973        i = (9-\num_initial_blocks)
1974        setreg
1975.rep \num_initial_blocks
1976                vpxor   \T_key, reg_i, reg_i
1977        i = (i+1)
1978        setreg
1979.endr
1980
1981        j = 1
1982        setreg
1983.rep \REP
1984        vmovdqa  16*j(arg1), \T_key
1985        i = (9-\num_initial_blocks)
1986        setreg
1987.rep \num_initial_blocks
1988        vaesenc \T_key, reg_i, reg_i
1989        i = (i+1)
1990        setreg
1991.endr
1992
1993        j = (j+1)
1994        setreg
1995.endr
1996
1997
1998        vmovdqa  16*j(arg1), \T_key
1999        i = (9-\num_initial_blocks)
2000        setreg
2001.rep \num_initial_blocks
2002        vaesenclast      \T_key, reg_i, reg_i
2003        i = (i+1)
2004        setreg
2005.endr
2006
2007        i = (9-\num_initial_blocks)
2008        setreg
2009.rep \num_initial_blocks
2010                vmovdqu (arg4, %r11), \T1
2011                vpxor   \T1, reg_i, reg_i
2012                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for
2013                                                       # num_initial_blocks blocks
2014                add     $16, %r11
2015.if  \ENC_DEC == DEC
2016                vmovdqa \T1, reg_i
2017.endif
2018                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
2019        i = (i+1)
2020        setreg
2021.endr
2022
2023
2024        i = (8-\num_initial_blocks)
2025        j = (9-\num_initial_blocks)
2026        setreg
2027
2028.rep \num_initial_blocks
2029        vpxor    reg_i, reg_j, reg_j
2030        GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
2031        i = (i+1)
2032        j = (j+1)
2033        setreg
2034.endr
2035        # XMM8 has the combined result here
2036
2037        vmovdqa  \XMM8, TMP1(%rsp)
2038        vmovdqa  \XMM8, \T3
2039
2040        cmp     $128, %r13
2041        jl      _initial_blocks_done\@                  # no need for precomputed constants
2042
2043###############################################################################
2044# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2045                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2046                vmovdqa  \CTR, \XMM1
2047                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
2048
2049                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2050                vmovdqa  \CTR, \XMM2
2051                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
2052
2053                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2054                vmovdqa  \CTR, \XMM3
2055                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
2056
2057                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2058                vmovdqa  \CTR, \XMM4
2059                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
2060
2061                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2062                vmovdqa  \CTR, \XMM5
2063                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
2064
2065                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2066                vmovdqa  \CTR, \XMM6
2067                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
2068
2069                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2070                vmovdqa  \CTR, \XMM7
2071                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
2072
2073                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2074                vmovdqa  \CTR, \XMM8
2075                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
2076
2077                vmovdqa  (arg1), \T_key
2078                vpxor    \T_key, \XMM1, \XMM1
2079                vpxor    \T_key, \XMM2, \XMM2
2080                vpxor    \T_key, \XMM3, \XMM3
2081                vpxor    \T_key, \XMM4, \XMM4
2082                vpxor    \T_key, \XMM5, \XMM5
2083                vpxor    \T_key, \XMM6, \XMM6
2084                vpxor    \T_key, \XMM7, \XMM7
2085                vpxor    \T_key, \XMM8, \XMM8
2086
2087                i = 1
2088                setreg
2089.rep    \REP       # do REP rounds
2090                vmovdqa  16*i(arg1), \T_key
2091                vaesenc  \T_key, \XMM1, \XMM1
2092                vaesenc  \T_key, \XMM2, \XMM2
2093                vaesenc  \T_key, \XMM3, \XMM3
2094                vaesenc  \T_key, \XMM4, \XMM4
2095                vaesenc  \T_key, \XMM5, \XMM5
2096                vaesenc  \T_key, \XMM6, \XMM6
2097                vaesenc  \T_key, \XMM7, \XMM7
2098                vaesenc  \T_key, \XMM8, \XMM8
2099                i = (i+1)
2100                setreg
2101.endr
2102
2103
2104                vmovdqa  16*i(arg1), \T_key
2105                vaesenclast  \T_key, \XMM1, \XMM1
2106                vaesenclast  \T_key, \XMM2, \XMM2
2107                vaesenclast  \T_key, \XMM3, \XMM3
2108                vaesenclast  \T_key, \XMM4, \XMM4
2109                vaesenclast  \T_key, \XMM5, \XMM5
2110                vaesenclast  \T_key, \XMM6, \XMM6
2111                vaesenclast  \T_key, \XMM7, \XMM7
2112                vaesenclast  \T_key, \XMM8, \XMM8
2113
2114                vmovdqu  (arg4, %r11), \T1
2115                vpxor    \T1, \XMM1, \XMM1
2116                vmovdqu  \XMM1, (arg3 , %r11)
2117                .if   \ENC_DEC == DEC
2118                vmovdqa  \T1, \XMM1
2119                .endif
2120
2121                vmovdqu  16*1(arg4, %r11), \T1
2122                vpxor    \T1, \XMM2, \XMM2
2123                vmovdqu  \XMM2, 16*1(arg3 , %r11)
2124                .if   \ENC_DEC == DEC
2125                vmovdqa  \T1, \XMM2
2126                .endif
2127
2128                vmovdqu  16*2(arg4, %r11), \T1
2129                vpxor    \T1, \XMM3, \XMM3
2130                vmovdqu  \XMM3, 16*2(arg3 , %r11)
2131                .if   \ENC_DEC == DEC
2132                vmovdqa  \T1, \XMM3
2133                .endif
2134
2135                vmovdqu  16*3(arg4, %r11), \T1
2136                vpxor    \T1, \XMM4, \XMM4
2137                vmovdqu  \XMM4, 16*3(arg3 , %r11)
2138                .if   \ENC_DEC == DEC
2139                vmovdqa  \T1, \XMM4
2140                .endif
2141
2142                vmovdqu  16*4(arg4, %r11), \T1
2143                vpxor    \T1, \XMM5, \XMM5
2144                vmovdqu  \XMM5, 16*4(arg3 , %r11)
2145                .if   \ENC_DEC == DEC
2146                vmovdqa  \T1, \XMM5
2147                .endif
2148
2149                vmovdqu  16*5(arg4, %r11), \T1
2150                vpxor    \T1, \XMM6, \XMM6
2151                vmovdqu  \XMM6, 16*5(arg3 , %r11)
2152                .if   \ENC_DEC == DEC
2153                vmovdqa  \T1, \XMM6
2154                .endif
2155
2156                vmovdqu  16*6(arg4, %r11), \T1
2157                vpxor    \T1, \XMM7, \XMM7
2158                vmovdqu  \XMM7, 16*6(arg3 , %r11)
2159                .if   \ENC_DEC == DEC
2160                vmovdqa  \T1, \XMM7
2161                .endif
2162
2163                vmovdqu  16*7(arg4, %r11), \T1
2164                vpxor    \T1, \XMM8, \XMM8
2165                vmovdqu  \XMM8, 16*7(arg3 , %r11)
2166                .if   \ENC_DEC == DEC
2167                vmovdqa  \T1, \XMM8
2168                .endif
2169
2170                add     $128, %r11
2171
2172                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2173                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
2174                                                           # the corresponding ciphertext
2175                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2176                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2177                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2178                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2179                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2180                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2181                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2182
2183###############################################################################
2184
2185_initial_blocks_done\@:
2186
2187
2188.endm
2189
2190
2191
2192# encrypt 8 blocks at a time
2193# ghash the 8 previously encrypted ciphertext blocks
2194# arg1, arg3, arg4 are used as pointers only, not modified
2195# r11 is the data offset value
2196.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2197
2198        vmovdqa \XMM1, \T2
2199        vmovdqa \XMM2, TMP2(%rsp)
2200        vmovdqa \XMM3, TMP3(%rsp)
2201        vmovdqa \XMM4, TMP4(%rsp)
2202        vmovdqa \XMM5, TMP5(%rsp)
2203        vmovdqa \XMM6, TMP6(%rsp)
2204        vmovdqa \XMM7, TMP7(%rsp)
2205        vmovdqa \XMM8, TMP8(%rsp)
2206
2207.if \loop_idx == in_order
2208                vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
2209                vpaddd  ONE(%rip), \XMM1, \XMM2
2210                vpaddd  ONE(%rip), \XMM2, \XMM3
2211                vpaddd  ONE(%rip), \XMM3, \XMM4
2212                vpaddd  ONE(%rip), \XMM4, \XMM5
2213                vpaddd  ONE(%rip), \XMM5, \XMM6
2214                vpaddd  ONE(%rip), \XMM6, \XMM7
2215                vpaddd  ONE(%rip), \XMM7, \XMM8
2216                vmovdqa \XMM8, \CTR
2217
2218                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2219                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2220                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2221                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2222                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2223                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2224                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2225                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2226.else
2227                vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
2228                vpaddd  ONEf(%rip), \XMM1, \XMM2
2229                vpaddd  ONEf(%rip), \XMM2, \XMM3
2230                vpaddd  ONEf(%rip), \XMM3, \XMM4
2231                vpaddd  ONEf(%rip), \XMM4, \XMM5
2232                vpaddd  ONEf(%rip), \XMM5, \XMM6
2233                vpaddd  ONEf(%rip), \XMM6, \XMM7
2234                vpaddd  ONEf(%rip), \XMM7, \XMM8
2235                vmovdqa \XMM8, \CTR
2236.endif
2237
2238
2239        #######################################################################
2240
2241                vmovdqu (arg1), \T1
2242                vpxor   \T1, \XMM1, \XMM1
2243                vpxor   \T1, \XMM2, \XMM2
2244                vpxor   \T1, \XMM3, \XMM3
2245                vpxor   \T1, \XMM4, \XMM4
2246                vpxor   \T1, \XMM5, \XMM5
2247                vpxor   \T1, \XMM6, \XMM6
2248                vpxor   \T1, \XMM7, \XMM7
2249                vpxor   \T1, \XMM8, \XMM8
2250
2251        #######################################################################
2252
2253
2254
2255
2256
2257                vmovdqu 16*1(arg1), \T1
2258                vaesenc \T1, \XMM1, \XMM1
2259                vaesenc \T1, \XMM2, \XMM2
2260                vaesenc \T1, \XMM3, \XMM3
2261                vaesenc \T1, \XMM4, \XMM4
2262                vaesenc \T1, \XMM5, \XMM5
2263                vaesenc \T1, \XMM6, \XMM6
2264                vaesenc \T1, \XMM7, \XMM7
2265                vaesenc \T1, \XMM8, \XMM8
2266
2267                vmovdqu 16*2(arg1), \T1
2268                vaesenc \T1, \XMM1, \XMM1
2269                vaesenc \T1, \XMM2, \XMM2
2270                vaesenc \T1, \XMM3, \XMM3
2271                vaesenc \T1, \XMM4, \XMM4
2272                vaesenc \T1, \XMM5, \XMM5
2273                vaesenc \T1, \XMM6, \XMM6
2274                vaesenc \T1, \XMM7, \XMM7
2275                vaesenc \T1, \XMM8, \XMM8
2276
2277
2278        #######################################################################
2279
2280        vmovdqu         HashKey_8(arg2), \T5
2281        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
2282        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
2283        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
2284        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
2285        vpxor           \T5, \T6, \T6
2286
2287                vmovdqu 16*3(arg1), \T1
2288                vaesenc \T1, \XMM1, \XMM1
2289                vaesenc \T1, \XMM2, \XMM2
2290                vaesenc \T1, \XMM3, \XMM3
2291                vaesenc \T1, \XMM4, \XMM4
2292                vaesenc \T1, \XMM5, \XMM5
2293                vaesenc \T1, \XMM6, \XMM6
2294                vaesenc \T1, \XMM7, \XMM7
2295                vaesenc \T1, \XMM8, \XMM8
2296
2297        vmovdqa         TMP2(%rsp), \T1
2298        vmovdqu         HashKey_7(arg2), \T5
2299        vpclmulqdq      $0x11, \T5, \T1, \T3
2300        vpxor           \T3, \T4, \T4
2301
2302        vpclmulqdq      $0x00, \T5, \T1, \T3
2303        vpxor           \T3, \T7, \T7
2304
2305        vpclmulqdq      $0x01, \T5, \T1, \T3
2306        vpxor           \T3, \T6, \T6
2307
2308        vpclmulqdq      $0x10, \T5, \T1, \T3
2309        vpxor           \T3, \T6, \T6
2310
2311                vmovdqu 16*4(arg1), \T1
2312                vaesenc \T1, \XMM1, \XMM1
2313                vaesenc \T1, \XMM2, \XMM2
2314                vaesenc \T1, \XMM3, \XMM3
2315                vaesenc \T1, \XMM4, \XMM4
2316                vaesenc \T1, \XMM5, \XMM5
2317                vaesenc \T1, \XMM6, \XMM6
2318                vaesenc \T1, \XMM7, \XMM7
2319                vaesenc \T1, \XMM8, \XMM8
2320
2321        #######################################################################
2322
2323        vmovdqa         TMP3(%rsp), \T1
2324        vmovdqu         HashKey_6(arg2), \T5
2325        vpclmulqdq      $0x11, \T5, \T1, \T3
2326        vpxor           \T3, \T4, \T4
2327
2328        vpclmulqdq      $0x00, \T5, \T1, \T3
2329        vpxor           \T3, \T7, \T7
2330
2331        vpclmulqdq      $0x01, \T5, \T1, \T3
2332        vpxor           \T3, \T6, \T6
2333
2334        vpclmulqdq      $0x10, \T5, \T1, \T3
2335        vpxor           \T3, \T6, \T6
2336
2337                vmovdqu 16*5(arg1), \T1
2338                vaesenc \T1, \XMM1, \XMM1
2339                vaesenc \T1, \XMM2, \XMM2
2340                vaesenc \T1, \XMM3, \XMM3
2341                vaesenc \T1, \XMM4, \XMM4
2342                vaesenc \T1, \XMM5, \XMM5
2343                vaesenc \T1, \XMM6, \XMM6
2344                vaesenc \T1, \XMM7, \XMM7
2345                vaesenc \T1, \XMM8, \XMM8
2346
2347        vmovdqa         TMP4(%rsp), \T1
2348        vmovdqu         HashKey_5(arg2), \T5
2349        vpclmulqdq      $0x11, \T5, \T1, \T3
2350        vpxor           \T3, \T4, \T4
2351
2352        vpclmulqdq      $0x00, \T5, \T1, \T3
2353        vpxor           \T3, \T7, \T7
2354
2355        vpclmulqdq      $0x01, \T5, \T1, \T3
2356        vpxor           \T3, \T6, \T6
2357
2358        vpclmulqdq      $0x10, \T5, \T1, \T3
2359        vpxor           \T3, \T6, \T6
2360
2361                vmovdqu 16*6(arg1), \T1
2362                vaesenc \T1, \XMM1, \XMM1
2363                vaesenc \T1, \XMM2, \XMM2
2364                vaesenc \T1, \XMM3, \XMM3
2365                vaesenc \T1, \XMM4, \XMM4
2366                vaesenc \T1, \XMM5, \XMM5
2367                vaesenc \T1, \XMM6, \XMM6
2368                vaesenc \T1, \XMM7, \XMM7
2369                vaesenc \T1, \XMM8, \XMM8
2370
2371
2372        vmovdqa         TMP5(%rsp), \T1
2373        vmovdqu         HashKey_4(arg2), \T5
2374        vpclmulqdq      $0x11, \T5, \T1, \T3
2375        vpxor           \T3, \T4, \T4
2376
2377        vpclmulqdq      $0x00, \T5, \T1, \T3
2378        vpxor           \T3, \T7, \T7
2379
2380        vpclmulqdq      $0x01, \T5, \T1, \T3
2381        vpxor           \T3, \T6, \T6
2382
2383        vpclmulqdq      $0x10, \T5, \T1, \T3
2384        vpxor           \T3, \T6, \T6
2385
2386                vmovdqu 16*7(arg1), \T1
2387                vaesenc \T1, \XMM1, \XMM1
2388                vaesenc \T1, \XMM2, \XMM2
2389                vaesenc \T1, \XMM3, \XMM3
2390                vaesenc \T1, \XMM4, \XMM4
2391                vaesenc \T1, \XMM5, \XMM5
2392                vaesenc \T1, \XMM6, \XMM6
2393                vaesenc \T1, \XMM7, \XMM7
2394                vaesenc \T1, \XMM8, \XMM8
2395
2396        vmovdqa         TMP6(%rsp), \T1
2397        vmovdqu         HashKey_3(arg2), \T5
2398        vpclmulqdq      $0x11, \T5, \T1, \T3
2399        vpxor           \T3, \T4, \T4
2400
2401        vpclmulqdq      $0x00, \T5, \T1, \T3
2402        vpxor           \T3, \T7, \T7
2403
2404        vpclmulqdq      $0x01, \T5, \T1, \T3
2405        vpxor           \T3, \T6, \T6
2406
2407        vpclmulqdq      $0x10, \T5, \T1, \T3
2408        vpxor           \T3, \T6, \T6
2409
2410                vmovdqu 16*8(arg1), \T1
2411                vaesenc \T1, \XMM1, \XMM1
2412                vaesenc \T1, \XMM2, \XMM2
2413                vaesenc \T1, \XMM3, \XMM3
2414                vaesenc \T1, \XMM4, \XMM4
2415                vaesenc \T1, \XMM5, \XMM5
2416                vaesenc \T1, \XMM6, \XMM6
2417                vaesenc \T1, \XMM7, \XMM7
2418                vaesenc \T1, \XMM8, \XMM8
2419
2420        vmovdqa         TMP7(%rsp), \T1
2421        vmovdqu         HashKey_2(arg2), \T5
2422        vpclmulqdq      $0x11, \T5, \T1, \T3
2423        vpxor           \T3, \T4, \T4
2424
2425        vpclmulqdq      $0x00, \T5, \T1, \T3
2426        vpxor           \T3, \T7, \T7
2427
2428        vpclmulqdq      $0x01, \T5, \T1, \T3
2429        vpxor           \T3, \T6, \T6
2430
2431        vpclmulqdq      $0x10, \T5, \T1, \T3
2432        vpxor           \T3, \T6, \T6
2433
2434
2435        #######################################################################
2436
2437                vmovdqu 16*9(arg1), \T5
2438                vaesenc \T5, \XMM1, \XMM1
2439                vaesenc \T5, \XMM2, \XMM2
2440                vaesenc \T5, \XMM3, \XMM3
2441                vaesenc \T5, \XMM4, \XMM4
2442                vaesenc \T5, \XMM5, \XMM5
2443                vaesenc \T5, \XMM6, \XMM6
2444                vaesenc \T5, \XMM7, \XMM7
2445                vaesenc \T5, \XMM8, \XMM8
2446
2447        vmovdqa         TMP8(%rsp), \T1
2448        vmovdqu         HashKey(arg2), \T5
2449
2450        vpclmulqdq      $0x00, \T5, \T1, \T3
2451        vpxor           \T3, \T7, \T7
2452
2453        vpclmulqdq      $0x01, \T5, \T1, \T3
2454        vpxor           \T3, \T6, \T6
2455
2456        vpclmulqdq      $0x10, \T5, \T1, \T3
2457        vpxor           \T3, \T6, \T6
2458
2459        vpclmulqdq      $0x11, \T5, \T1, \T3
2460        vpxor           \T3, \T4, \T1
2461
2462
2463                vmovdqu 16*10(arg1), \T5
2464
2465        i = 11
2466        setreg
2467.rep (\REP-9)
2468        vaesenc \T5, \XMM1, \XMM1
2469        vaesenc \T5, \XMM2, \XMM2
2470        vaesenc \T5, \XMM3, \XMM3
2471        vaesenc \T5, \XMM4, \XMM4
2472        vaesenc \T5, \XMM5, \XMM5
2473        vaesenc \T5, \XMM6, \XMM6
2474        vaesenc \T5, \XMM7, \XMM7
2475        vaesenc \T5, \XMM8, \XMM8
2476
2477        vmovdqu 16*i(arg1), \T5
2478        i = i + 1
2479        setreg
2480.endr
2481
2482        i = 0
2483        j = 1
2484        setreg
2485.rep 8
2486                vpxor   16*i(arg4, %r11), \T5, \T2
2487                .if \ENC_DEC == ENC
2488                vaesenclast     \T2, reg_j, reg_j
2489                .else
2490                vaesenclast     \T2, reg_j, \T3
2491                vmovdqu 16*i(arg4, %r11), reg_j
2492                vmovdqu \T3, 16*i(arg3, %r11)
2493                .endif
2494        i = (i+1)
2495        j = (j+1)
2496        setreg
2497.endr
2498        #######################################################################
2499
2500
2501        vpslldq $8, \T6, \T3                            # shift-L T3 2 DWs
2502        vpsrldq $8, \T6, \T6                            # shift-R T2 2 DWs
2503        vpxor   \T3, \T7, \T7
2504        vpxor   \T6, \T1, \T1                           # accumulate the results in T1:T7
2505
2506
2507
2508        #######################################################################
2509        #first phase of the reduction
2510        vmovdqa         POLY2(%rip), \T3
2511
2512        vpclmulqdq      $0x01, \T7, \T3, \T2
2513        vpslldq         $8, \T2, \T2                    # shift-L xmm2 2 DWs
2514
2515        vpxor           \T2, \T7, \T7                   # first phase of the reduction complete
2516        #######################################################################
2517                .if \ENC_DEC == ENC
2518                vmovdqu  \XMM1, 16*0(arg3,%r11)         # Write to the Ciphertext buffer
2519                vmovdqu  \XMM2, 16*1(arg3,%r11)         # Write to the Ciphertext buffer
2520                vmovdqu  \XMM3, 16*2(arg3,%r11)         # Write to the Ciphertext buffer
2521                vmovdqu  \XMM4, 16*3(arg3,%r11)         # Write to the Ciphertext buffer
2522                vmovdqu  \XMM5, 16*4(arg3,%r11)         # Write to the Ciphertext buffer
2523                vmovdqu  \XMM6, 16*5(arg3,%r11)         # Write to the Ciphertext buffer
2524                vmovdqu  \XMM7, 16*6(arg3,%r11)         # Write to the Ciphertext buffer
2525                vmovdqu  \XMM8, 16*7(arg3,%r11)         # Write to the Ciphertext buffer
2526                .endif
2527
2528        #######################################################################
2529        #second phase of the reduction
2530        vpclmulqdq      $0x00, \T7, \T3, \T2
2531        vpsrldq         $4, \T2, \T2                    # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2532
2533        vpclmulqdq      $0x10, \T7, \T3, \T4
2534        vpslldq         $4, \T4, \T4                    # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2535
2536        vpxor           \T2, \T4, \T4                   # second phase of the reduction complete
2537        #######################################################################
2538        vpxor           \T4, \T1, \T1                   # the result is in T1
2539
2540                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1   # perform a 16Byte swap
2541                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2   # perform a 16Byte swap
2542                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3   # perform a 16Byte swap
2543                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4   # perform a 16Byte swap
2544                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5   # perform a 16Byte swap
2545                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6   # perform a 16Byte swap
2546                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7   # perform a 16Byte swap
2547                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8   # perform a 16Byte swap
2548
2549
2550        vpxor   \T1, \XMM1, \XMM1
2551
2552
2553
2554.endm
2555
2556
2557# GHASH the last 4 ciphertext blocks.
2558.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2559
2560        ## Karatsuba Method
2561
2562        vmovdqu         HashKey_8(arg2), \T5
2563
2564        vpshufd         $0b01001110, \XMM1, \T2
2565        vpshufd         $0b01001110, \T5, \T3
2566        vpxor           \XMM1, \T2, \T2
2567        vpxor           \T5, \T3, \T3
2568
2569        vpclmulqdq      $0x11, \T5, \XMM1, \T6
2570        vpclmulqdq      $0x00, \T5, \XMM1, \T7
2571
2572        vpclmulqdq      $0x00, \T3, \T2, \XMM1
2573
2574        ######################
2575
2576        vmovdqu         HashKey_7(arg2), \T5
2577        vpshufd         $0b01001110, \XMM2, \T2
2578        vpshufd         $0b01001110, \T5, \T3
2579        vpxor           \XMM2, \T2, \T2
2580        vpxor           \T5, \T3, \T3
2581
2582        vpclmulqdq      $0x11, \T5, \XMM2, \T4
2583        vpxor           \T4, \T6, \T6
2584
2585        vpclmulqdq      $0x00, \T5, \XMM2, \T4
2586        vpxor           \T4, \T7, \T7
2587
2588        vpclmulqdq      $0x00, \T3, \T2, \T2
2589
2590        vpxor           \T2, \XMM1, \XMM1
2591
2592        ######################
2593
2594        vmovdqu         HashKey_6(arg2), \T5
2595        vpshufd         $0b01001110, \XMM3, \T2
2596        vpshufd         $0b01001110, \T5, \T3
2597        vpxor           \XMM3, \T2, \T2
2598        vpxor           \T5, \T3, \T3
2599
2600        vpclmulqdq      $0x11, \T5, \XMM3, \T4
2601        vpxor           \T4, \T6, \T6
2602
2603        vpclmulqdq      $0x00, \T5, \XMM3, \T4
2604        vpxor           \T4, \T7, \T7
2605
2606        vpclmulqdq      $0x00, \T3, \T2, \T2
2607
2608        vpxor           \T2, \XMM1, \XMM1
2609
2610        ######################
2611
2612        vmovdqu         HashKey_5(arg2), \T5
2613        vpshufd         $0b01001110, \XMM4, \T2
2614        vpshufd         $0b01001110, \T5, \T3
2615        vpxor           \XMM4, \T2, \T2
2616        vpxor           \T5, \T3, \T3
2617
2618        vpclmulqdq      $0x11, \T5, \XMM4, \T4
2619        vpxor           \T4, \T6, \T6
2620
2621        vpclmulqdq      $0x00, \T5, \XMM4, \T4
2622        vpxor           \T4, \T7, \T7
2623
2624        vpclmulqdq      $0x00, \T3, \T2, \T2
2625
2626        vpxor           \T2, \XMM1, \XMM1
2627
2628        ######################
2629
2630        vmovdqu         HashKey_4(arg2), \T5
2631        vpshufd         $0b01001110, \XMM5, \T2
2632        vpshufd         $0b01001110, \T5, \T3
2633        vpxor           \XMM5, \T2, \T2
2634        vpxor           \T5, \T3, \T3
2635
2636        vpclmulqdq      $0x11, \T5, \XMM5, \T4
2637        vpxor           \T4, \T6, \T6
2638
2639        vpclmulqdq      $0x00, \T5, \XMM5, \T4
2640        vpxor           \T4, \T7, \T7
2641
2642        vpclmulqdq      $0x00, \T3, \T2, \T2
2643
2644        vpxor           \T2, \XMM1, \XMM1
2645
2646        ######################
2647
2648        vmovdqu         HashKey_3(arg2), \T5
2649        vpshufd         $0b01001110, \XMM6, \T2
2650        vpshufd         $0b01001110, \T5, \T3
2651        vpxor           \XMM6, \T2, \T2
2652        vpxor           \T5, \T3, \T3
2653
2654        vpclmulqdq      $0x11, \T5, \XMM6, \T4
2655        vpxor           \T4, \T6, \T6
2656
2657        vpclmulqdq      $0x00, \T5, \XMM6, \T4
2658        vpxor           \T4, \T7, \T7
2659
2660        vpclmulqdq      $0x00, \T3, \T2, \T2
2661
2662        vpxor           \T2, \XMM1, \XMM1
2663
2664        ######################
2665
2666        vmovdqu         HashKey_2(arg2), \T5
2667        vpshufd         $0b01001110, \XMM7, \T2
2668        vpshufd         $0b01001110, \T5, \T3
2669        vpxor           \XMM7, \T2, \T2
2670        vpxor           \T5, \T3, \T3
2671
2672        vpclmulqdq      $0x11, \T5, \XMM7, \T4
2673        vpxor           \T4, \T6, \T6
2674
2675        vpclmulqdq      $0x00, \T5, \XMM7, \T4
2676        vpxor           \T4, \T7, \T7
2677
2678        vpclmulqdq      $0x00, \T3, \T2, \T2
2679
2680        vpxor           \T2, \XMM1, \XMM1
2681
2682        ######################
2683
2684        vmovdqu         HashKey(arg2), \T5
2685        vpshufd         $0b01001110, \XMM8, \T2
2686        vpshufd         $0b01001110, \T5, \T3
2687        vpxor           \XMM8, \T2, \T2
2688        vpxor           \T5, \T3, \T3
2689
2690        vpclmulqdq      $0x11, \T5, \XMM8, \T4
2691        vpxor           \T4, \T6, \T6
2692
2693        vpclmulqdq      $0x00, \T5, \XMM8, \T4
2694        vpxor           \T4, \T7, \T7
2695
2696        vpclmulqdq      $0x00, \T3, \T2, \T2
2697
2698        vpxor           \T2, \XMM1, \XMM1
2699        vpxor           \T6, \XMM1, \XMM1
2700        vpxor           \T7, \XMM1, \T2
2701
2702
2703
2704
2705        vpslldq $8, \T2, \T4
2706        vpsrldq $8, \T2, \T2
2707
2708        vpxor   \T4, \T7, \T7
2709        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
2710                                                   # accumulated carry-less multiplications
2711
2712        #######################################################################
2713        #first phase of the reduction
2714        vmovdqa         POLY2(%rip), \T3
2715
2716        vpclmulqdq      $0x01, \T7, \T3, \T2
2717        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
2718
2719        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
2720        #######################################################################
2721
2722
2723        #second phase of the reduction
2724        vpclmulqdq      $0x00, \T7, \T3, \T2
2725        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2726
2727        vpclmulqdq      $0x10, \T7, \T3, \T4
2728        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2729
2730        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
2731        #######################################################################
2732        vpxor           \T4, \T6, \T6              # the result is in T6
2733.endm
2734
2735
2736
2737#############################################################
2738#void   aesni_gcm_init_avx_gen4
2739#        (gcm_data     *my_ctx_data,
2740#         gcm_context_data *data,
2741#        u8      *iv, /* Pre-counter block j0: 4 byte salt
2742#                       (from Security Association) concatenated with 8 byte
2743#                       Initialisation Vector (from IPSec ESP Payload)
2744#                       concatenated with 0x00000001. 16-byte aligned pointer. */
2745#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2746#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2747#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2748#############################################################
2749ENTRY(aesni_gcm_init_avx_gen4)
2750        FUNC_SAVE
2751        INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2752        FUNC_RESTORE
2753        ret
2754ENDPROC(aesni_gcm_init_avx_gen4)
2755
2756###############################################################################
2757#void   aesni_gcm_enc_avx_gen4(
2758#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2759#        gcm_context_data *data,
2760#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
2761#        const   u8 *in, /* Plaintext input */
2762#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2763###############################################################################
2764ENTRY(aesni_gcm_enc_update_avx_gen4)
2765        FUNC_SAVE
2766        mov     keysize,%eax
2767        cmp     $32, %eax
2768        je      key_256_enc_update4
2769        cmp     $16, %eax
2770        je      key_128_enc_update4
2771        # must be 192
2772        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2773        FUNC_RESTORE
2774        ret
2775key_128_enc_update4:
2776        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2777        FUNC_RESTORE
2778        ret
2779key_256_enc_update4:
2780        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2781        FUNC_RESTORE
2782        ret
2783ENDPROC(aesni_gcm_enc_update_avx_gen4)
2784
2785###############################################################################
2786#void   aesni_gcm_dec_update_avx_gen4(
2787#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2788#        gcm_context_data *data,
2789#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
2790#        const   u8 *in, /* Ciphertext input */
2791#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2792###############################################################################
2793ENTRY(aesni_gcm_dec_update_avx_gen4)
2794        FUNC_SAVE
2795        mov     keysize,%eax
2796        cmp     $32, %eax
2797        je      key_256_dec_update4
2798        cmp     $16, %eax
2799        je      key_128_dec_update4
2800        # must be 192
2801        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2802        FUNC_RESTORE
2803        ret
2804key_128_dec_update4:
2805        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2806        FUNC_RESTORE
2807        ret
2808key_256_dec_update4:
2809        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2810        FUNC_RESTORE
2811        ret
2812ENDPROC(aesni_gcm_dec_update_avx_gen4)
2813
2814###############################################################################
2815#void   aesni_gcm_finalize_avx_gen4(
2816#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2817#        gcm_context_data *data,
2818#        u8      *auth_tag, /* Authenticated Tag output. */
2819#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2820#                              Valid values are 16 (most likely), 12 or 8. */
2821###############################################################################
2822ENTRY(aesni_gcm_finalize_avx_gen4)
2823        FUNC_SAVE
2824        mov     keysize,%eax
2825        cmp     $32, %eax
2826        je      key_256_finalize4
2827        cmp     $16, %eax
2828        je      key_128_finalize4
2829        # must be 192
2830        GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2831        FUNC_RESTORE
2832        ret
2833key_128_finalize4:
2834        GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2835        FUNC_RESTORE
2836        ret
2837key_256_finalize4:
2838        GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2839        FUNC_RESTORE
2840        ret
2841ENDPROC(aesni_gcm_finalize_avx_gen4)
2842
2843#endif /* CONFIG_AS_AVX2 */
2844