LXR linux/arch/x86/crypto/aesni-intel

   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * Implement AES algorithm in Intel AES-NI instructions.
   4 *
   5 * The white paper of AES-NI instructions can be downloaded from:
   6 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
   7 *
   8 * Copyright (C) 2008, Intel Corp.
   9 *    Author: Huang Ying <ying.huang@intel.com>
  10 *            Vinodh Gopal <vinodh.gopal@intel.com>
  11 *            Kahraman Akdemir
  12 *
  13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  14 * interface for 64-bit kernels.
  15 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  16 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
  17 *             Adrian Hoban <adrian.hoban@intel.com>
  18 *             James Guilford (james.guilford@intel.com)
  19 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
  20 *             Tadeusz Struk (tadeusz.struk@intel.com)
  21 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
  22 *    Copyright (c) 2010, Intel Corporation.
  23 *
  24 * Ported x86_64 version to x86:
  25 *    Author: Mathias Krause <minipli@googlemail.com>
  26 */
  27
  28#include <linux/linkage.h>
  29#include <asm/frame.h>
  30#include <asm/nospec-branch.h>
  31
  32/*
  33 * The following macros are used to move an (un)aligned 16 byte value to/from
  34 * an XMM register.  This can done for either FP or integer values, for FP use
  35 * movaps (move aligned packed single) or integer use movdqa (move double quad
  36 * aligned).  It doesn't make a performance difference which instruction is used
  37 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
  38 * shorter, so that is the one we'll use for now. (same for unaligned).
  39 */
  40#define MOVADQ  movaps
  41#define MOVUDQ  movups
  42
  43#ifdef __x86_64__
  44
  45# constants in mergeable sections, linker can reorder and merge
  46.section        .rodata.cst16.POLY, "aM", @progbits, 16
  47.align 16
  48POLY:   .octa 0xC2000000000000000000000000000001
  49.section        .rodata.cst16.TWOONE, "aM", @progbits, 16
  50.align 16
  51TWOONE: .octa 0x00000001000000000000000000000001
  52
  53.section        .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
  54.align 16
  55SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
  56.section        .rodata.cst16.MASK1, "aM", @progbits, 16
  57.align 16
  58MASK1:      .octa 0x0000000000000000ffffffffffffffff
  59.section        .rodata.cst16.MASK2, "aM", @progbits, 16
  60.align 16
  61MASK2:      .octa 0xffffffffffffffff0000000000000000
  62.section        .rodata.cst16.ONE, "aM", @progbits, 16
  63.align 16
  64ONE:        .octa 0x00000000000000000000000000000001
  65.section        .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
  66.align 16
  67F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  68.section        .rodata.cst16.dec, "aM", @progbits, 16
  69.align 16
  70dec:        .octa 0x1
  71.section        .rodata.cst16.enc, "aM", @progbits, 16
  72.align 16
  73enc:        .octa 0x2
  74
  75# order of these constants should not change.
  76# more specifically, ALL_F should follow SHIFT_MASK,
  77# and zero should follow ALL_F
  78.section        .rodata, "a", @progbits
  79.align 16
  80SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  81ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
  82            .octa 0x00000000000000000000000000000000
  83
  84.text
  85
  86
  87#define STACK_OFFSET    8*3
  88
  89#define AadHash 16*0
  90#define AadLen 16*1
  91#define InLen (16*1)+8
  92#define PBlockEncKey 16*2
  93#define OrigIV 16*3
  94#define CurCount 16*4
  95#define PBlockLen 16*5
  96#define HashKey         16*6    // store HashKey <<1 mod poly here
  97#define HashKey_2       16*7    // store HashKey^2 <<1 mod poly here
  98#define HashKey_3       16*8    // store HashKey^3 <<1 mod poly here
  99#define HashKey_4       16*9    // store HashKey^4 <<1 mod poly here
 100#define HashKey_k       16*10   // store XOR of High 64 bits and Low 64
 101                                // bits of  HashKey <<1 mod poly here
 102                                //(for Karatsuba purposes)
 103#define HashKey_2_k     16*11   // store XOR of High 64 bits and Low 64
 104                                // bits of  HashKey^2 <<1 mod poly here
 105                                // (for Karatsuba purposes)
 106#define HashKey_3_k     16*12   // store XOR of High 64 bits and Low 64
 107                                // bits of  HashKey^3 <<1 mod poly here
 108                                // (for Karatsuba purposes)
 109#define HashKey_4_k     16*13   // store XOR of High 64 bits and Low 64
 110                                // bits of  HashKey^4 <<1 mod poly here
 111                                // (for Karatsuba purposes)
 112
 113#define arg1 rdi
 114#define arg2 rsi
 115#define arg3 rdx
 116#define arg4 rcx
 117#define arg5 r8
 118#define arg6 r9
 119#define arg7 STACK_OFFSET+8(%rsp)
 120#define arg8 STACK_OFFSET+16(%rsp)
 121#define arg9 STACK_OFFSET+24(%rsp)
 122#define arg10 STACK_OFFSET+32(%rsp)
 123#define arg11 STACK_OFFSET+40(%rsp)
 124#define keysize 2*15*16(%arg1)
 125#endif
 126
 127
 128#define STATE1  %xmm0
 129#define STATE2  %xmm4
 130#define STATE3  %xmm5
 131#define STATE4  %xmm6
 132#define STATE   STATE1
 133#define IN1     %xmm1
 134#define IN2     %xmm7
 135#define IN3     %xmm8
 136#define IN4     %xmm9
 137#define IN      IN1
 138#define KEY     %xmm2
 139#define IV      %xmm3
 140
 141#define BSWAP_MASK %xmm10
 142#define CTR     %xmm11
 143#define INC     %xmm12
 144
 145#define GF128MUL_MASK %xmm7
 146
 147#ifdef __x86_64__
 148#define AREG    %rax
 149#define KEYP    %rdi
 150#define OUTP    %rsi
 151#define UKEYP   OUTP
 152#define INP     %rdx
 153#define LEN     %rcx
 154#define IVP     %r8
 155#define KLEN    %r9d
 156#define T1      %r10
 157#define TKEYP   T1
 158#define T2      %r11
 159#define TCTR_LOW T2
 160#else
 161#define AREG    %eax
 162#define KEYP    %edi
 163#define OUTP    AREG
 164#define UKEYP   OUTP
 165#define INP     %edx
 166#define LEN     %esi
 167#define IVP     %ebp
 168#define KLEN    %ebx
 169#define T1      %ecx
 170#define TKEYP   T1
 171#endif
 172
 173.macro FUNC_SAVE
 174        push    %r12
 175        push    %r13
 176        push    %r14
 177#
 178# states of %xmm registers %xmm6:%xmm15 not saved
 179# all %xmm registers are clobbered
 180#
 181.endm
 182
 183
 184.macro FUNC_RESTORE
 185        pop     %r14
 186        pop     %r13
 187        pop     %r12
 188.endm
 189
 190# Precompute hashkeys.
 191# Input: Hash subkey.
 192# Output: HashKeys stored in gcm_context_data.  Only needs to be called
 193# once per key.
 194# clobbers r12, and tmp xmm registers.
 195.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
 196        mov     \SUBKEY, %r12
 197        movdqu  (%r12), \TMP3
 198        movdqa  SHUF_MASK(%rip), \TMP2
 199        pshufb  \TMP2, \TMP3
 200
 201        # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
 202
 203        movdqa  \TMP3, \TMP2
 204        psllq   $1, \TMP3
 205        psrlq   $63, \TMP2
 206        movdqa  \TMP2, \TMP1
 207        pslldq  $8, \TMP2
 208        psrldq  $8, \TMP1
 209        por     \TMP2, \TMP3
 210
 211        # reduce HashKey<<1
 212
 213        pshufd  $0x24, \TMP1, \TMP2
 214        pcmpeqd TWOONE(%rip), \TMP2
 215        pand    POLY(%rip), \TMP2
 216        pxor    \TMP2, \TMP3
 217        movdqu  \TMP3, HashKey(%arg2)
 218
 219        movdqa     \TMP3, \TMP5
 220        pshufd     $78, \TMP3, \TMP1
 221        pxor       \TMP3, \TMP1
 222        movdqu     \TMP1, HashKey_k(%arg2)
 223
 224        GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 225# TMP5 = HashKey^2<<1 (mod poly)
 226        movdqu     \TMP5, HashKey_2(%arg2)
 227# HashKey_2 = HashKey^2<<1 (mod poly)
 228        pshufd     $78, \TMP5, \TMP1
 229        pxor       \TMP5, \TMP1
 230        movdqu     \TMP1, HashKey_2_k(%arg2)
 231
 232        GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 233# TMP5 = HashKey^3<<1 (mod poly)
 234        movdqu     \TMP5, HashKey_3(%arg2)
 235        pshufd     $78, \TMP5, \TMP1
 236        pxor       \TMP5, \TMP1
 237        movdqu     \TMP1, HashKey_3_k(%arg2)
 238
 239        GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 240# TMP5 = HashKey^3<<1 (mod poly)
 241        movdqu     \TMP5, HashKey_4(%arg2)
 242        pshufd     $78, \TMP5, \TMP1
 243        pxor       \TMP5, \TMP1
 244        movdqu     \TMP1, HashKey_4_k(%arg2)
 245.endm
 246
 247# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 248# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
 249.macro GCM_INIT Iv SUBKEY AAD AADLEN
 250        mov \AADLEN, %r11
 251        mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
 252        xor %r11d, %r11d
 253        mov %r11, InLen(%arg2) # ctx_data.in_length = 0
 254        mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
 255        mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
 256        mov \Iv, %rax
 257        movdqu (%rax), %xmm0
 258        movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
 259
 260        movdqa  SHUF_MASK(%rip), %xmm2
 261        pshufb %xmm2, %xmm0
 262        movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
 263
 264        PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
 265        movdqu HashKey(%arg2), %xmm13
 266
 267        CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
 268        %xmm4, %xmm5, %xmm6
 269.endm
 270
 271# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
 272# struct has been initialized by GCM_INIT.
 273# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
 274# Clobbers rax, r10-r13, and xmm0-xmm15
 275.macro GCM_ENC_DEC operation
 276        movdqu AadHash(%arg2), %xmm8
 277        movdqu HashKey(%arg2), %xmm13
 278        add %arg5, InLen(%arg2)
 279
 280        xor %r11d, %r11d # initialise the data pointer offset as zero
 281        PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
 282
 283        sub %r11, %arg5         # sub partial block data used
 284        mov %arg5, %r13         # save the number of bytes
 285
 286        and $-16, %r13          # %r13 = %r13 - (%r13 mod 16)
 287        mov %r13, %r12
 288        # Encrypt/Decrypt first few blocks
 289
 290        and     $(3<<4), %r12
 291        jz      _initial_num_blocks_is_0_\@
 292        cmp     $(2<<4), %r12
 293        jb      _initial_num_blocks_is_1_\@
 294        je      _initial_num_blocks_is_2_\@
 295_initial_num_blocks_is_3_\@:
 296        INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 297%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
 298        sub     $48, %r13
 299        jmp     _initial_blocks_\@
 300_initial_num_blocks_is_2_\@:
 301        INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 302%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
 303        sub     $32, %r13
 304        jmp     _initial_blocks_\@
 305_initial_num_blocks_is_1_\@:
 306        INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 307%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
 308        sub     $16, %r13
 309        jmp     _initial_blocks_\@
 310_initial_num_blocks_is_0_\@:
 311        INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 312%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
 313_initial_blocks_\@:
 314
 315        # Main loop - Encrypt/Decrypt remaining blocks
 316
 317        test    %r13, %r13
 318        je      _zero_cipher_left_\@
 319        sub     $64, %r13
 320        je      _four_cipher_left_\@
 321_crypt_by_4_\@:
 322        GHASH_4_ENCRYPT_4_PARALLEL_\operation   %xmm9, %xmm10, %xmm11, %xmm12, \
 323        %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
 324        %xmm7, %xmm8, enc
 325        add     $64, %r11
 326        sub     $64, %r13
 327        jne     _crypt_by_4_\@
 328_four_cipher_left_\@:
 329        GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
 330%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
 331_zero_cipher_left_\@:
 332        movdqu %xmm8, AadHash(%arg2)
 333        movdqu %xmm0, CurCount(%arg2)
 334
 335        mov     %arg5, %r13
 336        and     $15, %r13                       # %r13 = arg5 (mod 16)
 337        je      _multiple_of_16_bytes_\@
 338
 339        mov %r13, PBlockLen(%arg2)
 340
 341        # Handle the last <16 Byte block separately
 342        paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
 343        movdqu %xmm0, CurCount(%arg2)
 344        movdqa SHUF_MASK(%rip), %xmm10
 345        pshufb %xmm10, %xmm0
 346
 347        ENCRYPT_SINGLE_BLOCK    %xmm0, %xmm1        # Encrypt(K, Yn)
 348        movdqu %xmm0, PBlockEncKey(%arg2)
 349
 350        cmp     $16, %arg5
 351        jge _large_enough_update_\@
 352
 353        lea (%arg4,%r11,1), %r10
 354        mov %r13, %r12
 355        READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
 356        jmp _data_read_\@
 357
 358_large_enough_update_\@:
 359        sub     $16, %r11
 360        add     %r13, %r11
 361
 362        # receive the last <16 Byte block
 363        movdqu  (%arg4, %r11, 1), %xmm1
 364
 365        sub     %r13, %r11
 366        add     $16, %r11
 367
 368        lea     SHIFT_MASK+16(%rip), %r12
 369        # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
 370        # (r13 is the number of bytes in plaintext mod 16)
 371        sub     %r13, %r12
 372        # get the appropriate shuffle mask
 373        movdqu  (%r12), %xmm2
 374        # shift right 16-r13 bytes
 375        pshufb  %xmm2, %xmm1
 376
 377_data_read_\@:
 378        lea ALL_F+16(%rip), %r12
 379        sub %r13, %r12
 380
 381.ifc \operation, dec
 382        movdqa  %xmm1, %xmm2
 383.endif
 384        pxor    %xmm1, %xmm0            # XOR Encrypt(K, Yn)
 385        movdqu  (%r12), %xmm1
 386        # get the appropriate mask to mask out top 16-r13 bytes of xmm0
 387        pand    %xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
 388.ifc \operation, dec
 389        pand    %xmm1, %xmm2
 390        movdqa SHUF_MASK(%rip), %xmm10
 391        pshufb %xmm10 ,%xmm2
 392
 393        pxor %xmm2, %xmm8
 394.else
 395        movdqa SHUF_MASK(%rip), %xmm10
 396        pshufb %xmm10,%xmm0
 397
 398        pxor    %xmm0, %xmm8
 399.endif
 400
 401        movdqu %xmm8, AadHash(%arg2)
 402.ifc \operation, enc
 403        # GHASH computation for the last <16 byte block
 404        movdqa SHUF_MASK(%rip), %xmm10
 405        # shuffle xmm0 back to output as ciphertext
 406        pshufb %xmm10, %xmm0
 407.endif
 408
 409        # Output %r13 bytes
 410        movq %xmm0, %rax
 411        cmp $8, %r13
 412        jle _less_than_8_bytes_left_\@
 413        mov %rax, (%arg3 , %r11, 1)
 414        add $8, %r11
 415        psrldq $8, %xmm0
 416        movq %xmm0, %rax
 417        sub $8, %r13
 418_less_than_8_bytes_left_\@:
 419        mov %al,  (%arg3, %r11, 1)
 420        add $1, %r11
 421        shr $8, %rax
 422        sub $1, %r13
 423        jne _less_than_8_bytes_left_\@
 424_multiple_of_16_bytes_\@:
 425.endm
 426
 427# GCM_COMPLETE Finishes update of tag of last partial block
 428# Output: Authorization Tag (AUTH_TAG)
 429# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 430.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
 431        movdqu AadHash(%arg2), %xmm8
 432        movdqu HashKey(%arg2), %xmm13
 433
 434        mov PBlockLen(%arg2), %r12
 435
 436        test %r12, %r12
 437        je _partial_done\@
 438
 439        GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 440
 441_partial_done\@:
 442        mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
 443        shl     $3, %r12                  # convert into number of bits
 444        movd    %r12d, %xmm15             # len(A) in %xmm15
 445        mov InLen(%arg2), %r12
 446        shl     $3, %r12                  # len(C) in bits (*128)
 447        movq    %r12, %xmm1
 448
 449        pslldq  $8, %xmm15                # %xmm15 = len(A)||0x0000000000000000
 450        pxor    %xmm1, %xmm15             # %xmm15 = len(A)||len(C)
 451        pxor    %xmm15, %xmm8
 452        GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 453        # final GHASH computation
 454        movdqa SHUF_MASK(%rip), %xmm10
 455        pshufb %xmm10, %xmm8
 456
 457        movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
 458        ENCRYPT_SINGLE_BLOCK    %xmm0,  %xmm1     # E(K, Y0)
 459        pxor    %xmm8, %xmm0
 460_return_T_\@:
 461        mov     \AUTHTAG, %r10                     # %r10 = authTag
 462        mov     \AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
 463        cmp     $16, %r11
 464        je      _T_16_\@
 465        cmp     $8, %r11
 466        jl      _T_4_\@
 467_T_8_\@:
 468        movq    %xmm0, %rax
 469        mov     %rax, (%r10)
 470        add     $8, %r10
 471        sub     $8, %r11
 472        psrldq  $8, %xmm0
 473        test    %r11, %r11
 474        je      _return_T_done_\@
 475_T_4_\@:
 476        movd    %xmm0, %eax
 477        mov     %eax, (%r10)
 478        add     $4, %r10
 479        sub     $4, %r11
 480        psrldq  $4, %xmm0
 481        test    %r11, %r11
 482        je      _return_T_done_\@
 483_T_123_\@:
 484        movd    %xmm0, %eax
 485        cmp     $2, %r11
 486        jl      _T_1_\@
 487        mov     %ax, (%r10)
 488        cmp     $2, %r11
 489        je      _return_T_done_\@
 490        add     $2, %r10
 491        sar     $16, %eax
 492_T_1_\@:
 493        mov     %al, (%r10)
 494        jmp     _return_T_done_\@
 495_T_16_\@:
 496        movdqu  %xmm0, (%r10)
 497_return_T_done_\@:
 498.endm
 499
 500#ifdef __x86_64__
 501/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 502*
 503*
 504* Input: A and B (128-bits each, bit-reflected)
 505* Output: C = A*B*x mod poly, (i.e. >>1 )
 506* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 507* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 508*
 509*/
 510.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
 511        movdqa    \GH, \TMP1
 512        pshufd    $78, \GH, \TMP2
 513        pshufd    $78, \HK, \TMP3
 514        pxor      \GH, \TMP2            # TMP2 = a1+a0
 515        pxor      \HK, \TMP3            # TMP3 = b1+b0
 516        pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
 517        pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
 518        pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
 519        pxor      \GH, \TMP2
 520        pxor      \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
 521        movdqa    \TMP2, \TMP3
 522        pslldq    $8, \TMP3             # left shift TMP3 2 DWs
 523        psrldq    $8, \TMP2             # right shift TMP2 2 DWs
 524        pxor      \TMP3, \GH
 525        pxor      \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
 526
 527        # first phase of the reduction
 528
 529        movdqa    \GH, \TMP2
 530        movdqa    \GH, \TMP3
 531        movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
 532                                        # in in order to perform
 533                                        # independent shifts
 534        pslld     $31, \TMP2            # packed right shift <<31
 535        pslld     $30, \TMP3            # packed right shift <<30
 536        pslld     $25, \TMP4            # packed right shift <<25
 537        pxor      \TMP3, \TMP2          # xor the shifted versions
 538        pxor      \TMP4, \TMP2
 539        movdqa    \TMP2, \TMP5
 540        psrldq    $4, \TMP5             # right shift TMP5 1 DW
 541        pslldq    $12, \TMP2            # left shift TMP2 3 DWs
 542        pxor      \TMP2, \GH
 543
 544        # second phase of the reduction
 545
 546        movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
 547                                        # in in order to perform
 548                                        # independent shifts
 549        movdqa    \GH,\TMP3
 550        movdqa    \GH,\TMP4
 551        psrld     $1,\TMP2              # packed left shift >>1
 552        psrld     $2,\TMP3              # packed left shift >>2
 553        psrld     $7,\TMP4              # packed left shift >>7
 554        pxor      \TMP3,\TMP2           # xor the shifted versions
 555        pxor      \TMP4,\TMP2
 556        pxor      \TMP5, \TMP2
 557        pxor      \TMP2, \GH
 558        pxor      \TMP1, \GH            # result is in TMP1
 559.endm
 560
 561# Reads DLEN bytes starting at DPTR and stores in XMMDst
 562# where 0 < DLEN < 16
 563# Clobbers %rax, DLEN and XMM1
 564.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
 565        cmp $8, \DLEN
 566        jl _read_lt8_\@
 567        mov (\DPTR), %rax
 568        movq %rax, \XMMDst
 569        sub $8, \DLEN
 570        jz _done_read_partial_block_\@
 571        xor %eax, %eax
 572_read_next_byte_\@:
 573        shl $8, %rax
 574        mov 7(\DPTR, \DLEN, 1), %al
 575        dec \DLEN
 576        jnz _read_next_byte_\@
 577        movq %rax, \XMM1
 578        pslldq $8, \XMM1
 579        por \XMM1, \XMMDst
 580        jmp _done_read_partial_block_\@
 581_read_lt8_\@:
 582        xor %eax, %eax
 583_read_next_byte_lt8_\@:
 584        shl $8, %rax
 585        mov -1(\DPTR, \DLEN, 1), %al
 586        dec \DLEN
 587        jnz _read_next_byte_lt8_\@
 588        movq %rax, \XMMDst
 589_done_read_partial_block_\@:
 590.endm
 591
 592# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
 593# clobbers r10-11, xmm14
 594.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
 595        TMP6 TMP7
 596        MOVADQ     SHUF_MASK(%rip), %xmm14
 597        mov        \AAD, %r10           # %r10 = AAD
 598        mov        \AADLEN, %r11                # %r11 = aadLen
 599        pxor       \TMP7, \TMP7
 600        pxor       \TMP6, \TMP6
 601
 602        cmp        $16, %r11
 603        jl         _get_AAD_rest\@
 604_get_AAD_blocks\@:
 605        movdqu     (%r10), \TMP7
 606        pshufb     %xmm14, \TMP7 # byte-reflect the AAD data
 607        pxor       \TMP7, \TMP6
 608        GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 609        add        $16, %r10
 610        sub        $16, %r11
 611        cmp        $16, %r11
 612        jge        _get_AAD_blocks\@
 613
 614        movdqu     \TMP6, \TMP7
 615
 616        /* read the last <16B of AAD */
 617_get_AAD_rest\@:
 618        test       %r11, %r11
 619        je         _get_AAD_done\@
 620
 621        READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
 622        pshufb     %xmm14, \TMP7 # byte-reflect the AAD data
 623        pxor       \TMP6, \TMP7
 624        GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 625        movdqu \TMP7, \TMP6
 626
 627_get_AAD_done\@:
 628        movdqu \TMP6, AadHash(%arg2)
 629.endm
 630
 631# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
 632# between update calls.
 633# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
 634# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
 635# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
 636.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
 637        AAD_HASH operation
 638        mov     PBlockLen(%arg2), %r13
 639        test    %r13, %r13
 640        je      _partial_block_done_\@  # Leave Macro if no partial blocks
 641        # Read in input data without over reading
 642        cmp     $16, \PLAIN_CYPH_LEN
 643        jl      _fewer_than_16_bytes_\@
 644        movups  (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
 645        jmp     _data_read_\@
 646
 647_fewer_than_16_bytes_\@:
 648        lea     (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
 649        mov     \PLAIN_CYPH_LEN, %r12
 650        READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
 651
 652        mov PBlockLen(%arg2), %r13
 653
 654_data_read_\@:                          # Finished reading in data
 655
 656        movdqu  PBlockEncKey(%arg2), %xmm9
 657        movdqu  HashKey(%arg2), %xmm13
 658
 659        lea     SHIFT_MASK(%rip), %r12
 660
 661        # adjust the shuffle mask pointer to be able to shift r13 bytes
 662        # r16-r13 is the number of bytes in plaintext mod 16)
 663        add     %r13, %r12
 664        movdqu  (%r12), %xmm2           # get the appropriate shuffle mask
 665        pshufb  %xmm2, %xmm9            # shift right r13 bytes
 666
 667.ifc \operation, dec
 668        movdqa  %xmm1, %xmm3
 669        pxor    %xmm1, %xmm9            # Cyphertext XOR E(K, Yn)
 670
 671        mov     \PLAIN_CYPH_LEN, %r10
 672        add     %r13, %r10
 673        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 674        sub     $16, %r10
 675        # Determine if if partial block is not being filled and
 676        # shift mask accordingly
 677        jge     _no_extra_mask_1_\@
 678        sub     %r10, %r12
 679_no_extra_mask_1_\@:
 680
 681        movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 682        # get the appropriate mask to mask out bottom r13 bytes of xmm9
 683        pand    %xmm1, %xmm9            # mask out bottom r13 bytes of xmm9
 684
 685        pand    %xmm1, %xmm3
 686        movdqa  SHUF_MASK(%rip), %xmm10
 687        pshufb  %xmm10, %xmm3
 688        pshufb  %xmm2, %xmm3
 689        pxor    %xmm3, \AAD_HASH
 690
 691        test    %r10, %r10
 692        jl      _partial_incomplete_1_\@
 693
 694        # GHASH computation for the last <16 Byte block
 695        GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 696        xor     %eax, %eax
 697
 698        mov     %rax, PBlockLen(%arg2)
 699        jmp     _dec_done_\@
 700_partial_incomplete_1_\@:
 701        add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 702_dec_done_\@:
 703        movdqu  \AAD_HASH, AadHash(%arg2)
 704.else
 705        pxor    %xmm1, %xmm9                    # Plaintext XOR E(K, Yn)
 706
 707        mov     \PLAIN_CYPH_LEN, %r10
 708        add     %r13, %r10
 709        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 710        sub     $16, %r10
 711        # Determine if if partial block is not being filled and
 712        # shift mask accordingly
 713        jge     _no_extra_mask_2_\@
 714        sub     %r10, %r12
 715_no_extra_mask_2_\@:
 716
 717        movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 718        # get the appropriate mask to mask out bottom r13 bytes of xmm9
 719        pand    %xmm1, %xmm9
 720
 721        movdqa  SHUF_MASK(%rip), %xmm1
 722        pshufb  %xmm1, %xmm9
 723        pshufb  %xmm2, %xmm9
 724        pxor    %xmm9, \AAD_HASH
 725
 726        test    %r10, %r10
 727        jl      _partial_incomplete_2_\@
 728
 729        # GHASH computation for the last <16 Byte block
 730        GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 731        xor     %eax, %eax
 732
 733        mov     %rax, PBlockLen(%arg2)
 734        jmp     _encode_done_\@
 735_partial_incomplete_2_\@:
 736        add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 737_encode_done_\@:
 738        movdqu  \AAD_HASH, AadHash(%arg2)
 739
 740        movdqa  SHUF_MASK(%rip), %xmm10
 741        # shuffle xmm9 back to output as ciphertext
 742        pshufb  %xmm10, %xmm9
 743        pshufb  %xmm2, %xmm9
 744.endif
 745        # output encrypted Bytes
 746        test    %r10, %r10
 747        jl      _partial_fill_\@
 748        mov     %r13, %r12
 749        mov     $16, %r13
 750        # Set r13 to be the number of bytes to write out
 751        sub     %r12, %r13
 752        jmp     _count_set_\@
 753_partial_fill_\@:
 754        mov     \PLAIN_CYPH_LEN, %r13
 755_count_set_\@:
 756        movdqa  %xmm9, %xmm0
 757        movq    %xmm0, %rax
 758        cmp     $8, %r13
 759        jle     _less_than_8_bytes_left_\@
 760
 761        mov     %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 762        add     $8, \DATA_OFFSET
 763        psrldq  $8, %xmm0
 764        movq    %xmm0, %rax
 765        sub     $8, %r13
 766_less_than_8_bytes_left_\@:
 767        movb    %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 768        add     $1, \DATA_OFFSET
 769        shr     $8, %rax
 770        sub     $1, %r13
 771        jne     _less_than_8_bytes_left_\@
 772_partial_block_done_\@:
 773.endm # PARTIAL_BLOCK
 774
 775/*
 776* if a = number of total plaintext bytes
 777* b = floor(a/16)
 778* num_initial_blocks = b mod 4
 779* encrypt the initial num_initial_blocks blocks and apply ghash on
 780* the ciphertext
 781* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 782* are clobbered
 783* arg1, %arg2, %arg3 are used as a pointer only, not modified
 784*/
 785
 786
 787.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 788        XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 789        MOVADQ          SHUF_MASK(%rip), %xmm14
 790
 791        movdqu AadHash(%arg2), %xmm\i               # XMM0 = Y0
 792
 793        # start AES for num_initial_blocks blocks
 794
 795        movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
 796
 797.if (\i == 5) || (\i == 6) || (\i == 7)
 798
 799        MOVADQ          ONE(%RIP),\TMP1
 800        MOVADQ          0(%arg1),\TMP2
 801.irpc index, \i_seq
 802        paddd           \TMP1, \XMM0                 # INCR Y0
 803.ifc \operation, dec
 804        movdqa     \XMM0, %xmm\index
 805.else
 806        MOVADQ          \XMM0, %xmm\index
 807.endif
 808        pshufb  %xmm14, %xmm\index      # perform a 16 byte swap
 809        pxor            \TMP2, %xmm\index
 810.endr
 811        lea     0x10(%arg1),%r10
 812        mov     keysize,%eax
 813        shr     $2,%eax                         # 128->4, 192->6, 256->8
 814        add     $5,%eax                       # 128->9, 192->11, 256->13
 815
 816aes_loop_initial_\@:
 817        MOVADQ  (%r10),\TMP1
 818.irpc   index, \i_seq
 819        aesenc  \TMP1, %xmm\index
 820.endr
 821        add     $16,%r10
 822        sub     $1,%eax
 823        jnz     aes_loop_initial_\@
 824
 825        MOVADQ  (%r10), \TMP1
 826.irpc index, \i_seq
 827        aesenclast \TMP1, %xmm\index         # Last Round
 828.endr
 829.irpc index, \i_seq
 830        movdqu     (%arg4 , %r11, 1), \TMP1
 831        pxor       \TMP1, %xmm\index
 832        movdqu     %xmm\index, (%arg3 , %r11, 1)
 833        # write back plaintext/ciphertext for num_initial_blocks
 834        add        $16, %r11
 835
 836.ifc \operation, dec
 837        movdqa     \TMP1, %xmm\index
 838.endif
 839        pshufb     %xmm14, %xmm\index
 840
 841                # prepare plaintext/ciphertext for GHASH computation
 842.endr
 843.endif
 844
 845        # apply GHASH on num_initial_blocks blocks
 846
 847.if \i == 5
 848        pxor       %xmm5, %xmm6
 849        GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 850        pxor       %xmm6, %xmm7
 851        GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 852        pxor       %xmm7, %xmm8
 853        GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 854.elseif \i == 6
 855        pxor       %xmm6, %xmm7
 856        GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 857        pxor       %xmm7, %xmm8
 858        GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 859.elseif \i == 7
 860        pxor       %xmm7, %xmm8
 861        GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 862.endif
 863        cmp        $64, %r13
 864        jl      _initial_blocks_done\@
 865        # no need for precomputed values
 866/*
 867*
 868* Precomputations for HashKey parallel with encryption of first 4 blocks.
 869* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 870*/
 871        MOVADQ     ONE(%RIP),\TMP1
 872        paddd      \TMP1, \XMM0              # INCR Y0
 873        MOVADQ     \XMM0, \XMM1
 874        pshufb  %xmm14, \XMM1        # perform a 16 byte swap
 875
 876        paddd      \TMP1, \XMM0              # INCR Y0
 877        MOVADQ     \XMM0, \XMM2
 878        pshufb  %xmm14, \XMM2        # perform a 16 byte swap
 879
 880        paddd      \TMP1, \XMM0              # INCR Y0
 881        MOVADQ     \XMM0, \XMM3
 882        pshufb %xmm14, \XMM3        # perform a 16 byte swap
 883
 884        paddd      \TMP1, \XMM0              # INCR Y0
 885        MOVADQ     \XMM0, \XMM4
 886        pshufb %xmm14, \XMM4        # perform a 16 byte swap
 887
 888        MOVADQ     0(%arg1),\TMP1
 889        pxor       \TMP1, \XMM1
 890        pxor       \TMP1, \XMM2
 891        pxor       \TMP1, \XMM3
 892        pxor       \TMP1, \XMM4
 893.irpc index, 1234 # do 4 rounds
 894        movaps 0x10*\index(%arg1), \TMP1
 895        aesenc     \TMP1, \XMM1
 896        aesenc     \TMP1, \XMM2
 897        aesenc     \TMP1, \XMM3
 898        aesenc     \TMP1, \XMM4
 899.endr
 900.irpc index, 56789 # do next 5 rounds
 901        movaps 0x10*\index(%arg1), \TMP1
 902        aesenc     \TMP1, \XMM1
 903        aesenc     \TMP1, \XMM2
 904        aesenc     \TMP1, \XMM3
 905        aesenc     \TMP1, \XMM4
 906.endr
 907        lea        0xa0(%arg1),%r10
 908        mov        keysize,%eax
 909        shr        $2,%eax                      # 128->4, 192->6, 256->8
 910        sub        $4,%eax                      # 128->0, 192->2, 256->4
 911        jz         aes_loop_pre_done\@
 912
 913aes_loop_pre_\@:
 914        MOVADQ     (%r10),\TMP2
 915.irpc   index, 1234
 916        aesenc     \TMP2, %xmm\index
 917.endr
 918        add        $16,%r10
 919        sub        $1,%eax
 920        jnz        aes_loop_pre_\@
 921
 922aes_loop_pre_done\@:
 923        MOVADQ     (%r10), \TMP2
 924        aesenclast \TMP2, \XMM1
 925        aesenclast \TMP2, \XMM2
 926        aesenclast \TMP2, \XMM3
 927        aesenclast \TMP2, \XMM4
 928        movdqu     16*0(%arg4 , %r11 , 1), \TMP1
 929        pxor       \TMP1, \XMM1
 930.ifc \operation, dec
 931        movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 932        movdqa     \TMP1, \XMM1
 933.endif
 934        movdqu     16*1(%arg4 , %r11 , 1), \TMP1
 935        pxor       \TMP1, \XMM2
 936.ifc \operation, dec
 937        movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 938        movdqa     \TMP1, \XMM2
 939.endif
 940        movdqu     16*2(%arg4 , %r11 , 1), \TMP1
 941        pxor       \TMP1, \XMM3
 942.ifc \operation, dec
 943        movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 944        movdqa     \TMP1, \XMM3
 945.endif
 946        movdqu     16*3(%arg4 , %r11 , 1), \TMP1
 947        pxor       \TMP1, \XMM4
 948.ifc \operation, dec
 949        movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 950        movdqa     \TMP1, \XMM4
 951.else
 952        movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 953        movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 954        movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 955        movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 956.endif
 957
 958        add        $64, %r11
 959        pshufb %xmm14, \XMM1 # perform a 16 byte swap
 960        pxor       \XMMDst, \XMM1
 961# combine GHASHed value with the corresponding ciphertext
 962        pshufb %xmm14, \XMM2 # perform a 16 byte swap
 963        pshufb %xmm14, \XMM3 # perform a 16 byte swap
 964        pshufb %xmm14, \XMM4 # perform a 16 byte swap
 965
 966_initial_blocks_done\@:
 967
 968.endm
 969
 970/*
 971* encrypt 4 blocks at a time
 972* ghash the 4 previously encrypted ciphertext blocks
 973* arg1, %arg3, %arg4 are used as pointers only, not modified
 974* %r11 is the data offset value
 975*/
 976.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
 977TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 978
 979        movdqa    \XMM1, \XMM5
 980        movdqa    \XMM2, \XMM6
 981        movdqa    \XMM3, \XMM7
 982        movdqa    \XMM4, \XMM8
 983
 984        movdqa    SHUF_MASK(%rip), %xmm15
 985        # multiply TMP5 * HashKey using karatsuba
 986
 987        movdqa    \XMM5, \TMP4
 988        pshufd    $78, \XMM5, \TMP6
 989        pxor      \XMM5, \TMP6
 990        paddd     ONE(%rip), \XMM0              # INCR CNT
 991        movdqu    HashKey_4(%arg2), \TMP5
 992        pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
 993        movdqa    \XMM0, \XMM1
 994        paddd     ONE(%rip), \XMM0              # INCR CNT
 995        movdqa    \XMM0, \XMM2
 996        paddd     ONE(%rip), \XMM0              # INCR CNT
 997        movdqa    \XMM0, \XMM3
 998        paddd     ONE(%rip), \XMM0              # INCR CNT
 999        movdqa    \XMM0, \XMM4
1000        pshufb %xmm15, \XMM1    # perform a 16 byte swap

1001        pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1002        pshufb %xmm15, \XMM2    # perform a 16 byte swap
1003        pshufb %xmm15, \XMM3    # perform a 16 byte swap
1004        pshufb %xmm15, \XMM4    # perform a 16 byte swap
1005
1006        pxor      (%arg1), \XMM1
1007        pxor      (%arg1), \XMM2
1008        pxor      (%arg1), \XMM3
1009        pxor      (%arg1), \XMM4
1010        movdqu    HashKey_4_k(%arg2), \TMP5
1011        pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1012        movaps 0x10(%arg1), \TMP1
1013        aesenc    \TMP1, \XMM1              # Round 1
1014        aesenc    \TMP1, \XMM2
1015        aesenc    \TMP1, \XMM3
1016        aesenc    \TMP1, \XMM4
1017        movaps 0x20(%arg1), \TMP1
1018        aesenc    \TMP1, \XMM1              # Round 2
1019        aesenc    \TMP1, \XMM2
1020        aesenc    \TMP1, \XMM3
1021        aesenc    \TMP1, \XMM4
1022        movdqa    \XMM6, \TMP1
1023        pshufd    $78, \XMM6, \TMP2
1024        pxor      \XMM6, \TMP2
1025        movdqu    HashKey_3(%arg2), \TMP5
1026        pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1027        movaps 0x30(%arg1), \TMP3
1028        aesenc    \TMP3, \XMM1              # Round 3
1029        aesenc    \TMP3, \XMM2
1030        aesenc    \TMP3, \XMM3
1031        aesenc    \TMP3, \XMM4
1032        pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1033        movaps 0x40(%arg1), \TMP3
1034        aesenc    \TMP3, \XMM1              # Round 4
1035        aesenc    \TMP3, \XMM2
1036        aesenc    \TMP3, \XMM3
1037        aesenc    \TMP3, \XMM4
1038        movdqu    HashKey_3_k(%arg2), \TMP5
1039        pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1040        movaps 0x50(%arg1), \TMP3
1041        aesenc    \TMP3, \XMM1              # Round 5
1042        aesenc    \TMP3, \XMM2
1043        aesenc    \TMP3, \XMM3
1044        aesenc    \TMP3, \XMM4
1045        pxor      \TMP1, \TMP4
1046# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1047        pxor      \XMM6, \XMM5
1048        pxor      \TMP2, \TMP6
1049        movdqa    \XMM7, \TMP1
1050        pshufd    $78, \XMM7, \TMP2
1051        pxor      \XMM7, \TMP2
1052        movdqu    HashKey_2(%arg2), \TMP5
1053
1054        # Multiply TMP5 * HashKey using karatsuba
1055
1056        pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1057        movaps 0x60(%arg1), \TMP3
1058        aesenc    \TMP3, \XMM1              # Round 6
1059        aesenc    \TMP3, \XMM2
1060        aesenc    \TMP3, \XMM3
1061        aesenc    \TMP3, \XMM4
1062        pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1063        movaps 0x70(%arg1), \TMP3
1064        aesenc    \TMP3, \XMM1              # Round 7
1065        aesenc    \TMP3, \XMM2
1066        aesenc    \TMP3, \XMM3
1067        aesenc    \TMP3, \XMM4
1068        movdqu    HashKey_2_k(%arg2), \TMP5
1069        pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1070        movaps 0x80(%arg1), \TMP3
1071        aesenc    \TMP3, \XMM1              # Round 8
1072        aesenc    \TMP3, \XMM2
1073        aesenc    \TMP3, \XMM3
1074        aesenc    \TMP3, \XMM4
1075        pxor      \TMP1, \TMP4
1076# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1077        pxor      \XMM7, \XMM5
1078        pxor      \TMP2, \TMP6
1079
1080        # Multiply XMM8 * HashKey
1081        # XMM8 and TMP5 hold the values for the two operands
1082
1083        movdqa    \XMM8, \TMP1
1084        pshufd    $78, \XMM8, \TMP2
1085        pxor      \XMM8, \TMP2
1086        movdqu    HashKey(%arg2), \TMP5
1087        pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1088        movaps 0x90(%arg1), \TMP3
1089        aesenc    \TMP3, \XMM1             # Round 9
1090        aesenc    \TMP3, \XMM2
1091        aesenc    \TMP3, \XMM3
1092        aesenc    \TMP3, \XMM4
1093        pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1094        lea       0xa0(%arg1),%r10
1095        mov       keysize,%eax
1096        shr       $2,%eax                       # 128->4, 192->6, 256->8
1097        sub       $4,%eax                       # 128->0, 192->2, 256->4
1098        jz        aes_loop_par_enc_done\@
1099
1100aes_loop_par_enc\@:
1101        MOVADQ    (%r10),\TMP3
1102.irpc   index, 1234
1103        aesenc    \TMP3, %xmm\index
1104.endr
1105        add       $16,%r10
1106        sub       $1,%eax
1107        jnz       aes_loop_par_enc\@
1108
1109aes_loop_par_enc_done\@:
1110        MOVADQ    (%r10), \TMP3
1111        aesenclast \TMP3, \XMM1           # Round 10
1112        aesenclast \TMP3, \XMM2
1113        aesenclast \TMP3, \XMM3
1114        aesenclast \TMP3, \XMM4
1115        movdqu    HashKey_k(%arg2), \TMP5
1116        pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1117        movdqu    (%arg4,%r11,1), \TMP3
1118        pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1119        movdqu    16(%arg4,%r11,1), \TMP3
1120        pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1121        movdqu    32(%arg4,%r11,1), \TMP3
1122        pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1123        movdqu    48(%arg4,%r11,1), \TMP3
1124        pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1125        movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1126        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1127        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1128        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1129        pshufb %xmm15, \XMM1        # perform a 16 byte swap
1130        pshufb %xmm15, \XMM2    # perform a 16 byte swap
1131        pshufb %xmm15, \XMM3    # perform a 16 byte swap
1132        pshufb %xmm15, \XMM4    # perform a 16 byte swap
1133
1134        pxor      \TMP4, \TMP1
1135        pxor      \XMM8, \XMM5
1136        pxor      \TMP6, \TMP2
1137        pxor      \TMP1, \TMP2
1138        pxor      \XMM5, \TMP2
1139        movdqa    \TMP2, \TMP3
1140        pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1141        psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1142        pxor      \TMP3, \XMM5
1143        pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1144
1145        # first phase of reduction
1146
1147        movdqa    \XMM5, \TMP2
1148        movdqa    \XMM5, \TMP3
1149        movdqa    \XMM5, \TMP4
1150# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1151        pslld     $31, \TMP2                   # packed right shift << 31
1152        pslld     $30, \TMP3                   # packed right shift << 30
1153        pslld     $25, \TMP4                   # packed right shift << 25
1154        pxor      \TMP3, \TMP2                 # xor the shifted versions
1155        pxor      \TMP4, \TMP2
1156        movdqa    \TMP2, \TMP5
1157        psrldq    $4, \TMP5                    # right shift T5 1 DW
1158        pslldq    $12, \TMP2                   # left shift T2 3 DWs
1159        pxor      \TMP2, \XMM5
1160
1161        # second phase of reduction
1162
1163        movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1164        movdqa    \XMM5,\TMP3
1165        movdqa    \XMM5,\TMP4
1166        psrld     $1, \TMP2                    # packed left shift >>1
1167        psrld     $2, \TMP3                    # packed left shift >>2
1168        psrld     $7, \TMP4                    # packed left shift >>7
1169        pxor      \TMP3,\TMP2                  # xor the shifted versions
1170        pxor      \TMP4,\TMP2
1171        pxor      \TMP5, \TMP2
1172        pxor      \TMP2, \XMM5
1173        pxor      \TMP1, \XMM5                 # result is in TMP1
1174
1175        pxor      \XMM5, \XMM1
1176.endm
1177
1178/*
1179* decrypt 4 blocks at a time
1180* ghash the 4 previously decrypted ciphertext blocks
1181* arg1, %arg3, %arg4 are used as pointers only, not modified
1182* %r11 is the data offset value
1183*/
1184.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1185TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1186
1187        movdqa    \XMM1, \XMM5
1188        movdqa    \XMM2, \XMM6
1189        movdqa    \XMM3, \XMM7
1190        movdqa    \XMM4, \XMM8
1191
1192        movdqa    SHUF_MASK(%rip), %xmm15
1193        # multiply TMP5 * HashKey using karatsuba
1194
1195        movdqa    \XMM5, \TMP4
1196        pshufd    $78, \XMM5, \TMP6
1197        pxor      \XMM5, \TMP6
1198        paddd     ONE(%rip), \XMM0              # INCR CNT
1199        movdqu    HashKey_4(%arg2), \TMP5
1200        pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1201        movdqa    \XMM0, \XMM1
1202        paddd     ONE(%rip), \XMM0              # INCR CNT
1203        movdqa    \XMM0, \XMM2
1204        paddd     ONE(%rip), \XMM0              # INCR CNT
1205        movdqa    \XMM0, \XMM3
1206        paddd     ONE(%rip), \XMM0              # INCR CNT
1207        movdqa    \XMM0, \XMM4
1208        pshufb %xmm15, \XMM1    # perform a 16 byte swap
1209        pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1210        pshufb %xmm15, \XMM2    # perform a 16 byte swap
1211        pshufb %xmm15, \XMM3    # perform a 16 byte swap
1212        pshufb %xmm15, \XMM4    # perform a 16 byte swap
1213
1214        pxor      (%arg1), \XMM1
1215        pxor      (%arg1), \XMM2
1216        pxor      (%arg1), \XMM3
1217        pxor      (%arg1), \XMM4
1218        movdqu    HashKey_4_k(%arg2), \TMP5
1219        pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1220        movaps 0x10(%arg1), \TMP1
1221        aesenc    \TMP1, \XMM1              # Round 1
1222        aesenc    \TMP1, \XMM2
1223        aesenc    \TMP1, \XMM3
1224        aesenc    \TMP1, \XMM4
1225        movaps 0x20(%arg1), \TMP1
1226        aesenc    \TMP1, \XMM1              # Round 2
1227        aesenc    \TMP1, \XMM2
1228        aesenc    \TMP1, \XMM3
1229        aesenc    \TMP1, \XMM4
1230        movdqa    \XMM6, \TMP1
1231        pshufd    $78, \XMM6, \TMP2
1232        pxor      \XMM6, \TMP2
1233        movdqu    HashKey_3(%arg2), \TMP5
1234        pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1235        movaps 0x30(%arg1), \TMP3
1236        aesenc    \TMP3, \XMM1              # Round 3
1237        aesenc    \TMP3, \XMM2
1238        aesenc    \TMP3, \XMM3
1239        aesenc    \TMP3, \XMM4
1240        pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1241        movaps 0x40(%arg1), \TMP3
1242        aesenc    \TMP3, \XMM1              # Round 4
1243        aesenc    \TMP3, \XMM2
1244        aesenc    \TMP3, \XMM3
1245        aesenc    \TMP3, \XMM4
1246        movdqu    HashKey_3_k(%arg2), \TMP5
1247        pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1248        movaps 0x50(%arg1), \TMP3
1249        aesenc    \TMP3, \XMM1              # Round 5
1250        aesenc    \TMP3, \XMM2
1251        aesenc    \TMP3, \XMM3
1252        aesenc    \TMP3, \XMM4
1253        pxor      \TMP1, \TMP4
1254# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1255        pxor      \XMM6, \XMM5
1256        pxor      \TMP2, \TMP6
1257        movdqa    \XMM7, \TMP1
1258        pshufd    $78, \XMM7, \TMP2
1259        pxor      \XMM7, \TMP2
1260        movdqu    HashKey_2(%arg2), \TMP5
1261
1262        # Multiply TMP5 * HashKey using karatsuba
1263
1264        pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1265        movaps 0x60(%arg1), \TMP3
1266        aesenc    \TMP3, \XMM1              # Round 6
1267        aesenc    \TMP3, \XMM2
1268        aesenc    \TMP3, \XMM3
1269        aesenc    \TMP3, \XMM4
1270        pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1271        movaps 0x70(%arg1), \TMP3
1272        aesenc    \TMP3, \XMM1              # Round 7
1273        aesenc    \TMP3, \XMM2
1274        aesenc    \TMP3, \XMM3
1275        aesenc    \TMP3, \XMM4
1276        movdqu    HashKey_2_k(%arg2), \TMP5
1277        pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1278        movaps 0x80(%arg1), \TMP3
1279        aesenc    \TMP3, \XMM1              # Round 8
1280        aesenc    \TMP3, \XMM2
1281        aesenc    \TMP3, \XMM3
1282        aesenc    \TMP3, \XMM4
1283        pxor      \TMP1, \TMP4
1284# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1285        pxor      \XMM7, \XMM5
1286        pxor      \TMP2, \TMP6
1287
1288        # Multiply XMM8 * HashKey
1289        # XMM8 and TMP5 hold the values for the two operands
1290
1291        movdqa    \XMM8, \TMP1
1292        pshufd    $78, \XMM8, \TMP2
1293        pxor      \XMM8, \TMP2
1294        movdqu    HashKey(%arg2), \TMP5
1295        pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1296        movaps 0x90(%arg1), \TMP3
1297        aesenc    \TMP3, \XMM1             # Round 9
1298        aesenc    \TMP3, \XMM2
1299        aesenc    \TMP3, \XMM3
1300        aesenc    \TMP3, \XMM4
1301        pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1302        lea       0xa0(%arg1),%r10
1303        mov       keysize,%eax
1304        shr       $2,%eax                       # 128->4, 192->6, 256->8
1305        sub       $4,%eax                       # 128->0, 192->2, 256->4
1306        jz        aes_loop_par_dec_done\@
1307
1308aes_loop_par_dec\@:
1309        MOVADQ    (%r10),\TMP3
1310.irpc   index, 1234
1311        aesenc    \TMP3, %xmm\index
1312.endr
1313        add       $16,%r10
1314        sub       $1,%eax
1315        jnz       aes_loop_par_dec\@
1316
1317aes_loop_par_dec_done\@:
1318        MOVADQ    (%r10), \TMP3
1319        aesenclast \TMP3, \XMM1           # last round
1320        aesenclast \TMP3, \XMM2
1321        aesenclast \TMP3, \XMM3
1322        aesenclast \TMP3, \XMM4
1323        movdqu    HashKey_k(%arg2), \TMP5
1324        pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1325        movdqu    (%arg4,%r11,1), \TMP3
1326        pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1327        movdqu    \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1328        movdqa    \TMP3, \XMM1
1329        movdqu    16(%arg4,%r11,1), \TMP3
1330        pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1331        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1332        movdqa    \TMP3, \XMM2
1333        movdqu    32(%arg4,%r11,1), \TMP3
1334        pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1335        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1336        movdqa    \TMP3, \XMM3
1337        movdqu    48(%arg4,%r11,1), \TMP3
1338        pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1339        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1340        movdqa    \TMP3, \XMM4
1341        pshufb %xmm15, \XMM1        # perform a 16 byte swap
1342        pshufb %xmm15, \XMM2    # perform a 16 byte swap
1343        pshufb %xmm15, \XMM3    # perform a 16 byte swap
1344        pshufb %xmm15, \XMM4    # perform a 16 byte swap
1345
1346        pxor      \TMP4, \TMP1
1347        pxor      \XMM8, \XMM5
1348        pxor      \TMP6, \TMP2
1349        pxor      \TMP1, \TMP2
1350        pxor      \XMM5, \TMP2
1351        movdqa    \TMP2, \TMP3
1352        pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1353        psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1354        pxor      \TMP3, \XMM5
1355        pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1356
1357        # first phase of reduction
1358
1359        movdqa    \XMM5, \TMP2
1360        movdqa    \XMM5, \TMP3
1361        movdqa    \XMM5, \TMP4
1362# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1363        pslld     $31, \TMP2                   # packed right shift << 31
1364        pslld     $30, \TMP3                   # packed right shift << 30
1365        pslld     $25, \TMP4                   # packed right shift << 25
1366        pxor      \TMP3, \TMP2                 # xor the shifted versions
1367        pxor      \TMP4, \TMP2
1368        movdqa    \TMP2, \TMP5
1369        psrldq    $4, \TMP5                    # right shift T5 1 DW
1370        pslldq    $12, \TMP2                   # left shift T2 3 DWs
1371        pxor      \TMP2, \XMM5
1372
1373        # second phase of reduction
1374
1375        movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1376        movdqa    \XMM5,\TMP3
1377        movdqa    \XMM5,\TMP4
1378        psrld     $1, \TMP2                    # packed left shift >>1
1379        psrld     $2, \TMP3                    # packed left shift >>2
1380        psrld     $7, \TMP4                    # packed left shift >>7
1381        pxor      \TMP3,\TMP2                  # xor the shifted versions
1382        pxor      \TMP4,\TMP2
1383        pxor      \TMP5, \TMP2
1384        pxor      \TMP2, \XMM5
1385        pxor      \TMP1, \XMM5                 # result is in TMP1
1386
1387        pxor      \XMM5, \XMM1
1388.endm
1389
1390/* GHASH the last 4 ciphertext blocks. */
1391.macro  GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1392TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1393
1394        # Multiply TMP6 * HashKey (using Karatsuba)
1395
1396        movdqa    \XMM1, \TMP6
1397        pshufd    $78, \XMM1, \TMP2
1398        pxor      \XMM1, \TMP2
1399        movdqu    HashKey_4(%arg2), \TMP5
1400        pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1401        pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1402        movdqu    HashKey_4_k(%arg2), \TMP4
1403        pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1404        movdqa    \XMM1, \XMMDst
1405        movdqa    \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1406
1407        # Multiply TMP1 * HashKey (using Karatsuba)
1408
1409        movdqa    \XMM2, \TMP1
1410        pshufd    $78, \XMM2, \TMP2
1411        pxor      \XMM2, \TMP2
1412        movdqu    HashKey_3(%arg2), \TMP5
1413        pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1414        pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1415        movdqu    HashKey_3_k(%arg2), \TMP4
1416        pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1417        pxor      \TMP1, \TMP6
1418        pxor      \XMM2, \XMMDst
1419        pxor      \TMP2, \XMM1
1420# results accumulated in TMP6, XMMDst, XMM1
1421
1422        # Multiply TMP1 * HashKey (using Karatsuba)
1423
1424        movdqa    \XMM3, \TMP1
1425        pshufd    $78, \XMM3, \TMP2
1426        pxor      \XMM3, \TMP2
1427        movdqu    HashKey_2(%arg2), \TMP5
1428        pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1429        pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1430        movdqu    HashKey_2_k(%arg2), \TMP4
1431        pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1432        pxor      \TMP1, \TMP6
1433        pxor      \XMM3, \XMMDst
1434        pxor      \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1435
1436        # Multiply TMP1 * HashKey (using Karatsuba)
1437        movdqa    \XMM4, \TMP1
1438        pshufd    $78, \XMM4, \TMP2
1439        pxor      \XMM4, \TMP2
1440        movdqu    HashKey(%arg2), \TMP5
1441        pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1442        pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1443        movdqu    HashKey_k(%arg2), \TMP4
1444        pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1445        pxor      \TMP1, \TMP6
1446        pxor      \XMM4, \XMMDst
1447        pxor      \XMM1, \TMP2
1448        pxor      \TMP6, \TMP2
1449        pxor      \XMMDst, \TMP2
1450        # middle section of the temp results combined as in karatsuba algorithm
1451        movdqa    \TMP2, \TMP4
1452        pslldq    $8, \TMP4                 # left shift TMP4 2 DWs
1453        psrldq    $8, \TMP2                 # right shift TMP2 2 DWs
1454        pxor      \TMP4, \XMMDst
1455        pxor      \TMP2, \TMP6
1456# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1457        # first phase of the reduction
1458        movdqa    \XMMDst, \TMP2
1459        movdqa    \XMMDst, \TMP3
1460        movdqa    \XMMDst, \TMP4
1461# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1462        pslld     $31, \TMP2                # packed right shifting << 31
1463        pslld     $30, \TMP3                # packed right shifting << 30
1464        pslld     $25, \TMP4                # packed right shifting << 25
1465        pxor      \TMP3, \TMP2              # xor the shifted versions
1466        pxor      \TMP4, \TMP2
1467        movdqa    \TMP2, \TMP7
1468        psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1469        pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1470        pxor      \TMP2, \XMMDst
1471
1472        # second phase of the reduction
1473        movdqa    \XMMDst, \TMP2
1474        # make 3 copies of XMMDst for doing 3 shift operations
1475        movdqa    \XMMDst, \TMP3
1476        movdqa    \XMMDst, \TMP4
1477        psrld     $1, \TMP2                 # packed left shift >> 1
1478        psrld     $2, \TMP3                 # packed left shift >> 2
1479        psrld     $7, \TMP4                 # packed left shift >> 7
1480        pxor      \TMP3, \TMP2              # xor the shifted versions
1481        pxor      \TMP4, \TMP2
1482        pxor      \TMP7, \TMP2
1483        pxor      \TMP2, \XMMDst
1484        pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1485.endm
1486
1487
1488/* Encryption of a single block
1489* uses eax & r10
1490*/
1491
1492.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1493
1494        pxor            (%arg1), \XMM0
1495        mov             keysize,%eax
1496        shr             $2,%eax                 # 128->4, 192->6, 256->8
1497        add             $5,%eax                 # 128->9, 192->11, 256->13
1498        lea             16(%arg1), %r10   # get first expanded key address
1499
1500_esb_loop_\@:
1501        MOVADQ          (%r10),\TMP1
1502        aesenc          \TMP1,\XMM0
1503        add             $16,%r10
1504        sub             $1,%eax
1505        jnz             _esb_loop_\@
1506
1507        MOVADQ          (%r10),\TMP1
1508        aesenclast      \TMP1,\XMM0
1509.endm
1510/*****************************************************************************
1511* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1512*                   struct gcm_context_data *data
1513*                                      // Context data
1514*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1515*                   const u8 *in,      // Ciphertext input
1516*                   u64 plaintext_len, // Length of data in bytes for decryption.
1517*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1518*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1519*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1520*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1521*                   const u8 *aad,     // Additional Authentication Data (AAD)
1522*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1523*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1524*                                      // given authentication tag and only return the plaintext if they match.
1525*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1526*                                      // (most likely), 12 or 8.
1527*
1528* Assumptions:
1529*
1530* keys:
1531*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1532*       set of 11 keys in the data structure void *aes_ctx
1533*
1534* iv:
1535*       0                   1                   2                   3
1536*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1537*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1538*       |                             Salt  (From the SA)               |
1539*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1540*       |                     Initialization Vector                     |
1541*       |         (This is the sequence number from IPSec header)       |
1542*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543*       |                              0x1                              |
1544*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545*
1546*
1547*
1548* AAD:
1549*       AAD padded to 128 bits with 0
1550*       for example, assume AAD is a u32 vector
1551*
1552*       if AAD is 8 bytes:
1553*       AAD[3] = {A0, A1};
1554*       padded AAD in xmm register = {A1 A0 0 0}
1555*
1556*       0                   1                   2                   3
1557*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1558*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1559*       |                               SPI (A1)                        |
1560*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1561*       |                     32-bit Sequence Number (A0)               |
1562*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1563*       |                              0x0                              |
1564*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565*
1566*                                       AAD Format with 32-bit Sequence Number
1567*
1568*       if AAD is 12 bytes:
1569*       AAD[3] = {A0, A1, A2};
1570*       padded AAD in xmm register = {A2 A1 A0 0}
1571*
1572*       0                   1                   2                   3
1573*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1574*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1575*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1576*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1577*       |                               SPI (A2)                        |
1578*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579*       |                 64-bit Extended Sequence Number {A1,A0}       |
1580*       |                                                               |
1581*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1582*       |                              0x0                              |
1583*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584*
1585*                        AAD Format with 64-bit Extended Sequence Number
1586*
1587* poly = x^128 + x^127 + x^126 + x^121 + 1
1588*
1589*****************************************************************************/
1590SYM_FUNC_START(aesni_gcm_dec)
1591        FUNC_SAVE
1592
1593        GCM_INIT %arg6, arg7, arg8, arg9
1594        GCM_ENC_DEC dec
1595        GCM_COMPLETE arg10, arg11
1596        FUNC_RESTORE
1597        RET
1598SYM_FUNC_END(aesni_gcm_dec)
1599
1600
1601/*****************************************************************************
1602* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1603*                    struct gcm_context_data *data
1604*                                        // Context data
1605*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1606*                    const u8 *in,       // Plaintext input
1607*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1608*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1609*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1610*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1611*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1612*                    const u8 *aad,      // Additional Authentication Data (AAD)
1613*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1614*                    u8 *auth_tag,       // Authenticated Tag output.
1615*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1616*                                        // 12 or 8.
1617*
1618* Assumptions:
1619*
1620* keys:
1621*       keys are pre-expanded and aligned to 16 bytes. we are using the
1622*       first set of 11 keys in the data structure void *aes_ctx
1623*
1624*
1625* iv:
1626*       0                   1                   2                   3
1627*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1628*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1629*       |                             Salt  (From the SA)               |
1630*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1631*       |                     Initialization Vector                     |
1632*       |         (This is the sequence number from IPSec header)       |
1633*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634*       |                              0x1                              |
1635*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636*
1637*
1638*
1639* AAD:
1640*       AAD padded to 128 bits with 0
1641*       for example, assume AAD is a u32 vector
1642*
1643*       if AAD is 8 bytes:
1644*       AAD[3] = {A0, A1};
1645*       padded AAD in xmm register = {A1 A0 0 0}
1646*
1647*       0                   1                   2                   3
1648*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1649*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1650*       |                               SPI (A1)                        |
1651*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1652*       |                     32-bit Sequence Number (A0)               |
1653*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1654*       |                              0x0                              |
1655*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656*
1657*                                 AAD Format with 32-bit Sequence Number
1658*
1659*       if AAD is 12 bytes:
1660*       AAD[3] = {A0, A1, A2};
1661*       padded AAD in xmm register = {A2 A1 A0 0}
1662*
1663*       0                   1                   2                   3
1664*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1665*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1666*       |                               SPI (A2)                        |
1667*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1668*       |                 64-bit Extended Sequence Number {A1,A0}       |
1669*       |                                                               |
1670*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1671*       |                              0x0                              |
1672*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1673*
1674*                         AAD Format with 64-bit Extended Sequence Number
1675*
1676* poly = x^128 + x^127 + x^126 + x^121 + 1
1677***************************************************************************/
1678SYM_FUNC_START(aesni_gcm_enc)
1679        FUNC_SAVE
1680
1681        GCM_INIT %arg6, arg7, arg8, arg9
1682        GCM_ENC_DEC enc
1683
1684        GCM_COMPLETE arg10, arg11
1685        FUNC_RESTORE
1686        RET
1687SYM_FUNC_END(aesni_gcm_enc)
1688
1689/*****************************************************************************
1690* void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1691*                     struct gcm_context_data *data,
1692*                                         // context data
1693*                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1694*                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1695*                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1696*                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1697*                     const u8 *aad,      // Additional Authentication Data (AAD)
1698*                     u64 aad_len)        // Length of AAD in bytes.
1699*/
1700SYM_FUNC_START(aesni_gcm_init)
1701        FUNC_SAVE
1702        GCM_INIT %arg3, %arg4,%arg5, %arg6
1703        FUNC_RESTORE
1704        RET
1705SYM_FUNC_END(aesni_gcm_init)
1706
1707/*****************************************************************************
1708* void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1709*                    struct gcm_context_data *data,
1710*                                        // context data
1711*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1712*                    const u8 *in,       // Plaintext input
1713*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1714*/
1715SYM_FUNC_START(aesni_gcm_enc_update)
1716        FUNC_SAVE
1717        GCM_ENC_DEC enc
1718        FUNC_RESTORE
1719        RET
1720SYM_FUNC_END(aesni_gcm_enc_update)
1721
1722/*****************************************************************************
1723* void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1724*                    struct gcm_context_data *data,
1725*                                        // context data
1726*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1727*                    const u8 *in,       // Plaintext input
1728*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1729*/
1730SYM_FUNC_START(aesni_gcm_dec_update)
1731        FUNC_SAVE
1732        GCM_ENC_DEC dec
1733        FUNC_RESTORE
1734        RET
1735SYM_FUNC_END(aesni_gcm_dec_update)
1736
1737/*****************************************************************************
1738* void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1739*                    struct gcm_context_data *data,
1740*                                        // context data
1741*                    u8 *auth_tag,       // Authenticated Tag output.
1742*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1743*                                        // 12 or 8.
1744*/
1745SYM_FUNC_START(aesni_gcm_finalize)
1746        FUNC_SAVE
1747        GCM_COMPLETE %arg3 %arg4
1748        FUNC_RESTORE
1749        RET
1750SYM_FUNC_END(aesni_gcm_finalize)
1751
1752#endif
1753
1754SYM_FUNC_START_LOCAL(_key_expansion_256a)
1755        pshufd $0b11111111, %xmm1, %xmm1
1756        shufps $0b00010000, %xmm0, %xmm4
1757        pxor %xmm4, %xmm0
1758        shufps $0b10001100, %xmm0, %xmm4
1759        pxor %xmm4, %xmm0
1760        pxor %xmm1, %xmm0
1761        movaps %xmm0, (TKEYP)
1762        add $0x10, TKEYP
1763        RET
1764SYM_FUNC_END(_key_expansion_256a)
1765SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
1766
1767SYM_FUNC_START_LOCAL(_key_expansion_192a)
1768        pshufd $0b01010101, %xmm1, %xmm1
1769        shufps $0b00010000, %xmm0, %xmm4
1770        pxor %xmm4, %xmm0
1771        shufps $0b10001100, %xmm0, %xmm4
1772        pxor %xmm4, %xmm0
1773        pxor %xmm1, %xmm0
1774
1775        movaps %xmm2, %xmm5
1776        movaps %xmm2, %xmm6
1777        pslldq $4, %xmm5
1778        pshufd $0b11111111, %xmm0, %xmm3
1779        pxor %xmm3, %xmm2
1780        pxor %xmm5, %xmm2
1781
1782        movaps %xmm0, %xmm1
1783        shufps $0b01000100, %xmm0, %xmm6
1784        movaps %xmm6, (TKEYP)
1785        shufps $0b01001110, %xmm2, %xmm1
1786        movaps %xmm1, 0x10(TKEYP)
1787        add $0x20, TKEYP
1788        RET
1789SYM_FUNC_END(_key_expansion_192a)
1790
1791SYM_FUNC_START_LOCAL(_key_expansion_192b)
1792        pshufd $0b01010101, %xmm1, %xmm1
1793        shufps $0b00010000, %xmm0, %xmm4
1794        pxor %xmm4, %xmm0
1795        shufps $0b10001100, %xmm0, %xmm4
1796        pxor %xmm4, %xmm0
1797        pxor %xmm1, %xmm0
1798
1799        movaps %xmm2, %xmm5
1800        pslldq $4, %xmm5
1801        pshufd $0b11111111, %xmm0, %xmm3
1802        pxor %xmm3, %xmm2
1803        pxor %xmm5, %xmm2
1804
1805        movaps %xmm0, (TKEYP)
1806        add $0x10, TKEYP
1807        RET
1808SYM_FUNC_END(_key_expansion_192b)
1809
1810SYM_FUNC_START_LOCAL(_key_expansion_256b)
1811        pshufd $0b10101010, %xmm1, %xmm1
1812        shufps $0b00010000, %xmm2, %xmm4
1813        pxor %xmm4, %xmm2
1814        shufps $0b10001100, %xmm2, %xmm4
1815        pxor %xmm4, %xmm2
1816        pxor %xmm1, %xmm2
1817        movaps %xmm2, (TKEYP)
1818        add $0x10, TKEYP
1819        RET
1820SYM_FUNC_END(_key_expansion_256b)
1821
1822/*
1823 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1824 *                   unsigned int key_len)
1825 */
1826SYM_FUNC_START(aesni_set_key)
1827        FRAME_BEGIN
1828#ifndef __x86_64__
1829        pushl KEYP
1830        movl (FRAME_OFFSET+8)(%esp), KEYP       # ctx
1831        movl (FRAME_OFFSET+12)(%esp), UKEYP     # in_key
1832        movl (FRAME_OFFSET+16)(%esp), %edx      # key_len
1833#endif
1834        movups (UKEYP), %xmm0           # user key (first 16 bytes)
1835        movaps %xmm0, (KEYP)
1836        lea 0x10(KEYP), TKEYP           # key addr
1837        movl %edx, 480(KEYP)
1838        pxor %xmm4, %xmm4               # xmm4 is assumed 0 in _key_expansion_x
1839        cmp $24, %dl
1840        jb .Lenc_key128
1841        je .Lenc_key192
1842        movups 0x10(UKEYP), %xmm2       # other user key
1843        movaps %xmm2, (TKEYP)
1844        add $0x10, TKEYP
1845        aeskeygenassist $0x1, %xmm2, %xmm1      # round 1
1846        call _key_expansion_256a
1847        aeskeygenassist $0x1, %xmm0, %xmm1
1848        call _key_expansion_256b
1849        aeskeygenassist $0x2, %xmm2, %xmm1      # round 2
1850        call _key_expansion_256a
1851        aeskeygenassist $0x2, %xmm0, %xmm1
1852        call _key_expansion_256b
1853        aeskeygenassist $0x4, %xmm2, %xmm1      # round 3
1854        call _key_expansion_256a
1855        aeskeygenassist $0x4, %xmm0, %xmm1
1856        call _key_expansion_256b
1857        aeskeygenassist $0x8, %xmm2, %xmm1      # round 4
1858        call _key_expansion_256a
1859        aeskeygenassist $0x8, %xmm0, %xmm1
1860        call _key_expansion_256b
1861        aeskeygenassist $0x10, %xmm2, %xmm1     # round 5
1862        call _key_expansion_256a
1863        aeskeygenassist $0x10, %xmm0, %xmm1
1864        call _key_expansion_256b
1865        aeskeygenassist $0x20, %xmm2, %xmm1     # round 6
1866        call _key_expansion_256a
1867        aeskeygenassist $0x20, %xmm0, %xmm1
1868        call _key_expansion_256b
1869        aeskeygenassist $0x40, %xmm2, %xmm1     # round 7
1870        call _key_expansion_256a
1871        jmp .Ldec_key
1872.Lenc_key192:
1873        movq 0x10(UKEYP), %xmm2         # other user key
1874        aeskeygenassist $0x1, %xmm2, %xmm1      # round 1
1875        call _key_expansion_192a
1876        aeskeygenassist $0x2, %xmm2, %xmm1      # round 2
1877        call _key_expansion_192b
1878        aeskeygenassist $0x4, %xmm2, %xmm1      # round 3
1879        call _key_expansion_192a
1880        aeskeygenassist $0x8, %xmm2, %xmm1      # round 4
1881        call _key_expansion_192b
1882        aeskeygenassist $0x10, %xmm2, %xmm1     # round 5
1883        call _key_expansion_192a
1884        aeskeygenassist $0x20, %xmm2, %xmm1     # round 6
1885        call _key_expansion_192b
1886        aeskeygenassist $0x40, %xmm2, %xmm1     # round 7
1887        call _key_expansion_192a
1888        aeskeygenassist $0x80, %xmm2, %xmm1     # round 8
1889        call _key_expansion_192b
1890        jmp .Ldec_key
1891.Lenc_key128:
1892        aeskeygenassist $0x1, %xmm0, %xmm1      # round 1
1893        call _key_expansion_128
1894        aeskeygenassist $0x2, %xmm0, %xmm1      # round 2
1895        call _key_expansion_128
1896        aeskeygenassist $0x4, %xmm0, %xmm1      # round 3
1897        call _key_expansion_128
1898        aeskeygenassist $0x8, %xmm0, %xmm1      # round 4
1899        call _key_expansion_128
1900        aeskeygenassist $0x10, %xmm0, %xmm1     # round 5
1901        call _key_expansion_128
1902        aeskeygenassist $0x20, %xmm0, %xmm1     # round 6
1903        call _key_expansion_128
1904        aeskeygenassist $0x40, %xmm0, %xmm1     # round 7
1905        call _key_expansion_128
1906        aeskeygenassist $0x80, %xmm0, %xmm1     # round 8
1907        call _key_expansion_128
1908        aeskeygenassist $0x1b, %xmm0, %xmm1     # round 9
1909        call _key_expansion_128
1910        aeskeygenassist $0x36, %xmm0, %xmm1     # round 10
1911        call _key_expansion_128
1912.Ldec_key:
1913        sub $0x10, TKEYP
1914        movaps (KEYP), %xmm0
1915        movaps (TKEYP), %xmm1
1916        movaps %xmm0, 240(TKEYP)
1917        movaps %xmm1, 240(KEYP)
1918        add $0x10, KEYP
1919        lea 240-16(TKEYP), UKEYP
1920.align 4
1921.Ldec_key_loop:
1922        movaps (KEYP), %xmm0
1923        aesimc %xmm0, %xmm1
1924        movaps %xmm1, (UKEYP)
1925        add $0x10, KEYP
1926        sub $0x10, UKEYP
1927        cmp TKEYP, KEYP
1928        jb .Ldec_key_loop
1929        xor AREG, AREG
1930#ifndef __x86_64__
1931        popl KEYP
1932#endif
1933        FRAME_END
1934        RET
1935SYM_FUNC_END(aesni_set_key)
1936
1937/*
1938 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1939 */
1940SYM_FUNC_START(aesni_enc)
1941        FRAME_BEGIN
1942#ifndef __x86_64__
1943        pushl KEYP
1944        pushl KLEN
1945        movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
1946        movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
1947        movl (FRAME_OFFSET+20)(%esp), INP       # src
1948#endif
1949        movl 480(KEYP), KLEN            # key length
1950        movups (INP), STATE             # input
1951        call _aesni_enc1
1952        movups STATE, (OUTP)            # output
1953#ifndef __x86_64__
1954        popl KLEN
1955        popl KEYP
1956#endif
1957        FRAME_END
1958        RET
1959SYM_FUNC_END(aesni_enc)
1960
1961/*
1962 * _aesni_enc1:         internal ABI
1963 * input:
1964 *      KEYP:           key struct pointer
1965 *      KLEN:           round count
1966 *      STATE:          initial state (input)
1967 * output:
1968 *      STATE:          finial state (output)
1969 * changed:
1970 *      KEY
1971 *      TKEYP (T1)
1972 */
1973SYM_FUNC_START_LOCAL(_aesni_enc1)
1974        movaps (KEYP), KEY              # key
1975        mov KEYP, TKEYP
1976        pxor KEY, STATE         # round 0
1977        add $0x30, TKEYP
1978        cmp $24, KLEN
1979        jb .Lenc128
1980        lea 0x20(TKEYP), TKEYP
1981        je .Lenc192
1982        add $0x20, TKEYP
1983        movaps -0x60(TKEYP), KEY
1984        aesenc KEY, STATE
1985        movaps -0x50(TKEYP), KEY
1986        aesenc KEY, STATE
1987.align 4
1988.Lenc192:
1989        movaps -0x40(TKEYP), KEY
1990        aesenc KEY, STATE
1991        movaps -0x30(TKEYP), KEY
1992        aesenc KEY, STATE
1993.align 4
1994.Lenc128:
1995        movaps -0x20(TKEYP), KEY
1996        aesenc KEY, STATE
1997        movaps -0x10(TKEYP), KEY
1998        aesenc KEY, STATE
1999        movaps (TKEYP), KEY
2000        aesenc KEY, STATE

2001        movaps 0x10(TKEYP), KEY
2002        aesenc KEY, STATE
2003        movaps 0x20(TKEYP), KEY
2004        aesenc KEY, STATE
2005        movaps 0x30(TKEYP), KEY
2006        aesenc KEY, STATE
2007        movaps 0x40(TKEYP), KEY
2008        aesenc KEY, STATE
2009        movaps 0x50(TKEYP), KEY
2010        aesenc KEY, STATE
2011        movaps 0x60(TKEYP), KEY
2012        aesenc KEY, STATE
2013        movaps 0x70(TKEYP), KEY
2014        aesenclast KEY, STATE
2015        RET
2016SYM_FUNC_END(_aesni_enc1)
2017
2018/*
2019 * _aesni_enc4: internal ABI
2020 * input:
2021 *      KEYP:           key struct pointer
2022 *      KLEN:           round count
2023 *      STATE1:         initial state (input)
2024 *      STATE2
2025 *      STATE3
2026 *      STATE4
2027 * output:
2028 *      STATE1:         finial state (output)
2029 *      STATE2
2030 *      STATE3
2031 *      STATE4
2032 * changed:
2033 *      KEY
2034 *      TKEYP (T1)
2035 */
2036SYM_FUNC_START_LOCAL(_aesni_enc4)
2037        movaps (KEYP), KEY              # key
2038        mov KEYP, TKEYP
2039        pxor KEY, STATE1                # round 0
2040        pxor KEY, STATE2
2041        pxor KEY, STATE3
2042        pxor KEY, STATE4
2043        add $0x30, TKEYP
2044        cmp $24, KLEN
2045        jb .L4enc128
2046        lea 0x20(TKEYP), TKEYP
2047        je .L4enc192
2048        add $0x20, TKEYP
2049        movaps -0x60(TKEYP), KEY
2050        aesenc KEY, STATE1
2051        aesenc KEY, STATE2
2052        aesenc KEY, STATE3
2053        aesenc KEY, STATE4
2054        movaps -0x50(TKEYP), KEY
2055        aesenc KEY, STATE1
2056        aesenc KEY, STATE2
2057        aesenc KEY, STATE3
2058        aesenc KEY, STATE4
2059#.align 4
2060.L4enc192:
2061        movaps -0x40(TKEYP), KEY
2062        aesenc KEY, STATE1
2063        aesenc KEY, STATE2
2064        aesenc KEY, STATE3
2065        aesenc KEY, STATE4
2066        movaps -0x30(TKEYP), KEY
2067        aesenc KEY, STATE1
2068        aesenc KEY, STATE2
2069        aesenc KEY, STATE3
2070        aesenc KEY, STATE4
2071#.align 4
2072.L4enc128:
2073        movaps -0x20(TKEYP), KEY
2074        aesenc KEY, STATE1
2075        aesenc KEY, STATE2
2076        aesenc KEY, STATE3
2077        aesenc KEY, STATE4
2078        movaps -0x10(TKEYP), KEY
2079        aesenc KEY, STATE1
2080        aesenc KEY, STATE2
2081        aesenc KEY, STATE3
2082        aesenc KEY, STATE4
2083        movaps (TKEYP), KEY
2084        aesenc KEY, STATE1
2085        aesenc KEY, STATE2
2086        aesenc KEY, STATE3
2087        aesenc KEY, STATE4
2088        movaps 0x10(TKEYP), KEY
2089        aesenc KEY, STATE1
2090        aesenc KEY, STATE2
2091        aesenc KEY, STATE3
2092        aesenc KEY, STATE4
2093        movaps 0x20(TKEYP), KEY
2094        aesenc KEY, STATE1
2095        aesenc KEY, STATE2
2096        aesenc KEY, STATE3
2097        aesenc KEY, STATE4
2098        movaps 0x30(TKEYP), KEY
2099        aesenc KEY, STATE1
2100        aesenc KEY, STATE2
2101        aesenc KEY, STATE3
2102        aesenc KEY, STATE4
2103        movaps 0x40(TKEYP), KEY
2104        aesenc KEY, STATE1
2105        aesenc KEY, STATE2
2106        aesenc KEY, STATE3
2107        aesenc KEY, STATE4
2108        movaps 0x50(TKEYP), KEY
2109        aesenc KEY, STATE1
2110        aesenc KEY, STATE2
2111        aesenc KEY, STATE3
2112        aesenc KEY, STATE4
2113        movaps 0x60(TKEYP), KEY
2114        aesenc KEY, STATE1
2115        aesenc KEY, STATE2
2116        aesenc KEY, STATE3
2117        aesenc KEY, STATE4
2118        movaps 0x70(TKEYP), KEY
2119        aesenclast KEY, STATE1          # last round
2120        aesenclast KEY, STATE2
2121        aesenclast KEY, STATE3
2122        aesenclast KEY, STATE4
2123        RET
2124SYM_FUNC_END(_aesni_enc4)
2125
2126/*
2127 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2128 */
2129SYM_FUNC_START(aesni_dec)
2130        FRAME_BEGIN
2131#ifndef __x86_64__
2132        pushl KEYP
2133        pushl KLEN
2134        movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
2135        movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
2136        movl (FRAME_OFFSET+20)(%esp), INP       # src
2137#endif
2138        mov 480(KEYP), KLEN             # key length
2139        add $240, KEYP
2140        movups (INP), STATE             # input
2141        call _aesni_dec1
2142        movups STATE, (OUTP)            #output
2143#ifndef __x86_64__
2144        popl KLEN
2145        popl KEYP
2146#endif
2147        FRAME_END
2148        RET
2149SYM_FUNC_END(aesni_dec)
2150
2151/*
2152 * _aesni_dec1:         internal ABI
2153 * input:
2154 *      KEYP:           key struct pointer
2155 *      KLEN:           key length
2156 *      STATE:          initial state (input)
2157 * output:
2158 *      STATE:          finial state (output)
2159 * changed:
2160 *      KEY
2161 *      TKEYP (T1)
2162 */
2163SYM_FUNC_START_LOCAL(_aesni_dec1)
2164        movaps (KEYP), KEY              # key
2165        mov KEYP, TKEYP
2166        pxor KEY, STATE         # round 0
2167        add $0x30, TKEYP
2168        cmp $24, KLEN
2169        jb .Ldec128
2170        lea 0x20(TKEYP), TKEYP
2171        je .Ldec192
2172        add $0x20, TKEYP
2173        movaps -0x60(TKEYP), KEY
2174        aesdec KEY, STATE
2175        movaps -0x50(TKEYP), KEY
2176        aesdec KEY, STATE
2177.align 4
2178.Ldec192:
2179        movaps -0x40(TKEYP), KEY
2180        aesdec KEY, STATE
2181        movaps -0x30(TKEYP), KEY
2182        aesdec KEY, STATE
2183.align 4
2184.Ldec128:
2185        movaps -0x20(TKEYP), KEY
2186        aesdec KEY, STATE
2187        movaps -0x10(TKEYP), KEY
2188        aesdec KEY, STATE
2189        movaps (TKEYP), KEY
2190        aesdec KEY, STATE
2191        movaps 0x10(TKEYP), KEY
2192        aesdec KEY, STATE
2193        movaps 0x20(TKEYP), KEY
2194        aesdec KEY, STATE
2195        movaps 0x30(TKEYP), KEY
2196        aesdec KEY, STATE
2197        movaps 0x40(TKEYP), KEY
2198        aesdec KEY, STATE
2199        movaps 0x50(TKEYP), KEY
2200        aesdec KEY, STATE
2201        movaps 0x60(TKEYP), KEY
2202        aesdec KEY, STATE
2203        movaps 0x70(TKEYP), KEY
2204        aesdeclast KEY, STATE
2205        RET
2206SYM_FUNC_END(_aesni_dec1)
2207
2208/*
2209 * _aesni_dec4: internal ABI
2210 * input:
2211 *      KEYP:           key struct pointer
2212 *      KLEN:           key length
2213 *      STATE1:         initial state (input)
2214 *      STATE2
2215 *      STATE3
2216 *      STATE4
2217 * output:
2218 *      STATE1:         finial state (output)
2219 *      STATE2
2220 *      STATE3
2221 *      STATE4
2222 * changed:
2223 *      KEY
2224 *      TKEYP (T1)
2225 */
2226SYM_FUNC_START_LOCAL(_aesni_dec4)
2227        movaps (KEYP), KEY              # key
2228        mov KEYP, TKEYP
2229        pxor KEY, STATE1                # round 0
2230        pxor KEY, STATE2
2231        pxor KEY, STATE3
2232        pxor KEY, STATE4
2233        add $0x30, TKEYP
2234        cmp $24, KLEN
2235        jb .L4dec128
2236        lea 0x20(TKEYP), TKEYP
2237        je .L4dec192
2238        add $0x20, TKEYP
2239        movaps -0x60(TKEYP), KEY
2240        aesdec KEY, STATE1
2241        aesdec KEY, STATE2
2242        aesdec KEY, STATE3
2243        aesdec KEY, STATE4
2244        movaps -0x50(TKEYP), KEY
2245        aesdec KEY, STATE1
2246        aesdec KEY, STATE2
2247        aesdec KEY, STATE3
2248        aesdec KEY, STATE4
2249.align 4
2250.L4dec192:
2251        movaps -0x40(TKEYP), KEY
2252        aesdec KEY, STATE1
2253        aesdec KEY, STATE2
2254        aesdec KEY, STATE3
2255        aesdec KEY, STATE4
2256        movaps -0x30(TKEYP), KEY
2257        aesdec KEY, STATE1
2258        aesdec KEY, STATE2
2259        aesdec KEY, STATE3
2260        aesdec KEY, STATE4
2261.align 4
2262.L4dec128:
2263        movaps -0x20(TKEYP), KEY
2264        aesdec KEY, STATE1
2265        aesdec KEY, STATE2
2266        aesdec KEY, STATE3
2267        aesdec KEY, STATE4
2268        movaps -0x10(TKEYP), KEY
2269        aesdec KEY, STATE1
2270        aesdec KEY, STATE2
2271        aesdec KEY, STATE3
2272        aesdec KEY, STATE4
2273        movaps (TKEYP), KEY
2274        aesdec KEY, STATE1
2275        aesdec KEY, STATE2
2276        aesdec KEY, STATE3
2277        aesdec KEY, STATE4
2278        movaps 0x10(TKEYP), KEY
2279        aesdec KEY, STATE1
2280        aesdec KEY, STATE2
2281        aesdec KEY, STATE3
2282        aesdec KEY, STATE4
2283        movaps 0x20(TKEYP), KEY
2284        aesdec KEY, STATE1
2285        aesdec KEY, STATE2
2286        aesdec KEY, STATE3
2287        aesdec KEY, STATE4
2288        movaps 0x30(TKEYP), KEY
2289        aesdec KEY, STATE1
2290        aesdec KEY, STATE2
2291        aesdec KEY, STATE3
2292        aesdec KEY, STATE4
2293        movaps 0x40(TKEYP), KEY
2294        aesdec KEY, STATE1
2295        aesdec KEY, STATE2
2296        aesdec KEY, STATE3
2297        aesdec KEY, STATE4
2298        movaps 0x50(TKEYP), KEY
2299        aesdec KEY, STATE1
2300        aesdec KEY, STATE2
2301        aesdec KEY, STATE3
2302        aesdec KEY, STATE4
2303        movaps 0x60(TKEYP), KEY
2304        aesdec KEY, STATE1
2305        aesdec KEY, STATE2
2306        aesdec KEY, STATE3
2307        aesdec KEY, STATE4
2308        movaps 0x70(TKEYP), KEY
2309        aesdeclast KEY, STATE1          # last round
2310        aesdeclast KEY, STATE2
2311        aesdeclast KEY, STATE3
2312        aesdeclast KEY, STATE4
2313        RET
2314SYM_FUNC_END(_aesni_dec4)
2315
2316/*
2317 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2318 *                    size_t len)
2319 */
2320SYM_FUNC_START(aesni_ecb_enc)
2321        FRAME_BEGIN
2322#ifndef __x86_64__
2323        pushl LEN
2324        pushl KEYP
2325        pushl KLEN
2326        movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2327        movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2328        movl (FRAME_OFFSET+24)(%esp), INP       # src
2329        movl (FRAME_OFFSET+28)(%esp), LEN       # len
2330#endif
2331        test LEN, LEN           # check length
2332        jz .Lecb_enc_ret
2333        mov 480(KEYP), KLEN
2334        cmp $16, LEN
2335        jb .Lecb_enc_ret
2336        cmp $64, LEN
2337        jb .Lecb_enc_loop1
2338.align 4
2339.Lecb_enc_loop4:
2340        movups (INP), STATE1
2341        movups 0x10(INP), STATE2
2342        movups 0x20(INP), STATE3
2343        movups 0x30(INP), STATE4
2344        call _aesni_enc4
2345        movups STATE1, (OUTP)
2346        movups STATE2, 0x10(OUTP)
2347        movups STATE3, 0x20(OUTP)
2348        movups STATE4, 0x30(OUTP)
2349        sub $64, LEN
2350        add $64, INP
2351        add $64, OUTP
2352        cmp $64, LEN
2353        jge .Lecb_enc_loop4
2354        cmp $16, LEN
2355        jb .Lecb_enc_ret
2356.align 4
2357.Lecb_enc_loop1:
2358        movups (INP), STATE1
2359        call _aesni_enc1
2360        movups STATE1, (OUTP)
2361        sub $16, LEN
2362        add $16, INP
2363        add $16, OUTP
2364        cmp $16, LEN
2365        jge .Lecb_enc_loop1
2366.Lecb_enc_ret:
2367#ifndef __x86_64__
2368        popl KLEN
2369        popl KEYP
2370        popl LEN
2371#endif
2372        FRAME_END
2373        RET
2374SYM_FUNC_END(aesni_ecb_enc)
2375
2376/*
2377 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2378 *                    size_t len);
2379 */
2380SYM_FUNC_START(aesni_ecb_dec)
2381        FRAME_BEGIN
2382#ifndef __x86_64__
2383        pushl LEN
2384        pushl KEYP
2385        pushl KLEN
2386        movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2387        movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2388        movl (FRAME_OFFSET+24)(%esp), INP       # src
2389        movl (FRAME_OFFSET+28)(%esp), LEN       # len
2390#endif
2391        test LEN, LEN
2392        jz .Lecb_dec_ret
2393        mov 480(KEYP), KLEN
2394        add $240, KEYP
2395        cmp $16, LEN
2396        jb .Lecb_dec_ret
2397        cmp $64, LEN
2398        jb .Lecb_dec_loop1
2399.align 4
2400.Lecb_dec_loop4:
2401        movups (INP), STATE1
2402        movups 0x10(INP), STATE2
2403        movups 0x20(INP), STATE3
2404        movups 0x30(INP), STATE4
2405        call _aesni_dec4
2406        movups STATE1, (OUTP)
2407        movups STATE2, 0x10(OUTP)
2408        movups STATE3, 0x20(OUTP)
2409        movups STATE4, 0x30(OUTP)
2410        sub $64, LEN
2411        add $64, INP
2412        add $64, OUTP
2413        cmp $64, LEN
2414        jge .Lecb_dec_loop4
2415        cmp $16, LEN
2416        jb .Lecb_dec_ret
2417.align 4
2418.Lecb_dec_loop1:
2419        movups (INP), STATE1
2420        call _aesni_dec1
2421        movups STATE1, (OUTP)
2422        sub $16, LEN
2423        add $16, INP
2424        add $16, OUTP
2425        cmp $16, LEN
2426        jge .Lecb_dec_loop1
2427.Lecb_dec_ret:
2428#ifndef __x86_64__
2429        popl KLEN
2430        popl KEYP
2431        popl LEN
2432#endif
2433        FRAME_END
2434        RET
2435SYM_FUNC_END(aesni_ecb_dec)
2436
2437/*
2438 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2439 *                    size_t len, u8 *iv)
2440 */
2441SYM_FUNC_START(aesni_cbc_enc)
2442        FRAME_BEGIN
2443#ifndef __x86_64__
2444        pushl IVP
2445        pushl LEN
2446        pushl KEYP
2447        pushl KLEN
2448        movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2449        movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2450        movl (FRAME_OFFSET+28)(%esp), INP       # src
2451        movl (FRAME_OFFSET+32)(%esp), LEN       # len
2452        movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2453#endif
2454        cmp $16, LEN
2455        jb .Lcbc_enc_ret
2456        mov 480(KEYP), KLEN
2457        movups (IVP), STATE     # load iv as initial state
2458.align 4
2459.Lcbc_enc_loop:
2460        movups (INP), IN        # load input
2461        pxor IN, STATE
2462        call _aesni_enc1
2463        movups STATE, (OUTP)    # store output
2464        sub $16, LEN
2465        add $16, INP
2466        add $16, OUTP
2467        cmp $16, LEN
2468        jge .Lcbc_enc_loop
2469        movups STATE, (IVP)
2470.Lcbc_enc_ret:
2471#ifndef __x86_64__
2472        popl KLEN
2473        popl KEYP
2474        popl LEN
2475        popl IVP
2476#endif
2477        FRAME_END
2478        RET
2479SYM_FUNC_END(aesni_cbc_enc)
2480
2481/*
2482 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2483 *                    size_t len, u8 *iv)
2484 */
2485SYM_FUNC_START(aesni_cbc_dec)
2486        FRAME_BEGIN
2487#ifndef __x86_64__
2488        pushl IVP
2489        pushl LEN
2490        pushl KEYP
2491        pushl KLEN
2492        movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2493        movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2494        movl (FRAME_OFFSET+28)(%esp), INP       # src
2495        movl (FRAME_OFFSET+32)(%esp), LEN       # len
2496        movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2497#endif
2498        cmp $16, LEN
2499        jb .Lcbc_dec_just_ret
2500        mov 480(KEYP), KLEN
2501        add $240, KEYP
2502        movups (IVP), IV
2503        cmp $64, LEN
2504        jb .Lcbc_dec_loop1
2505.align 4
2506.Lcbc_dec_loop4:
2507        movups (INP), IN1
2508        movaps IN1, STATE1
2509        movups 0x10(INP), IN2
2510        movaps IN2, STATE2
2511#ifdef __x86_64__
2512        movups 0x20(INP), IN3
2513        movaps IN3, STATE3
2514        movups 0x30(INP), IN4
2515        movaps IN4, STATE4
2516#else
2517        movups 0x20(INP), IN1
2518        movaps IN1, STATE3
2519        movups 0x30(INP), IN2
2520        movaps IN2, STATE4
2521#endif
2522        call _aesni_dec4
2523        pxor IV, STATE1
2524#ifdef __x86_64__
2525        pxor IN1, STATE2
2526        pxor IN2, STATE3
2527        pxor IN3, STATE4
2528        movaps IN4, IV
2529#else
2530        pxor IN1, STATE4
2531        movaps IN2, IV
2532        movups (INP), IN1
2533        pxor IN1, STATE2
2534        movups 0x10(INP), IN2
2535        pxor IN2, STATE3
2536#endif
2537        movups STATE1, (OUTP)
2538        movups STATE2, 0x10(OUTP)
2539        movups STATE3, 0x20(OUTP)
2540        movups STATE4, 0x30(OUTP)
2541        sub $64, LEN
2542        add $64, INP
2543        add $64, OUTP
2544        cmp $64, LEN
2545        jge .Lcbc_dec_loop4
2546        cmp $16, LEN
2547        jb .Lcbc_dec_ret
2548.align 4
2549.Lcbc_dec_loop1:
2550        movups (INP), IN
2551        movaps IN, STATE
2552        call _aesni_dec1
2553        pxor IV, STATE
2554        movups STATE, (OUTP)
2555        movaps IN, IV
2556        sub $16, LEN
2557        add $16, INP
2558        add $16, OUTP
2559        cmp $16, LEN
2560        jge .Lcbc_dec_loop1
2561.Lcbc_dec_ret:
2562        movups IV, (IVP)
2563.Lcbc_dec_just_ret:
2564#ifndef __x86_64__
2565        popl KLEN
2566        popl KEYP
2567        popl LEN
2568        popl IVP
2569#endif
2570        FRAME_END
2571        RET
2572SYM_FUNC_END(aesni_cbc_dec)
2573
2574/*
2575 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2576 *                        size_t len, u8 *iv)
2577 */
2578SYM_FUNC_START(aesni_cts_cbc_enc)
2579        FRAME_BEGIN
2580#ifndef __x86_64__
2581        pushl IVP
2582        pushl LEN
2583        pushl KEYP
2584        pushl KLEN
2585        movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2586        movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2587        movl (FRAME_OFFSET+28)(%esp), INP       # src
2588        movl (FRAME_OFFSET+32)(%esp), LEN       # len
2589        movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2590        lea .Lcts_permute_table, T1
2591#else
2592        lea .Lcts_permute_table(%rip), T1
2593#endif
2594        mov 480(KEYP), KLEN
2595        movups (IVP), STATE
2596        sub $16, LEN
2597        mov T1, IVP
2598        add $32, IVP
2599        add LEN, T1
2600        sub LEN, IVP
2601        movups (T1), %xmm4
2602        movups (IVP), %xmm5
2603
2604        movups (INP), IN1
2605        add LEN, INP
2606        movups (INP), IN2
2607
2608        pxor IN1, STATE
2609        call _aesni_enc1
2610
2611        pshufb %xmm5, IN2
2612        pxor STATE, IN2
2613        pshufb %xmm4, STATE
2614        add OUTP, LEN
2615        movups STATE, (LEN)
2616
2617        movaps IN2, STATE
2618        call _aesni_enc1
2619        movups STATE, (OUTP)
2620
2621#ifndef __x86_64__
2622        popl KLEN
2623        popl KEYP
2624        popl LEN
2625        popl IVP
2626#endif
2627        FRAME_END
2628        RET
2629SYM_FUNC_END(aesni_cts_cbc_enc)
2630
2631/*
2632 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2633 *                        size_t len, u8 *iv)
2634 */
2635SYM_FUNC_START(aesni_cts_cbc_dec)
2636        FRAME_BEGIN
2637#ifndef __x86_64__
2638        pushl IVP
2639        pushl LEN
2640        pushl KEYP
2641        pushl KLEN
2642        movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2643        movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2644        movl (FRAME_OFFSET+28)(%esp), INP       # src
2645        movl (FRAME_OFFSET+32)(%esp), LEN       # len
2646        movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2647        lea .Lcts_permute_table, T1
2648#else
2649        lea .Lcts_permute_table(%rip), T1
2650#endif
2651        mov 480(KEYP), KLEN
2652        add $240, KEYP
2653        movups (IVP), IV
2654        sub $16, LEN
2655        mov T1, IVP
2656        add $32, IVP
2657        add LEN, T1
2658        sub LEN, IVP
2659        movups (T1), %xmm4
2660
2661        movups (INP), STATE
2662        add LEN, INP
2663        movups (INP), IN1
2664
2665        call _aesni_dec1
2666        movaps STATE, IN2
2667        pshufb %xmm4, STATE
2668        pxor IN1, STATE
2669
2670        add OUTP, LEN
2671        movups STATE, (LEN)
2672
2673        movups (IVP), %xmm0
2674        pshufb %xmm0, IN1
2675        pblendvb IN2, IN1
2676        movaps IN1, STATE
2677        call _aesni_dec1
2678
2679        pxor IV, STATE
2680        movups STATE, (OUTP)
2681
2682#ifndef __x86_64__
2683        popl KLEN
2684        popl KEYP
2685        popl LEN
2686        popl IVP
2687#endif
2688        FRAME_END
2689        RET
2690SYM_FUNC_END(aesni_cts_cbc_dec)
2691
2692.pushsection .rodata
2693.align 16
2694.Lcts_permute_table:
2695        .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2696        .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2697        .byte           0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
2698        .byte           0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
2699        .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2700        .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2701#ifdef __x86_64__
2702.Lbswap_mask:
2703        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2704#endif
2705.popsection
2706
2707#ifdef __x86_64__
2708/*
2709 * _aesni_inc_init:     internal ABI
2710 *      setup registers used by _aesni_inc
2711 * input:
2712 *      IV
2713 * output:
2714 *      CTR:    == IV, in little endian
2715 *      TCTR_LOW: == lower qword of CTR
2716 *      INC:    == 1, in little endian
2717 *      BSWAP_MASK == endian swapping mask
2718 */
2719SYM_FUNC_START_LOCAL(_aesni_inc_init)
2720        movaps .Lbswap_mask, BSWAP_MASK
2721        movaps IV, CTR
2722        pshufb BSWAP_MASK, CTR
2723        mov $1, TCTR_LOW
2724        movq TCTR_LOW, INC
2725        movq CTR, TCTR_LOW
2726        RET
2727SYM_FUNC_END(_aesni_inc_init)
2728
2729/*
2730 * _aesni_inc:          internal ABI
2731 *      Increase IV by 1, IV is in big endian
2732 * input:
2733 *      IV
2734 *      CTR:    == IV, in little endian
2735 *      TCTR_LOW: == lower qword of CTR
2736 *      INC:    == 1, in little endian
2737 *      BSWAP_MASK == endian swapping mask
2738 * output:
2739 *      IV:     Increase by 1
2740 * changed:
2741 *      CTR:    == output IV, in little endian
2742 *      TCTR_LOW: == lower qword of CTR
2743 */
2744SYM_FUNC_START_LOCAL(_aesni_inc)
2745        paddq INC, CTR
2746        add $1, TCTR_LOW
2747        jnc .Linc_low
2748        pslldq $8, INC
2749        paddq INC, CTR
2750        psrldq $8, INC
2751.Linc_low:
2752        movaps CTR, IV
2753        pshufb BSWAP_MASK, IV
2754        RET
2755SYM_FUNC_END(_aesni_inc)
2756
2757/*
2758 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2759 *                    size_t len, u8 *iv)
2760 */
2761SYM_FUNC_START(aesni_ctr_enc)
2762        FRAME_BEGIN
2763        cmp $16, LEN
2764        jb .Lctr_enc_just_ret
2765        mov 480(KEYP), KLEN
2766        movups (IVP), IV
2767        call _aesni_inc_init
2768        cmp $64, LEN
2769        jb .Lctr_enc_loop1
2770.align 4
2771.Lctr_enc_loop4:
2772        movaps IV, STATE1
2773        call _aesni_inc
2774        movups (INP), IN1
2775        movaps IV, STATE2
2776        call _aesni_inc
2777        movups 0x10(INP), IN2
2778        movaps IV, STATE3
2779        call _aesni_inc
2780        movups 0x20(INP), IN3
2781        movaps IV, STATE4
2782        call _aesni_inc
2783        movups 0x30(INP), IN4
2784        call _aesni_enc4
2785        pxor IN1, STATE1
2786        movups STATE1, (OUTP)
2787        pxor IN2, STATE2
2788        movups STATE2, 0x10(OUTP)
2789        pxor IN3, STATE3
2790        movups STATE3, 0x20(OUTP)
2791        pxor IN4, STATE4
2792        movups STATE4, 0x30(OUTP)
2793        sub $64, LEN
2794        add $64, INP
2795        add $64, OUTP
2796        cmp $64, LEN
2797        jge .Lctr_enc_loop4
2798        cmp $16, LEN
2799        jb .Lctr_enc_ret
2800.align 4
2801.Lctr_enc_loop1:
2802        movaps IV, STATE
2803        call _aesni_inc
2804        movups (INP), IN
2805        call _aesni_enc1
2806        pxor IN, STATE
2807        movups STATE, (OUTP)
2808        sub $16, LEN
2809        add $16, INP
2810        add $16, OUTP
2811        cmp $16, LEN
2812        jge .Lctr_enc_loop1
2813.Lctr_enc_ret:
2814        movups IV, (IVP)
2815.Lctr_enc_just_ret:
2816        FRAME_END
2817        RET
2818SYM_FUNC_END(aesni_ctr_enc)
2819
2820#endif
2821
2822.section        .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
2823.align 16
2824.Lgf128mul_x_ble_mask:
2825        .octa 0x00000000000000010000000000000087
2826.previous
2827
2828/*
2829 * _aesni_gf128mul_x_ble:               internal ABI
2830 *      Multiply in GF(2^128) for XTS IVs
2831 * input:
2832 *      IV:     current IV
2833 *      GF128MUL_MASK == mask with 0x87 and 0x01
2834 * output:
2835 *      IV:     next IV
2836 * changed:
2837 *      CTR:    == temporary value
2838 */
2839#define _aesni_gf128mul_x_ble() \
2840        pshufd $0x13, IV, KEY; \
2841        paddq IV, IV; \
2842        psrad $31, KEY; \
2843        pand GF128MUL_MASK, KEY; \
2844        pxor KEY, IV;
2845
2846/*
2847 * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
2848 *                        const u8 *src, unsigned int len, le128 *iv)
2849 */
2850SYM_FUNC_START(aesni_xts_encrypt)
2851        FRAME_BEGIN
2852#ifndef __x86_64__
2853        pushl IVP
2854        pushl LEN
2855        pushl KEYP
2856        pushl KLEN
2857        movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2858        movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2859        movl (FRAME_OFFSET+28)(%esp), INP       # src
2860        movl (FRAME_OFFSET+32)(%esp), LEN       # len
2861        movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2862        movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2863#else
2864        movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
2865#endif
2866        movups (IVP), IV
2867
2868        mov 480(KEYP), KLEN
2869
2870.Lxts_enc_loop4:
2871        sub $64, LEN
2872        jl .Lxts_enc_1x
2873
2874        movdqa IV, STATE1
2875        movdqu 0x00(INP), IN
2876        pxor IN, STATE1
2877        movdqu IV, 0x00(OUTP)
2878
2879        _aesni_gf128mul_x_ble()
2880        movdqa IV, STATE2
2881        movdqu 0x10(INP), IN
2882        pxor IN, STATE2
2883        movdqu IV, 0x10(OUTP)
2884
2885        _aesni_gf128mul_x_ble()
2886        movdqa IV, STATE3
2887        movdqu 0x20(INP), IN
2888        pxor IN, STATE3
2889        movdqu IV, 0x20(OUTP)
2890
2891        _aesni_gf128mul_x_ble()
2892        movdqa IV, STATE4
2893        movdqu 0x30(INP), IN
2894        pxor IN, STATE4
2895        movdqu IV, 0x30(OUTP)
2896
2897        call _aesni_enc4
2898
2899        movdqu 0x00(OUTP), IN
2900        pxor IN, STATE1
2901        movdqu STATE1, 0x00(OUTP)
2902
2903        movdqu 0x10(OUTP), IN
2904        pxor IN, STATE2
2905        movdqu STATE2, 0x10(OUTP)
2906
2907        movdqu 0x20(OUTP), IN
2908        pxor IN, STATE3
2909        movdqu STATE3, 0x20(OUTP)
2910
2911        movdqu 0x30(OUTP), IN
2912        pxor IN, STATE4
2913        movdqu STATE4, 0x30(OUTP)
2914
2915        _aesni_gf128mul_x_ble()
2916
2917        add $64, INP
2918        add $64, OUTP
2919        test LEN, LEN
2920        jnz .Lxts_enc_loop4
2921
2922.Lxts_enc_ret_iv:
2923        movups IV, (IVP)
2924
2925.Lxts_enc_ret:
2926#ifndef __x86_64__
2927        popl KLEN
2928        popl KEYP
2929        popl LEN
2930        popl IVP
2931#endif
2932        FRAME_END
2933        RET
2934
2935.Lxts_enc_1x:
2936        add $64, LEN
2937        jz .Lxts_enc_ret_iv
2938        sub $16, LEN
2939        jl .Lxts_enc_cts4
2940
2941.Lxts_enc_loop1:
2942        movdqu (INP), STATE
2943        pxor IV, STATE
2944        call _aesni_enc1
2945        pxor IV, STATE
2946        _aesni_gf128mul_x_ble()
2947
2948        test LEN, LEN
2949        jz .Lxts_enc_out
2950
2951        add $16, INP
2952        sub $16, LEN
2953        jl .Lxts_enc_cts1
2954
2955        movdqu STATE, (OUTP)
2956        add $16, OUTP
2957        jmp .Lxts_enc_loop1
2958
2959.Lxts_enc_out:
2960        movdqu STATE, (OUTP)
2961        jmp .Lxts_enc_ret_iv
2962
2963.Lxts_enc_cts4:
2964        movdqa STATE4, STATE
2965        sub $16, OUTP
2966
2967.Lxts_enc_cts1:
2968#ifndef __x86_64__
2969        lea .Lcts_permute_table, T1
2970#else
2971        lea .Lcts_permute_table(%rip), T1
2972#endif
2973        add LEN, INP            /* rewind input pointer */
2974        add $16, LEN            /* # bytes in final block */
2975        movups (INP), IN1
2976
2977        mov T1, IVP
2978        add $32, IVP
2979        add LEN, T1
2980        sub LEN, IVP
2981        add OUTP, LEN
2982
2983        movups (T1), %xmm4
2984        movaps STATE, IN2
2985        pshufb %xmm4, STATE
2986        movups STATE, (LEN)
2987
2988        movups (IVP), %xmm0
2989        pshufb %xmm0, IN1
2990        pblendvb IN2, IN1
2991        movaps IN1, STATE
2992
2993        pxor IV, STATE
2994        call _aesni_enc1
2995        pxor IV, STATE
2996
2997        movups STATE, (OUTP)
2998        jmp .Lxts_enc_ret
2999SYM_FUNC_END(aesni_xts_encrypt)
3000

3001/*
3002 * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
3003 *                        const u8 *src, unsigned int len, le128 *iv)
3004 */
3005SYM_FUNC_START(aesni_xts_decrypt)
3006        FRAME_BEGIN
3007#ifndef __x86_64__
3008        pushl IVP
3009        pushl LEN
3010        pushl KEYP
3011        pushl KLEN
3012        movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
3013        movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
3014        movl (FRAME_OFFSET+28)(%esp), INP       # src
3015        movl (FRAME_OFFSET+32)(%esp), LEN       # len
3016        movl (FRAME_OFFSET+36)(%esp), IVP       # iv
3017        movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
3018#else
3019        movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
3020#endif
3021        movups (IVP), IV
3022
3023        mov 480(KEYP), KLEN
3024        add $240, KEYP
3025
3026        test $15, LEN
3027        jz .Lxts_dec_loop4
3028        sub $16, LEN
3029
3030.Lxts_dec_loop4:
3031        sub $64, LEN
3032        jl .Lxts_dec_1x
3033
3034        movdqa IV, STATE1
3035        movdqu 0x00(INP), IN
3036        pxor IN, STATE1
3037        movdqu IV, 0x00(OUTP)
3038
3039        _aesni_gf128mul_x_ble()
3040        movdqa IV, STATE2
3041        movdqu 0x10(INP), IN
3042        pxor IN, STATE2
3043        movdqu IV, 0x10(OUTP)
3044
3045        _aesni_gf128mul_x_ble()
3046        movdqa IV, STATE3
3047        movdqu 0x20(INP), IN
3048        pxor IN, STATE3
3049        movdqu IV, 0x20(OUTP)
3050
3051        _aesni_gf128mul_x_ble()
3052        movdqa IV, STATE4
3053        movdqu 0x30(INP), IN
3054        pxor IN, STATE4
3055        movdqu IV, 0x30(OUTP)
3056
3057        call _aesni_dec4
3058
3059        movdqu 0x00(OUTP), IN
3060        pxor IN, STATE1
3061        movdqu STATE1, 0x00(OUTP)
3062
3063        movdqu 0x10(OUTP), IN
3064        pxor IN, STATE2
3065        movdqu STATE2, 0x10(OUTP)
3066
3067        movdqu 0x20(OUTP), IN
3068        pxor IN, STATE3
3069        movdqu STATE3, 0x20(OUTP)
3070
3071        movdqu 0x30(OUTP), IN
3072        pxor IN, STATE4
3073        movdqu STATE4, 0x30(OUTP)
3074
3075        _aesni_gf128mul_x_ble()
3076
3077        add $64, INP
3078        add $64, OUTP
3079        test LEN, LEN
3080        jnz .Lxts_dec_loop4
3081
3082.Lxts_dec_ret_iv:
3083        movups IV, (IVP)
3084
3085.Lxts_dec_ret:
3086#ifndef __x86_64__
3087        popl KLEN
3088        popl KEYP
3089        popl LEN
3090        popl IVP
3091#endif
3092        FRAME_END
3093        RET
3094
3095.Lxts_dec_1x:
3096        add $64, LEN
3097        jz .Lxts_dec_ret_iv
3098
3099.Lxts_dec_loop1:
3100        movdqu (INP), STATE
3101
3102        add $16, INP
3103        sub $16, LEN
3104        jl .Lxts_dec_cts1
3105
3106        pxor IV, STATE
3107        call _aesni_dec1
3108        pxor IV, STATE
3109        _aesni_gf128mul_x_ble()
3110
3111        test LEN, LEN
3112        jz .Lxts_dec_out
3113
3114        movdqu STATE, (OUTP)
3115        add $16, OUTP
3116        jmp .Lxts_dec_loop1
3117
3118.Lxts_dec_out:
3119        movdqu STATE, (OUTP)
3120        jmp .Lxts_dec_ret_iv
3121
3122.Lxts_dec_cts1:
3123        movdqa IV, STATE4
3124        _aesni_gf128mul_x_ble()
3125
3126        pxor IV, STATE
3127        call _aesni_dec1
3128        pxor IV, STATE
3129
3130#ifndef __x86_64__
3131        lea .Lcts_permute_table, T1
3132#else
3133        lea .Lcts_permute_table(%rip), T1
3134#endif
3135        add LEN, INP            /* rewind input pointer */
3136        add $16, LEN            /* # bytes in final block */
3137        movups (INP), IN1
3138
3139        mov T1, IVP
3140        add $32, IVP
3141        add LEN, T1
3142        sub LEN, IVP
3143        add OUTP, LEN
3144
3145        movups (T1), %xmm4
3146        movaps STATE, IN2
3147        pshufb %xmm4, STATE
3148        movups STATE, (LEN)
3149
3150        movups (IVP), %xmm0
3151        pshufb %xmm0, IN1
3152        pblendvb IN2, IN1
3153        movaps IN1, STATE
3154
3155        pxor STATE4, STATE
3156        call _aesni_dec1
3157        pxor STATE4, STATE
3158
3159        movups STATE, (OUTP)
3160        jmp .Lxts_dec_ret
3161SYM_FUNC_END(aesni_xts_decrypt)
3162