linux/arch/x86/crypto/aesni-intel_asm.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * Implement AES algorithm in Intel AES-NI instructions.
   4 *
   5 * The white paper of AES-NI instructions can be downloaded from:
   6 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
   7 *
   8 * Copyright (C) 2008, Intel Corp.
   9 *    Author: Huang Ying <ying.huang@intel.com>
  10 *            Vinodh Gopal <vinodh.gopal@intel.com>
  11 *            Kahraman Akdemir
  12 *
  13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  14 * interface for 64-bit kernels.
  15 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  16 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
  17 *             Adrian Hoban <adrian.hoban@intel.com>
  18 *             James Guilford (james.guilford@intel.com)
  19 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
  20 *             Tadeusz Struk (tadeusz.struk@intel.com)
  21 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
  22 *    Copyright (c) 2010, Intel Corporation.
  23 *
  24 * Ported x86_64 version to x86:
  25 *    Author: Mathias Krause <minipli@googlemail.com>
  26 */
  27
  28#include <linux/linkage.h>
  29#include <asm/frame.h>
  30#include <asm/nospec-branch.h>
  31
  32/*
  33 * The following macros are used to move an (un)aligned 16 byte value to/from
  34 * an XMM register.  This can done for either FP or integer values, for FP use
  35 * movaps (move aligned packed single) or integer use movdqa (move double quad
  36 * aligned).  It doesn't make a performance difference which instruction is used
  37 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
  38 * shorter, so that is the one we'll use for now. (same for unaligned).
  39 */
  40#define MOVADQ  movaps
  41#define MOVUDQ  movups
  42
  43#ifdef __x86_64__
  44
  45# constants in mergeable sections, linker can reorder and merge
  46.section        .rodata.cst16.POLY, "aM", @progbits, 16
  47.align 16
  48POLY:   .octa 0xC2000000000000000000000000000001
  49.section        .rodata.cst16.TWOONE, "aM", @progbits, 16
  50.align 16
  51TWOONE: .octa 0x00000001000000000000000000000001
  52
  53.section        .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
  54.align 16
  55SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
  56.section        .rodata.cst16.MASK1, "aM", @progbits, 16
  57.align 16
  58MASK1:      .octa 0x0000000000000000ffffffffffffffff
  59.section        .rodata.cst16.MASK2, "aM", @progbits, 16
  60.align 16
  61MASK2:      .octa 0xffffffffffffffff0000000000000000
  62.section        .rodata.cst16.ONE, "aM", @progbits, 16
  63.align 16
  64ONE:        .octa 0x00000000000000000000000000000001
  65.section        .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
  66.align 16
  67F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  68.section        .rodata.cst16.dec, "aM", @progbits, 16
  69.align 16
  70dec:        .octa 0x1
  71.section        .rodata.cst16.enc, "aM", @progbits, 16
  72.align 16
  73enc:        .octa 0x2
  74
  75# order of these constants should not change.
  76# more specifically, ALL_F should follow SHIFT_MASK,
  77# and zero should follow ALL_F
  78.section        .rodata, "a", @progbits
  79.align 16
  80SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  81ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
  82            .octa 0x00000000000000000000000000000000
  83
  84.text
  85
  86
  87#define STACK_OFFSET    8*3
  88
  89#define AadHash 16*0
  90#define AadLen 16*1
  91#define InLen (16*1)+8
  92#define PBlockEncKey 16*2
  93#define OrigIV 16*3
  94#define CurCount 16*4
  95#define PBlockLen 16*5
  96#define HashKey         16*6    // store HashKey <<1 mod poly here
  97#define HashKey_2       16*7    // store HashKey^2 <<1 mod poly here
  98#define HashKey_3       16*8    // store HashKey^3 <<1 mod poly here
  99#define HashKey_4       16*9    // store HashKey^4 <<1 mod poly here
 100#define HashKey_k       16*10   // store XOR of High 64 bits and Low 64
 101                                // bits of  HashKey <<1 mod poly here
 102                                //(for Karatsuba purposes)
 103#define HashKey_2_k     16*11   // store XOR of High 64 bits and Low 64
 104                                // bits of  HashKey^2 <<1 mod poly here
 105                                // (for Karatsuba purposes)
 106#define HashKey_3_k     16*12   // store XOR of High 64 bits and Low 64
 107                                // bits of  HashKey^3 <<1 mod poly here
 108                                // (for Karatsuba purposes)
 109#define HashKey_4_k     16*13   // store XOR of High 64 bits and Low 64
 110                                // bits of  HashKey^4 <<1 mod poly here
 111                                // (for Karatsuba purposes)
 112
 113#define arg1 rdi
 114#define arg2 rsi
 115#define arg3 rdx
 116#define arg4 rcx
 117#define arg5 r8
 118#define arg6 r9
 119#define arg7 STACK_OFFSET+8(%rsp)
 120#define arg8 STACK_OFFSET+16(%rsp)
 121#define arg9 STACK_OFFSET+24(%rsp)
 122#define arg10 STACK_OFFSET+32(%rsp)
 123#define arg11 STACK_OFFSET+40(%rsp)
 124#define keysize 2*15*16(%arg1)
 125#endif
 126
 127
 128#define STATE1  %xmm0
 129#define STATE2  %xmm4
 130#define STATE3  %xmm5
 131#define STATE4  %xmm6
 132#define STATE   STATE1
 133#define IN1     %xmm1
 134#define IN2     %xmm7
 135#define IN3     %xmm8
 136#define IN4     %xmm9
 137#define IN      IN1
 138#define KEY     %xmm2
 139#define IV      %xmm3
 140
 141#define BSWAP_MASK %xmm10
 142#define CTR     %xmm11
 143#define INC     %xmm12
 144
 145#define GF128MUL_MASK %xmm7
 146
 147#ifdef __x86_64__
 148#define AREG    %rax
 149#define KEYP    %rdi
 150#define OUTP    %rsi
 151#define UKEYP   OUTP
 152#define INP     %rdx
 153#define LEN     %rcx
 154#define IVP     %r8
 155#define KLEN    %r9d
 156#define T1      %r10
 157#define TKEYP   T1
 158#define T2      %r11
 159#define TCTR_LOW T2
 160#else
 161#define AREG    %eax
 162#define KEYP    %edi
 163#define OUTP    AREG
 164#define UKEYP   OUTP
 165#define INP     %edx
 166#define LEN     %esi
 167#define IVP     %ebp
 168#define KLEN    %ebx
 169#define T1      %ecx
 170#define TKEYP   T1
 171#endif
 172
 173.macro FUNC_SAVE
 174        push    %r12
 175        push    %r13
 176        push    %r14
 177#
 178# states of %xmm registers %xmm6:%xmm15 not saved
 179# all %xmm registers are clobbered
 180#
 181.endm
 182
 183
 184.macro FUNC_RESTORE
 185        pop     %r14
 186        pop     %r13
 187        pop     %r12
 188.endm
 189
 190# Precompute hashkeys.
 191# Input: Hash subkey.
 192# Output: HashKeys stored in gcm_context_data.  Only needs to be called
 193# once per key.
 194# clobbers r12, and tmp xmm registers.
 195.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
 196        mov     \SUBKEY, %r12
 197        movdqu  (%r12), \TMP3
 198        movdqa  SHUF_MASK(%rip), \TMP2
 199        pshufb  \TMP2, \TMP3
 200
 201        # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
 202
 203        movdqa  \TMP3, \TMP2
 204        psllq   $1, \TMP3
 205        psrlq   $63, \TMP2
 206        movdqa  \TMP2, \TMP1
 207        pslldq  $8, \TMP2
 208        psrldq  $8, \TMP1
 209        por     \TMP2, \TMP3
 210
 211        # reduce HashKey<<1
 212
 213        pshufd  $0x24, \TMP1, \TMP2
 214        pcmpeqd TWOONE(%rip), \TMP2
 215        pand    POLY(%rip), \TMP2
 216        pxor    \TMP2, \TMP3
 217        movdqu  \TMP3, HashKey(%arg2)
 218
 219        movdqa     \TMP3, \TMP5
 220        pshufd     $78, \TMP3, \TMP1
 221        pxor       \TMP3, \TMP1
 222        movdqu     \TMP1, HashKey_k(%arg2)
 223
 224        GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 225# TMP5 = HashKey^2<<1 (mod poly)
 226        movdqu     \TMP5, HashKey_2(%arg2)
 227# HashKey_2 = HashKey^2<<1 (mod poly)
 228        pshufd     $78, \TMP5, \TMP1
 229        pxor       \TMP5, \TMP1
 230        movdqu     \TMP1, HashKey_2_k(%arg2)
 231
 232        GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 233# TMP5 = HashKey^3<<1 (mod poly)
 234        movdqu     \TMP5, HashKey_3(%arg2)
 235        pshufd     $78, \TMP5, \TMP1
 236        pxor       \TMP5, \TMP1
 237        movdqu     \TMP1, HashKey_3_k(%arg2)
 238
 239        GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 240# TMP5 = HashKey^3<<1 (mod poly)
 241        movdqu     \TMP5, HashKey_4(%arg2)
 242        pshufd     $78, \TMP5, \TMP1
 243        pxor       \TMP5, \TMP1
 244        movdqu     \TMP1, HashKey_4_k(%arg2)
 245.endm
 246
 247# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 248# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
 249.macro GCM_INIT Iv SUBKEY AAD AADLEN
 250        mov \AADLEN, %r11
 251        mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
 252        xor %r11d, %r11d
 253        mov %r11, InLen(%arg2) # ctx_data.in_length = 0
 254        mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
 255        mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
 256        mov \Iv, %rax
 257        movdqu (%rax), %xmm0
 258        movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
 259
 260        movdqa  SHUF_MASK(%rip), %xmm2
 261        pshufb %xmm2, %xmm0
 262        movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
 263
 264        PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
 265        movdqu HashKey(%arg2), %xmm13
 266
 267        CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
 268        %xmm4, %xmm5, %xmm6
 269.endm
 270
 271# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
 272# struct has been initialized by GCM_INIT.
 273# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
 274# Clobbers rax, r10-r13, and xmm0-xmm15
 275.macro GCM_ENC_DEC operation
 276        movdqu AadHash(%arg2), %xmm8
 277        movdqu HashKey(%arg2), %xmm13
 278        add %arg5, InLen(%arg2)
 279
 280        xor %r11d, %r11d # initialise the data pointer offset as zero
 281        PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
 282
 283        sub %r11, %arg5         # sub partial block data used
 284        mov %arg5, %r13         # save the number of bytes
 285
 286        and $-16, %r13          # %r13 = %r13 - (%r13 mod 16)
 287        mov %r13, %r12
 288        # Encrypt/Decrypt first few blocks
 289
 290        and     $(3<<4), %r12
 291        jz      _initial_num_blocks_is_0_\@
 292        cmp     $(2<<4), %r12
 293        jb      _initial_num_blocks_is_1_\@
 294        je      _initial_num_blocks_is_2_\@
 295_initial_num_blocks_is_3_\@:
 296        INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 297%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
 298        sub     $48, %r13
 299        jmp     _initial_blocks_\@
 300_initial_num_blocks_is_2_\@:
 301        INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 302%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
 303        sub     $32, %r13
 304        jmp     _initial_blocks_\@
 305_initial_num_blocks_is_1_\@:
 306        INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 307%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
 308        sub     $16, %r13
 309        jmp     _initial_blocks_\@
 310_initial_num_blocks_is_0_\@:
 311        INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 312%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
 313_initial_blocks_\@:
 314
 315        # Main loop - Encrypt/Decrypt remaining blocks
 316
 317        test    %r13, %r13
 318        je      _zero_cipher_left_\@
 319        sub     $64, %r13
 320        je      _four_cipher_left_\@
 321_crypt_by_4_\@:
 322        GHASH_4_ENCRYPT_4_PARALLEL_\operation   %xmm9, %xmm10, %xmm11, %xmm12, \
 323        %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
 324        %xmm7, %xmm8, enc
 325        add     $64, %r11
 326        sub     $64, %r13
 327        jne     _crypt_by_4_\@
 328_four_cipher_left_\@:
 329        GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
 330%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
 331_zero_cipher_left_\@:
 332        movdqu %xmm8, AadHash(%arg2)
 333        movdqu %xmm0, CurCount(%arg2)
 334
 335        mov     %arg5, %r13
 336        and     $15, %r13                       # %r13 = arg5 (mod 16)
 337        je      _multiple_of_16_bytes_\@
 338
 339        mov %r13, PBlockLen(%arg2)
 340
 341        # Handle the last <16 Byte block separately
 342        paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
 343        movdqu %xmm0, CurCount(%arg2)
 344        movdqa SHUF_MASK(%rip), %xmm10
 345        pshufb %xmm10, %xmm0
 346
 347        ENCRYPT_SINGLE_BLOCK    %xmm0, %xmm1        # Encrypt(K, Yn)
 348        movdqu %xmm0, PBlockEncKey(%arg2)
 349
 350        cmp     $16, %arg5
 351        jge _large_enough_update_\@
 352
 353        lea (%arg4,%r11,1), %r10
 354        mov %r13, %r12
 355        READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
 356        jmp _data_read_\@
 357
 358_large_enough_update_\@:
 359        sub     $16, %r11
 360        add     %r13, %r11
 361
 362        # receive the last <16 Byte block
 363        movdqu  (%arg4, %r11, 1), %xmm1
 364
 365        sub     %r13, %r11
 366        add     $16, %r11
 367
 368        lea     SHIFT_MASK+16(%rip), %r12
 369        # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
 370        # (r13 is the number of bytes in plaintext mod 16)
 371        sub     %r13, %r12
 372        # get the appropriate shuffle mask
 373        movdqu  (%r12), %xmm2
 374        # shift right 16-r13 bytes
 375        pshufb  %xmm2, %xmm1
 376
 377_data_read_\@:
 378        lea ALL_F+16(%rip), %r12
 379        sub %r13, %r12
 380
 381.ifc \operation, dec
 382        movdqa  %xmm1, %xmm2
 383.endif
 384        pxor    %xmm1, %xmm0            # XOR Encrypt(K, Yn)
 385        movdqu  (%r12), %xmm1
 386        # get the appropriate mask to mask out top 16-r13 bytes of xmm0
 387        pand    %xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
 388.ifc \operation, dec
 389        pand    %xmm1, %xmm2
 390        movdqa SHUF_MASK(%rip), %xmm10
 391        pshufb %xmm10 ,%xmm2
 392
 393        pxor %xmm2, %xmm8
 394.else
 395        movdqa SHUF_MASK(%rip), %xmm10
 396        pshufb %xmm10,%xmm0
 397
 398        pxor    %xmm0, %xmm8
 399.endif
 400
 401        movdqu %xmm8, AadHash(%arg2)
 402.ifc \operation, enc
 403        # GHASH computation for the last <16 byte block
 404        movdqa SHUF_MASK(%rip), %xmm10
 405        # shuffle xmm0 back to output as ciphertext
 406        pshufb %xmm10, %xmm0
 407.endif
 408
 409        # Output %r13 bytes
 410        movq %xmm0, %rax
 411        cmp $8, %r13
 412        jle _less_than_8_bytes_left_\@
 413        mov %rax, (%arg3 , %r11, 1)
 414        add $8, %r11
 415        psrldq $8, %xmm0
 416        movq %xmm0, %rax
 417        sub $8, %r13
 418_less_than_8_bytes_left_\@:
 419        mov %al,  (%arg3, %r11, 1)
 420        add $1, %r11
 421        shr $8, %rax
 422        sub $1, %r13
 423        jne _less_than_8_bytes_left_\@
 424_multiple_of_16_bytes_\@:
 425.endm
 426
 427# GCM_COMPLETE Finishes update of tag of last partial block
 428# Output: Authorization Tag (AUTH_TAG)
 429# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 430.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
 431        movdqu AadHash(%arg2), %xmm8
 432        movdqu HashKey(%arg2), %xmm13
 433
 434        mov PBlockLen(%arg2), %r12
 435
 436        test %r12, %r12
 437        je _partial_done\@
 438
 439        GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 440
 441_partial_done\@:
 442        mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
 443        shl     $3, %r12                  # convert into number of bits
 444        movd    %r12d, %xmm15             # len(A) in %xmm15
 445        mov InLen(%arg2), %r12
 446        shl     $3, %r12                  # len(C) in bits (*128)
 447        movq    %r12, %xmm1
 448
 449        pslldq  $8, %xmm15                # %xmm15 = len(A)||0x0000000000000000
 450        pxor    %xmm1, %xmm15             # %xmm15 = len(A)||len(C)
 451        pxor    %xmm15, %xmm8
 452        GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 453        # final GHASH computation
 454        movdqa SHUF_MASK(%rip), %xmm10
 455        pshufb %xmm10, %xmm8
 456
 457        movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
 458        ENCRYPT_SINGLE_BLOCK    %xmm0,  %xmm1     # E(K, Y0)
 459        pxor    %xmm8, %xmm0
 460_return_T_\@:
 461        mov     \AUTHTAG, %r10                     # %r10 = authTag
 462        mov     \AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
 463        cmp     $16, %r11
 464        je      _T_16_\@
 465        cmp     $8, %r11
 466        jl      _T_4_\@
 467_T_8_\@:
 468        movq    %xmm0, %rax
 469        mov     %rax, (%r10)
 470        add     $8, %r10
 471        sub     $8, %r11
 472        psrldq  $8, %xmm0
 473        test    %r11, %r11
 474        je      _return_T_done_\@
 475_T_4_\@:
 476        movd    %xmm0, %eax
 477        mov     %eax, (%r10)
 478        add     $4, %r10
 479        sub     $4, %r11
 480        psrldq  $4, %xmm0
 481        test    %r11, %r11
 482        je      _return_T_done_\@
 483_T_123_\@:
 484        movd    %xmm0, %eax
 485        cmp     $2, %r11
 486        jl      _T_1_\@
 487        mov     %ax, (%r10)
 488        cmp     $2, %r11
 489        je      _return_T_done_\@
 490        add     $2, %r10
 491        sar     $16, %eax
 492_T_1_\@:
 493        mov     %al, (%r10)
 494        jmp     _return_T_done_\@
 495_T_16_\@:
 496        movdqu  %xmm0, (%r10)
 497_return_T_done_\@:
 498.endm
 499
 500#ifdef __x86_64__
 501/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 502*
 503*
 504* Input: A and B (128-bits each, bit-reflected)
 505* Output: C = A*B*x mod poly, (i.e. >>1 )
 506* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 507* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 508*
 509*/
 510.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
 511        movdqa    \GH, \TMP1
 512        pshufd    $78, \GH, \TMP2
 513        pshufd    $78, \HK, \TMP3
 514        pxor      \GH, \TMP2            # TMP2 = a1+a0
 515        pxor      \HK, \TMP3            # TMP3 = b1+b0
 516        pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
 517        pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
 518        pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
 519        pxor      \GH, \TMP2
 520        pxor      \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
 521        movdqa    \TMP2, \TMP3
 522        pslldq    $8, \TMP3             # left shift TMP3 2 DWs
 523        psrldq    $8, \TMP2             # right shift TMP2 2 DWs
 524        pxor      \TMP3, \GH
 525        pxor      \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
 526
 527        # first phase of the reduction
 528
 529        movdqa    \GH, \TMP2
 530        movdqa    \GH, \TMP3
 531        movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
 532                                        # in in order to perform
 533                                        # independent shifts
 534        pslld     $31, \TMP2            # packed right shift <<31
 535        pslld     $30, \TMP3            # packed right shift <<30
 536        pslld     $25, \TMP4            # packed right shift <<25
 537        pxor      \TMP3, \TMP2          # xor the shifted versions
 538        pxor      \TMP4, \TMP2
 539        movdqa    \TMP2, \TMP5
 540        psrldq    $4, \TMP5             # right shift TMP5 1 DW
 541        pslldq    $12, \TMP2            # left shift TMP2 3 DWs
 542        pxor      \TMP2, \GH
 543
 544        # second phase of the reduction
 545
 546        movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
 547                                        # in in order to perform
 548                                        # independent shifts
 549        movdqa    \GH,\TMP3
 550        movdqa    \GH,\TMP4
 551        psrld     $1,\TMP2              # packed left shift >>1
 552        psrld     $2,\TMP3              # packed left shift >>2
 553        psrld     $7,\TMP4              # packed left shift >>7
 554        pxor      \TMP3,\TMP2           # xor the shifted versions
 555        pxor      \TMP4,\TMP2
 556        pxor      \TMP5, \TMP2
 557        pxor      \TMP2, \GH
 558        pxor      \TMP1, \GH            # result is in TMP1
 559.endm
 560
 561# Reads DLEN bytes starting at DPTR and stores in XMMDst
 562# where 0 < DLEN < 16
 563# Clobbers %rax, DLEN and XMM1
 564.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
 565        cmp $8, \DLEN
 566        jl _read_lt8_\@
 567        mov (\DPTR), %rax
 568        movq %rax, \XMMDst
 569        sub $8, \DLEN
 570        jz _done_read_partial_block_\@
 571        xor %eax, %eax
 572_read_next_byte_\@:
 573        shl $8, %rax
 574        mov 7(\DPTR, \DLEN, 1), %al
 575        dec \DLEN
 576        jnz _read_next_byte_\@
 577        movq %rax, \XMM1
 578        pslldq $8, \XMM1
 579        por \XMM1, \XMMDst
 580        jmp _done_read_partial_block_\@
 581_read_lt8_\@:
 582        xor %eax, %eax
 583_read_next_byte_lt8_\@:
 584        shl $8, %rax
 585        mov -1(\DPTR, \DLEN, 1), %al
 586        dec \DLEN
 587        jnz _read_next_byte_lt8_\@
 588        movq %rax, \XMMDst
 589_done_read_partial_block_\@:
 590.endm
 591
 592# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
 593# clobbers r10-11, xmm14
 594.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
 595        TMP6 TMP7
 596        MOVADQ     SHUF_MASK(%rip), %xmm14
 597        mov        \AAD, %r10           # %r10 = AAD
 598        mov        \AADLEN, %r11                # %r11 = aadLen
 599        pxor       \TMP7, \TMP7
 600        pxor       \TMP6, \TMP6
 601
 602        cmp        $16, %r11
 603        jl         _get_AAD_rest\@
 604_get_AAD_blocks\@:
 605        movdqu     (%r10), \TMP7
 606        pshufb     %xmm14, \TMP7 # byte-reflect the AAD data
 607        pxor       \TMP7, \TMP6
 608        GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 609        add        $16, %r10
 610        sub        $16, %r11
 611        cmp        $16, %r11
 612        jge        _get_AAD_blocks\@
 613
 614        movdqu     \TMP6, \TMP7
 615
 616        /* read the last <16B of AAD */
 617_get_AAD_rest\@:
 618        test       %r11, %r11
 619        je         _get_AAD_done\@
 620
 621        READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
 622        pshufb     %xmm14, \TMP7 # byte-reflect the AAD data
 623        pxor       \TMP6, \TMP7
 624        GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 625        movdqu \TMP7, \TMP6
 626
 627_get_AAD_done\@:
 628        movdqu \TMP6, AadHash(%arg2)
 629.endm
 630
 631# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
 632# between update calls.
 633# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
 634# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
 635# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
 636.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
 637        AAD_HASH operation
 638        mov     PBlockLen(%arg2), %r13
 639        test    %r13, %r13
 640        je      _partial_block_done_\@  # Leave Macro if no partial blocks
 641        # Read in input data without over reading
 642        cmp     $16, \PLAIN_CYPH_LEN
 643        jl      _fewer_than_16_bytes_\@
 644        movups  (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
 645        jmp     _data_read_\@
 646
 647_fewer_than_16_bytes_\@:
 648        lea     (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
 649        mov     \PLAIN_CYPH_LEN, %r12
 650        READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
 651
 652        mov PBlockLen(%arg2), %r13
 653
 654_data_read_\@:                          # Finished reading in data
 655
 656        movdqu  PBlockEncKey(%arg2), %xmm9
 657        movdqu  HashKey(%arg2), %xmm13
 658
 659        lea     SHIFT_MASK(%rip), %r12
 660
 661        # adjust the shuffle mask pointer to be able to shift r13 bytes
 662        # r16-r13 is the number of bytes in plaintext mod 16)
 663        add     %r13, %r12
 664        movdqu  (%r12), %xmm2           # get the appropriate shuffle mask
 665        pshufb  %xmm2, %xmm9            # shift right r13 bytes
 666
 667.ifc \operation, dec
 668        movdqa  %xmm1, %xmm3
 669        pxor    %xmm1, %xmm9            # Cyphertext XOR E(K, Yn)
 670
 671        mov     \PLAIN_CYPH_LEN, %r10
 672        add     %r13, %r10
 673        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 674        sub     $16, %r10
 675        # Determine if if partial block is not being filled and
 676        # shift mask accordingly
 677        jge     _no_extra_mask_1_\@
 678        sub     %r10, %r12
 679_no_extra_mask_1_\@:
 680
 681        movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 682        # get the appropriate mask to mask out bottom r13 bytes of xmm9
 683        pand    %xmm1, %xmm9            # mask out bottom r13 bytes of xmm9
 684
 685        pand    %xmm1, %xmm3
 686        movdqa  SHUF_MASK(%rip), %xmm10
 687        pshufb  %xmm10, %xmm3
 688        pshufb  %xmm2, %xmm3
 689        pxor    %xmm3, \AAD_HASH
 690
 691        test    %r10, %r10
 692        jl      _partial_incomplete_1_\@
 693
 694        # GHASH computation for the last <16 Byte block
 695        GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 696        xor     %eax, %eax
 697
 698        mov     %rax, PBlockLen(%arg2)
 699        jmp     _dec_done_\@
 700_partial_incomplete_1_\@:
 701        add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 702_dec_done_\@:
 703        movdqu  \AAD_HASH, AadHash(%arg2)
 704.else
 705        pxor    %xmm1, %xmm9                    # Plaintext XOR E(K, Yn)
 706
 707        mov     \PLAIN_CYPH_LEN, %r10
 708        add     %r13, %r10
 709        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 710        sub     $16, %r10
 711        # Determine if if partial block is not being filled and
 712        # shift mask accordingly
 713        jge     _no_extra_mask_2_\@
 714        sub     %r10, %r12
 715_no_extra_mask_2_\@:
 716
 717        movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 718        # get the appropriate mask to mask out bottom r13 bytes of xmm9
 719        pand    %xmm1, %xmm9
 720
 721        movdqa  SHUF_MASK(%rip), %xmm1
 722        pshufb  %xmm1, %xmm9
 723        pshufb  %xmm2, %xmm9
 724        pxor    %xmm9, \AAD_HASH
 725
 726        test    %r10, %r10
 727        jl      _partial_incomplete_2_\@
 728
 729        # GHASH computation for the last <16 Byte block
 730        GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 731        xor     %eax, %eax
 732
 733        mov     %rax, PBlockLen(%arg2)
 734        jmp     _encode_done_\@
 735_partial_incomplete_2_\@:
 736        add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 737_encode_done_\@:
 738        movdqu  \AAD_HASH, AadHash(%arg2)
 739
 740        movdqa  SHUF_MASK(%rip), %xmm10
 741        # shuffle xmm9 back to output as ciphertext
 742        pshufb  %xmm10, %xmm9
 743        pshufb  %xmm2, %xmm9
 744.endif
 745        # output encrypted Bytes
 746        test    %r10, %r10
 747        jl      _partial_fill_\@
 748        mov     %r13, %r12
 749        mov     $16, %r13
 750        # Set r13 to be the number of bytes to write out
 751        sub     %r12, %r13
 752        jmp     _count_set_\@
 753_partial_fill_\@:
 754        mov     \PLAIN_CYPH_LEN, %r13
 755_count_set_\@:
 756        movdqa  %xmm9, %xmm0
 757        movq    %xmm0, %rax
 758        cmp     $8, %r13
 759        jle     _less_than_8_bytes_left_\@
 760
 761        mov     %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 762        add     $8, \DATA_OFFSET
 763        psrldq  $8, %xmm0
 764        movq    %xmm0, %rax
 765        sub     $8, %r13
 766_less_than_8_bytes_left_\@:
 767        movb    %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 768        add     $1, \DATA_OFFSET
 769        shr     $8, %rax
 770        sub     $1, %r13
 771        jne     _less_than_8_bytes_left_\@
 772_partial_block_done_\@:
 773.endm # PARTIAL_BLOCK
 774
 775/*
 776* if a = number of total plaintext bytes
 777* b = floor(a/16)
 778* num_initial_blocks = b mod 4
 779* encrypt the initial num_initial_blocks blocks and apply ghash on
 780* the ciphertext
 781* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 782* are clobbered
 783* arg1, %arg2, %arg3 are used as a pointer only, not modified
 784*/
 785
 786
 787.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 788        XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 789        MOVADQ          SHUF_MASK(%rip), %xmm14
 790
 791        movdqu AadHash(%arg2), %xmm\i               # XMM0 = Y0
 792
 793        # start AES for num_initial_blocks blocks
 794
 795        movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
 796
 797.if (\i == 5) || (\i == 6) || (\i == 7)
 798
 799        MOVADQ          ONE(%RIP),\TMP1
 800        MOVADQ          0(%arg1),\TMP2
 801.irpc index, \i_seq
 802        paddd           \TMP1, \XMM0                 # INCR Y0
 803.ifc \operation, dec
 804        movdqa     \XMM0, %xmm\index
 805.else
 806        MOVADQ          \XMM0, %xmm\index
 807.endif
 808        pshufb  %xmm14, %xmm\index      # perform a 16 byte swap
 809        pxor            \TMP2, %xmm\index
 810.endr
 811        lea     0x10(%arg1),%r10
 812        mov     keysize,%eax
 813        shr     $2,%eax                         # 128->4, 192->6, 256->8
 814        add     $5,%eax                       # 128->9, 192->11, 256->13
 815
 816aes_loop_initial_\@:
 817        MOVADQ  (%r10),\TMP1
 818.irpc   index, \i_seq
 819        aesenc  \TMP1, %xmm\index
 820.endr
 821        add     $16,%r10
 822        sub     $1,%eax
 823        jnz     aes_loop_initial_\@
 824
 825        MOVADQ  (%r10), \TMP1
 826.irpc index, \i_seq
 827        aesenclast \TMP1, %xmm\index         # Last Round
 828.endr
 829.irpc index, \i_seq
 830        movdqu     (%arg4 , %r11, 1), \TMP1
 831        pxor       \TMP1, %xmm\index
 832        movdqu     %xmm\index, (%arg3 , %r11, 1)
 833        # write back plaintext/ciphertext for num_initial_blocks
 834        add        $16, %r11
 835
 836.ifc \operation, dec
 837        movdqa     \TMP1, %xmm\index
 838.endif
 839        pshufb     %xmm14, %xmm\index
 840
 841                # prepare plaintext/ciphertext for GHASH computation
 842.endr
 843.endif
 844
 845        # apply GHASH on num_initial_blocks blocks
 846
 847.if \i == 5
 848        pxor       %xmm5, %xmm6
 849        GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 850        pxor       %xmm6, %xmm7
 851        GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 852        pxor       %xmm7, %xmm8
 853        GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 854.elseif \i == 6
 855        pxor       %xmm6, %xmm7
 856        GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 857        pxor       %xmm7, %xmm8
 858        GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 859.elseif \i == 7
 860        pxor       %xmm7, %xmm8
 861        GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 862.endif
 863        cmp        $64, %r13
 864        jl      _initial_blocks_done\@
 865        # no need for precomputed values
 866/*
 867*
 868* Precomputations for HashKey parallel with encryption of first 4 blocks.
 869* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 870*/
 871        MOVADQ     ONE(%RIP),\TMP1
 872        paddd      \TMP1, \XMM0              # INCR Y0
 873        MOVADQ     \XMM0, \XMM1
 874        pshufb  %xmm14, \XMM1        # perform a 16 byte swap
 875
 876        paddd      \TMP1, \XMM0              # INCR Y0
 877        MOVADQ     \XMM0, \XMM2
 878        pshufb  %xmm14, \XMM2        # perform a 16 byte swap
 879
 880        paddd      \TMP1, \XMM0              # INCR Y0
 881        MOVADQ     \XMM0, \XMM3
 882        pshufb %xmm14, \XMM3        # perform a 16 byte swap
 883
 884        paddd      \TMP1, \XMM0              # INCR Y0
 885        MOVADQ     \XMM0, \XMM4
 886        pshufb %xmm14, \XMM4        # perform a 16 byte swap
 887
 888        MOVADQ     0(%arg1),\TMP1
 889        pxor       \TMP1, \XMM1
 890        pxor       \TMP1, \XMM2
 891        pxor       \TMP1, \XMM3
 892        pxor       \TMP1, \XMM4
 893.irpc index, 1234 # do 4 rounds
 894        movaps 0x10*\index(%arg1), \TMP1
 895        aesenc     \TMP1, \XMM1
 896        aesenc     \TMP1, \XMM2
 897        aesenc     \TMP1, \XMM3
 898        aesenc     \TMP1, \XMM4
 899.endr
 900.irpc index, 56789 # do next 5 rounds
 901        movaps 0x10*\index(%arg1), \TMP1
 902        aesenc     \TMP1, \XMM1
 903        aesenc     \TMP1, \XMM2
 904        aesenc     \TMP1, \XMM3
 905        aesenc     \TMP1, \XMM4
 906.endr
 907        lea        0xa0(%arg1),%r10
 908        mov        keysize,%eax
 909        shr        $2,%eax                      # 128->4, 192->6, 256->8
 910        sub        $4,%eax                      # 128->0, 192->2, 256->4
 911        jz         aes_loop_pre_done\@
 912
 913aes_loop_pre_\@:
 914        MOVADQ     (%r10),\TMP2
 915.irpc   index, 1234
 916        aesenc     \TMP2, %xmm\index
 917.endr
 918        add        $16,%r10
 919        sub        $1,%eax
 920        jnz        aes_loop_pre_\@
 921
 922aes_loop_pre_done\@:
 923        MOVADQ     (%r10), \TMP2
 924        aesenclast \TMP2, \XMM1
 925        aesenclast \TMP2, \XMM2
 926        aesenclast \TMP2, \XMM3
 927        aesenclast \TMP2, \XMM4
 928        movdqu     16*0(%arg4 , %r11 , 1), \TMP1
 929        pxor       \TMP1, \XMM1
 930.ifc \operation, dec
 931        movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 932        movdqa     \TMP1, \XMM1
 933.endif
 934        movdqu     16*1(%arg4 , %r11 , 1), \TMP1
 935        pxor       \TMP1, \XMM2
 936.ifc \operation, dec
 937        movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 938        movdqa     \TMP1, \XMM2
 939.endif
 940        movdqu     16*2(%arg4 , %r11 , 1), \TMP1
 941        pxor       \TMP1, \XMM3
 942.ifc \operation, dec
 943        movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 944        movdqa     \TMP1, \XMM3
 945.endif
 946        movdqu     16*3(%arg4 , %r11 , 1), \TMP1
 947        pxor       \TMP1, \XMM4
 948.ifc \operation, dec
 949        movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 950        movdqa     \TMP1, \XMM4
 951.else
 952        movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 953        movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 954        movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 955        movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 956.endif
 957
 958        add        $64, %r11
 959        pshufb %xmm14, \XMM1 # perform a 16 byte swap
 960        pxor       \XMMDst, \XMM1
 961# combine GHASHed value with the corresponding ciphertext
 962        pshufb %xmm14, \XMM2 # perform a 16 byte swap
 963        pshufb %xmm14, \XMM3 # perform a 16 byte swap
 964        pshufb %xmm14, \XMM4 # perform a 16 byte swap
 965
 966_initial_blocks_done\@:
 967
 968.endm
 969
 970/*
 971* encrypt 4 blocks at a time
 972* ghash the 4 previously encrypted ciphertext blocks
 973* arg1, %arg3, %arg4 are used as pointers only, not modified
 974* %r11 is the data offset value
 975*/
 976.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
 977TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 978
 979        movdqa    \XMM1, \XMM5
 980        movdqa    \XMM2, \XMM6
 981        movdqa    \XMM3, \XMM7
 982        movdqa    \XMM4, \XMM8
 983
 984        movdqa    SHUF_MASK(%rip), %xmm15
 985        # multiply TMP5 * HashKey using karatsuba
 986
 987        movdqa    \XMM5, \TMP4
 988        pshufd    $78, \XMM5, \TMP6
 989        pxor      \XMM5, \TMP6
 990        paddd     ONE(%rip), \XMM0              # INCR CNT
 991        movdqu    HashKey_4(%arg2), \TMP5
 992        pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
 993        movdqa    \XMM0, \XMM1
 994        paddd     ONE(%rip), \XMM0              # INCR CNT
 995        movdqa    \XMM0, \XMM2
 996        paddd     ONE(%rip), \XMM0              # INCR CNT
 997        movdqa    \XMM0, \XMM3
 998        paddd     ONE(%rip), \XMM0              # INCR CNT
 999        movdqa    \XMM0, \XMM4
1000        pshufb %xmm15, \XMM1    # perform a 16 byte swap
1001        pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1002        pshufb %xmm15, \XMM2    # perform a 16 byte swap
1003        pshufb %xmm15, \XMM3    # perform a 16 byte swap
1004        pshufb %xmm15, \XMM4    # perform a 16 byte swap
1005
1006        pxor      (%arg1), \XMM1
1007        pxor      (%arg1), \XMM2
1008        pxor      (%arg1), \XMM3
1009        pxor      (%arg1), \XMM4
1010        movdqu    HashKey_4_k(%arg2), \TMP5
1011        pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1012        movaps 0x10(%arg1), \TMP1
1013        aesenc    \TMP1, \XMM1              # Round 1
1014        aesenc    \TMP1, \XMM2
1015        aesenc    \TMP1, \XMM3
1016        aesenc    \TMP1, \XMM4
1017        movaps 0x20(%arg1), \TMP1
1018        aesenc    \TMP1, \XMM1              # Round 2
1019        aesenc    \TMP1, \XMM2
1020        aesenc    \TMP1, \XMM3
1021        aesenc    \TMP1, \XMM4
1022        movdqa    \XMM6, \TMP1
1023        pshufd    $78, \XMM6, \TMP2
1024        pxor      \XMM6, \TMP2
1025        movdqu    HashKey_3(%arg2), \TMP5
1026        pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1027        movaps 0x30(%arg1), \TMP3
1028        aesenc    \TMP3, \XMM1              # Round 3
1029        aesenc    \TMP3, \XMM2
1030        aesenc    \TMP3, \XMM3
1031        aesenc    \TMP3, \XMM4
1032        pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1033        movaps 0x40(%arg1), \TMP3
1034        aesenc    \TMP3, \XMM1              # Round 4
1035        aesenc    \TMP3, \XMM2
1036        aesenc    \TMP3, \XMM3
1037        aesenc    \TMP3, \XMM4
1038        movdqu    HashKey_3_k(%arg2), \TMP5
1039        pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1040        movaps 0x50(%arg1), \TMP3
1041        aesenc    \TMP3, \XMM1              # Round 5
1042        aesenc    \TMP3, \XMM2
1043        aesenc    \TMP3, \XMM3
1044        aesenc    \TMP3, \XMM4
1045        pxor      \TMP1, \TMP4
1046# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1047        pxor      \XMM6, \XMM5
1048        pxor      \TMP2, \TMP6
1049        movdqa    \XMM7, \TMP1
1050        pshufd    $78, \XMM7, \TMP2
1051        pxor      \XMM7, \TMP2
1052        movdqu    HashKey_2(%arg2), \TMP5
1053
1054        # Multiply TMP5 * HashKey using karatsuba
1055
1056        pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1057        movaps 0x60(%arg1), \TMP3
1058        aesenc    \TMP3, \XMM1              # Round 6
1059        aesenc    \TMP3, \XMM2
1060        aesenc    \TMP3, \XMM3
1061        aesenc    \TMP3, \XMM4
1062        pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1063        movaps 0x70(%arg1), \TMP3
1064        aesenc    \TMP3, \XMM1              # Round 7
1065        aesenc    \TMP3, \XMM2
1066        aesenc    \TMP3, \XMM3
1067        aesenc    \TMP3, \XMM4
1068        movdqu    HashKey_2_k(%arg2), \TMP5
1069        pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1070        movaps 0x80(%arg1), \TMP3
1071        aesenc    \TMP3, \XMM1              # Round 8
1072        aesenc    \TMP3, \XMM2
1073        aesenc    \TMP3, \XMM3
1074        aesenc    \TMP3, \XMM4
1075        pxor      \TMP1, \TMP4
1076# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1077        pxor      \XMM7, \XMM5
1078        pxor      \TMP2, \TMP6
1079
1080        # Multiply XMM8 * HashKey
1081        # XMM8 and TMP5 hold the values for the two operands
1082
1083        movdqa    \XMM8, \TMP1
1084        pshufd    $78, \XMM8, \TMP2
1085        pxor      \XMM8, \TMP2
1086        movdqu    HashKey(%arg2), \TMP5
1087        pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1088        movaps 0x90(%arg1), \TMP3
1089        aesenc    \TMP3, \XMM1             # Round 9
1090        aesenc    \TMP3, \XMM2
1091        aesenc    \TMP3, \XMM3
1092        aesenc    \TMP3, \XMM4
1093        pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1094        lea       0xa0(%arg1),%r10
1095        mov       keysize,%eax
1096        shr       $2,%eax                       # 128->4, 192->6, 256->8
1097        sub       $4,%eax                       # 128->0, 192->2, 256->4
1098        jz        aes_loop_par_enc_done\@
1099
1100aes_loop_par_enc\@:
1101        MOVADQ    (%r10),\TMP3
1102.irpc   index, 1234
1103        aesenc    \TMP3, %xmm\index
1104.endr
1105        add       $16,%r10
1106        sub       $1,%eax
1107        jnz       aes_loop_par_enc\@
1108
1109aes_loop_par_enc_done\@:
1110        MOVADQ    (%r10), \TMP3
1111        aesenclast \TMP3, \XMM1           # Round 10
1112        aesenclast \TMP3, \XMM2
1113        aesenclast \TMP3, \XMM3
1114        aesenclast \TMP3, \XMM4
1115        movdqu    HashKey_k(%arg2), \TMP5
1116        pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1117        movdqu    (%arg4,%r11,1), \TMP3
1118        pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1119        movdqu    16(%arg4,%r11,1), \TMP3
1120        pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1121        movdqu    32(%arg4,%r11,1), \TMP3
1122        pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1123        movdqu    48(%arg4,%r11,1), \TMP3
1124        pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1125        movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1126        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1127        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1128        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1129        pshufb %xmm15, \XMM1        # perform a 16 byte swap
1130        pshufb %xmm15, \XMM2    # perform a 16 byte swap
1131        pshufb %xmm15, \XMM3    # perform a 16 byte swap
1132        pshufb %xmm15, \XMM4    # perform a 16 byte swap
1133
1134        pxor      \TMP4, \TMP1
1135        pxor      \XMM8, \XMM5
1136        pxor      \TMP6, \TMP2
1137        pxor      \TMP1, \TMP2
1138        pxor      \XMM5, \TMP2
1139        movdqa    \TMP2, \TMP3
1140        pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1141        psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1142        pxor      \TMP3, \XMM5
1143        pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1144
1145        # first phase of reduction
1146
1147        movdqa    \XMM5, \TMP2
1148        movdqa    \XMM5, \TMP3
1149        movdqa    \XMM5, \TMP4
1150# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1151        pslld     $31, \TMP2                   # packed right shift << 31
1152        pslld     $30, \TMP3                   # packed right shift << 30
1153        pslld     $25, \TMP4                   # packed right shift << 25
1154        pxor      \TMP3, \TMP2                 # xor the shifted versions
1155        pxor      \TMP4, \TMP2
1156        movdqa    \TMP2, \TMP5
1157        psrldq    $4, \TMP5                    # right shift T5 1 DW
1158        pslldq    $12, \TMP2                   # left shift T2 3 DWs
1159        pxor      \TMP2, \XMM5
1160
1161        # second phase of reduction
1162
1163        movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1164        movdqa    \XMM5,\TMP3
1165        movdqa    \XMM5,\TMP4
1166        psrld     $1, \TMP2                    # packed left shift >>1
1167        psrld     $2, \TMP3                    # packed left shift >>2
1168        psrld     $7, \TMP4                    # packed left shift >>7
1169        pxor      \TMP3,\TMP2                  # xor the shifted versions
1170        pxor      \TMP4,\TMP2
1171        pxor      \TMP5, \TMP2
1172        pxor      \TMP2, \XMM5
1173        pxor      \TMP1, \XMM5                 # result is in TMP1
1174
1175        pxor      \XMM5, \XMM1
1176.endm
1177
1178/*
1179* decrypt 4 blocks at a time
1180* ghash the 4 previously decrypted ciphertext blocks
1181* arg1, %arg3, %arg4 are used as pointers only, not modified
1182* %r11 is the data offset value
1183*/
1184.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1185TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1186
1187        movdqa    \XMM1, \XMM5
1188        movdqa    \XMM2, \XMM6
1189        movdqa    \XMM3, \XMM7
1190        movdqa    \XMM4, \XMM8
1191
1192        movdqa    SHUF_MASK(%rip), %xmm15
1193        # multiply TMP5 * HashKey using karatsuba
1194
1195        movdqa    \XMM5, \TMP4
1196        pshufd    $78, \XMM5, \TMP6
1197        pxor      \XMM5, \TMP6
1198        paddd     ONE(%rip), \XMM0              # INCR CNT
1199        movdqu    HashKey_4(%arg2), \TMP5
1200        pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1201        movdqa    \XMM0, \XMM1
1202        paddd     ONE(%rip), \XMM0              # INCR CNT
1203        movdqa    \XMM0, \XMM2
1204        paddd     ONE(%rip), \XMM0              # INCR CNT
1205        movdqa    \XMM0, \XMM3
1206        paddd     ONE(%rip), \XMM0              # INCR CNT
1207        movdqa    \XMM0, \XMM4
1208        pshufb %xmm15, \XMM1    # perform a 16 byte swap
1209        pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1210        pshufb %xmm15, \XMM2    # perform a 16 byte swap
1211        pshufb %xmm15, \XMM3    # perform a 16 byte swap
1212        pshufb %xmm15, \XMM4    # perform a 16 byte swap
1213
1214        pxor      (%arg1), \XMM1
1215        pxor      (%arg1), \XMM2
1216        pxor      (%arg1), \XMM3
1217        pxor      (%arg1), \XMM4
1218        movdqu    HashKey_4_k(%arg2), \TMP5
1219        pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1220        movaps 0x10(%arg1), \TMP1
1221        aesenc    \TMP1, \XMM1              # Round 1
1222        aesenc    \TMP1, \XMM2
1223        aesenc    \TMP1, \XMM3
1224        aesenc    \TMP1, \XMM4
1225        movaps 0x20(%arg1), \TMP1
1226        aesenc    \TMP1, \XMM1              # Round 2
1227        aesenc    \TMP1, \XMM2
1228        aesenc    \TMP1, \XMM3
1229        aesenc    \TMP1, \XMM4
1230        movdqa    \XMM6, \TMP1
1231        pshufd    $78, \XMM6, \TMP2
1232        pxor      \XMM6, \TMP2
1233        movdqu    HashKey_3(%arg2), \TMP5
1234        pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1235        movaps 0x30(%arg1), \TMP3
1236        aesenc    \TMP3, \XMM1              # Round 3
1237        aesenc    \TMP3, \XMM2
1238        aesenc    \TMP3, \XMM3
1239        aesenc    \TMP3, \XMM4
1240        pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1241        movaps 0x40(%arg1), \TMP3
1242        aesenc    \TMP3, \XMM1              # Round 4
1243        aesenc    \TMP3, \XMM2
1244        aesenc    \TMP3, \XMM3
1245        aesenc    \TMP3, \XMM4
1246        movdqu    HashKey_3_k(%arg2), \TMP5
1247        pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1248        movaps 0x50(%arg1), \TMP3
1249        aesenc    \TMP3, \XMM1              # Round 5
1250        aesenc    \TMP3, \XMM2
1251        aesenc    \TMP3, \XMM3
1252        aesenc    \TMP3, \XMM4
1253        pxor      \TMP1, \TMP4
1254# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1255        pxor      \XMM6, \XMM5
1256        pxor      \TMP2, \TMP6
1257        movdqa    \XMM7, \TMP1
1258        pshufd    $78, \XMM7, \TMP2
1259        pxor      \XMM7, \TMP2
1260        movdqu    HashKey_2(%arg2), \TMP5
1261
1262        # Multiply TMP5 * HashKey using karatsuba
1263
1264        pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1265        movaps 0x60(%arg1), \TMP3
1266        aesenc    \TMP3, \XMM1              # Round 6
1267        aesenc    \TMP3, \XMM2
1268        aesenc    \TMP3, \XMM3
1269        aesenc    \TMP3, \XMM4
1270        pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1271        movaps 0x70(%arg1), \TMP3
1272        aesenc    \TMP3, \XMM1              # Round 7
1273        aesenc    \TMP3, \XMM2
1274        aesenc    \TMP3, \XMM3
1275        aesenc    \TMP3, \XMM4
1276        movdqu    HashKey_2_k(%arg2), \TMP5
1277        pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1278        movaps 0x80(%arg1), \TMP3
1279        aesenc    \TMP3, \XMM1              # Round 8
1280        aesenc    \TMP3, \XMM2
1281        aesenc    \TMP3, \XMM3
1282        aesenc    \TMP3, \XMM4
1283        pxor      \TMP1, \TMP4
1284# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1285        pxor      \XMM7, \XMM5
1286        pxor      \TMP2, \TMP6
1287
1288        # Multiply XMM8 * HashKey
1289        # XMM8 and TMP5 hold the values for the two operands
1290
1291        movdqa    \XMM8, \TMP1
1292        pshufd    $78, \XMM8, \TMP2
1293        pxor      \XMM8, \TMP2
1294        movdqu    HashKey(%arg2), \TMP5
1295        pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1296        movaps 0x90(%arg1), \TMP3
1297        aesenc    \TMP3, \XMM1             # Round 9
1298        aesenc    \TMP3, \XMM2
1299        aesenc    \TMP3, \XMM3
1300        aesenc    \TMP3, \XMM4
1301        pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1302        lea       0xa0(%arg1),%r10
1303        mov       keysize,%eax
1304        shr       $2,%eax                       # 128->4, 192->6, 256->8
1305        sub       $4,%eax                       # 128->0, 192->2, 256->4
1306        jz        aes_loop_par_dec_done\@
1307
1308aes_loop_par_dec\@:
1309        MOVADQ    (%r10),\TMP3
1310.irpc   index, 1234
1311        aesenc    \TMP3, %xmm\index
1312.endr
1313        add       $16,%r10
1314        sub       $1,%eax
1315        jnz       aes_loop_par_dec\@
1316
1317aes_loop_par_dec_done\@:
1318        MOVADQ    (%r10), \TMP3
1319        aesenclast \TMP3, \XMM1           # last round
1320        aesenclast \TMP3, \XMM2
1321        aesenclast \TMP3, \XMM3
1322        aesenclast \TMP3, \XMM4
1323        movdqu    HashKey_k(%arg2), \TMP5
1324        pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1325        movdqu    (%arg4,%r11,1), \TMP3
1326        pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1327        movdqu    \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1328        movdqa    \TMP3, \XMM1
1329        movdqu    16(%arg4,%r11,1), \TMP3
1330        pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1331        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1332        movdqa    \TMP3, \XMM2
1333        movdqu    32(%arg4,%r11,1), \TMP3
1334        pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1335        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1336        movdqa    \TMP3, \XMM3
1337        movdqu    48(%arg4,%r11,1), \TMP3
1338        pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1339        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1340        movdqa    \TMP3, \XMM4
1341        pshufb %xmm15, \XMM1        # perform a 16 byte swap
1342        pshufb %xmm15, \XMM2    # perform a 16 byte swap
1343        pshufb %xmm15, \XMM3    # perform a 16 byte swap
1344        pshufb %xmm15, \XMM4    # perform a 16 byte swap
1345
1346        pxor      \TMP4, \TMP1
1347        pxor      \XMM8, \XMM5
1348        pxor      \TMP6, \TMP2
1349        pxor      \TMP1, \TMP2
1350        pxor      \XMM5, \TMP2
1351        movdqa    \TMP2, \TMP3
1352        pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1353        psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1354        pxor      \TMP3, \XMM5
1355        pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1356
1357        # first phase of reduction
1358
1359        movdqa    \XMM5, \TMP2
1360        movdqa    \XMM5, \TMP3
1361        movdqa    \XMM5, \TMP4
1362# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1363        pslld     $31, \TMP2                   # packed right shift << 31
1364        pslld     $30, \TMP3                   # packed right shift << 30
1365        pslld     $25, \TMP4                   # packed right shift << 25
1366        pxor      \TMP3, \TMP2                 # xor the shifted versions
1367        pxor      \TMP4, \TMP2
1368        movdqa    \TMP2, \TMP5
1369        psrldq    $4, \TMP5                    # right shift T5 1 DW
1370        pslldq    $12, \TMP2                   # left shift T2 3 DWs
1371        pxor      \TMP2, \XMM5
1372
1373        # second phase of reduction
1374
1375        movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1376        movdqa    \XMM5,\TMP3
1377        movdqa    \XMM5,\TMP4
1378        psrld     $1, \TMP2                    # packed left shift >>1
1379        psrld     $2, \TMP3                    # packed left shift >>2
1380        psrld     $7, \TMP4                    # packed left shift >>7
1381        pxor      \TMP3,\TMP2                  # xor the shifted versions
1382        pxor      \TMP4,\TMP2
1383        pxor      \TMP5, \TMP2
1384        pxor      \TMP2, \XMM5
1385        pxor      \TMP1, \XMM5                 # result is in TMP1
1386
1387        pxor      \XMM5, \XMM1
1388.endm
1389
1390/* GHASH the last 4 ciphertext blocks. */
1391.macro  GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1392TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1393
1394        # Multiply TMP6 * HashKey (using Karatsuba)
1395
1396        movdqa    \XMM1, \TMP6
1397        pshufd    $78, \XMM1, \TMP2
1398        pxor      \XMM1, \TMP2
1399        movdqu    HashKey_4(%arg2), \TMP5
1400        pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1401        pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1402        movdqu    HashKey_4_k(%arg2), \TMP4
1403        pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1404        movdqa    \XMM1, \XMMDst
1405        movdqa    \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1406
1407        # Multiply TMP1 * HashKey (using Karatsuba)
1408
1409        movdqa    \XMM2, \TMP1
1410        pshufd    $78, \XMM2, \TMP2
1411        pxor      \XMM2, \TMP2
1412        movdqu    HashKey_3(%arg2), \TMP5
1413        pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1414        pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1415        movdqu    HashKey_3_k(%arg2), \TMP4
1416        pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1417        pxor      \TMP1, \TMP6
1418        pxor      \XMM2, \XMMDst
1419        pxor      \TMP2, \XMM1
1420# results accumulated in TMP6, XMMDst, XMM1
1421
1422        # Multiply TMP1 * HashKey (using Karatsuba)
1423
1424        movdqa    \XMM3, \TMP1
1425        pshufd    $78, \XMM3, \TMP2
1426        pxor      \XMM3, \TMP2
1427        movdqu    HashKey_2(%arg2), \TMP5
1428        pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1429        pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1430        movdqu    HashKey_2_k(%arg2), \TMP4
1431        pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1432        pxor      \TMP1, \TMP6
1433        pxor      \XMM3, \XMMDst
1434        pxor      \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1435
1436        # Multiply TMP1 * HashKey (using Karatsuba)
1437        movdqa    \XMM4, \TMP1
1438        pshufd    $78, \XMM4, \TMP2
1439        pxor      \XMM4, \TMP2
1440        movdqu    HashKey(%arg2), \TMP5
1441        pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1442        pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1443        movdqu    HashKey_k(%arg2), \TMP4
1444        pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1445        pxor      \TMP1, \TMP6
1446        pxor      \XMM4, \XMMDst
1447        pxor      \XMM1, \TMP2
1448        pxor      \TMP6, \TMP2
1449        pxor      \XMMDst, \TMP2
1450        # middle section of the temp results combined as in karatsuba algorithm
1451        movdqa    \TMP2, \TMP4
1452        pslldq    $8, \TMP4                 # left shift TMP4 2 DWs
1453        psrldq    $8, \TMP2                 # right shift TMP2 2 DWs
1454        pxor      \TMP4, \XMMDst
1455        pxor      \TMP2, \TMP6
1456# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1457        # first phase of the reduction
1458        movdqa    \XMMDst, \TMP2
1459        movdqa    \XMMDst, \TMP3
1460        movdqa    \XMMDst, \TMP4
1461# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1462        pslld     $31, \TMP2                # packed right shifting << 31
1463        pslld     $30, \TMP3                # packed right shifting << 30
1464        pslld     $25, \TMP4                # packed right shifting << 25
1465        pxor      \TMP3, \TMP2              # xor the shifted versions
1466        pxor      \TMP4, \TMP2
1467        movdqa    \TMP2, \TMP7
1468        psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1469        pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1470        pxor      \TMP2, \XMMDst
1471
1472        # second phase of the reduction
1473        movdqa    \XMMDst, \TMP2
1474        # make 3 copies of XMMDst for doing 3 shift operations
1475        movdqa    \XMMDst, \TMP3
1476        movdqa    \XMMDst, \TMP4
1477        psrld     $1, \TMP2                 # packed left shift >> 1
1478        psrld     $2, \TMP3                 # packed left shift >> 2
1479        psrld     $7, \TMP4                 # packed left shift >> 7
1480        pxor      \TMP3, \TMP2              # xor the shifted versions
1481        pxor      \TMP4, \TMP2
1482        pxor      \TMP7, \TMP2
1483        pxor      \TMP2, \XMMDst
1484        pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1485.endm
1486
1487
1488/* Encryption of a single block
1489* uses eax & r10
1490*/
1491
1492.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1493
1494        pxor            (%arg1), \XMM0
1495        mov             keysize,%eax
1496        shr             $2,%eax                 # 128->4, 192->6, 256->8
1497        add             $5,%eax                 # 128->9, 192->11, 256->13
1498        lea             16(%arg1), %r10   # get first expanded key address
1499
1500_esb_loop_\@:
1501        MOVADQ          (%r10),\TMP1
1502        aesenc          \TMP1,\XMM0
1503        add             $16,%r10
1504        sub             $1,%eax
1505        jnz             _esb_loop_\@
1506
1507        MOVADQ          (%r10),\TMP1
1508        aesenclast      \TMP1,\XMM0
1509.endm
1510/*****************************************************************************
1511* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1512*                   struct gcm_context_data *data
1513*                                      // Context data
1514*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1515*                   const u8 *in,      // Ciphertext input
1516*                   u64 plaintext_len, // Length of data in bytes for decryption.
1517*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1518*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1519*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1520*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1521*                   const u8 *aad,     // Additional Authentication Data (AAD)
1522*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1523*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1524*                                      // given authentication tag and only return the plaintext if they match.
1525*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1526*                                      // (most likely), 12 or 8.
1527*
1528* Assumptions:
1529*
1530* keys:
1531*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1532*       set of 11 keys in the data structure void *aes_ctx
1533*
1534* iv:
1535*       0                   1                   2                   3
1536*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1537*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1538*       |                             Salt  (From the SA)               |
1539*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1540*       |                     Initialization Vector                     |
1541*       |         (This is the sequence number from IPSec header)       |
1542*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543*       |                              0x1                              |
1544*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545*
1546*
1547*
1548* AAD:
1549*       AAD padded to 128 bits with 0
1550*       for example, assume AAD is a u32 vector
1551*
1552*       if AAD is 8 bytes:
1553*       AAD[3] = {A0, A1};
1554*       padded AAD in xmm register = {A1 A0 0 0}
1555*
1556*       0                   1                   2                   3
1557*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1558*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1559*       |                               SPI (A1)                        |
1560*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1561*       |                     32-bit Sequence Number (A0)               |
1562*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1563*       |                              0x0                              |
1564*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565*
1566*                                       AAD Format with 32-bit Sequence Number
1567*
1568*       if AAD is 12 bytes:
1569*       AAD[3] = {A0, A1, A2};
1570*       padded AAD in xmm register = {A2 A1 A0 0}
1571*
1572*       0                   1                   2                   3
1573*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1574*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1575*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1576*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1577*       |                               SPI (A2)                        |
1578*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579*       |                 64-bit Extended Sequence Number {A1,A0}       |
1580*       |                                                               |
1581*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1582*       |                              0x0                              |
1583*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584*
1585*                        AAD Format with 64-bit Extended Sequence Number
1586*
1587* poly = x^128 + x^127 + x^126 + x^121 + 1
1588*
1589*****************************************************************************/
1590SYM_FUNC_START(aesni_gcm_dec)
1591        FUNC_SAVE
1592
1593        GCM_INIT %arg6, arg7, arg8, arg9
1594        GCM_ENC_DEC dec
1595        GCM_COMPLETE arg10, arg11
1596        FUNC_RESTORE
1597        ret
1598SYM_FUNC_END(aesni_gcm_dec)
1599
1600
1601/*****************************************************************************
1602* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1603*                    struct gcm_context_data *data
1604*                                        // Context data
1605*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1606*                    const u8 *in,       // Plaintext input
1607*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1608*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1609*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1610*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1611*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1612*                    const u8 *aad,      // Additional Authentication Data (AAD)
1613*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1614*                    u8 *auth_tag,       // Authenticated Tag output.
1615*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1616*                                        // 12 or 8.
1617*
1618* Assumptions:
1619*
1620* keys:
1621*       keys are pre-expanded and aligned to 16 bytes. we are using the
1622*       first set of 11 keys in the data structure void *aes_ctx
1623*
1624*
1625* iv:
1626*       0                   1                   2                   3
1627*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1628*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1629*       |                             Salt  (From the SA)               |
1630*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1631*       |                     Initialization Vector                     |
1632*       |         (This is the sequence number from IPSec header)       |
1633*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634*       |                              0x1                              |
1635*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636*
1637*
1638*
1639* AAD:
1640*       AAD padded to 128 bits with 0
1641*       for example, assume AAD is a u32 vector
1642*
1643*       if AAD is 8 bytes:
1644*       AAD[3] = {A0, A1};
1645*       padded AAD in xmm register = {A1 A0 0 0}
1646*
1647*       0                   1                   2                   3
1648*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1649*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1650*       |                               SPI (A1)                        |
1651*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1652*       |                     32-bit Sequence Number (A0)               |
1653*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1654*       |                              0x0                              |
1655*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656*
1657*                                 AAD Format with 32-bit Sequence Number
1658*
1659*       if AAD is 12 bytes:
1660*       AAD[3] = {A0, A1, A2};
1661*       padded AAD in xmm register = {A2 A1 A0 0}
1662*
1663*       0                   1                   2                   3
1664*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1665*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1666*       |                               SPI (A2)                        |
1667*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1668*       |                 64-bit Extended Sequence Number {A1,A0}       |
1669*       |                                                               |
1670*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1671*       |                              0x0                              |
1672*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1673*
1674*                         AAD Format with 64-bit Extended Sequence Number
1675*
1676* poly = x^128 + x^127 + x^126 + x^121 + 1
1677***************************************************************************/
1678SYM_FUNC_START(aesni_gcm_enc)
1679        FUNC_SAVE
1680
1681        GCM_INIT %arg6, arg7, arg8, arg9
1682        GCM_ENC_DEC enc
1683
1684        GCM_COMPLETE arg10, arg11
1685        FUNC_RESTORE
1686        ret
1687SYM_FUNC_END(aesni_gcm_enc)
1688
1689/*****************************************************************************
1690* void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1691*                     struct gcm_context_data *data,
1692*                                         // context data
1693*                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1694*                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1695*                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1696*                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1697*                     const u8 *aad,      // Additional Authentication Data (AAD)
1698*                     u64 aad_len)        // Length of AAD in bytes.
1699*/
1700SYM_FUNC_START(aesni_gcm_init)
1701        FUNC_SAVE
1702        GCM_INIT %arg3, %arg4,%arg5, %arg6
1703        FUNC_RESTORE
1704        ret
1705SYM_FUNC_END(aesni_gcm_init)
1706
1707/*****************************************************************************
1708* void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1709*                    struct gcm_context_data *data,
1710*                                        // context data
1711*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1712*                    const u8 *in,       // Plaintext input
1713*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1714*/
1715SYM_FUNC_START(aesni_gcm_enc_update)
1716        FUNC_SAVE
1717        GCM_ENC_DEC enc
1718        FUNC_RESTORE
1719        ret
1720SYM_FUNC_END(aesni_gcm_enc_update)
1721
1722/*****************************************************************************
1723* void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1724*                    struct gcm_context_data *data,
1725*                                        // context data
1726*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1727*                    const u8 *in,       // Plaintext input
1728*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1729*/
1730SYM_FUNC_START(aesni_gcm_dec_update)
1731        FUNC_SAVE
1732        GCM_ENC_DEC dec
1733        FUNC_RESTORE
1734        ret
1735SYM_FUNC_END(aesni_gcm_dec_update)
1736
1737/*****************************************************************************
1738* void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1739*                    struct gcm_context_data *data,
1740*                                        // context data
1741*                    u8 *auth_tag,       // Authenticated Tag output.
1742*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1743*                                        // 12 or 8.
1744*/
1745SYM_FUNC_START(aesni_gcm_finalize)
1746        FUNC_SAVE
1747        GCM_COMPLETE %arg3 %arg4
1748        FUNC_RESTORE
1749        ret
1750SYM_FUNC_END(aesni_gcm_finalize)
1751
1752#endif
1753
1754
1755SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
1756SYM_FUNC_START_LOCAL(_key_expansion_256a)
1757        pshufd $0b11111111, %xmm1, %xmm1
1758        shufps $0b00010000, %xmm0, %xmm4
1759        pxor %xmm4, %xmm0
1760        shufps $0b10001100, %xmm0, %xmm4
1761        pxor %xmm4, %xmm0
1762        pxor %xmm1, %xmm0
1763        movaps %xmm0, (TKEYP)
1764        add $0x10, TKEYP
1765        ret
1766SYM_FUNC_END(_key_expansion_256a)
1767SYM_FUNC_END_ALIAS(_key_expansion_128)
1768
1769SYM_FUNC_START_LOCAL(_key_expansion_192a)
1770        pshufd $0b01010101, %xmm1, %xmm1
1771        shufps $0b00010000, %xmm0, %xmm4
1772        pxor %xmm4, %xmm0
1773        shufps $0b10001100, %xmm0, %xmm4
1774        pxor %xmm4, %xmm0
1775        pxor %xmm1, %xmm0
1776
1777        movaps %xmm2, %xmm5
1778        movaps %xmm2, %xmm6
1779        pslldq $4, %xmm5
1780        pshufd $0b11111111, %xmm0, %xmm3
1781        pxor %xmm3, %xmm2
1782        pxor %xmm5, %xmm2
1783
1784        movaps %xmm0, %xmm1
1785        shufps $0b01000100, %xmm0, %xmm6
1786        movaps %xmm6, (TKEYP)
1787        shufps $0b01001110, %xmm2, %xmm1
1788        movaps %xmm1, 0x10(TKEYP)
1789        add $0x20, TKEYP
1790        ret
1791SYM_FUNC_END(_key_expansion_192a)
1792
1793SYM_FUNC_START_LOCAL(_key_expansion_192b)
1794        pshufd $0b01010101, %xmm1, %xmm1
1795        shufps $0b00010000, %xmm0, %xmm4
1796        pxor %xmm4, %xmm0
1797        shufps $0b10001100, %xmm0, %xmm4
1798        pxor %xmm4, %xmm0
1799        pxor %xmm1, %xmm0
1800
1801        movaps %xmm2, %xmm5
1802        pslldq $4, %xmm5
1803        pshufd $0b11111111, %xmm0, %xmm3
1804        pxor %xmm3, %xmm2
1805        pxor %xmm5, %xmm2
1806
1807        movaps %xmm0, (TKEYP)
1808        add $0x10, TKEYP
1809        ret
1810SYM_FUNC_END(_key_expansion_192b)
1811
1812SYM_FUNC_START_LOCAL(_key_expansion_256b)
1813        pshufd $0b10101010, %xmm1, %xmm1
1814        shufps $0b00010000, %xmm2, %xmm4
1815        pxor %xmm4, %xmm2
1816        shufps $0b10001100, %xmm2, %xmm4
1817        pxor %xmm4, %xmm2
1818        pxor %xmm1, %xmm2
1819        movaps %xmm2, (TKEYP)
1820        add $0x10, TKEYP
1821        ret
1822SYM_FUNC_END(_key_expansion_256b)
1823
1824/*
1825 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1826 *                   unsigned int key_len)
1827 */
1828SYM_FUNC_START(aesni_set_key)
1829        FRAME_BEGIN
1830#ifndef __x86_64__
1831        pushl KEYP
1832        movl (FRAME_OFFSET+8)(%esp), KEYP       # ctx
1833        movl (FRAME_OFFSET+12)(%esp), UKEYP     # in_key
1834        movl (FRAME_OFFSET+16)(%esp), %edx      # key_len
1835#endif
1836        movups (UKEYP), %xmm0           # user key (first 16 bytes)
1837        movaps %xmm0, (KEYP)
1838        lea 0x10(KEYP), TKEYP           # key addr
1839        movl %edx, 480(KEYP)
1840        pxor %xmm4, %xmm4               # xmm4 is assumed 0 in _key_expansion_x
1841        cmp $24, %dl
1842        jb .Lenc_key128
1843        je .Lenc_key192
1844        movups 0x10(UKEYP), %xmm2       # other user key
1845        movaps %xmm2, (TKEYP)
1846        add $0x10, TKEYP
1847        aeskeygenassist $0x1, %xmm2, %xmm1      # round 1
1848        call _key_expansion_256a
1849        aeskeygenassist $0x1, %xmm0, %xmm1
1850        call _key_expansion_256b
1851        aeskeygenassist $0x2, %xmm2, %xmm1      # round 2
1852        call _key_expansion_256a
1853        aeskeygenassist $0x2, %xmm0, %xmm1
1854        call _key_expansion_256b
1855        aeskeygenassist $0x4, %xmm2, %xmm1      # round 3
1856        call _key_expansion_256a
1857        aeskeygenassist $0x4, %xmm0, %xmm1
1858        call _key_expansion_256b
1859        aeskeygenassist $0x8, %xmm2, %xmm1      # round 4
1860        call _key_expansion_256a
1861        aeskeygenassist $0x8, %xmm0, %xmm1
1862        call _key_expansion_256b
1863        aeskeygenassist $0x10, %xmm2, %xmm1     # round 5
1864        call _key_expansion_256a
1865        aeskeygenassist $0x10, %xmm0, %xmm1
1866        call _key_expansion_256b
1867        aeskeygenassist $0x20, %xmm2, %xmm1     # round 6
1868        call _key_expansion_256a
1869        aeskeygenassist $0x20, %xmm0, %xmm1
1870        call _key_expansion_256b
1871        aeskeygenassist $0x40, %xmm2, %xmm1     # round 7
1872        call _key_expansion_256a
1873        jmp .Ldec_key
1874.Lenc_key192:
1875        movq 0x10(UKEYP), %xmm2         # other user key
1876        aeskeygenassist $0x1, %xmm2, %xmm1      # round 1
1877        call _key_expansion_192a
1878        aeskeygenassist $0x2, %xmm2, %xmm1      # round 2
1879        call _key_expansion_192b
1880        aeskeygenassist $0x4, %xmm2, %xmm1      # round 3
1881        call _key_expansion_192a
1882        aeskeygenassist $0x8, %xmm2, %xmm1      # round 4
1883        call _key_expansion_192b
1884        aeskeygenassist $0x10, %xmm2, %xmm1     # round 5
1885        call _key_expansion_192a
1886        aeskeygenassist $0x20, %xmm2, %xmm1     # round 6
1887        call _key_expansion_192b
1888        aeskeygenassist $0x40, %xmm2, %xmm1     # round 7
1889        call _key_expansion_192a
1890        aeskeygenassist $0x80, %xmm2, %xmm1     # round 8
1891        call _key_expansion_192b
1892        jmp .Ldec_key
1893.Lenc_key128:
1894        aeskeygenassist $0x1, %xmm0, %xmm1      # round 1
1895        call _key_expansion_128
1896        aeskeygenassist $0x2, %xmm0, %xmm1      # round 2
1897        call _key_expansion_128
1898        aeskeygenassist $0x4, %xmm0, %xmm1      # round 3
1899        call _key_expansion_128
1900        aeskeygenassist $0x8, %xmm0, %xmm1      # round 4
1901        call _key_expansion_128
1902        aeskeygenassist $0x10, %xmm0, %xmm1     # round 5
1903        call _key_expansion_128
1904        aeskeygenassist $0x20, %xmm0, %xmm1     # round 6
1905        call _key_expansion_128
1906        aeskeygenassist $0x40, %xmm0, %xmm1     # round 7
1907        call _key_expansion_128
1908        aeskeygenassist $0x80, %xmm0, %xmm1     # round 8
1909        call _key_expansion_128
1910        aeskeygenassist $0x1b, %xmm0, %xmm1     # round 9
1911        call _key_expansion_128
1912        aeskeygenassist $0x36, %xmm0, %xmm1     # round 10
1913        call _key_expansion_128
1914.Ldec_key:
1915        sub $0x10, TKEYP
1916        movaps (KEYP), %xmm0
1917        movaps (TKEYP), %xmm1
1918        movaps %xmm0, 240(TKEYP)
1919        movaps %xmm1, 240(KEYP)
1920        add $0x10, KEYP
1921        lea 240-16(TKEYP), UKEYP
1922.align 4
1923.Ldec_key_loop:
1924        movaps (KEYP), %xmm0
1925        aesimc %xmm0, %xmm1
1926        movaps %xmm1, (UKEYP)
1927        add $0x10, KEYP
1928        sub $0x10, UKEYP
1929        cmp TKEYP, KEYP
1930        jb .Ldec_key_loop
1931        xor AREG, AREG
1932#ifndef __x86_64__
1933        popl KEYP
1934#endif
1935        FRAME_END
1936        ret
1937SYM_FUNC_END(aesni_set_key)
1938
1939/*
1940 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1941 */
1942SYM_FUNC_START(aesni_enc)
1943        FRAME_BEGIN
1944#ifndef __x86_64__
1945        pushl KEYP
1946        pushl KLEN
1947        movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
1948        movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
1949        movl (FRAME_OFFSET+20)(%esp), INP       # src
1950#endif
1951        movl 480(KEYP), KLEN            # key length
1952        movups (INP), STATE             # input
1953        call _aesni_enc1
1954        movups STATE, (OUTP)            # output
1955#ifndef __x86_64__
1956        popl KLEN
1957        popl KEYP
1958#endif
1959        FRAME_END
1960        ret
1961SYM_FUNC_END(aesni_enc)
1962
1963/*
1964 * _aesni_enc1:         internal ABI
1965 * input:
1966 *      KEYP:           key struct pointer
1967 *      KLEN:           round count
1968 *      STATE:          initial state (input)
1969 * output:
1970 *      STATE:          finial state (output)
1971 * changed:
1972 *      KEY
1973 *      TKEYP (T1)
1974 */
1975SYM_FUNC_START_LOCAL(_aesni_enc1)
1976        movaps (KEYP), KEY              # key
1977        mov KEYP, TKEYP
1978        pxor KEY, STATE         # round 0
1979        add $0x30, TKEYP
1980        cmp $24, KLEN
1981        jb .Lenc128
1982        lea 0x20(TKEYP), TKEYP
1983        je .Lenc192
1984        add $0x20, TKEYP
1985        movaps -0x60(TKEYP), KEY
1986        aesenc KEY, STATE
1987        movaps -0x50(TKEYP), KEY
1988        aesenc KEY, STATE
1989.align 4
1990.Lenc192:
1991        movaps -0x40(TKEYP), KEY
1992        aesenc KEY, STATE
1993        movaps -0x30(TKEYP), KEY
1994        aesenc KEY, STATE
1995.align 4
1996.Lenc128:
1997        movaps -0x20(TKEYP), KEY
1998        aesenc KEY, STATE
1999        movaps -0x10(TKEYP), KEY
2000        aesenc KEY, STATE
2001        movaps (TKEYP), KEY
2002        aesenc KEY, STATE
2003        movaps 0x10(TKEYP), KEY
2004        aesenc KEY, STATE
2005        movaps 0x20(TKEYP), KEY
2006        aesenc KEY, STATE
2007        movaps 0x30(TKEYP), KEY
2008        aesenc KEY, STATE
2009        movaps 0x40(TKEYP), KEY
2010        aesenc KEY, STATE
2011        movaps 0x50(TKEYP), KEY
2012        aesenc KEY, STATE
2013        movaps 0x60(TKEYP), KEY
2014        aesenc KEY, STATE
2015        movaps 0x70(TKEYP), KEY
2016        aesenclast KEY, STATE
2017        ret
2018SYM_FUNC_END(_aesni_enc1)
2019
2020/*
2021 * _aesni_enc4: internal ABI
2022 * input:
2023 *      KEYP:           key struct pointer
2024 *      KLEN:           round count
2025 *      STATE1:         initial state (input)
2026 *      STATE2
2027 *      STATE3
2028 *      STATE4
2029 * output:
2030 *      STATE1:         finial state (output)
2031 *      STATE2
2032 *      STATE3
2033 *      STATE4
2034 * changed:
2035 *      KEY
2036 *      TKEYP (T1)
2037 */
2038SYM_FUNC_START_LOCAL(_aesni_enc4)
2039        movaps (KEYP), KEY              # key
2040        mov KEYP, TKEYP
2041        pxor KEY, STATE1                # round 0
2042        pxor KEY, STATE2
2043        pxor KEY, STATE3
2044        pxor KEY, STATE4
2045        add $0x30, TKEYP
2046        cmp $24, KLEN
2047        jb .L4enc128
2048        lea 0x20(TKEYP), TKEYP
2049        je .L4enc192
2050        add $0x20, TKEYP
2051        movaps -0x60(TKEYP), KEY
2052        aesenc KEY, STATE1
2053        aesenc KEY, STATE2
2054        aesenc KEY, STATE3
2055        aesenc KEY, STATE4
2056        movaps -0x50(TKEYP), KEY
2057        aesenc KEY, STATE1
2058        aesenc KEY, STATE2
2059        aesenc KEY, STATE3
2060        aesenc KEY, STATE4
2061#.align 4
2062.L4enc192:
2063        movaps -0x40(TKEYP), KEY
2064        aesenc KEY, STATE1
2065        aesenc KEY, STATE2
2066        aesenc KEY, STATE3
2067        aesenc KEY, STATE4
2068        movaps -0x30(TKEYP), KEY
2069        aesenc KEY, STATE1
2070        aesenc KEY, STATE2
2071        aesenc KEY, STATE3
2072        aesenc KEY, STATE4
2073#.align 4
2074.L4enc128:
2075        movaps -0x20(TKEYP), KEY
2076        aesenc KEY, STATE1
2077        aesenc KEY, STATE2
2078        aesenc KEY, STATE3
2079        aesenc KEY, STATE4
2080        movaps -0x10(TKEYP), KEY
2081        aesenc KEY, STATE1
2082        aesenc KEY, STATE2
2083        aesenc KEY, STATE3
2084        aesenc KEY, STATE4
2085        movaps (TKEYP), KEY
2086        aesenc KEY, STATE1
2087        aesenc KEY, STATE2
2088        aesenc KEY, STATE3
2089        aesenc KEY, STATE4
2090        movaps 0x10(TKEYP), KEY
2091        aesenc KEY, STATE1
2092        aesenc KEY, STATE2
2093        aesenc KEY, STATE3
2094        aesenc KEY, STATE4
2095        movaps 0x20(TKEYP), KEY
2096        aesenc KEY, STATE1
2097        aesenc KEY, STATE2
2098        aesenc KEY, STATE3
2099        aesenc KEY, STATE4
2100        movaps 0x30(TKEYP), KEY
2101        aesenc KEY, STATE1
2102        aesenc KEY, STATE2
2103        aesenc KEY, STATE3
2104        aesenc KEY, STATE4
2105        movaps 0x40(TKEYP), KEY
2106        aesenc KEY, STATE1
2107        aesenc KEY, STATE2
2108        aesenc KEY, STATE3
2109        aesenc KEY, STATE4
2110        movaps 0x50(TKEYP), KEY
2111        aesenc KEY, STATE1
2112        aesenc KEY, STATE2
2113        aesenc KEY, STATE3
2114        aesenc KEY, STATE4
2115        movaps 0x60(TKEYP), KEY
2116        aesenc KEY, STATE1
2117        aesenc KEY, STATE2
2118        aesenc KEY, STATE3
2119        aesenc KEY, STATE4
2120        movaps 0x70(TKEYP), KEY
2121        aesenclast KEY, STATE1          # last round
2122        aesenclast KEY, STATE2
2123        aesenclast KEY, STATE3
2124        aesenclast KEY, STATE4
2125        ret
2126SYM_FUNC_END(_aesni_enc4)
2127
2128/*
2129 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2130 */
2131SYM_FUNC_START(aesni_dec)
2132        FRAME_BEGIN
2133#ifndef __x86_64__
2134        pushl KEYP
2135        pushl KLEN
2136        movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
2137        movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
2138        movl (FRAME_OFFSET+20)(%esp), INP       # src
2139#endif
2140        mov 480(KEYP), KLEN             # key length
2141        add $240, KEYP
2142        movups (INP), STATE             # input
2143        call _aesni_dec1
2144        movups STATE, (OUTP)            #output
2145#ifndef __x86_64__
2146        popl KLEN
2147        popl KEYP
2148#endif
2149        FRAME_END
2150        ret
2151SYM_FUNC_END(aesni_dec)
2152
2153/*
2154 * _aesni_dec1:         internal ABI
2155 * input:
2156 *      KEYP:           key struct pointer
2157 *      KLEN:           key length
2158 *      STATE:          initial state (input)
2159 * output:
2160 *      STATE:          finial state (output)
2161 * changed:
2162 *      KEY
2163 *      TKEYP (T1)
2164 */
2165SYM_FUNC_START_LOCAL(_aesni_dec1)
2166        movaps (KEYP), KEY              # key
2167        mov KEYP, TKEYP
2168        pxor KEY, STATE         # round 0
2169        add $0x30, TKEYP
2170        cmp $24, KLEN
2171        jb .Ldec128
2172        lea 0x20(TKEYP), TKEYP
2173        je .Ldec192
2174        add $0x20, TKEYP
2175        movaps -0x60(TKEYP), KEY
2176        aesdec KEY, STATE
2177        movaps -0x50(TKEYP), KEY
2178        aesdec KEY, STATE
2179.align 4
2180.Ldec192:
2181        movaps -0x40(TKEYP), KEY
2182        aesdec KEY, STATE
2183        movaps -0x30(TKEYP), KEY
2184        aesdec KEY, STATE
2185.align 4
2186.Ldec128:
2187        movaps -0x20(TKEYP), KEY
2188        aesdec KEY, STATE
2189        movaps -0x10(TKEYP), KEY
2190        aesdec KEY, STATE
2191        movaps (TKEYP), KEY
2192        aesdec KEY, STATE
2193        movaps 0x10(TKEYP), KEY
2194        aesdec KEY, STATE
2195        movaps 0x20(TKEYP), KEY
2196        aesdec KEY, STATE
2197        movaps 0x30(TKEYP), KEY
2198        aesdec KEY, STATE
2199        movaps 0x40(TKEYP), KEY
2200        aesdec KEY, STATE
2201        movaps 0x50(TKEYP), KEY
2202        aesdec KEY, STATE
2203        movaps 0x60(TKEYP), KEY
2204        aesdec KEY, STATE
2205        movaps 0x70(TKEYP), KEY
2206        aesdeclast KEY, STATE
2207        ret
2208SYM_FUNC_END(_aesni_dec1)
2209
2210/*
2211 * _aesni_dec4: internal ABI
2212 * input:
2213 *      KEYP:           key struct pointer
2214 *      KLEN:           key length
2215 *      STATE1:         initial state (input)
2216 *      STATE2
2217 *      STATE3
2218 *      STATE4
2219 * output:
2220 *      STATE1:         finial state (output)
2221 *      STATE2
2222 *      STATE3
2223 *      STATE4
2224 * changed:
2225 *      KEY
2226 *      TKEYP (T1)
2227 */
2228SYM_FUNC_START_LOCAL(_aesni_dec4)
2229        movaps (KEYP), KEY              # key
2230        mov KEYP, TKEYP
2231        pxor KEY, STATE1                # round 0
2232        pxor KEY, STATE2
2233        pxor KEY, STATE3
2234        pxor KEY, STATE4
2235        add $0x30, TKEYP
2236        cmp $24, KLEN
2237        jb .L4dec128
2238        lea 0x20(TKEYP), TKEYP
2239        je .L4dec192
2240        add $0x20, TKEYP
2241        movaps -0x60(TKEYP), KEY
2242        aesdec KEY, STATE1
2243        aesdec KEY, STATE2
2244        aesdec KEY, STATE3
2245        aesdec KEY, STATE4
2246        movaps -0x50(TKEYP), KEY
2247        aesdec KEY, STATE1
2248        aesdec KEY, STATE2
2249        aesdec KEY, STATE3
2250        aesdec KEY, STATE4
2251.align 4
2252.L4dec192:
2253        movaps -0x40(TKEYP), KEY
2254        aesdec KEY, STATE1
2255        aesdec KEY, STATE2
2256        aesdec KEY, STATE3
2257        aesdec KEY, STATE4
2258        movaps -0x30(TKEYP), KEY
2259        aesdec KEY, STATE1
2260        aesdec KEY, STATE2
2261        aesdec KEY, STATE3
2262        aesdec KEY, STATE4
2263.align 4
2264.L4dec128:
2265        movaps -0x20(TKEYP), KEY
2266        aesdec KEY, STATE1
2267        aesdec KEY, STATE2
2268        aesdec KEY, STATE3
2269        aesdec KEY, STATE4
2270        movaps -0x10(TKEYP), KEY
2271        aesdec KEY, STATE1
2272        aesdec KEY, STATE2
2273        aesdec KEY, STATE3
2274        aesdec KEY, STATE4
2275        movaps (TKEYP), KEY
2276        aesdec KEY, STATE1
2277        aesdec KEY, STATE2
2278        aesdec KEY, STATE3
2279        aesdec KEY, STATE4
2280        movaps 0x10(TKEYP), KEY
2281        aesdec KEY, STATE1
2282        aesdec KEY, STATE2
2283        aesdec KEY, STATE3
2284        aesdec KEY, STATE4
2285        movaps 0x20(TKEYP), KEY
2286        aesdec KEY, STATE1
2287        aesdec KEY, STATE2
2288        aesdec KEY, STATE3
2289        aesdec KEY, STATE4
2290        movaps 0x30(TKEYP), KEY
2291        aesdec KEY, STATE1
2292        aesdec KEY, STATE2
2293        aesdec KEY, STATE3
2294        aesdec KEY, STATE4
2295        movaps 0x40(TKEYP), KEY
2296        aesdec KEY, STATE1
2297        aesdec KEY, STATE2
2298        aesdec KEY, STATE3
2299        aesdec KEY, STATE4
2300        movaps 0x50(TKEYP), KEY
2301        aesdec KEY, STATE1
2302        aesdec KEY, STATE2
2303        aesdec KEY, STATE3
2304        aesdec KEY, STATE4
2305        movaps 0x60(TKEYP), KEY
2306        aesdec KEY, STATE1
2307        aesdec KEY, STATE2
2308        aesdec KEY, STATE3
2309        aesdec KEY, STATE4
2310        movaps 0x70(TKEYP), KEY
2311        aesdeclast KEY, STATE1          # last round
2312        aesdeclast KEY, STATE2
2313        aesdeclast KEY, STATE3
2314        aesdeclast KEY, STATE4
2315        ret
2316SYM_FUNC_END(_aesni_dec4)
2317
2318/*
2319 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2320 *                    size_t len)
2321 */
2322SYM_FUNC_START(aesni_ecb_enc)
2323        FRAME_BEGIN
2324#ifndef __x86_64__
2325        pushl LEN
2326        pushl KEYP
2327        pushl KLEN
2328        movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2329        movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2330        movl (FRAME_OFFSET+24)(%esp), INP       # src
2331        movl (FRAME_OFFSET+28)(%esp), LEN       # len
2332#endif
2333        test LEN, LEN           # check length
2334        jz .Lecb_enc_ret
2335        mov 480(KEYP), KLEN
2336        cmp $16, LEN
2337        jb .Lecb_enc_ret
2338        cmp $64, LEN
2339        jb .Lecb_enc_loop1
2340.align 4
2341.Lecb_enc_loop4:
2342        movups (INP), STATE1
2343        movups 0x10(INP), STATE2
2344        movups 0x20(INP), STATE3
2345        movups 0x30(INP), STATE4
2346        call _aesni_enc4
2347        movups STATE1, (OUTP)
2348        movups STATE2, 0x10(OUTP)
2349        movups STATE3, 0x20(OUTP)
2350        movups STATE4, 0x30(OUTP)
2351        sub $64, LEN
2352        add $64, INP
2353        add $64, OUTP
2354        cmp $64, LEN
2355        jge .Lecb_enc_loop4
2356        cmp $16, LEN
2357        jb .Lecb_enc_ret
2358.align 4
2359.Lecb_enc_loop1:
2360        movups (INP), STATE1
2361        call _aesni_enc1
2362        movups STATE1, (OUTP)
2363        sub $16, LEN
2364        add $16, INP
2365        add $16, OUTP
2366        cmp $16, LEN
2367        jge .Lecb_enc_loop1
2368.Lecb_enc_ret:
2369#ifndef __x86_64__
2370        popl KLEN
2371        popl KEYP
2372        popl LEN
2373#endif
2374        FRAME_END
2375        ret
2376SYM_FUNC_END(aesni_ecb_enc)
2377
2378/*
2379 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2380 *                    size_t len);
2381 */
2382SYM_FUNC_START(aesni_ecb_dec)
2383        FRAME_BEGIN
2384#ifndef __x86_64__
2385        pushl LEN
2386        pushl KEYP
2387        pushl KLEN
2388        movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2389        movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2390        movl (FRAME_OFFSET+24)(%esp), INP       # src
2391        movl (FRAME_OFFSET+28)(%esp), LEN       # len
2392#endif
2393        test LEN, LEN
2394        jz .Lecb_dec_ret
2395        mov 480(KEYP), KLEN
2396        add $240, KEYP
2397        cmp $16, LEN
2398        jb .Lecb_dec_ret
2399        cmp $64, LEN
2400        jb .Lecb_dec_loop1
2401.align 4
2402.Lecb_dec_loop4:
2403        movups (INP), STATE1
2404        movups 0x10(INP), STATE2
2405        movups 0x20(INP), STATE3
2406        movups 0x30(INP), STATE4
2407        call _aesni_dec4
2408        movups STATE1, (OUTP)
2409        movups STATE2, 0x10(OUTP)
2410        movups STATE3, 0x20(OUTP)
2411        movups STATE4, 0x30(OUTP)
2412        sub $64, LEN
2413        add $64, INP
2414        add $64, OUTP
2415        cmp $64, LEN
2416        jge .Lecb_dec_loop4
2417        cmp $16, LEN
2418        jb .Lecb_dec_ret
2419.align 4
2420.Lecb_dec_loop1:
2421        movups (INP), STATE1
2422        call _aesni_dec1
2423        movups STATE1, (OUTP)
2424        sub $16, LEN
2425        add $16, INP
2426        add $16, OUTP
2427        cmp $16, LEN
2428        jge .Lecb_dec_loop1
2429.Lecb_dec_ret:
2430#ifndef __x86_64__
2431        popl KLEN
2432        popl KEYP
2433        popl LEN
2434#endif
2435        FRAME_END
2436        ret
2437SYM_FUNC_END(aesni_ecb_dec)
2438
2439/*
2440 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2441 *                    size_t len, u8 *iv)
2442 */
2443SYM_FUNC_START(aesni_cbc_enc)
2444        FRAME_BEGIN
2445#ifndef __x86_64__
2446        pushl IVP
2447        pushl LEN
2448        pushl KEYP
2449        pushl KLEN
2450        movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2451        movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2452        movl (FRAME_OFFSET+28)(%esp), INP       # src
2453        movl (FRAME_OFFSET+32)(%esp), LEN       # len
2454        movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2455#endif
2456        cmp $16, LEN
2457        jb .Lcbc_enc_ret
2458        mov 480(KEYP), KLEN
2459        movups (IVP), STATE     # load iv as initial state
2460.align 4
2461.Lcbc_enc_loop:
2462        movups (INP), IN        # load input
2463        pxor IN, STATE
2464        call _aesni_enc1
2465        movups STATE, (OUTP)    # store output
2466        sub $16, LEN
2467        add $16, INP
2468        add $16, OUTP
2469        cmp $16, LEN
2470        jge .Lcbc_enc_loop
2471        movups STATE, (IVP)
2472.Lcbc_enc_ret:
2473#ifndef __x86_64__
2474        popl KLEN
2475        popl KEYP
2476        popl LEN
2477        popl IVP
2478#endif
2479        FRAME_END
2480        ret
2481SYM_FUNC_END(aesni_cbc_enc)
2482
2483/*
2484 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2485 *                    size_t len, u8 *iv)
2486 */
2487SYM_FUNC_START(aesni_cbc_dec)
2488        FRAME_BEGIN
2489#ifndef __x86_64__
2490        pushl IVP
2491        pushl LEN
2492        pushl KEYP
2493        pushl KLEN
2494        movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2495        movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2496        movl (FRAME_OFFSET+28)(%esp), INP       # src
2497        movl (FRAME_OFFSET+32)(%esp), LEN       # len
2498        movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2499#endif
2500        cmp $16, LEN
2501        jb .Lcbc_dec_just_ret
2502        mov 480(KEYP), KLEN
2503        add $240, KEYP
2504        movups (IVP), IV
2505        cmp $64, LEN
2506        jb .Lcbc_dec_loop1
2507.align 4
2508.Lcbc_dec_loop4:
2509        movups (INP), IN1
2510        movaps IN1, STATE1
2511        movups 0x10(INP), IN2
2512        movaps IN2, STATE2
2513#ifdef __x86_64__
2514        movups 0x20(INP), IN3
2515        movaps IN3, STATE3
2516        movups 0x30(INP), IN4
2517        movaps IN4, STATE4
2518#else
2519        movups 0x20(INP), IN1
2520        movaps IN1, STATE3
2521        movups 0x30(INP), IN2
2522        movaps IN2, STATE4
2523#endif
2524        call _aesni_dec4
2525        pxor IV, STATE1
2526#ifdef __x86_64__
2527        pxor IN1, STATE2
2528        pxor IN2, STATE3
2529        pxor IN3, STATE4
2530        movaps IN4, IV
2531#else
2532        pxor IN1, STATE4
2533        movaps IN2, IV
2534        movups (INP), IN1
2535        pxor IN1, STATE2
2536        movups 0x10(INP), IN2
2537        pxor IN2, STATE3
2538#endif
2539        movups STATE1, (OUTP)
2540        movups STATE2, 0x10(OUTP)
2541        movups STATE3, 0x20(OUTP)
2542        movups STATE4, 0x30(OUTP)
2543        sub $64, LEN
2544        add $64, INP
2545        add $64, OUTP
2546        cmp $64, LEN
2547        jge .Lcbc_dec_loop4
2548        cmp $16, LEN
2549        jb .Lcbc_dec_ret
2550.align 4
2551.Lcbc_dec_loop1:
2552        movups (INP), IN
2553        movaps IN, STATE
2554        call _aesni_dec1
2555        pxor IV, STATE
2556        movups STATE, (OUTP)
2557        movaps IN, IV
2558        sub $16, LEN
2559        add $16, INP
2560        add $16, OUTP
2561        cmp $16, LEN
2562        jge .Lcbc_dec_loop1
2563.Lcbc_dec_ret:
2564        movups IV, (IVP)
2565.Lcbc_dec_just_ret:
2566#ifndef __x86_64__
2567        popl KLEN
2568        popl KEYP
2569        popl LEN
2570        popl IVP
2571#endif
2572        FRAME_END
2573        ret
2574SYM_FUNC_END(aesni_cbc_dec)
2575
2576/*
2577 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2578 *                        size_t len, u8 *iv)
2579 */
2580SYM_FUNC_START(aesni_cts_cbc_enc)
2581        FRAME_BEGIN
2582#ifndef __x86_64__
2583        pushl IVP
2584        pushl LEN
2585        pushl KEYP
2586        pushl KLEN
2587        movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2588        movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2589        movl (FRAME_OFFSET+28)(%esp), INP       # src
2590        movl (FRAME_OFFSET+32)(%esp), LEN       # len
2591        movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2592        lea .Lcts_permute_table, T1
2593#else
2594        lea .Lcts_permute_table(%rip), T1
2595#endif
2596        mov 480(KEYP), KLEN
2597        movups (IVP), STATE
2598        sub $16, LEN
2599        mov T1, IVP
2600        add $32, IVP
2601        add LEN, T1
2602        sub LEN, IVP
2603        movups (T1), %xmm4
2604        movups (IVP), %xmm5
2605
2606        movups (INP), IN1
2607        add LEN, INP
2608        movups (INP), IN2
2609
2610        pxor IN1, STATE
2611        call _aesni_enc1
2612
2613        pshufb %xmm5, IN2
2614        pxor STATE, IN2
2615        pshufb %xmm4, STATE
2616        add OUTP, LEN
2617        movups STATE, (LEN)
2618
2619        movaps IN2, STATE
2620        call _aesni_enc1
2621        movups STATE, (OUTP)
2622
2623#ifndef __x86_64__
2624        popl KLEN
2625        popl KEYP
2626        popl LEN
2627        popl IVP
2628#endif
2629        FRAME_END
2630        ret
2631SYM_FUNC_END(aesni_cts_cbc_enc)
2632
2633/*
2634 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2635 *                        size_t len, u8 *iv)
2636 */
2637SYM_FUNC_START(aesni_cts_cbc_dec)
2638        FRAME_BEGIN
2639#ifndef __x86_64__
2640        pushl IVP
2641        pushl LEN
2642        pushl KEYP
2643        pushl KLEN
2644        movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2645        movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2646        movl (FRAME_OFFSET+28)(%esp), INP       # src
2647        movl (FRAME_OFFSET+32)(%esp), LEN       # len
2648        movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2649        lea .Lcts_permute_table, T1
2650#else
2651        lea .Lcts_permute_table(%rip), T1
2652#endif
2653        mov 480(KEYP), KLEN
2654        add $240, KEYP
2655        movups (IVP), IV
2656        sub $16, LEN
2657        mov T1, IVP
2658        add $32, IVP
2659        add LEN, T1
2660        sub LEN, IVP
2661        movups (T1), %xmm4
2662
2663        movups (INP), STATE
2664        add LEN, INP
2665        movups (INP), IN1
2666
2667        call _aesni_dec1
2668        movaps STATE, IN2
2669        pshufb %xmm4, STATE
2670        pxor IN1, STATE
2671
2672        add OUTP, LEN
2673        movups STATE, (LEN)
2674
2675        movups (IVP), %xmm0
2676        pshufb %xmm0, IN1
2677        pblendvb IN2, IN1
2678        movaps IN1, STATE
2679        call _aesni_dec1
2680
2681        pxor IV, STATE
2682        movups STATE, (OUTP)
2683
2684#ifndef __x86_64__
2685        popl KLEN
2686        popl KEYP
2687        popl LEN
2688        popl IVP
2689#endif
2690        FRAME_END
2691        ret
2692SYM_FUNC_END(aesni_cts_cbc_dec)
2693
2694.pushsection .rodata
2695.align 16
2696.Lcts_permute_table:
2697        .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2698        .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2699        .byte           0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
2700        .byte           0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
2701        .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2702        .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2703#ifdef __x86_64__
2704.Lbswap_mask:
2705        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2706#endif
2707.popsection
2708
2709#ifdef __x86_64__
2710/*
2711 * _aesni_inc_init:     internal ABI
2712 *      setup registers used by _aesni_inc
2713 * input:
2714 *      IV
2715 * output:
2716 *      CTR:    == IV, in little endian
2717 *      TCTR_LOW: == lower qword of CTR
2718 *      INC:    == 1, in little endian
2719 *      BSWAP_MASK == endian swapping mask
2720 */
2721SYM_FUNC_START_LOCAL(_aesni_inc_init)
2722        movaps .Lbswap_mask, BSWAP_MASK
2723        movaps IV, CTR
2724        pshufb BSWAP_MASK, CTR
2725        mov $1, TCTR_LOW
2726        movq TCTR_LOW, INC
2727        movq CTR, TCTR_LOW
2728        ret
2729SYM_FUNC_END(_aesni_inc_init)
2730
2731/*
2732 * _aesni_inc:          internal ABI
2733 *      Increase IV by 1, IV is in big endian
2734 * input:
2735 *      IV
2736 *      CTR:    == IV, in little endian
2737 *      TCTR_LOW: == lower qword of CTR
2738 *      INC:    == 1, in little endian
2739 *      BSWAP_MASK == endian swapping mask
2740 * output:
2741 *      IV:     Increase by 1
2742 * changed:
2743 *      CTR:    == output IV, in little endian
2744 *      TCTR_LOW: == lower qword of CTR
2745 */
2746SYM_FUNC_START_LOCAL(_aesni_inc)
2747        paddq INC, CTR
2748        add $1, TCTR_LOW
2749        jnc .Linc_low
2750        pslldq $8, INC
2751        paddq INC, CTR
2752        psrldq $8, INC
2753.Linc_low:
2754        movaps CTR, IV
2755        pshufb BSWAP_MASK, IV
2756        ret
2757SYM_FUNC_END(_aesni_inc)
2758
2759/*
2760 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2761 *                    size_t len, u8 *iv)
2762 */
2763SYM_FUNC_START(aesni_ctr_enc)
2764        FRAME_BEGIN
2765        cmp $16, LEN
2766        jb .Lctr_enc_just_ret
2767        mov 480(KEYP), KLEN
2768        movups (IVP), IV
2769        call _aesni_inc_init
2770        cmp $64, LEN
2771        jb .Lctr_enc_loop1
2772.align 4
2773.Lctr_enc_loop4:
2774        movaps IV, STATE1
2775        call _aesni_inc
2776        movups (INP), IN1
2777        movaps IV, STATE2
2778        call _aesni_inc
2779        movups 0x10(INP), IN2
2780        movaps IV, STATE3
2781        call _aesni_inc
2782        movups 0x20(INP), IN3
2783        movaps IV, STATE4
2784        call _aesni_inc
2785        movups 0x30(INP), IN4
2786        call _aesni_enc4
2787        pxor IN1, STATE1
2788        movups STATE1, (OUTP)
2789        pxor IN2, STATE2
2790        movups STATE2, 0x10(OUTP)
2791        pxor IN3, STATE3
2792        movups STATE3, 0x20(OUTP)
2793        pxor IN4, STATE4
2794        movups STATE4, 0x30(OUTP)
2795        sub $64, LEN
2796        add $64, INP
2797        add $64, OUTP
2798        cmp $64, LEN
2799        jge .Lctr_enc_loop4
2800        cmp $16, LEN
2801        jb .Lctr_enc_ret
2802.align 4
2803.Lctr_enc_loop1:
2804        movaps IV, STATE
2805        call _aesni_inc
2806        movups (INP), IN
2807        call _aesni_enc1
2808        pxor IN, STATE
2809        movups STATE, (OUTP)
2810        sub $16, LEN
2811        add $16, INP
2812        add $16, OUTP
2813        cmp $16, LEN
2814        jge .Lctr_enc_loop1
2815.Lctr_enc_ret:
2816        movups IV, (IVP)
2817.Lctr_enc_just_ret:
2818        FRAME_END
2819        ret
2820SYM_FUNC_END(aesni_ctr_enc)
2821
2822#endif
2823
2824.section        .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
2825.align 16
2826.Lgf128mul_x_ble_mask:
2827        .octa 0x00000000000000010000000000000087
2828.previous
2829
2830/*
2831 * _aesni_gf128mul_x_ble:               internal ABI
2832 *      Multiply in GF(2^128) for XTS IVs
2833 * input:
2834 *      IV:     current IV
2835 *      GF128MUL_MASK == mask with 0x87 and 0x01
2836 * output:
2837 *      IV:     next IV
2838 * changed:
2839 *      CTR:    == temporary value
2840 */
2841#define _aesni_gf128mul_x_ble() \
2842        pshufd $0x13, IV, KEY; \
2843        paddq IV, IV; \
2844        psrad $31, KEY; \
2845        pand GF128MUL_MASK, KEY; \
2846        pxor KEY, IV;
2847
2848/*
2849 * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
2850 *                        const u8 *src, unsigned int len, le128 *iv)
2851 */
2852SYM_FUNC_START(aesni_xts_encrypt)
2853        FRAME_BEGIN
2854#ifndef __x86_64__
2855        pushl IVP
2856        pushl LEN
2857        pushl KEYP
2858        pushl KLEN
2859        movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2860        movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2861        movl (FRAME_OFFSET+28)(%esp), INP       # src
2862        movl (FRAME_OFFSET+32)(%esp), LEN       # len
2863        movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2864        movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2865#else
2866        movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
2867#endif
2868        movups (IVP), IV
2869
2870        mov 480(KEYP), KLEN
2871
2872.Lxts_enc_loop4:
2873        sub $64, LEN
2874        jl .Lxts_enc_1x
2875
2876        movdqa IV, STATE1
2877        movdqu 0x00(INP), IN
2878        pxor IN, STATE1
2879        movdqu IV, 0x00(OUTP)
2880
2881        _aesni_gf128mul_x_ble()
2882        movdqa IV, STATE2
2883        movdqu 0x10(INP), IN
2884        pxor IN, STATE2
2885        movdqu IV, 0x10(OUTP)
2886
2887        _aesni_gf128mul_x_ble()
2888        movdqa IV, STATE3
2889        movdqu 0x20(INP), IN
2890        pxor IN, STATE3
2891        movdqu IV, 0x20(OUTP)
2892
2893        _aesni_gf128mul_x_ble()
2894        movdqa IV, STATE4
2895        movdqu 0x30(INP), IN
2896        pxor IN, STATE4
2897        movdqu IV, 0x30(OUTP)
2898
2899        call _aesni_enc4
2900
2901        movdqu 0x00(OUTP), IN
2902        pxor IN, STATE1
2903        movdqu STATE1, 0x00(OUTP)
2904
2905        movdqu 0x10(OUTP), IN
2906        pxor IN, STATE2
2907        movdqu STATE2, 0x10(OUTP)
2908
2909        movdqu 0x20(OUTP), IN
2910        pxor IN, STATE3
2911        movdqu STATE3, 0x20(OUTP)
2912
2913        movdqu 0x30(OUTP), IN
2914        pxor IN, STATE4
2915        movdqu STATE4, 0x30(OUTP)
2916
2917        _aesni_gf128mul_x_ble()
2918
2919        add $64, INP
2920        add $64, OUTP
2921        test LEN, LEN
2922        jnz .Lxts_enc_loop4
2923
2924.Lxts_enc_ret_iv:
2925        movups IV, (IVP)
2926
2927.Lxts_enc_ret:
2928#ifndef __x86_64__
2929        popl KLEN
2930        popl KEYP
2931        popl LEN
2932        popl IVP
2933#endif
2934        FRAME_END
2935        ret
2936
2937.Lxts_enc_1x:
2938        add $64, LEN
2939        jz .Lxts_enc_ret_iv
2940        sub $16, LEN
2941        jl .Lxts_enc_cts4
2942
2943.Lxts_enc_loop1:
2944        movdqu (INP), STATE
2945        pxor IV, STATE
2946        call _aesni_enc1
2947        pxor IV, STATE
2948        _aesni_gf128mul_x_ble()
2949
2950        test LEN, LEN
2951        jz .Lxts_enc_out
2952
2953        add $16, INP
2954        sub $16, LEN
2955        jl .Lxts_enc_cts1
2956
2957        movdqu STATE, (OUTP)
2958        add $16, OUTP
2959        jmp .Lxts_enc_loop1
2960
2961.Lxts_enc_out:
2962        movdqu STATE, (OUTP)
2963        jmp .Lxts_enc_ret_iv
2964
2965.Lxts_enc_cts4:
2966        movdqa STATE4, STATE
2967        sub $16, OUTP
2968
2969.Lxts_enc_cts1:
2970#ifndef __x86_64__
2971        lea .Lcts_permute_table, T1
2972#else
2973        lea .Lcts_permute_table(%rip), T1
2974#endif
2975        add LEN, INP            /* rewind input pointer */
2976        add $16, LEN            /* # bytes in final block */
2977        movups (INP), IN1
2978
2979        mov T1, IVP
2980        add $32, IVP
2981        add LEN, T1
2982        sub LEN, IVP
2983        add OUTP, LEN
2984
2985        movups (T1), %xmm4
2986        movaps STATE, IN2
2987        pshufb %xmm4, STATE
2988        movups STATE, (LEN)
2989
2990        movups (IVP), %xmm0
2991        pshufb %xmm0, IN1
2992        pblendvb IN2, IN1
2993        movaps IN1, STATE
2994
2995        pxor IV, STATE
2996        call _aesni_enc1
2997        pxor IV, STATE
2998
2999        movups STATE, (OUTP)
3000        jmp .Lxts_enc_ret
3001SYM_FUNC_END(aesni_xts_encrypt)
3002
3003/*
3004 * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
3005 *                        const u8 *src, unsigned int len, le128 *iv)
3006 */
3007SYM_FUNC_START(aesni_xts_decrypt)
3008        FRAME_BEGIN
3009#ifndef __x86_64__
3010        pushl IVP
3011        pushl LEN
3012        pushl KEYP
3013        pushl KLEN
3014        movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
3015        movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
3016        movl (FRAME_OFFSET+28)(%esp), INP       # src
3017        movl (FRAME_OFFSET+32)(%esp), LEN       # len
3018        movl (FRAME_OFFSET+36)(%esp), IVP       # iv
3019        movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
3020#else
3021        movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
3022#endif
3023        movups (IVP), IV
3024
3025        mov 480(KEYP), KLEN
3026        add $240, KEYP
3027
3028        test $15, LEN
3029        jz .Lxts_dec_loop4
3030        sub $16, LEN
3031
3032.Lxts_dec_loop4:
3033        sub $64, LEN
3034        jl .Lxts_dec_1x
3035
3036        movdqa IV, STATE1
3037        movdqu 0x00(INP), IN
3038        pxor IN, STATE1
3039        movdqu IV, 0x00(OUTP)
3040
3041        _aesni_gf128mul_x_ble()
3042        movdqa IV, STATE2
3043        movdqu 0x10(INP), IN
3044        pxor IN, STATE2
3045        movdqu IV, 0x10(OUTP)
3046
3047        _aesni_gf128mul_x_ble()
3048        movdqa IV, STATE3
3049        movdqu 0x20(INP), IN
3050        pxor IN, STATE3
3051        movdqu IV, 0x20(OUTP)
3052
3053        _aesni_gf128mul_x_ble()
3054        movdqa IV, STATE4
3055        movdqu 0x30(INP), IN
3056        pxor IN, STATE4
3057        movdqu IV, 0x30(OUTP)
3058
3059        call _aesni_dec4
3060
3061        movdqu 0x00(OUTP), IN
3062        pxor IN, STATE1
3063        movdqu STATE1, 0x00(OUTP)
3064
3065        movdqu 0x10(OUTP), IN
3066        pxor IN, STATE2
3067        movdqu STATE2, 0x10(OUTP)
3068
3069        movdqu 0x20(OUTP), IN
3070        pxor IN, STATE3
3071        movdqu STATE3, 0x20(OUTP)
3072
3073        movdqu 0x30(OUTP), IN
3074        pxor IN, STATE4
3075        movdqu STATE4, 0x30(OUTP)
3076
3077        _aesni_gf128mul_x_ble()
3078
3079        add $64, INP
3080        add $64, OUTP
3081        test LEN, LEN
3082        jnz .Lxts_dec_loop4
3083
3084.Lxts_dec_ret_iv:
3085        movups IV, (IVP)
3086
3087.Lxts_dec_ret:
3088#ifndef __x86_64__
3089        popl KLEN
3090        popl KEYP
3091        popl LEN
3092        popl IVP
3093#endif
3094        FRAME_END
3095        ret
3096
3097.Lxts_dec_1x:
3098        add $64, LEN
3099        jz .Lxts_dec_ret_iv
3100
3101.Lxts_dec_loop1:
3102        movdqu (INP), STATE
3103
3104        add $16, INP
3105        sub $16, LEN
3106        jl .Lxts_dec_cts1
3107
3108        pxor IV, STATE
3109        call _aesni_dec1
3110        pxor IV, STATE
3111        _aesni_gf128mul_x_ble()
3112
3113        test LEN, LEN
3114        jz .Lxts_dec_out
3115
3116        movdqu STATE, (OUTP)
3117        add $16, OUTP
3118        jmp .Lxts_dec_loop1
3119
3120.Lxts_dec_out:
3121        movdqu STATE, (OUTP)
3122        jmp .Lxts_dec_ret_iv
3123
3124.Lxts_dec_cts1:
3125        movdqa IV, STATE4
3126        _aesni_gf128mul_x_ble()
3127
3128        pxor IV, STATE
3129        call _aesni_dec1
3130        pxor IV, STATE
3131
3132#ifndef __x86_64__
3133        lea .Lcts_permute_table, T1
3134#else
3135        lea .Lcts_permute_table(%rip), T1
3136#endif
3137        add LEN, INP            /* rewind input pointer */
3138        add $16, LEN            /* # bytes in final block */
3139        movups (INP), IN1
3140
3141        mov T1, IVP
3142        add $32, IVP
3143        add LEN, T1
3144        sub LEN, IVP
3145        add OUTP, LEN
3146
3147        movups (T1), %xmm4
3148        movaps STATE, IN2
3149        pshufb %xmm4, STATE
3150        movups STATE, (LEN)
3151
3152        movups (IVP), %xmm0
3153        pshufb %xmm0, IN1
3154        pblendvb IN2, IN1
3155        movaps IN1, STATE
3156
3157        pxor STATE4, STATE
3158        call _aesni_dec1
3159        pxor STATE4, STATE
3160
3161        movups STATE, (OUTP)
3162        jmp .Lxts_dec_ret
3163SYM_FUNC_END(aesni_xts_decrypt)
3164