linux/arch/x86/crypto/aesni-intel_asm.S
<<
>>
Prefs
   1/*
   2 * Implement AES algorithm in Intel AES-NI instructions.
   3 *
   4 * The white paper of AES-NI instructions can be downloaded from:
   5 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
   6 *
   7 * Copyright (C) 2008, Intel Corp.
   8 *    Author: Huang Ying <ying.huang@intel.com>
   9 *            Vinodh Gopal <vinodh.gopal@intel.com>
  10 *            Kahraman Akdemir
  11 *
  12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  13 * interface for 64-bit kernels.
  14 *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  15 *             Aidan O'Mahony (aidan.o.mahony@intel.com)
  16 *             Adrian Hoban <adrian.hoban@intel.com>
  17 *             James Guilford (james.guilford@intel.com)
  18 *             Gabriele Paoloni <gabriele.paoloni@intel.com>
  19 *             Tadeusz Struk (tadeusz.struk@intel.com)
  20 *             Wajdi Feghali (wajdi.k.feghali@intel.com)
  21 *    Copyright (c) 2010, Intel Corporation.
  22 *
  23 * Ported x86_64 version to x86:
  24 *    Author: Mathias Krause <minipli@googlemail.com>
  25 *
  26 * This program is free software; you can redistribute it and/or modify
  27 * it under the terms of the GNU General Public License as published by
  28 * the Free Software Foundation; either version 2 of the License, or
  29 * (at your option) any later version.
  30 */
  31
  32#include <linux/linkage.h>
  33#include <asm/inst.h>
  34#include <asm/frame.h>
  35#include <asm/nospec-branch.h>
  36
  37/*
  38 * The following macros are used to move an (un)aligned 16 byte value to/from
  39 * an XMM register.  This can done for either FP or integer values, for FP use
  40 * movaps (move aligned packed single) or integer use movdqa (move double quad
  41 * aligned).  It doesn't make a performance difference which instruction is used
  42 * since Nehalem (original Core i7) was released.  However, the movaps is a byte
  43 * shorter, so that is the one we'll use for now. (same for unaligned).
  44 */
  45#define MOVADQ  movaps
  46#define MOVUDQ  movups
  47
  48#ifdef __x86_64__
  49
  50# constants in mergeable sections, linker can reorder and merge
  51.section        .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
  52.align 16
  53.Lgf128mul_x_ble_mask:
  54        .octa 0x00000000000000010000000000000087
  55.section        .rodata.cst16.POLY, "aM", @progbits, 16
  56.align 16
  57POLY:   .octa 0xC2000000000000000000000000000001
  58.section        .rodata.cst16.TWOONE, "aM", @progbits, 16
  59.align 16
  60TWOONE: .octa 0x00000001000000000000000000000001
  61
  62.section        .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
  63.align 16
  64SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
  65.section        .rodata.cst16.MASK1, "aM", @progbits, 16
  66.align 16
  67MASK1:      .octa 0x0000000000000000ffffffffffffffff
  68.section        .rodata.cst16.MASK2, "aM", @progbits, 16
  69.align 16
  70MASK2:      .octa 0xffffffffffffffff0000000000000000
  71.section        .rodata.cst16.ONE, "aM", @progbits, 16
  72.align 16
  73ONE:        .octa 0x00000000000000000000000000000001
  74.section        .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
  75.align 16
  76F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  77.section        .rodata.cst16.dec, "aM", @progbits, 16
  78.align 16
  79dec:        .octa 0x1
  80.section        .rodata.cst16.enc, "aM", @progbits, 16
  81.align 16
  82enc:        .octa 0x2
  83
  84# order of these constants should not change.
  85# more specifically, ALL_F should follow SHIFT_MASK,
  86# and zero should follow ALL_F
  87.section        .rodata, "a", @progbits
  88.align 16
  89SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  90ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
  91            .octa 0x00000000000000000000000000000000
  92
  93.text
  94
  95
  96#define STACK_OFFSET    8*3
  97
  98#define AadHash 16*0
  99#define AadLen 16*1
 100#define InLen (16*1)+8
 101#define PBlockEncKey 16*2
 102#define OrigIV 16*3
 103#define CurCount 16*4
 104#define PBlockLen 16*5
 105#define HashKey         16*6    // store HashKey <<1 mod poly here
 106#define HashKey_2       16*7    // store HashKey^2 <<1 mod poly here
 107#define HashKey_3       16*8    // store HashKey^3 <<1 mod poly here
 108#define HashKey_4       16*9    // store HashKey^4 <<1 mod poly here
 109#define HashKey_k       16*10   // store XOR of High 64 bits and Low 64
 110                                // bits of  HashKey <<1 mod poly here
 111                                //(for Karatsuba purposes)
 112#define HashKey_2_k     16*11   // store XOR of High 64 bits and Low 64
 113                                // bits of  HashKey^2 <<1 mod poly here
 114                                // (for Karatsuba purposes)
 115#define HashKey_3_k     16*12   // store XOR of High 64 bits and Low 64
 116                                // bits of  HashKey^3 <<1 mod poly here
 117                                // (for Karatsuba purposes)
 118#define HashKey_4_k     16*13   // store XOR of High 64 bits and Low 64
 119                                // bits of  HashKey^4 <<1 mod poly here
 120                                // (for Karatsuba purposes)
 121
 122#define arg1 rdi
 123#define arg2 rsi
 124#define arg3 rdx
 125#define arg4 rcx
 126#define arg5 r8
 127#define arg6 r9
 128#define arg7 STACK_OFFSET+8(%rsp)
 129#define arg8 STACK_OFFSET+16(%rsp)
 130#define arg9 STACK_OFFSET+24(%rsp)
 131#define arg10 STACK_OFFSET+32(%rsp)
 132#define arg11 STACK_OFFSET+40(%rsp)
 133#define keysize 2*15*16(%arg1)
 134#endif
 135
 136
 137#define STATE1  %xmm0
 138#define STATE2  %xmm4
 139#define STATE3  %xmm5
 140#define STATE4  %xmm6
 141#define STATE   STATE1
 142#define IN1     %xmm1
 143#define IN2     %xmm7
 144#define IN3     %xmm8
 145#define IN4     %xmm9
 146#define IN      IN1
 147#define KEY     %xmm2
 148#define IV      %xmm3
 149
 150#define BSWAP_MASK %xmm10
 151#define CTR     %xmm11
 152#define INC     %xmm12
 153
 154#define GF128MUL_MASK %xmm10
 155
 156#ifdef __x86_64__
 157#define AREG    %rax
 158#define KEYP    %rdi
 159#define OUTP    %rsi
 160#define UKEYP   OUTP
 161#define INP     %rdx
 162#define LEN     %rcx
 163#define IVP     %r8
 164#define KLEN    %r9d
 165#define T1      %r10
 166#define TKEYP   T1
 167#define T2      %r11
 168#define TCTR_LOW T2
 169#else
 170#define AREG    %eax
 171#define KEYP    %edi
 172#define OUTP    AREG
 173#define UKEYP   OUTP
 174#define INP     %edx
 175#define LEN     %esi
 176#define IVP     %ebp
 177#define KLEN    %ebx
 178#define T1      %ecx
 179#define TKEYP   T1
 180#endif
 181
 182.macro FUNC_SAVE
 183        push    %r12
 184        push    %r13
 185        push    %r14
 186#
 187# states of %xmm registers %xmm6:%xmm15 not saved
 188# all %xmm registers are clobbered
 189#
 190.endm
 191
 192
 193.macro FUNC_RESTORE
 194        pop     %r14
 195        pop     %r13
 196        pop     %r12
 197.endm
 198
 199# Precompute hashkeys.
 200# Input: Hash subkey.
 201# Output: HashKeys stored in gcm_context_data.  Only needs to be called
 202# once per key.
 203# clobbers r12, and tmp xmm registers.
 204.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
 205        mov     \SUBKEY, %r12
 206        movdqu  (%r12), \TMP3
 207        movdqa  SHUF_MASK(%rip), \TMP2
 208        PSHUFB_XMM \TMP2, \TMP3
 209
 210        # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
 211
 212        movdqa  \TMP3, \TMP2
 213        psllq   $1, \TMP3
 214        psrlq   $63, \TMP2
 215        movdqa  \TMP2, \TMP1
 216        pslldq  $8, \TMP2
 217        psrldq  $8, \TMP1
 218        por     \TMP2, \TMP3
 219
 220        # reduce HashKey<<1
 221
 222        pshufd  $0x24, \TMP1, \TMP2
 223        pcmpeqd TWOONE(%rip), \TMP2
 224        pand    POLY(%rip), \TMP2
 225        pxor    \TMP2, \TMP3
 226        movdqa  \TMP3, HashKey(%arg2)
 227
 228        movdqa     \TMP3, \TMP5
 229        pshufd     $78, \TMP3, \TMP1
 230        pxor       \TMP3, \TMP1
 231        movdqa     \TMP1, HashKey_k(%arg2)
 232
 233        GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 234# TMP5 = HashKey^2<<1 (mod poly)
 235        movdqa     \TMP5, HashKey_2(%arg2)
 236# HashKey_2 = HashKey^2<<1 (mod poly)
 237        pshufd     $78, \TMP5, \TMP1
 238        pxor       \TMP5, \TMP1
 239        movdqa     \TMP1, HashKey_2_k(%arg2)
 240
 241        GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 242# TMP5 = HashKey^3<<1 (mod poly)
 243        movdqa     \TMP5, HashKey_3(%arg2)
 244        pshufd     $78, \TMP5, \TMP1
 245        pxor       \TMP5, \TMP1
 246        movdqa     \TMP1, HashKey_3_k(%arg2)
 247
 248        GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 249# TMP5 = HashKey^3<<1 (mod poly)
 250        movdqa     \TMP5, HashKey_4(%arg2)
 251        pshufd     $78, \TMP5, \TMP1
 252        pxor       \TMP5, \TMP1
 253        movdqa     \TMP1, HashKey_4_k(%arg2)
 254.endm
 255
 256# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 257# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
 258.macro GCM_INIT Iv SUBKEY AAD AADLEN
 259        mov \AADLEN, %r11
 260        mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
 261        xor %r11, %r11
 262        mov %r11, InLen(%arg2) # ctx_data.in_length = 0
 263        mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
 264        mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
 265        mov \Iv, %rax
 266        movdqu (%rax), %xmm0
 267        movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
 268
 269        movdqa  SHUF_MASK(%rip), %xmm2
 270        PSHUFB_XMM %xmm2, %xmm0
 271        movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
 272
 273        PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 274        movdqa HashKey(%arg2), %xmm13
 275
 276        CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
 277        %xmm4, %xmm5, %xmm6
 278.endm
 279
 280# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
 281# struct has been initialized by GCM_INIT.
 282# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
 283# Clobbers rax, r10-r13, and xmm0-xmm15
 284.macro GCM_ENC_DEC operation
 285        movdqu AadHash(%arg2), %xmm8
 286        movdqu HashKey(%arg2), %xmm13
 287        add %arg5, InLen(%arg2)
 288
 289        xor %r11, %r11 # initialise the data pointer offset as zero
 290        PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
 291
 292        sub %r11, %arg5         # sub partial block data used
 293        mov %arg5, %r13         # save the number of bytes
 294
 295        and $-16, %r13          # %r13 = %r13 - (%r13 mod 16)
 296        mov %r13, %r12
 297        # Encrypt/Decrypt first few blocks
 298
 299        and     $(3<<4), %r12
 300        jz      _initial_num_blocks_is_0_\@
 301        cmp     $(2<<4), %r12
 302        jb      _initial_num_blocks_is_1_\@
 303        je      _initial_num_blocks_is_2_\@
 304_initial_num_blocks_is_3_\@:
 305        INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 306%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
 307        sub     $48, %r13
 308        jmp     _initial_blocks_\@
 309_initial_num_blocks_is_2_\@:
 310        INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 311%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
 312        sub     $32, %r13
 313        jmp     _initial_blocks_\@
 314_initial_num_blocks_is_1_\@:
 315        INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 316%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
 317        sub     $16, %r13
 318        jmp     _initial_blocks_\@
 319_initial_num_blocks_is_0_\@:
 320        INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 321%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
 322_initial_blocks_\@:
 323
 324        # Main loop - Encrypt/Decrypt remaining blocks
 325
 326        cmp     $0, %r13
 327        je      _zero_cipher_left_\@
 328        sub     $64, %r13
 329        je      _four_cipher_left_\@
 330_crypt_by_4_\@:
 331        GHASH_4_ENCRYPT_4_PARALLEL_\operation   %xmm9, %xmm10, %xmm11, %xmm12, \
 332        %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
 333        %xmm7, %xmm8, enc
 334        add     $64, %r11
 335        sub     $64, %r13
 336        jne     _crypt_by_4_\@
 337_four_cipher_left_\@:
 338        GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
 339%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
 340_zero_cipher_left_\@:
 341        movdqu %xmm8, AadHash(%arg2)
 342        movdqu %xmm0, CurCount(%arg2)
 343
 344        mov     %arg5, %r13
 345        and     $15, %r13                       # %r13 = arg5 (mod 16)
 346        je      _multiple_of_16_bytes_\@
 347
 348        mov %r13, PBlockLen(%arg2)
 349
 350        # Handle the last <16 Byte block separately
 351        paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
 352        movdqu %xmm0, CurCount(%arg2)
 353        movdqa SHUF_MASK(%rip), %xmm10
 354        PSHUFB_XMM %xmm10, %xmm0
 355
 356        ENCRYPT_SINGLE_BLOCK    %xmm0, %xmm1        # Encrypt(K, Yn)
 357        movdqu %xmm0, PBlockEncKey(%arg2)
 358
 359        cmp     $16, %arg5
 360        jge _large_enough_update_\@
 361
 362        lea (%arg4,%r11,1), %r10
 363        mov %r13, %r12
 364        READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
 365        jmp _data_read_\@
 366
 367_large_enough_update_\@:
 368        sub     $16, %r11
 369        add     %r13, %r11
 370
 371        # receive the last <16 Byte block
 372        movdqu  (%arg4, %r11, 1), %xmm1
 373
 374        sub     %r13, %r11
 375        add     $16, %r11
 376
 377        lea     SHIFT_MASK+16(%rip), %r12
 378        # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
 379        # (r13 is the number of bytes in plaintext mod 16)
 380        sub     %r13, %r12
 381        # get the appropriate shuffle mask
 382        movdqu  (%r12), %xmm2
 383        # shift right 16-r13 bytes
 384        PSHUFB_XMM  %xmm2, %xmm1
 385
 386_data_read_\@:
 387        lea ALL_F+16(%rip), %r12
 388        sub %r13, %r12
 389
 390.ifc \operation, dec
 391        movdqa  %xmm1, %xmm2
 392.endif
 393        pxor    %xmm1, %xmm0            # XOR Encrypt(K, Yn)
 394        movdqu  (%r12), %xmm1
 395        # get the appropriate mask to mask out top 16-r13 bytes of xmm0
 396        pand    %xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
 397.ifc \operation, dec
 398        pand    %xmm1, %xmm2
 399        movdqa SHUF_MASK(%rip), %xmm10
 400        PSHUFB_XMM %xmm10 ,%xmm2
 401
 402        pxor %xmm2, %xmm8
 403.else
 404        movdqa SHUF_MASK(%rip), %xmm10
 405        PSHUFB_XMM %xmm10,%xmm0
 406
 407        pxor    %xmm0, %xmm8
 408.endif
 409
 410        movdqu %xmm8, AadHash(%arg2)
 411.ifc \operation, enc
 412        # GHASH computation for the last <16 byte block
 413        movdqa SHUF_MASK(%rip), %xmm10
 414        # shuffle xmm0 back to output as ciphertext
 415        PSHUFB_XMM %xmm10, %xmm0
 416.endif
 417
 418        # Output %r13 bytes
 419        MOVQ_R64_XMM %xmm0, %rax
 420        cmp $8, %r13
 421        jle _less_than_8_bytes_left_\@
 422        mov %rax, (%arg3 , %r11, 1)
 423        add $8, %r11
 424        psrldq $8, %xmm0
 425        MOVQ_R64_XMM %xmm0, %rax
 426        sub $8, %r13
 427_less_than_8_bytes_left_\@:
 428        mov %al,  (%arg3, %r11, 1)
 429        add $1, %r11
 430        shr $8, %rax
 431        sub $1, %r13
 432        jne _less_than_8_bytes_left_\@
 433_multiple_of_16_bytes_\@:
 434.endm
 435
 436# GCM_COMPLETE Finishes update of tag of last partial block
 437# Output: Authorization Tag (AUTH_TAG)
 438# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 439.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
 440        movdqu AadHash(%arg2), %xmm8
 441        movdqu HashKey(%arg2), %xmm13
 442
 443        mov PBlockLen(%arg2), %r12
 444
 445        cmp $0, %r12
 446        je _partial_done\@
 447
 448        GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 449
 450_partial_done\@:
 451        mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
 452        shl     $3, %r12                  # convert into number of bits
 453        movd    %r12d, %xmm15             # len(A) in %xmm15
 454        mov InLen(%arg2), %r12
 455        shl     $3, %r12                  # len(C) in bits (*128)
 456        MOVQ_R64_XMM    %r12, %xmm1
 457
 458        pslldq  $8, %xmm15                # %xmm15 = len(A)||0x0000000000000000
 459        pxor    %xmm1, %xmm15             # %xmm15 = len(A)||len(C)
 460        pxor    %xmm15, %xmm8
 461        GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 462        # final GHASH computation
 463        movdqa SHUF_MASK(%rip), %xmm10
 464        PSHUFB_XMM %xmm10, %xmm8
 465
 466        movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
 467        ENCRYPT_SINGLE_BLOCK    %xmm0,  %xmm1     # E(K, Y0)
 468        pxor    %xmm8, %xmm0
 469_return_T_\@:
 470        mov     \AUTHTAG, %r10                     # %r10 = authTag
 471        mov     \AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
 472        cmp     $16, %r11
 473        je      _T_16_\@
 474        cmp     $8, %r11
 475        jl      _T_4_\@
 476_T_8_\@:
 477        MOVQ_R64_XMM    %xmm0, %rax
 478        mov     %rax, (%r10)
 479        add     $8, %r10
 480        sub     $8, %r11
 481        psrldq  $8, %xmm0
 482        cmp     $0, %r11
 483        je      _return_T_done_\@
 484_T_4_\@:
 485        movd    %xmm0, %eax
 486        mov     %eax, (%r10)
 487        add     $4, %r10
 488        sub     $4, %r11
 489        psrldq  $4, %xmm0
 490        cmp     $0, %r11
 491        je      _return_T_done_\@
 492_T_123_\@:
 493        movd    %xmm0, %eax
 494        cmp     $2, %r11
 495        jl      _T_1_\@
 496        mov     %ax, (%r10)
 497        cmp     $2, %r11
 498        je      _return_T_done_\@
 499        add     $2, %r10
 500        sar     $16, %eax
 501_T_1_\@:
 502        mov     %al, (%r10)
 503        jmp     _return_T_done_\@
 504_T_16_\@:
 505        movdqu  %xmm0, (%r10)
 506_return_T_done_\@:
 507.endm
 508
 509#ifdef __x86_64__
 510/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 511*
 512*
 513* Input: A and B (128-bits each, bit-reflected)
 514* Output: C = A*B*x mod poly, (i.e. >>1 )
 515* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 516* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 517*
 518*/
 519.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
 520        movdqa    \GH, \TMP1
 521        pshufd    $78, \GH, \TMP2
 522        pshufd    $78, \HK, \TMP3
 523        pxor      \GH, \TMP2            # TMP2 = a1+a0
 524        pxor      \HK, \TMP3            # TMP3 = b1+b0
 525        PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
 526        PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
 527        PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
 528        pxor      \GH, \TMP2
 529        pxor      \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
 530        movdqa    \TMP2, \TMP3
 531        pslldq    $8, \TMP3             # left shift TMP3 2 DWs
 532        psrldq    $8, \TMP2             # right shift TMP2 2 DWs
 533        pxor      \TMP3, \GH
 534        pxor      \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
 535
 536        # first phase of the reduction
 537
 538        movdqa    \GH, \TMP2
 539        movdqa    \GH, \TMP3
 540        movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
 541                                        # in in order to perform
 542                                        # independent shifts
 543        pslld     $31, \TMP2            # packed right shift <<31
 544        pslld     $30, \TMP3            # packed right shift <<30
 545        pslld     $25, \TMP4            # packed right shift <<25
 546        pxor      \TMP3, \TMP2          # xor the shifted versions
 547        pxor      \TMP4, \TMP2
 548        movdqa    \TMP2, \TMP5
 549        psrldq    $4, \TMP5             # right shift TMP5 1 DW
 550        pslldq    $12, \TMP2            # left shift TMP2 3 DWs
 551        pxor      \TMP2, \GH
 552
 553        # second phase of the reduction
 554
 555        movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
 556                                        # in in order to perform
 557                                        # independent shifts
 558        movdqa    \GH,\TMP3
 559        movdqa    \GH,\TMP4
 560        psrld     $1,\TMP2              # packed left shift >>1
 561        psrld     $2,\TMP3              # packed left shift >>2
 562        psrld     $7,\TMP4              # packed left shift >>7
 563        pxor      \TMP3,\TMP2           # xor the shifted versions
 564        pxor      \TMP4,\TMP2
 565        pxor      \TMP5, \TMP2
 566        pxor      \TMP2, \GH
 567        pxor      \TMP1, \GH            # result is in TMP1
 568.endm
 569
 570# Reads DLEN bytes starting at DPTR and stores in XMMDst
 571# where 0 < DLEN < 16
 572# Clobbers %rax, DLEN and XMM1
 573.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
 574        cmp $8, \DLEN
 575        jl _read_lt8_\@
 576        mov (\DPTR), %rax
 577        MOVQ_R64_XMM %rax, \XMMDst
 578        sub $8, \DLEN
 579        jz _done_read_partial_block_\@
 580        xor %eax, %eax
 581_read_next_byte_\@:
 582        shl $8, %rax
 583        mov 7(\DPTR, \DLEN, 1), %al
 584        dec \DLEN
 585        jnz _read_next_byte_\@
 586        MOVQ_R64_XMM %rax, \XMM1
 587        pslldq $8, \XMM1
 588        por \XMM1, \XMMDst
 589        jmp _done_read_partial_block_\@
 590_read_lt8_\@:
 591        xor %eax, %eax
 592_read_next_byte_lt8_\@:
 593        shl $8, %rax
 594        mov -1(\DPTR, \DLEN, 1), %al
 595        dec \DLEN
 596        jnz _read_next_byte_lt8_\@
 597        MOVQ_R64_XMM %rax, \XMMDst
 598_done_read_partial_block_\@:
 599.endm
 600
 601# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
 602# clobbers r10-11, xmm14
 603.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
 604        TMP6 TMP7
 605        MOVADQ     SHUF_MASK(%rip), %xmm14
 606        mov        \AAD, %r10           # %r10 = AAD
 607        mov        \AADLEN, %r11                # %r11 = aadLen
 608        pxor       \TMP7, \TMP7
 609        pxor       \TMP6, \TMP6
 610
 611        cmp        $16, %r11
 612        jl         _get_AAD_rest\@
 613_get_AAD_blocks\@:
 614        movdqu     (%r10), \TMP7
 615        PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
 616        pxor       \TMP7, \TMP6
 617        GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 618        add        $16, %r10
 619        sub        $16, %r11
 620        cmp        $16, %r11
 621        jge        _get_AAD_blocks\@
 622
 623        movdqu     \TMP6, \TMP7
 624
 625        /* read the last <16B of AAD */
 626_get_AAD_rest\@:
 627        cmp        $0, %r11
 628        je         _get_AAD_done\@
 629
 630        READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
 631        PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
 632        pxor       \TMP6, \TMP7
 633        GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 634        movdqu \TMP7, \TMP6
 635
 636_get_AAD_done\@:
 637        movdqu \TMP6, AadHash(%arg2)
 638.endm
 639
 640# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
 641# between update calls.
 642# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
 643# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
 644# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
 645.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
 646        AAD_HASH operation
 647        mov     PBlockLen(%arg2), %r13
 648        cmp     $0, %r13
 649        je      _partial_block_done_\@  # Leave Macro if no partial blocks
 650        # Read in input data without over reading
 651        cmp     $16, \PLAIN_CYPH_LEN
 652        jl      _fewer_than_16_bytes_\@
 653        movups  (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
 654        jmp     _data_read_\@
 655
 656_fewer_than_16_bytes_\@:
 657        lea     (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
 658        mov     \PLAIN_CYPH_LEN, %r12
 659        READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
 660
 661        mov PBlockLen(%arg2), %r13
 662
 663_data_read_\@:                          # Finished reading in data
 664
 665        movdqu  PBlockEncKey(%arg2), %xmm9
 666        movdqu  HashKey(%arg2), %xmm13
 667
 668        lea     SHIFT_MASK(%rip), %r12
 669
 670        # adjust the shuffle mask pointer to be able to shift r13 bytes
 671        # r16-r13 is the number of bytes in plaintext mod 16)
 672        add     %r13, %r12
 673        movdqu  (%r12), %xmm2           # get the appropriate shuffle mask
 674        PSHUFB_XMM %xmm2, %xmm9         # shift right r13 bytes
 675
 676.ifc \operation, dec
 677        movdqa  %xmm1, %xmm3
 678        pxor    %xmm1, %xmm9            # Cyphertext XOR E(K, Yn)
 679
 680        mov     \PLAIN_CYPH_LEN, %r10
 681        add     %r13, %r10
 682        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 683        sub     $16, %r10
 684        # Determine if if partial block is not being filled and
 685        # shift mask accordingly
 686        jge     _no_extra_mask_1_\@
 687        sub     %r10, %r12
 688_no_extra_mask_1_\@:
 689
 690        movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 691        # get the appropriate mask to mask out bottom r13 bytes of xmm9
 692        pand    %xmm1, %xmm9            # mask out bottom r13 bytes of xmm9
 693
 694        pand    %xmm1, %xmm3
 695        movdqa  SHUF_MASK(%rip), %xmm10
 696        PSHUFB_XMM      %xmm10, %xmm3
 697        PSHUFB_XMM      %xmm2, %xmm3
 698        pxor    %xmm3, \AAD_HASH
 699
 700        cmp     $0, %r10
 701        jl      _partial_incomplete_1_\@
 702
 703        # GHASH computation for the last <16 Byte block
 704        GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 705        xor     %rax,%rax
 706
 707        mov     %rax, PBlockLen(%arg2)
 708        jmp     _dec_done_\@
 709_partial_incomplete_1_\@:
 710        add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 711_dec_done_\@:
 712        movdqu  \AAD_HASH, AadHash(%arg2)
 713.else
 714        pxor    %xmm1, %xmm9                    # Plaintext XOR E(K, Yn)
 715
 716        mov     \PLAIN_CYPH_LEN, %r10
 717        add     %r13, %r10
 718        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 719        sub     $16, %r10
 720        # Determine if if partial block is not being filled and
 721        # shift mask accordingly
 722        jge     _no_extra_mask_2_\@
 723        sub     %r10, %r12
 724_no_extra_mask_2_\@:
 725
 726        movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 727        # get the appropriate mask to mask out bottom r13 bytes of xmm9
 728        pand    %xmm1, %xmm9
 729
 730        movdqa  SHUF_MASK(%rip), %xmm1
 731        PSHUFB_XMM %xmm1, %xmm9
 732        PSHUFB_XMM %xmm2, %xmm9
 733        pxor    %xmm9, \AAD_HASH
 734
 735        cmp     $0, %r10
 736        jl      _partial_incomplete_2_\@
 737
 738        # GHASH computation for the last <16 Byte block
 739        GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 740        xor     %rax,%rax
 741
 742        mov     %rax, PBlockLen(%arg2)
 743        jmp     _encode_done_\@
 744_partial_incomplete_2_\@:
 745        add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 746_encode_done_\@:
 747        movdqu  \AAD_HASH, AadHash(%arg2)
 748
 749        movdqa  SHUF_MASK(%rip), %xmm10
 750        # shuffle xmm9 back to output as ciphertext
 751        PSHUFB_XMM      %xmm10, %xmm9
 752        PSHUFB_XMM      %xmm2, %xmm9
 753.endif
 754        # output encrypted Bytes
 755        cmp     $0, %r10
 756        jl      _partial_fill_\@
 757        mov     %r13, %r12
 758        mov     $16, %r13
 759        # Set r13 to be the number of bytes to write out
 760        sub     %r12, %r13
 761        jmp     _count_set_\@
 762_partial_fill_\@:
 763        mov     \PLAIN_CYPH_LEN, %r13
 764_count_set_\@:
 765        movdqa  %xmm9, %xmm0
 766        MOVQ_R64_XMM    %xmm0, %rax
 767        cmp     $8, %r13
 768        jle     _less_than_8_bytes_left_\@
 769
 770        mov     %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 771        add     $8, \DATA_OFFSET
 772        psrldq  $8, %xmm0
 773        MOVQ_R64_XMM    %xmm0, %rax
 774        sub     $8, %r13
 775_less_than_8_bytes_left_\@:
 776        movb    %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 777        add     $1, \DATA_OFFSET
 778        shr     $8, %rax
 779        sub     $1, %r13
 780        jne     _less_than_8_bytes_left_\@
 781_partial_block_done_\@:
 782.endm # PARTIAL_BLOCK
 783
 784/*
 785* if a = number of total plaintext bytes
 786* b = floor(a/16)
 787* num_initial_blocks = b mod 4
 788* encrypt the initial num_initial_blocks blocks and apply ghash on
 789* the ciphertext
 790* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 791* are clobbered
 792* arg1, %arg2, %arg3 are used as a pointer only, not modified
 793*/
 794
 795
 796.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 797        XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 798        MOVADQ          SHUF_MASK(%rip), %xmm14
 799
 800        movdqu AadHash(%arg2), %xmm\i               # XMM0 = Y0
 801
 802        # start AES for num_initial_blocks blocks
 803
 804        movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
 805
 806.if (\i == 5) || (\i == 6) || (\i == 7)
 807
 808        MOVADQ          ONE(%RIP),\TMP1
 809        MOVADQ          0(%arg1),\TMP2
 810.irpc index, \i_seq
 811        paddd           \TMP1, \XMM0                 # INCR Y0
 812.ifc \operation, dec
 813        movdqa     \XMM0, %xmm\index
 814.else
 815        MOVADQ          \XMM0, %xmm\index
 816.endif
 817        PSHUFB_XMM      %xmm14, %xmm\index      # perform a 16 byte swap
 818        pxor            \TMP2, %xmm\index
 819.endr
 820        lea     0x10(%arg1),%r10
 821        mov     keysize,%eax
 822        shr     $2,%eax                         # 128->4, 192->6, 256->8
 823        add     $5,%eax                       # 128->9, 192->11, 256->13
 824
 825aes_loop_initial_\@:
 826        MOVADQ  (%r10),\TMP1
 827.irpc   index, \i_seq
 828        AESENC  \TMP1, %xmm\index
 829.endr
 830        add     $16,%r10
 831        sub     $1,%eax
 832        jnz     aes_loop_initial_\@
 833
 834        MOVADQ  (%r10), \TMP1
 835.irpc index, \i_seq
 836        AESENCLAST \TMP1, %xmm\index         # Last Round
 837.endr
 838.irpc index, \i_seq
 839        movdqu     (%arg4 , %r11, 1), \TMP1
 840        pxor       \TMP1, %xmm\index
 841        movdqu     %xmm\index, (%arg3 , %r11, 1)
 842        # write back plaintext/ciphertext for num_initial_blocks
 843        add        $16, %r11
 844
 845.ifc \operation, dec
 846        movdqa     \TMP1, %xmm\index
 847.endif
 848        PSHUFB_XMM         %xmm14, %xmm\index
 849
 850                # prepare plaintext/ciphertext for GHASH computation
 851.endr
 852.endif
 853
 854        # apply GHASH on num_initial_blocks blocks
 855
 856.if \i == 5
 857        pxor       %xmm5, %xmm6
 858        GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 859        pxor       %xmm6, %xmm7
 860        GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 861        pxor       %xmm7, %xmm8
 862        GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 863.elseif \i == 6
 864        pxor       %xmm6, %xmm7
 865        GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 866        pxor       %xmm7, %xmm8
 867        GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 868.elseif \i == 7
 869        pxor       %xmm7, %xmm8
 870        GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 871.endif
 872        cmp        $64, %r13
 873        jl      _initial_blocks_done\@
 874        # no need for precomputed values
 875/*
 876*
 877* Precomputations for HashKey parallel with encryption of first 4 blocks.
 878* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 879*/
 880        MOVADQ     ONE(%RIP),\TMP1
 881        paddd      \TMP1, \XMM0              # INCR Y0
 882        MOVADQ     \XMM0, \XMM1
 883        PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
 884
 885        paddd      \TMP1, \XMM0              # INCR Y0
 886        MOVADQ     \XMM0, \XMM2
 887        PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
 888
 889        paddd      \TMP1, \XMM0              # INCR Y0
 890        MOVADQ     \XMM0, \XMM3
 891        PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
 892
 893        paddd      \TMP1, \XMM0              # INCR Y0
 894        MOVADQ     \XMM0, \XMM4
 895        PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
 896
 897        MOVADQ     0(%arg1),\TMP1
 898        pxor       \TMP1, \XMM1
 899        pxor       \TMP1, \XMM2
 900        pxor       \TMP1, \XMM3
 901        pxor       \TMP1, \XMM4
 902.irpc index, 1234 # do 4 rounds
 903        movaps 0x10*\index(%arg1), \TMP1
 904        AESENC     \TMP1, \XMM1
 905        AESENC     \TMP1, \XMM2
 906        AESENC     \TMP1, \XMM3
 907        AESENC     \TMP1, \XMM4
 908.endr
 909.irpc index, 56789 # do next 5 rounds
 910        movaps 0x10*\index(%arg1), \TMP1
 911        AESENC     \TMP1, \XMM1
 912        AESENC     \TMP1, \XMM2
 913        AESENC     \TMP1, \XMM3
 914        AESENC     \TMP1, \XMM4
 915.endr
 916        lea        0xa0(%arg1),%r10
 917        mov        keysize,%eax
 918        shr        $2,%eax                      # 128->4, 192->6, 256->8
 919        sub        $4,%eax                      # 128->0, 192->2, 256->4
 920        jz         aes_loop_pre_done\@
 921
 922aes_loop_pre_\@:
 923        MOVADQ     (%r10),\TMP2
 924.irpc   index, 1234
 925        AESENC     \TMP2, %xmm\index
 926.endr
 927        add        $16,%r10
 928        sub        $1,%eax
 929        jnz        aes_loop_pre_\@
 930
 931aes_loop_pre_done\@:
 932        MOVADQ     (%r10), \TMP2
 933        AESENCLAST \TMP2, \XMM1
 934        AESENCLAST \TMP2, \XMM2
 935        AESENCLAST \TMP2, \XMM3
 936        AESENCLAST \TMP2, \XMM4
 937        movdqu     16*0(%arg4 , %r11 , 1), \TMP1
 938        pxor       \TMP1, \XMM1
 939.ifc \operation, dec
 940        movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 941        movdqa     \TMP1, \XMM1
 942.endif
 943        movdqu     16*1(%arg4 , %r11 , 1), \TMP1
 944        pxor       \TMP1, \XMM2
 945.ifc \operation, dec
 946        movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 947        movdqa     \TMP1, \XMM2
 948.endif
 949        movdqu     16*2(%arg4 , %r11 , 1), \TMP1
 950        pxor       \TMP1, \XMM3
 951.ifc \operation, dec
 952        movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 953        movdqa     \TMP1, \XMM3
 954.endif
 955        movdqu     16*3(%arg4 , %r11 , 1), \TMP1
 956        pxor       \TMP1, \XMM4
 957.ifc \operation, dec
 958        movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 959        movdqa     \TMP1, \XMM4
 960.else
 961        movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 962        movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 963        movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 964        movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 965.endif
 966
 967        add        $64, %r11
 968        PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
 969        pxor       \XMMDst, \XMM1
 970# combine GHASHed value with the corresponding ciphertext
 971        PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
 972        PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
 973        PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
 974
 975_initial_blocks_done\@:
 976
 977.endm
 978
 979/*
 980* encrypt 4 blocks at a time
 981* ghash the 4 previously encrypted ciphertext blocks
 982* arg1, %arg3, %arg4 are used as pointers only, not modified
 983* %r11 is the data offset value
 984*/
 985.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
 986TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 987
 988        movdqa    \XMM1, \XMM5
 989        movdqa    \XMM2, \XMM6
 990        movdqa    \XMM3, \XMM7
 991        movdqa    \XMM4, \XMM8
 992
 993        movdqa    SHUF_MASK(%rip), %xmm15
 994        # multiply TMP5 * HashKey using karatsuba
 995
 996        movdqa    \XMM5, \TMP4
 997        pshufd    $78, \XMM5, \TMP6
 998        pxor      \XMM5, \TMP6
 999        paddd     ONE(%rip), \XMM0              # INCR CNT
1000        movdqa    HashKey_4(%arg2), \TMP5
1001        PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1002        movdqa    \XMM0, \XMM1
1003        paddd     ONE(%rip), \XMM0              # INCR CNT
1004        movdqa    \XMM0, \XMM2
1005        paddd     ONE(%rip), \XMM0              # INCR CNT
1006        movdqa    \XMM0, \XMM3
1007        paddd     ONE(%rip), \XMM0              # INCR CNT
1008        movdqa    \XMM0, \XMM4
1009        PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1010        PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1011        PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
1012        PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
1013        PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
1014
1015        pxor      (%arg1), \XMM1
1016        pxor      (%arg1), \XMM2
1017        pxor      (%arg1), \XMM3
1018        pxor      (%arg1), \XMM4
1019        movdqa    HashKey_4_k(%arg2), \TMP5
1020        PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
1021        movaps 0x10(%arg1), \TMP1
1022        AESENC    \TMP1, \XMM1              # Round 1
1023        AESENC    \TMP1, \XMM2
1024        AESENC    \TMP1, \XMM3
1025        AESENC    \TMP1, \XMM4
1026        movaps 0x20(%arg1), \TMP1
1027        AESENC    \TMP1, \XMM1              # Round 2
1028        AESENC    \TMP1, \XMM2
1029        AESENC    \TMP1, \XMM3
1030        AESENC    \TMP1, \XMM4
1031        movdqa    \XMM6, \TMP1
1032        pshufd    $78, \XMM6, \TMP2
1033        pxor      \XMM6, \TMP2
1034        movdqa    HashKey_3(%arg2), \TMP5
1035        PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
1036        movaps 0x30(%arg1), \TMP3
1037        AESENC    \TMP3, \XMM1              # Round 3
1038        AESENC    \TMP3, \XMM2
1039        AESENC    \TMP3, \XMM3
1040        AESENC    \TMP3, \XMM4
1041        PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
1042        movaps 0x40(%arg1), \TMP3
1043        AESENC    \TMP3, \XMM1              # Round 4
1044        AESENC    \TMP3, \XMM2
1045        AESENC    \TMP3, \XMM3
1046        AESENC    \TMP3, \XMM4
1047        movdqa    HashKey_3_k(%arg2), \TMP5
1048        PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1049        movaps 0x50(%arg1), \TMP3
1050        AESENC    \TMP3, \XMM1              # Round 5
1051        AESENC    \TMP3, \XMM2
1052        AESENC    \TMP3, \XMM3
1053        AESENC    \TMP3, \XMM4
1054        pxor      \TMP1, \TMP4
1055# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1056        pxor      \XMM6, \XMM5
1057        pxor      \TMP2, \TMP6
1058        movdqa    \XMM7, \TMP1
1059        pshufd    $78, \XMM7, \TMP2
1060        pxor      \XMM7, \TMP2
1061        movdqa    HashKey_2(%arg2), \TMP5
1062
1063        # Multiply TMP5 * HashKey using karatsuba
1064
1065        PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
1066        movaps 0x60(%arg1), \TMP3
1067        AESENC    \TMP3, \XMM1              # Round 6
1068        AESENC    \TMP3, \XMM2
1069        AESENC    \TMP3, \XMM3
1070        AESENC    \TMP3, \XMM4
1071        PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
1072        movaps 0x70(%arg1), \TMP3
1073        AESENC    \TMP3, \XMM1             # Round 7
1074        AESENC    \TMP3, \XMM2
1075        AESENC    \TMP3, \XMM3
1076        AESENC    \TMP3, \XMM4
1077        movdqa    HashKey_2_k(%arg2), \TMP5
1078        PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1079        movaps 0x80(%arg1), \TMP3
1080        AESENC    \TMP3, \XMM1             # Round 8
1081        AESENC    \TMP3, \XMM2
1082        AESENC    \TMP3, \XMM3
1083        AESENC    \TMP3, \XMM4
1084        pxor      \TMP1, \TMP4
1085# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1086        pxor      \XMM7, \XMM5
1087        pxor      \TMP2, \TMP6
1088
1089        # Multiply XMM8 * HashKey
1090        # XMM8 and TMP5 hold the values for the two operands
1091
1092        movdqa    \XMM8, \TMP1
1093        pshufd    $78, \XMM8, \TMP2
1094        pxor      \XMM8, \TMP2
1095        movdqa    HashKey(%arg2), \TMP5
1096        PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
1097        movaps 0x90(%arg1), \TMP3
1098        AESENC    \TMP3, \XMM1            # Round 9
1099        AESENC    \TMP3, \XMM2
1100        AESENC    \TMP3, \XMM3
1101        AESENC    \TMP3, \XMM4
1102        PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1103        lea       0xa0(%arg1),%r10
1104        mov       keysize,%eax
1105        shr       $2,%eax                       # 128->4, 192->6, 256->8
1106        sub       $4,%eax                       # 128->0, 192->2, 256->4
1107        jz        aes_loop_par_enc_done\@
1108
1109aes_loop_par_enc\@:
1110        MOVADQ    (%r10),\TMP3
1111.irpc   index, 1234
1112        AESENC    \TMP3, %xmm\index
1113.endr
1114        add       $16,%r10
1115        sub       $1,%eax
1116        jnz       aes_loop_par_enc\@
1117
1118aes_loop_par_enc_done\@:
1119        MOVADQ    (%r10), \TMP3
1120        AESENCLAST \TMP3, \XMM1           # Round 10
1121        AESENCLAST \TMP3, \XMM2
1122        AESENCLAST \TMP3, \XMM3
1123        AESENCLAST \TMP3, \XMM4
1124        movdqa    HashKey_k(%arg2), \TMP5
1125        PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1126        movdqu    (%arg4,%r11,1), \TMP3
1127        pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1128        movdqu    16(%arg4,%r11,1), \TMP3
1129        pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1130        movdqu    32(%arg4,%r11,1), \TMP3
1131        pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1132        movdqu    48(%arg4,%r11,1), \TMP3
1133        pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1134        movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1135        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1136        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1137        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1138        PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1139        PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
1140        PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
1141        PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
1142
1143        pxor      \TMP4, \TMP1
1144        pxor      \XMM8, \XMM5
1145        pxor      \TMP6, \TMP2
1146        pxor      \TMP1, \TMP2
1147        pxor      \XMM5, \TMP2
1148        movdqa    \TMP2, \TMP3
1149        pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1150        psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1151        pxor      \TMP3, \XMM5
1152        pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1153
1154        # first phase of reduction
1155
1156        movdqa    \XMM5, \TMP2
1157        movdqa    \XMM5, \TMP3
1158        movdqa    \XMM5, \TMP4
1159# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1160        pslld     $31, \TMP2                   # packed right shift << 31
1161        pslld     $30, \TMP3                   # packed right shift << 30
1162        pslld     $25, \TMP4                   # packed right shift << 25
1163        pxor      \TMP3, \TMP2                 # xor the shifted versions
1164        pxor      \TMP4, \TMP2
1165        movdqa    \TMP2, \TMP5
1166        psrldq    $4, \TMP5                    # right shift T5 1 DW
1167        pslldq    $12, \TMP2                   # left shift T2 3 DWs
1168        pxor      \TMP2, \XMM5
1169
1170        # second phase of reduction
1171
1172        movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1173        movdqa    \XMM5,\TMP3
1174        movdqa    \XMM5,\TMP4
1175        psrld     $1, \TMP2                    # packed left shift >>1
1176        psrld     $2, \TMP3                    # packed left shift >>2
1177        psrld     $7, \TMP4                    # packed left shift >>7
1178        pxor      \TMP3,\TMP2                  # xor the shifted versions
1179        pxor      \TMP4,\TMP2
1180        pxor      \TMP5, \TMP2
1181        pxor      \TMP2, \XMM5
1182        pxor      \TMP1, \XMM5                 # result is in TMP1
1183
1184        pxor      \XMM5, \XMM1
1185.endm
1186
1187/*
1188* decrypt 4 blocks at a time
1189* ghash the 4 previously decrypted ciphertext blocks
1190* arg1, %arg3, %arg4 are used as pointers only, not modified
1191* %r11 is the data offset value
1192*/
1193.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1194TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1195
1196        movdqa    \XMM1, \XMM5
1197        movdqa    \XMM2, \XMM6
1198        movdqa    \XMM3, \XMM7
1199        movdqa    \XMM4, \XMM8
1200
1201        movdqa    SHUF_MASK(%rip), %xmm15
1202        # multiply TMP5 * HashKey using karatsuba
1203
1204        movdqa    \XMM5, \TMP4
1205        pshufd    $78, \XMM5, \TMP6
1206        pxor      \XMM5, \TMP6
1207        paddd     ONE(%rip), \XMM0              # INCR CNT
1208        movdqa    HashKey_4(%arg2), \TMP5
1209        PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1210        movdqa    \XMM0, \XMM1
1211        paddd     ONE(%rip), \XMM0              # INCR CNT
1212        movdqa    \XMM0, \XMM2
1213        paddd     ONE(%rip), \XMM0              # INCR CNT
1214        movdqa    \XMM0, \XMM3
1215        paddd     ONE(%rip), \XMM0              # INCR CNT
1216        movdqa    \XMM0, \XMM4
1217        PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1218        PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1219        PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
1220        PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
1221        PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
1222
1223        pxor      (%arg1), \XMM1
1224        pxor      (%arg1), \XMM2
1225        pxor      (%arg1), \XMM3
1226        pxor      (%arg1), \XMM4
1227        movdqa    HashKey_4_k(%arg2), \TMP5
1228        PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
1229        movaps 0x10(%arg1), \TMP1
1230        AESENC    \TMP1, \XMM1              # Round 1
1231        AESENC    \TMP1, \XMM2
1232        AESENC    \TMP1, \XMM3
1233        AESENC    \TMP1, \XMM4
1234        movaps 0x20(%arg1), \TMP1
1235        AESENC    \TMP1, \XMM1              # Round 2
1236        AESENC    \TMP1, \XMM2
1237        AESENC    \TMP1, \XMM3
1238        AESENC    \TMP1, \XMM4
1239        movdqa    \XMM6, \TMP1
1240        pshufd    $78, \XMM6, \TMP2
1241        pxor      \XMM6, \TMP2
1242        movdqa    HashKey_3(%arg2), \TMP5
1243        PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
1244        movaps 0x30(%arg1), \TMP3
1245        AESENC    \TMP3, \XMM1              # Round 3
1246        AESENC    \TMP3, \XMM2
1247        AESENC    \TMP3, \XMM3
1248        AESENC    \TMP3, \XMM4
1249        PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
1250        movaps 0x40(%arg1), \TMP3
1251        AESENC    \TMP3, \XMM1              # Round 4
1252        AESENC    \TMP3, \XMM2
1253        AESENC    \TMP3, \XMM3
1254        AESENC    \TMP3, \XMM4
1255        movdqa    HashKey_3_k(%arg2), \TMP5
1256        PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1257        movaps 0x50(%arg1), \TMP3
1258        AESENC    \TMP3, \XMM1              # Round 5
1259        AESENC    \TMP3, \XMM2
1260        AESENC    \TMP3, \XMM3
1261        AESENC    \TMP3, \XMM4
1262        pxor      \TMP1, \TMP4
1263# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1264        pxor      \XMM6, \XMM5
1265        pxor      \TMP2, \TMP6
1266        movdqa    \XMM7, \TMP1
1267        pshufd    $78, \XMM7, \TMP2
1268        pxor      \XMM7, \TMP2
1269        movdqa    HashKey_2(%arg2), \TMP5
1270
1271        # Multiply TMP5 * HashKey using karatsuba
1272
1273        PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
1274        movaps 0x60(%arg1), \TMP3
1275        AESENC    \TMP3, \XMM1              # Round 6
1276        AESENC    \TMP3, \XMM2
1277        AESENC    \TMP3, \XMM3
1278        AESENC    \TMP3, \XMM4
1279        PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
1280        movaps 0x70(%arg1), \TMP3
1281        AESENC    \TMP3, \XMM1             # Round 7
1282        AESENC    \TMP3, \XMM2
1283        AESENC    \TMP3, \XMM3
1284        AESENC    \TMP3, \XMM4
1285        movdqa    HashKey_2_k(%arg2), \TMP5
1286        PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1287        movaps 0x80(%arg1), \TMP3
1288        AESENC    \TMP3, \XMM1             # Round 8
1289        AESENC    \TMP3, \XMM2
1290        AESENC    \TMP3, \XMM3
1291        AESENC    \TMP3, \XMM4
1292        pxor      \TMP1, \TMP4
1293# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1294        pxor      \XMM7, \XMM5
1295        pxor      \TMP2, \TMP6
1296
1297        # Multiply XMM8 * HashKey
1298        # XMM8 and TMP5 hold the values for the two operands
1299
1300        movdqa    \XMM8, \TMP1
1301        pshufd    $78, \XMM8, \TMP2
1302        pxor      \XMM8, \TMP2
1303        movdqa    HashKey(%arg2), \TMP5
1304        PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
1305        movaps 0x90(%arg1), \TMP3
1306        AESENC    \TMP3, \XMM1            # Round 9
1307        AESENC    \TMP3, \XMM2
1308        AESENC    \TMP3, \XMM3
1309        AESENC    \TMP3, \XMM4
1310        PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1311        lea       0xa0(%arg1),%r10
1312        mov       keysize,%eax
1313        shr       $2,%eax                       # 128->4, 192->6, 256->8
1314        sub       $4,%eax                       # 128->0, 192->2, 256->4
1315        jz        aes_loop_par_dec_done\@
1316
1317aes_loop_par_dec\@:
1318        MOVADQ    (%r10),\TMP3
1319.irpc   index, 1234
1320        AESENC    \TMP3, %xmm\index
1321.endr
1322        add       $16,%r10
1323        sub       $1,%eax
1324        jnz       aes_loop_par_dec\@
1325
1326aes_loop_par_dec_done\@:
1327        MOVADQ    (%r10), \TMP3
1328        AESENCLAST \TMP3, \XMM1           # last round
1329        AESENCLAST \TMP3, \XMM2
1330        AESENCLAST \TMP3, \XMM3
1331        AESENCLAST \TMP3, \XMM4
1332        movdqa    HashKey_k(%arg2), \TMP5
1333        PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1334        movdqu    (%arg4,%r11,1), \TMP3
1335        pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1336        movdqu    \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1337        movdqa    \TMP3, \XMM1
1338        movdqu    16(%arg4,%r11,1), \TMP3
1339        pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1340        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1341        movdqa    \TMP3, \XMM2
1342        movdqu    32(%arg4,%r11,1), \TMP3
1343        pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1344        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1345        movdqa    \TMP3, \XMM3
1346        movdqu    48(%arg4,%r11,1), \TMP3
1347        pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1348        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1349        movdqa    \TMP3, \XMM4
1350        PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1351        PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
1352        PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
1353        PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
1354
1355        pxor      \TMP4, \TMP1
1356        pxor      \XMM8, \XMM5
1357        pxor      \TMP6, \TMP2
1358        pxor      \TMP1, \TMP2
1359        pxor      \XMM5, \TMP2
1360        movdqa    \TMP2, \TMP3
1361        pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1362        psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1363        pxor      \TMP3, \XMM5
1364        pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1365
1366        # first phase of reduction
1367
1368        movdqa    \XMM5, \TMP2
1369        movdqa    \XMM5, \TMP3
1370        movdqa    \XMM5, \TMP4
1371# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1372        pslld     $31, \TMP2                   # packed right shift << 31
1373        pslld     $30, \TMP3                   # packed right shift << 30
1374        pslld     $25, \TMP4                   # packed right shift << 25
1375        pxor      \TMP3, \TMP2                 # xor the shifted versions
1376        pxor      \TMP4, \TMP2
1377        movdqa    \TMP2, \TMP5
1378        psrldq    $4, \TMP5                    # right shift T5 1 DW
1379        pslldq    $12, \TMP2                   # left shift T2 3 DWs
1380        pxor      \TMP2, \XMM5
1381
1382        # second phase of reduction
1383
1384        movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1385        movdqa    \XMM5,\TMP3
1386        movdqa    \XMM5,\TMP4
1387        psrld     $1, \TMP2                    # packed left shift >>1
1388        psrld     $2, \TMP3                    # packed left shift >>2
1389        psrld     $7, \TMP4                    # packed left shift >>7
1390        pxor      \TMP3,\TMP2                  # xor the shifted versions
1391        pxor      \TMP4,\TMP2
1392        pxor      \TMP5, \TMP2
1393        pxor      \TMP2, \XMM5
1394        pxor      \TMP1, \XMM5                 # result is in TMP1
1395
1396        pxor      \XMM5, \XMM1
1397.endm
1398
1399/* GHASH the last 4 ciphertext blocks. */
1400.macro  GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1401TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1402
1403        # Multiply TMP6 * HashKey (using Karatsuba)
1404
1405        movdqa    \XMM1, \TMP6
1406        pshufd    $78, \XMM1, \TMP2
1407        pxor      \XMM1, \TMP2
1408        movdqa    HashKey_4(%arg2), \TMP5
1409        PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1410        PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1411        movdqa    HashKey_4_k(%arg2), \TMP4
1412        PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1413        movdqa    \XMM1, \XMMDst
1414        movdqa    \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1415
1416        # Multiply TMP1 * HashKey (using Karatsuba)
1417
1418        movdqa    \XMM2, \TMP1
1419        pshufd    $78, \XMM2, \TMP2
1420        pxor      \XMM2, \TMP2
1421        movdqa    HashKey_3(%arg2), \TMP5
1422        PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1423        PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1424        movdqa    HashKey_3_k(%arg2), \TMP4
1425        PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1426        pxor      \TMP1, \TMP6
1427        pxor      \XMM2, \XMMDst
1428        pxor      \TMP2, \XMM1
1429# results accumulated in TMP6, XMMDst, XMM1
1430
1431        # Multiply TMP1 * HashKey (using Karatsuba)
1432
1433        movdqa    \XMM3, \TMP1
1434        pshufd    $78, \XMM3, \TMP2
1435        pxor      \XMM3, \TMP2
1436        movdqa    HashKey_2(%arg2), \TMP5
1437        PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1438        PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1439        movdqa    HashKey_2_k(%arg2), \TMP4
1440        PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1441        pxor      \TMP1, \TMP6
1442        pxor      \XMM3, \XMMDst
1443        pxor      \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1444
1445        # Multiply TMP1 * HashKey (using Karatsuba)
1446        movdqa    \XMM4, \TMP1
1447        pshufd    $78, \XMM4, \TMP2
1448        pxor      \XMM4, \TMP2
1449        movdqa    HashKey(%arg2), \TMP5
1450        PCLMULQDQ 0x11, \TMP5, \TMP1        # TMP1 = a1*b1
1451        PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1452        movdqa    HashKey_k(%arg2), \TMP4
1453        PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1454        pxor      \TMP1, \TMP6
1455        pxor      \XMM4, \XMMDst
1456        pxor      \XMM1, \TMP2
1457        pxor      \TMP6, \TMP2
1458        pxor      \XMMDst, \TMP2
1459        # middle section of the temp results combined as in karatsuba algorithm
1460        movdqa    \TMP2, \TMP4
1461        pslldq    $8, \TMP4                 # left shift TMP4 2 DWs
1462        psrldq    $8, \TMP2                 # right shift TMP2 2 DWs
1463        pxor      \TMP4, \XMMDst
1464        pxor      \TMP2, \TMP6
1465# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1466        # first phase of the reduction
1467        movdqa    \XMMDst, \TMP2
1468        movdqa    \XMMDst, \TMP3
1469        movdqa    \XMMDst, \TMP4
1470# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1471        pslld     $31, \TMP2                # packed right shifting << 31
1472        pslld     $30, \TMP3                # packed right shifting << 30
1473        pslld     $25, \TMP4                # packed right shifting << 25
1474        pxor      \TMP3, \TMP2              # xor the shifted versions
1475        pxor      \TMP4, \TMP2
1476        movdqa    \TMP2, \TMP7
1477        psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1478        pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1479        pxor      \TMP2, \XMMDst
1480
1481        # second phase of the reduction
1482        movdqa    \XMMDst, \TMP2
1483        # make 3 copies of XMMDst for doing 3 shift operations
1484        movdqa    \XMMDst, \TMP3
1485        movdqa    \XMMDst, \TMP4
1486        psrld     $1, \TMP2                 # packed left shift >> 1
1487        psrld     $2, \TMP3                 # packed left shift >> 2
1488        psrld     $7, \TMP4                 # packed left shift >> 7
1489        pxor      \TMP3, \TMP2              # xor the shifted versions
1490        pxor      \TMP4, \TMP2
1491        pxor      \TMP7, \TMP2
1492        pxor      \TMP2, \XMMDst
1493        pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1494.endm
1495
1496
1497/* Encryption of a single block
1498* uses eax & r10
1499*/
1500
1501.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1502
1503        pxor            (%arg1), \XMM0
1504        mov             keysize,%eax
1505        shr             $2,%eax                 # 128->4, 192->6, 256->8
1506        add             $5,%eax                 # 128->9, 192->11, 256->13
1507        lea             16(%arg1), %r10   # get first expanded key address
1508
1509_esb_loop_\@:
1510        MOVADQ          (%r10),\TMP1
1511        AESENC          \TMP1,\XMM0
1512        add             $16,%r10
1513        sub             $1,%eax
1514        jnz             _esb_loop_\@
1515
1516        MOVADQ          (%r10),\TMP1
1517        AESENCLAST      \TMP1,\XMM0
1518.endm
1519/*****************************************************************************
1520* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1521*                   struct gcm_context_data *data
1522*                                      // Context data
1523*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1524*                   const u8 *in,      // Ciphertext input
1525*                   u64 plaintext_len, // Length of data in bytes for decryption.
1526*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1527*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1528*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1529*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1530*                   const u8 *aad,     // Additional Authentication Data (AAD)
1531*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1532*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1533*                                      // given authentication tag and only return the plaintext if they match.
1534*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1535*                                      // (most likely), 12 or 8.
1536*
1537* Assumptions:
1538*
1539* keys:
1540*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1541*       set of 11 keys in the data structure void *aes_ctx
1542*
1543* iv:
1544*       0                   1                   2                   3
1545*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1546*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1547*       |                             Salt  (From the SA)               |
1548*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549*       |                     Initialization Vector                     |
1550*       |         (This is the sequence number from IPSec header)       |
1551*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1552*       |                              0x1                              |
1553*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1554*
1555*
1556*
1557* AAD:
1558*       AAD padded to 128 bits with 0
1559*       for example, assume AAD is a u32 vector
1560*
1561*       if AAD is 8 bytes:
1562*       AAD[3] = {A0, A1};
1563*       padded AAD in xmm register = {A1 A0 0 0}
1564*
1565*       0                   1                   2                   3
1566*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1567*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1568*       |                               SPI (A1)                        |
1569*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1570*       |                     32-bit Sequence Number (A0)               |
1571*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1572*       |                              0x0                              |
1573*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1574*
1575*                                       AAD Format with 32-bit Sequence Number
1576*
1577*       if AAD is 12 bytes:
1578*       AAD[3] = {A0, A1, A2};
1579*       padded AAD in xmm register = {A2 A1 A0 0}
1580*
1581*       0                   1                   2                   3
1582*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1583*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1585*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1586*       |                               SPI (A2)                        |
1587*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1588*       |                 64-bit Extended Sequence Number {A1,A0}       |
1589*       |                                                               |
1590*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1591*       |                              0x0                              |
1592*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1593*
1594*                        AAD Format with 64-bit Extended Sequence Number
1595*
1596* poly = x^128 + x^127 + x^126 + x^121 + 1
1597*
1598*****************************************************************************/
1599ENTRY(aesni_gcm_dec)
1600        FUNC_SAVE
1601
1602        GCM_INIT %arg6, arg7, arg8, arg9
1603        GCM_ENC_DEC dec
1604        GCM_COMPLETE arg10, arg11
1605        FUNC_RESTORE
1606        ret
1607ENDPROC(aesni_gcm_dec)
1608
1609
1610/*****************************************************************************
1611* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1612*                    struct gcm_context_data *data
1613*                                        // Context data
1614*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1615*                    const u8 *in,       // Plaintext input
1616*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1617*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1618*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1619*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1620*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1621*                    const u8 *aad,      // Additional Authentication Data (AAD)
1622*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1623*                    u8 *auth_tag,       // Authenticated Tag output.
1624*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1625*                                        // 12 or 8.
1626*
1627* Assumptions:
1628*
1629* keys:
1630*       keys are pre-expanded and aligned to 16 bytes. we are using the
1631*       first set of 11 keys in the data structure void *aes_ctx
1632*
1633*
1634* iv:
1635*       0                   1                   2                   3
1636*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1637*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1638*       |                             Salt  (From the SA)               |
1639*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640*       |                     Initialization Vector                     |
1641*       |         (This is the sequence number from IPSec header)       |
1642*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1643*       |                              0x1                              |
1644*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1645*
1646*
1647*
1648* AAD:
1649*       AAD padded to 128 bits with 0
1650*       for example, assume AAD is a u32 vector
1651*
1652*       if AAD is 8 bytes:
1653*       AAD[3] = {A0, A1};
1654*       padded AAD in xmm register = {A1 A0 0 0}
1655*
1656*       0                   1                   2                   3
1657*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1658*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1659*       |                               SPI (A1)                        |
1660*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1661*       |                     32-bit Sequence Number (A0)               |
1662*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1663*       |                              0x0                              |
1664*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1665*
1666*                                 AAD Format with 32-bit Sequence Number
1667*
1668*       if AAD is 12 bytes:
1669*       AAD[3] = {A0, A1, A2};
1670*       padded AAD in xmm register = {A2 A1 A0 0}
1671*
1672*       0                   1                   2                   3
1673*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1674*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1675*       |                               SPI (A2)                        |
1676*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1677*       |                 64-bit Extended Sequence Number {A1,A0}       |
1678*       |                                                               |
1679*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1680*       |                              0x0                              |
1681*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1682*
1683*                         AAD Format with 64-bit Extended Sequence Number
1684*
1685* poly = x^128 + x^127 + x^126 + x^121 + 1
1686***************************************************************************/
1687ENTRY(aesni_gcm_enc)
1688        FUNC_SAVE
1689
1690        GCM_INIT %arg6, arg7, arg8, arg9
1691        GCM_ENC_DEC enc
1692
1693        GCM_COMPLETE arg10, arg11
1694        FUNC_RESTORE
1695        ret
1696ENDPROC(aesni_gcm_enc)
1697
1698/*****************************************************************************
1699* void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1700*                     struct gcm_context_data *data,
1701*                                         // context data
1702*                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1703*                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1704*                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1705*                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1706*                     const u8 *aad,      // Additional Authentication Data (AAD)
1707*                     u64 aad_len)        // Length of AAD in bytes.
1708*/
1709ENTRY(aesni_gcm_init)
1710        FUNC_SAVE
1711        GCM_INIT %arg3, %arg4,%arg5, %arg6
1712        FUNC_RESTORE
1713        ret
1714ENDPROC(aesni_gcm_init)
1715
1716/*****************************************************************************
1717* void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1718*                    struct gcm_context_data *data,
1719*                                        // context data
1720*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1721*                    const u8 *in,       // Plaintext input
1722*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1723*/
1724ENTRY(aesni_gcm_enc_update)
1725        FUNC_SAVE
1726        GCM_ENC_DEC enc
1727        FUNC_RESTORE
1728        ret
1729ENDPROC(aesni_gcm_enc_update)
1730
1731/*****************************************************************************
1732* void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1733*                    struct gcm_context_data *data,
1734*                                        // context data
1735*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1736*                    const u8 *in,       // Plaintext input
1737*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1738*/
1739ENTRY(aesni_gcm_dec_update)
1740        FUNC_SAVE
1741        GCM_ENC_DEC dec
1742        FUNC_RESTORE
1743        ret
1744ENDPROC(aesni_gcm_dec_update)
1745
1746/*****************************************************************************
1747* void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1748*                    struct gcm_context_data *data,
1749*                                        // context data
1750*                    u8 *auth_tag,       // Authenticated Tag output.
1751*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1752*                                        // 12 or 8.
1753*/
1754ENTRY(aesni_gcm_finalize)
1755        FUNC_SAVE
1756        GCM_COMPLETE %arg3 %arg4
1757        FUNC_RESTORE
1758        ret
1759ENDPROC(aesni_gcm_finalize)
1760
1761#endif
1762
1763
1764.align 4
1765_key_expansion_128:
1766_key_expansion_256a:
1767        pshufd $0b11111111, %xmm1, %xmm1
1768        shufps $0b00010000, %xmm0, %xmm4
1769        pxor %xmm4, %xmm0
1770        shufps $0b10001100, %xmm0, %xmm4
1771        pxor %xmm4, %xmm0
1772        pxor %xmm1, %xmm0
1773        movaps %xmm0, (TKEYP)
1774        add $0x10, TKEYP
1775        ret
1776ENDPROC(_key_expansion_128)
1777ENDPROC(_key_expansion_256a)
1778
1779.align 4
1780_key_expansion_192a:
1781        pshufd $0b01010101, %xmm1, %xmm1
1782        shufps $0b00010000, %xmm0, %xmm4
1783        pxor %xmm4, %xmm0
1784        shufps $0b10001100, %xmm0, %xmm4
1785        pxor %xmm4, %xmm0
1786        pxor %xmm1, %xmm0
1787
1788        movaps %xmm2, %xmm5
1789        movaps %xmm2, %xmm6
1790        pslldq $4, %xmm5
1791        pshufd $0b11111111, %xmm0, %xmm3
1792        pxor %xmm3, %xmm2
1793        pxor %xmm5, %xmm2
1794
1795        movaps %xmm0, %xmm1
1796        shufps $0b01000100, %xmm0, %xmm6
1797        movaps %xmm6, (TKEYP)
1798        shufps $0b01001110, %xmm2, %xmm1
1799        movaps %xmm1, 0x10(TKEYP)
1800        add $0x20, TKEYP
1801        ret
1802ENDPROC(_key_expansion_192a)
1803
1804.align 4
1805_key_expansion_192b:
1806        pshufd $0b01010101, %xmm1, %xmm1
1807        shufps $0b00010000, %xmm0, %xmm4
1808        pxor %xmm4, %xmm0
1809        shufps $0b10001100, %xmm0, %xmm4
1810        pxor %xmm4, %xmm0
1811        pxor %xmm1, %xmm0
1812
1813        movaps %xmm2, %xmm5
1814        pslldq $4, %xmm5
1815        pshufd $0b11111111, %xmm0, %xmm3
1816        pxor %xmm3, %xmm2
1817        pxor %xmm5, %xmm2
1818
1819        movaps %xmm0, (TKEYP)
1820        add $0x10, TKEYP
1821        ret
1822ENDPROC(_key_expansion_192b)
1823
1824.align 4
1825_key_expansion_256b:
1826        pshufd $0b10101010, %xmm1, %xmm1
1827        shufps $0b00010000, %xmm2, %xmm4
1828        pxor %xmm4, %xmm2
1829        shufps $0b10001100, %xmm2, %xmm4
1830        pxor %xmm4, %xmm2
1831        pxor %xmm1, %xmm2
1832        movaps %xmm2, (TKEYP)
1833        add $0x10, TKEYP
1834        ret
1835ENDPROC(_key_expansion_256b)
1836
1837/*
1838 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1839 *                   unsigned int key_len)
1840 */
1841ENTRY(aesni_set_key)
1842        FRAME_BEGIN
1843#ifndef __x86_64__
1844        pushl KEYP
1845        movl (FRAME_OFFSET+8)(%esp), KEYP       # ctx
1846        movl (FRAME_OFFSET+12)(%esp), UKEYP     # in_key
1847        movl (FRAME_OFFSET+16)(%esp), %edx      # key_len
1848#endif
1849        movups (UKEYP), %xmm0           # user key (first 16 bytes)
1850        movaps %xmm0, (KEYP)
1851        lea 0x10(KEYP), TKEYP           # key addr
1852        movl %edx, 480(KEYP)
1853        pxor %xmm4, %xmm4               # xmm4 is assumed 0 in _key_expansion_x
1854        cmp $24, %dl
1855        jb .Lenc_key128
1856        je .Lenc_key192
1857        movups 0x10(UKEYP), %xmm2       # other user key
1858        movaps %xmm2, (TKEYP)
1859        add $0x10, TKEYP
1860        AESKEYGENASSIST 0x1 %xmm2 %xmm1         # round 1
1861        call _key_expansion_256a
1862        AESKEYGENASSIST 0x1 %xmm0 %xmm1
1863        call _key_expansion_256b
1864        AESKEYGENASSIST 0x2 %xmm2 %xmm1         # round 2
1865        call _key_expansion_256a
1866        AESKEYGENASSIST 0x2 %xmm0 %xmm1
1867        call _key_expansion_256b
1868        AESKEYGENASSIST 0x4 %xmm2 %xmm1         # round 3
1869        call _key_expansion_256a
1870        AESKEYGENASSIST 0x4 %xmm0 %xmm1
1871        call _key_expansion_256b
1872        AESKEYGENASSIST 0x8 %xmm2 %xmm1         # round 4
1873        call _key_expansion_256a
1874        AESKEYGENASSIST 0x8 %xmm0 %xmm1
1875        call _key_expansion_256b
1876        AESKEYGENASSIST 0x10 %xmm2 %xmm1        # round 5
1877        call _key_expansion_256a
1878        AESKEYGENASSIST 0x10 %xmm0 %xmm1
1879        call _key_expansion_256b
1880        AESKEYGENASSIST 0x20 %xmm2 %xmm1        # round 6
1881        call _key_expansion_256a
1882        AESKEYGENASSIST 0x20 %xmm0 %xmm1
1883        call _key_expansion_256b
1884        AESKEYGENASSIST 0x40 %xmm2 %xmm1        # round 7
1885        call _key_expansion_256a
1886        jmp .Ldec_key
1887.Lenc_key192:
1888        movq 0x10(UKEYP), %xmm2         # other user key
1889        AESKEYGENASSIST 0x1 %xmm2 %xmm1         # round 1
1890        call _key_expansion_192a
1891        AESKEYGENASSIST 0x2 %xmm2 %xmm1         # round 2
1892        call _key_expansion_192b
1893        AESKEYGENASSIST 0x4 %xmm2 %xmm1         # round 3
1894        call _key_expansion_192a
1895        AESKEYGENASSIST 0x8 %xmm2 %xmm1         # round 4
1896        call _key_expansion_192b
1897        AESKEYGENASSIST 0x10 %xmm2 %xmm1        # round 5
1898        call _key_expansion_192a
1899        AESKEYGENASSIST 0x20 %xmm2 %xmm1        # round 6
1900        call _key_expansion_192b
1901        AESKEYGENASSIST 0x40 %xmm2 %xmm1        # round 7
1902        call _key_expansion_192a
1903        AESKEYGENASSIST 0x80 %xmm2 %xmm1        # round 8
1904        call _key_expansion_192b
1905        jmp .Ldec_key
1906.Lenc_key128:
1907        AESKEYGENASSIST 0x1 %xmm0 %xmm1         # round 1
1908        call _key_expansion_128
1909        AESKEYGENASSIST 0x2 %xmm0 %xmm1         # round 2
1910        call _key_expansion_128
1911        AESKEYGENASSIST 0x4 %xmm0 %xmm1         # round 3
1912        call _key_expansion_128
1913        AESKEYGENASSIST 0x8 %xmm0 %xmm1         # round 4
1914        call _key_expansion_128
1915        AESKEYGENASSIST 0x10 %xmm0 %xmm1        # round 5
1916        call _key_expansion_128
1917        AESKEYGENASSIST 0x20 %xmm0 %xmm1        # round 6
1918        call _key_expansion_128
1919        AESKEYGENASSIST 0x40 %xmm0 %xmm1        # round 7
1920        call _key_expansion_128
1921        AESKEYGENASSIST 0x80 %xmm0 %xmm1        # round 8
1922        call _key_expansion_128
1923        AESKEYGENASSIST 0x1b %xmm0 %xmm1        # round 9
1924        call _key_expansion_128
1925        AESKEYGENASSIST 0x36 %xmm0 %xmm1        # round 10
1926        call _key_expansion_128
1927.Ldec_key:
1928        sub $0x10, TKEYP
1929        movaps (KEYP), %xmm0
1930        movaps (TKEYP), %xmm1
1931        movaps %xmm0, 240(TKEYP)
1932        movaps %xmm1, 240(KEYP)
1933        add $0x10, KEYP
1934        lea 240-16(TKEYP), UKEYP
1935.align 4
1936.Ldec_key_loop:
1937        movaps (KEYP), %xmm0
1938        AESIMC %xmm0 %xmm1
1939        movaps %xmm1, (UKEYP)
1940        add $0x10, KEYP
1941        sub $0x10, UKEYP
1942        cmp TKEYP, KEYP
1943        jb .Ldec_key_loop
1944        xor AREG, AREG
1945#ifndef __x86_64__
1946        popl KEYP
1947#endif
1948        FRAME_END
1949        ret
1950ENDPROC(aesni_set_key)
1951
1952/*
1953 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1954 */
1955ENTRY(aesni_enc)
1956        FRAME_BEGIN
1957#ifndef __x86_64__
1958        pushl KEYP
1959        pushl KLEN
1960        movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
1961        movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
1962        movl (FRAME_OFFSET+20)(%esp), INP       # src
1963#endif
1964        movl 480(KEYP), KLEN            # key length
1965        movups (INP), STATE             # input
1966        call _aesni_enc1
1967        movups STATE, (OUTP)            # output
1968#ifndef __x86_64__
1969        popl KLEN
1970        popl KEYP
1971#endif
1972        FRAME_END
1973        ret
1974ENDPROC(aesni_enc)
1975
1976/*
1977 * _aesni_enc1:         internal ABI
1978 * input:
1979 *      KEYP:           key struct pointer
1980 *      KLEN:           round count
1981 *      STATE:          initial state (input)
1982 * output:
1983 *      STATE:          finial state (output)
1984 * changed:
1985 *      KEY
1986 *      TKEYP (T1)
1987 */
1988.align 4
1989_aesni_enc1:
1990        movaps (KEYP), KEY              # key
1991        mov KEYP, TKEYP
1992        pxor KEY, STATE         # round 0
1993        add $0x30, TKEYP
1994        cmp $24, KLEN
1995        jb .Lenc128
1996        lea 0x20(TKEYP), TKEYP
1997        je .Lenc192
1998        add $0x20, TKEYP
1999        movaps -0x60(TKEYP), KEY
2000        AESENC KEY STATE
2001        movaps -0x50(TKEYP), KEY
2002        AESENC KEY STATE
2003.align 4
2004.Lenc192:
2005        movaps -0x40(TKEYP), KEY
2006        AESENC KEY STATE
2007        movaps -0x30(TKEYP), KEY
2008        AESENC KEY STATE
2009.align 4
2010.Lenc128:
2011        movaps -0x20(TKEYP), KEY
2012        AESENC KEY STATE
2013        movaps -0x10(TKEYP), KEY
2014        AESENC KEY STATE
2015        movaps (TKEYP), KEY
2016        AESENC KEY STATE
2017        movaps 0x10(TKEYP), KEY
2018        AESENC KEY STATE
2019        movaps 0x20(TKEYP), KEY
2020        AESENC KEY STATE
2021        movaps 0x30(TKEYP), KEY
2022        AESENC KEY STATE
2023        movaps 0x40(TKEYP), KEY
2024        AESENC KEY STATE
2025        movaps 0x50(TKEYP), KEY
2026        AESENC KEY STATE
2027        movaps 0x60(TKEYP), KEY
2028        AESENC KEY STATE
2029        movaps 0x70(TKEYP), KEY
2030        AESENCLAST KEY STATE
2031        ret
2032ENDPROC(_aesni_enc1)
2033
2034/*
2035 * _aesni_enc4: internal ABI
2036 * input:
2037 *      KEYP:           key struct pointer
2038 *      KLEN:           round count
2039 *      STATE1:         initial state (input)
2040 *      STATE2
2041 *      STATE3
2042 *      STATE4
2043 * output:
2044 *      STATE1:         finial state (output)
2045 *      STATE2
2046 *      STATE3
2047 *      STATE4
2048 * changed:
2049 *      KEY
2050 *      TKEYP (T1)
2051 */
2052.align 4
2053_aesni_enc4:
2054        movaps (KEYP), KEY              # key
2055        mov KEYP, TKEYP
2056        pxor KEY, STATE1                # round 0
2057        pxor KEY, STATE2
2058        pxor KEY, STATE3
2059        pxor KEY, STATE4
2060        add $0x30, TKEYP
2061        cmp $24, KLEN
2062        jb .L4enc128
2063        lea 0x20(TKEYP), TKEYP
2064        je .L4enc192
2065        add $0x20, TKEYP
2066        movaps -0x60(TKEYP), KEY
2067        AESENC KEY STATE1
2068        AESENC KEY STATE2
2069        AESENC KEY STATE3
2070        AESENC KEY STATE4
2071        movaps -0x50(TKEYP), KEY
2072        AESENC KEY STATE1
2073        AESENC KEY STATE2
2074        AESENC KEY STATE3
2075        AESENC KEY STATE4
2076#.align 4
2077.L4enc192:
2078        movaps -0x40(TKEYP), KEY
2079        AESENC KEY STATE1
2080        AESENC KEY STATE2
2081        AESENC KEY STATE3
2082        AESENC KEY STATE4
2083        movaps -0x30(TKEYP), KEY
2084        AESENC KEY STATE1
2085        AESENC KEY STATE2
2086        AESENC KEY STATE3
2087        AESENC KEY STATE4
2088#.align 4
2089.L4enc128:
2090        movaps -0x20(TKEYP), KEY
2091        AESENC KEY STATE1
2092        AESENC KEY STATE2
2093        AESENC KEY STATE3
2094        AESENC KEY STATE4
2095        movaps -0x10(TKEYP), KEY
2096        AESENC KEY STATE1
2097        AESENC KEY STATE2
2098        AESENC KEY STATE3
2099        AESENC KEY STATE4
2100        movaps (TKEYP), KEY
2101        AESENC KEY STATE1
2102        AESENC KEY STATE2
2103        AESENC KEY STATE3
2104        AESENC KEY STATE4
2105        movaps 0x10(TKEYP), KEY
2106        AESENC KEY STATE1
2107        AESENC KEY STATE2
2108        AESENC KEY STATE3
2109        AESENC KEY STATE4
2110        movaps 0x20(TKEYP), KEY
2111        AESENC KEY STATE1
2112        AESENC KEY STATE2
2113        AESENC KEY STATE3
2114        AESENC KEY STATE4
2115        movaps 0x30(TKEYP), KEY
2116        AESENC KEY STATE1
2117        AESENC KEY STATE2
2118        AESENC KEY STATE3
2119        AESENC KEY STATE4
2120        movaps 0x40(TKEYP), KEY
2121        AESENC KEY STATE1
2122        AESENC KEY STATE2
2123        AESENC KEY STATE3
2124        AESENC KEY STATE4
2125        movaps 0x50(TKEYP), KEY
2126        AESENC KEY STATE1
2127        AESENC KEY STATE2
2128        AESENC KEY STATE3
2129        AESENC KEY STATE4
2130        movaps 0x60(TKEYP), KEY
2131        AESENC KEY STATE1
2132        AESENC KEY STATE2
2133        AESENC KEY STATE3
2134        AESENC KEY STATE4
2135        movaps 0x70(TKEYP), KEY
2136        AESENCLAST KEY STATE1           # last round
2137        AESENCLAST KEY STATE2
2138        AESENCLAST KEY STATE3
2139        AESENCLAST KEY STATE4
2140        ret
2141ENDPROC(_aesni_enc4)
2142
2143/*
2144 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2145 */
2146ENTRY(aesni_dec)
2147        FRAME_BEGIN
2148#ifndef __x86_64__
2149        pushl KEYP
2150        pushl KLEN
2151        movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
2152        movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
2153        movl (FRAME_OFFSET+20)(%esp), INP       # src
2154#endif
2155        mov 480(KEYP), KLEN             # key length
2156        add $240, KEYP
2157        movups (INP), STATE             # input
2158        call _aesni_dec1
2159        movups STATE, (OUTP)            #output
2160#ifndef __x86_64__
2161        popl KLEN
2162        popl KEYP
2163#endif
2164        FRAME_END
2165        ret
2166ENDPROC(aesni_dec)
2167
2168/*
2169 * _aesni_dec1:         internal ABI
2170 * input:
2171 *      KEYP:           key struct pointer
2172 *      KLEN:           key length
2173 *      STATE:          initial state (input)
2174 * output:
2175 *      STATE:          finial state (output)
2176 * changed:
2177 *      KEY
2178 *      TKEYP (T1)
2179 */
2180.align 4
2181_aesni_dec1:
2182        movaps (KEYP), KEY              # key
2183        mov KEYP, TKEYP
2184        pxor KEY, STATE         # round 0
2185        add $0x30, TKEYP
2186        cmp $24, KLEN
2187        jb .Ldec128
2188        lea 0x20(TKEYP), TKEYP
2189        je .Ldec192
2190        add $0x20, TKEYP
2191        movaps -0x60(TKEYP), KEY
2192        AESDEC KEY STATE
2193        movaps -0x50(TKEYP), KEY
2194        AESDEC KEY STATE
2195.align 4
2196.Ldec192:
2197        movaps -0x40(TKEYP), KEY
2198        AESDEC KEY STATE
2199        movaps -0x30(TKEYP), KEY
2200        AESDEC KEY STATE
2201.align 4
2202.Ldec128:
2203        movaps -0x20(TKEYP), KEY
2204        AESDEC KEY STATE
2205        movaps -0x10(TKEYP), KEY
2206        AESDEC KEY STATE
2207        movaps (TKEYP), KEY
2208        AESDEC KEY STATE
2209        movaps 0x10(TKEYP), KEY
2210        AESDEC KEY STATE
2211        movaps 0x20(TKEYP), KEY
2212        AESDEC KEY STATE
2213        movaps 0x30(TKEYP), KEY
2214        AESDEC KEY STATE
2215        movaps 0x40(TKEYP), KEY
2216        AESDEC KEY STATE
2217        movaps 0x50(TKEYP), KEY
2218        AESDEC KEY STATE
2219        movaps 0x60(TKEYP), KEY
2220        AESDEC KEY STATE
2221        movaps 0x70(TKEYP), KEY
2222        AESDECLAST KEY STATE
2223        ret
2224ENDPROC(_aesni_dec1)
2225
2226/*
2227 * _aesni_dec4: internal ABI
2228 * input:
2229 *      KEYP:           key struct pointer
2230 *      KLEN:           key length
2231 *      STATE1:         initial state (input)
2232 *      STATE2
2233 *      STATE3
2234 *      STATE4
2235 * output:
2236 *      STATE1:         finial state (output)
2237 *      STATE2
2238 *      STATE3
2239 *      STATE4
2240 * changed:
2241 *      KEY
2242 *      TKEYP (T1)
2243 */
2244.align 4
2245_aesni_dec4:
2246        movaps (KEYP), KEY              # key
2247        mov KEYP, TKEYP
2248        pxor KEY, STATE1                # round 0
2249        pxor KEY, STATE2
2250        pxor KEY, STATE3
2251        pxor KEY, STATE4
2252        add $0x30, TKEYP
2253        cmp $24, KLEN
2254        jb .L4dec128
2255        lea 0x20(TKEYP), TKEYP
2256        je .L4dec192
2257        add $0x20, TKEYP
2258        movaps -0x60(TKEYP), KEY
2259        AESDEC KEY STATE1
2260        AESDEC KEY STATE2
2261        AESDEC KEY STATE3
2262        AESDEC KEY STATE4
2263        movaps -0x50(TKEYP), KEY
2264        AESDEC KEY STATE1
2265        AESDEC KEY STATE2
2266        AESDEC KEY STATE3
2267        AESDEC KEY STATE4
2268.align 4
2269.L4dec192:
2270        movaps -0x40(TKEYP), KEY
2271        AESDEC KEY STATE1
2272        AESDEC KEY STATE2
2273        AESDEC KEY STATE3
2274        AESDEC KEY STATE4
2275        movaps -0x30(TKEYP), KEY
2276        AESDEC KEY STATE1
2277        AESDEC KEY STATE2
2278        AESDEC KEY STATE3
2279        AESDEC KEY STATE4
2280.align 4
2281.L4dec128:
2282        movaps -0x20(TKEYP), KEY
2283        AESDEC KEY STATE1
2284        AESDEC KEY STATE2
2285        AESDEC KEY STATE3
2286        AESDEC KEY STATE4
2287        movaps -0x10(TKEYP), KEY
2288        AESDEC KEY STATE1
2289        AESDEC KEY STATE2
2290        AESDEC KEY STATE3
2291        AESDEC KEY STATE4
2292        movaps (TKEYP), KEY
2293        AESDEC KEY STATE1
2294        AESDEC KEY STATE2
2295        AESDEC KEY STATE3
2296        AESDEC KEY STATE4
2297        movaps 0x10(TKEYP), KEY
2298        AESDEC KEY STATE1
2299        AESDEC KEY STATE2
2300        AESDEC KEY STATE3
2301        AESDEC KEY STATE4
2302        movaps 0x20(TKEYP), KEY
2303        AESDEC KEY STATE1
2304        AESDEC KEY STATE2
2305        AESDEC KEY STATE3
2306        AESDEC KEY STATE4
2307        movaps 0x30(TKEYP), KEY
2308        AESDEC KEY STATE1
2309        AESDEC KEY STATE2
2310        AESDEC KEY STATE3
2311        AESDEC KEY STATE4
2312        movaps 0x40(TKEYP), KEY
2313        AESDEC KEY STATE1
2314        AESDEC KEY STATE2
2315        AESDEC KEY STATE3
2316        AESDEC KEY STATE4
2317        movaps 0x50(TKEYP), KEY
2318        AESDEC KEY STATE1
2319        AESDEC KEY STATE2
2320        AESDEC KEY STATE3
2321        AESDEC KEY STATE4
2322        movaps 0x60(TKEYP), KEY
2323        AESDEC KEY STATE1
2324        AESDEC KEY STATE2
2325        AESDEC KEY STATE3
2326        AESDEC KEY STATE4
2327        movaps 0x70(TKEYP), KEY
2328        AESDECLAST KEY STATE1           # last round
2329        AESDECLAST KEY STATE2
2330        AESDECLAST KEY STATE3
2331        AESDECLAST KEY STATE4
2332        ret
2333ENDPROC(_aesni_dec4)
2334
2335/*
2336 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2337 *                    size_t len)
2338 */
2339ENTRY(aesni_ecb_enc)
2340        FRAME_BEGIN
2341#ifndef __x86_64__
2342        pushl LEN
2343        pushl KEYP
2344        pushl KLEN
2345        movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2346        movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2347        movl (FRAME_OFFSET+24)(%esp), INP       # src
2348        movl (FRAME_OFFSET+28)(%esp), LEN       # len
2349#endif
2350        test LEN, LEN           # check length
2351        jz .Lecb_enc_ret
2352        mov 480(KEYP), KLEN
2353        cmp $16, LEN
2354        jb .Lecb_enc_ret
2355        cmp $64, LEN
2356        jb .Lecb_enc_loop1
2357.align 4
2358.Lecb_enc_loop4:
2359        movups (INP), STATE1
2360        movups 0x10(INP), STATE2
2361        movups 0x20(INP), STATE3
2362        movups 0x30(INP), STATE4
2363        call _aesni_enc4
2364        movups STATE1, (OUTP)
2365        movups STATE2, 0x10(OUTP)
2366        movups STATE3, 0x20(OUTP)
2367        movups STATE4, 0x30(OUTP)
2368        sub $64, LEN
2369        add $64, INP
2370        add $64, OUTP
2371        cmp $64, LEN
2372        jge .Lecb_enc_loop4
2373        cmp $16, LEN
2374        jb .Lecb_enc_ret
2375.align 4
2376.Lecb_enc_loop1:
2377        movups (INP), STATE1
2378        call _aesni_enc1
2379        movups STATE1, (OUTP)
2380        sub $16, LEN
2381        add $16, INP
2382        add $16, OUTP
2383        cmp $16, LEN
2384        jge .Lecb_enc_loop1
2385.Lecb_enc_ret:
2386#ifndef __x86_64__
2387        popl KLEN
2388        popl KEYP
2389        popl LEN
2390#endif
2391        FRAME_END
2392        ret
2393ENDPROC(aesni_ecb_enc)
2394
2395/*
2396 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2397 *                    size_t len);
2398 */
2399ENTRY(aesni_ecb_dec)
2400        FRAME_BEGIN
2401#ifndef __x86_64__
2402        pushl LEN
2403        pushl KEYP
2404        pushl KLEN
2405        movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2406        movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2407        movl (FRAME_OFFSET+24)(%esp), INP       # src
2408        movl (FRAME_OFFSET+28)(%esp), LEN       # len
2409#endif
2410        test LEN, LEN
2411        jz .Lecb_dec_ret
2412        mov 480(KEYP), KLEN
2413        add $240, KEYP
2414        cmp $16, LEN
2415        jb .Lecb_dec_ret
2416        cmp $64, LEN
2417        jb .Lecb_dec_loop1
2418.align 4
2419.Lecb_dec_loop4:
2420        movups (INP), STATE1
2421        movups 0x10(INP), STATE2
2422        movups 0x20(INP), STATE3
2423        movups 0x30(INP), STATE4
2424        call _aesni_dec4
2425        movups STATE1, (OUTP)
2426        movups STATE2, 0x10(OUTP)
2427        movups STATE3, 0x20(OUTP)
2428        movups STATE4, 0x30(OUTP)
2429        sub $64, LEN
2430        add $64, INP
2431        add $64, OUTP
2432        cmp $64, LEN
2433        jge .Lecb_dec_loop4
2434        cmp $16, LEN
2435        jb .Lecb_dec_ret
2436.align 4
2437.Lecb_dec_loop1:
2438        movups (INP), STATE1
2439        call _aesni_dec1
2440        movups STATE1, (OUTP)
2441        sub $16, LEN
2442        add $16, INP
2443        add $16, OUTP
2444        cmp $16, LEN
2445        jge .Lecb_dec_loop1
2446.Lecb_dec_ret:
2447#ifndef __x86_64__
2448        popl KLEN
2449        popl KEYP
2450        popl LEN
2451#endif
2452        FRAME_END
2453        ret
2454ENDPROC(aesni_ecb_dec)
2455
2456/*
2457 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2458 *                    size_t len, u8 *iv)
2459 */
2460ENTRY(aesni_cbc_enc)
2461        FRAME_BEGIN
2462#ifndef __x86_64__
2463        pushl IVP
2464        pushl LEN
2465        pushl KEYP
2466        pushl KLEN
2467        movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2468        movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2469        movl (FRAME_OFFSET+28)(%esp), INP       # src
2470        movl (FRAME_OFFSET+32)(%esp), LEN       # len
2471        movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2472#endif
2473        cmp $16, LEN
2474        jb .Lcbc_enc_ret
2475        mov 480(KEYP), KLEN
2476        movups (IVP), STATE     # load iv as initial state
2477.align 4
2478.Lcbc_enc_loop:
2479        movups (INP), IN        # load input
2480        pxor IN, STATE
2481        call _aesni_enc1
2482        movups STATE, (OUTP)    # store output
2483        sub $16, LEN
2484        add $16, INP
2485        add $16, OUTP
2486        cmp $16, LEN
2487        jge .Lcbc_enc_loop
2488        movups STATE, (IVP)
2489.Lcbc_enc_ret:
2490#ifndef __x86_64__
2491        popl KLEN
2492        popl KEYP
2493        popl LEN
2494        popl IVP
2495#endif
2496        FRAME_END
2497        ret
2498ENDPROC(aesni_cbc_enc)
2499
2500/*
2501 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2502 *                    size_t len, u8 *iv)
2503 */
2504ENTRY(aesni_cbc_dec)
2505        FRAME_BEGIN
2506#ifndef __x86_64__
2507        pushl IVP
2508        pushl LEN
2509        pushl KEYP
2510        pushl KLEN
2511        movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2512        movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2513        movl (FRAME_OFFSET+28)(%esp), INP       # src
2514        movl (FRAME_OFFSET+32)(%esp), LEN       # len
2515        movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2516#endif
2517        cmp $16, LEN
2518        jb .Lcbc_dec_just_ret
2519        mov 480(KEYP), KLEN
2520        add $240, KEYP
2521        movups (IVP), IV
2522        cmp $64, LEN
2523        jb .Lcbc_dec_loop1
2524.align 4
2525.Lcbc_dec_loop4:
2526        movups (INP), IN1
2527        movaps IN1, STATE1
2528        movups 0x10(INP), IN2
2529        movaps IN2, STATE2
2530#ifdef __x86_64__
2531        movups 0x20(INP), IN3
2532        movaps IN3, STATE3
2533        movups 0x30(INP), IN4
2534        movaps IN4, STATE4
2535#else
2536        movups 0x20(INP), IN1
2537        movaps IN1, STATE3
2538        movups 0x30(INP), IN2
2539        movaps IN2, STATE4
2540#endif
2541        call _aesni_dec4
2542        pxor IV, STATE1
2543#ifdef __x86_64__
2544        pxor IN1, STATE2
2545        pxor IN2, STATE3
2546        pxor IN3, STATE4
2547        movaps IN4, IV
2548#else
2549        pxor IN1, STATE4
2550        movaps IN2, IV
2551        movups (INP), IN1
2552        pxor IN1, STATE2
2553        movups 0x10(INP), IN2
2554        pxor IN2, STATE3
2555#endif
2556        movups STATE1, (OUTP)
2557        movups STATE2, 0x10(OUTP)
2558        movups STATE3, 0x20(OUTP)
2559        movups STATE4, 0x30(OUTP)
2560        sub $64, LEN
2561        add $64, INP
2562        add $64, OUTP
2563        cmp $64, LEN
2564        jge .Lcbc_dec_loop4
2565        cmp $16, LEN
2566        jb .Lcbc_dec_ret
2567.align 4
2568.Lcbc_dec_loop1:
2569        movups (INP), IN
2570        movaps IN, STATE
2571        call _aesni_dec1
2572        pxor IV, STATE
2573        movups STATE, (OUTP)
2574        movaps IN, IV
2575        sub $16, LEN
2576        add $16, INP
2577        add $16, OUTP
2578        cmp $16, LEN
2579        jge .Lcbc_dec_loop1
2580.Lcbc_dec_ret:
2581        movups IV, (IVP)
2582.Lcbc_dec_just_ret:
2583#ifndef __x86_64__
2584        popl KLEN
2585        popl KEYP
2586        popl LEN
2587        popl IVP
2588#endif
2589        FRAME_END
2590        ret
2591ENDPROC(aesni_cbc_dec)
2592
2593#ifdef __x86_64__
2594.pushsection .rodata
2595.align 16
2596.Lbswap_mask:
2597        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2598.popsection
2599
2600/*
2601 * _aesni_inc_init:     internal ABI
2602 *      setup registers used by _aesni_inc
2603 * input:
2604 *      IV
2605 * output:
2606 *      CTR:    == IV, in little endian
2607 *      TCTR_LOW: == lower qword of CTR
2608 *      INC:    == 1, in little endian
2609 *      BSWAP_MASK == endian swapping mask
2610 */
2611.align 4
2612_aesni_inc_init:
2613        movaps .Lbswap_mask, BSWAP_MASK
2614        movaps IV, CTR
2615        PSHUFB_XMM BSWAP_MASK CTR
2616        mov $1, TCTR_LOW
2617        MOVQ_R64_XMM TCTR_LOW INC
2618        MOVQ_R64_XMM CTR TCTR_LOW
2619        ret
2620ENDPROC(_aesni_inc_init)
2621
2622/*
2623 * _aesni_inc:          internal ABI
2624 *      Increase IV by 1, IV is in big endian
2625 * input:
2626 *      IV
2627 *      CTR:    == IV, in little endian
2628 *      TCTR_LOW: == lower qword of CTR
2629 *      INC:    == 1, in little endian
2630 *      BSWAP_MASK == endian swapping mask
2631 * output:
2632 *      IV:     Increase by 1
2633 * changed:
2634 *      CTR:    == output IV, in little endian
2635 *      TCTR_LOW: == lower qword of CTR
2636 */
2637.align 4
2638_aesni_inc:
2639        paddq INC, CTR
2640        add $1, TCTR_LOW
2641        jnc .Linc_low
2642        pslldq $8, INC
2643        paddq INC, CTR
2644        psrldq $8, INC
2645.Linc_low:
2646        movaps CTR, IV
2647        PSHUFB_XMM BSWAP_MASK IV
2648        ret
2649ENDPROC(_aesni_inc)
2650
2651/*
2652 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2653 *                    size_t len, u8 *iv)
2654 */
2655ENTRY(aesni_ctr_enc)
2656        FRAME_BEGIN
2657        cmp $16, LEN
2658        jb .Lctr_enc_just_ret
2659        mov 480(KEYP), KLEN
2660        movups (IVP), IV
2661        call _aesni_inc_init
2662        cmp $64, LEN
2663        jb .Lctr_enc_loop1
2664.align 4
2665.Lctr_enc_loop4:
2666        movaps IV, STATE1
2667        call _aesni_inc
2668        movups (INP), IN1
2669        movaps IV, STATE2
2670        call _aesni_inc
2671        movups 0x10(INP), IN2
2672        movaps IV, STATE3
2673        call _aesni_inc
2674        movups 0x20(INP), IN3
2675        movaps IV, STATE4
2676        call _aesni_inc
2677        movups 0x30(INP), IN4
2678        call _aesni_enc4
2679        pxor IN1, STATE1
2680        movups STATE1, (OUTP)
2681        pxor IN2, STATE2
2682        movups STATE2, 0x10(OUTP)
2683        pxor IN3, STATE3
2684        movups STATE3, 0x20(OUTP)
2685        pxor IN4, STATE4
2686        movups STATE4, 0x30(OUTP)
2687        sub $64, LEN
2688        add $64, INP
2689        add $64, OUTP
2690        cmp $64, LEN
2691        jge .Lctr_enc_loop4
2692        cmp $16, LEN
2693        jb .Lctr_enc_ret
2694.align 4
2695.Lctr_enc_loop1:
2696        movaps IV, STATE
2697        call _aesni_inc
2698        movups (INP), IN
2699        call _aesni_enc1
2700        pxor IN, STATE
2701        movups STATE, (OUTP)
2702        sub $16, LEN
2703        add $16, INP
2704        add $16, OUTP
2705        cmp $16, LEN
2706        jge .Lctr_enc_loop1
2707.Lctr_enc_ret:
2708        movups IV, (IVP)
2709.Lctr_enc_just_ret:
2710        FRAME_END
2711        ret
2712ENDPROC(aesni_ctr_enc)
2713
2714/*
2715 * _aesni_gf128mul_x_ble:               internal ABI
2716 *      Multiply in GF(2^128) for XTS IVs
2717 * input:
2718 *      IV:     current IV
2719 *      GF128MUL_MASK == mask with 0x87 and 0x01
2720 * output:
2721 *      IV:     next IV
2722 * changed:
2723 *      CTR:    == temporary value
2724 */
2725#define _aesni_gf128mul_x_ble() \
2726        pshufd $0x13, IV, CTR; \
2727        paddq IV, IV; \
2728        psrad $31, CTR; \
2729        pand GF128MUL_MASK, CTR; \
2730        pxor CTR, IV;
2731
2732/*
2733 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2734 *                       bool enc, u8 *iv)
2735 */
2736ENTRY(aesni_xts_crypt8)
2737        FRAME_BEGIN
2738        cmpb $0, %cl
2739        movl $0, %ecx
2740        movl $240, %r10d
2741        leaq _aesni_enc4, %r11
2742        leaq _aesni_dec4, %rax
2743        cmovel %r10d, %ecx
2744        cmoveq %rax, %r11
2745
2746        movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2747        movups (IVP), IV
2748
2749        mov 480(KEYP), KLEN
2750        addq %rcx, KEYP
2751
2752        movdqa IV, STATE1
2753        movdqu 0x00(INP), INC
2754        pxor INC, STATE1
2755        movdqu IV, 0x00(OUTP)
2756
2757        _aesni_gf128mul_x_ble()
2758        movdqa IV, STATE2
2759        movdqu 0x10(INP), INC
2760        pxor INC, STATE2
2761        movdqu IV, 0x10(OUTP)
2762
2763        _aesni_gf128mul_x_ble()
2764        movdqa IV, STATE3
2765        movdqu 0x20(INP), INC
2766        pxor INC, STATE3
2767        movdqu IV, 0x20(OUTP)
2768
2769        _aesni_gf128mul_x_ble()
2770        movdqa IV, STATE4
2771        movdqu 0x30(INP), INC
2772        pxor INC, STATE4
2773        movdqu IV, 0x30(OUTP)
2774
2775        CALL_NOSPEC %r11
2776
2777        movdqu 0x00(OUTP), INC
2778        pxor INC, STATE1
2779        movdqu STATE1, 0x00(OUTP)
2780
2781        _aesni_gf128mul_x_ble()
2782        movdqa IV, STATE1
2783        movdqu 0x40(INP), INC
2784        pxor INC, STATE1
2785        movdqu IV, 0x40(OUTP)
2786
2787        movdqu 0x10(OUTP), INC
2788        pxor INC, STATE2
2789        movdqu STATE2, 0x10(OUTP)
2790
2791        _aesni_gf128mul_x_ble()
2792        movdqa IV, STATE2
2793        movdqu 0x50(INP), INC
2794        pxor INC, STATE2
2795        movdqu IV, 0x50(OUTP)
2796
2797        movdqu 0x20(OUTP), INC
2798        pxor INC, STATE3
2799        movdqu STATE3, 0x20(OUTP)
2800
2801        _aesni_gf128mul_x_ble()
2802        movdqa IV, STATE3
2803        movdqu 0x60(INP), INC
2804        pxor INC, STATE3
2805        movdqu IV, 0x60(OUTP)
2806
2807        movdqu 0x30(OUTP), INC
2808        pxor INC, STATE4
2809        movdqu STATE4, 0x30(OUTP)
2810
2811        _aesni_gf128mul_x_ble()
2812        movdqa IV, STATE4
2813        movdqu 0x70(INP), INC
2814        pxor INC, STATE4
2815        movdqu IV, 0x70(OUTP)
2816
2817        _aesni_gf128mul_x_ble()
2818        movups IV, (IVP)
2819
2820        CALL_NOSPEC %r11
2821
2822        movdqu 0x40(OUTP), INC
2823        pxor INC, STATE1
2824        movdqu STATE1, 0x40(OUTP)
2825
2826        movdqu 0x50(OUTP), INC
2827        pxor INC, STATE2
2828        movdqu STATE2, 0x50(OUTP)
2829
2830        movdqu 0x60(OUTP), INC
2831        pxor INC, STATE3
2832        movdqu STATE3, 0x60(OUTP)
2833
2834        movdqu 0x70(OUTP), INC
2835        pxor INC, STATE4
2836        movdqu STATE4, 0x70(OUTP)
2837
2838        FRAME_END
2839        ret
2840ENDPROC(aesni_xts_crypt8)
2841
2842#endif
2843