linux/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
<<
>>
Prefs
   1/*
   2 *      Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
   3 *
   4 * This is AES128/192/256 CTR mode optimization implementation. It requires
   5 * the support of Intel(R) AESNI and AVX instructions.
   6 *
   7 * This work was inspired by the AES CTR mode optimization published
   8 * in Intel Optimized IPSEC Cryptograhpic library.
   9 * Additional information on it can be found at:
  10 *    http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
  11 *
  12 * This file is provided under a dual BSD/GPLv2 license.  When using or
  13 * redistributing this file, you may do so under either license.
  14 *
  15 * GPL LICENSE SUMMARY
  16 *
  17 * Copyright(c) 2014 Intel Corporation.
  18 *
  19 * This program is free software; you can redistribute it and/or modify
  20 * it under the terms of version 2 of the GNU General Public License as
  21 * published by the Free Software Foundation.
  22 *
  23 * This program is distributed in the hope that it will be useful, but
  24 * WITHOUT ANY WARRANTY; without even the implied warranty of
  25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  26 * General Public License for more details.
  27 *
  28 * Contact Information:
  29 * James Guilford <james.guilford@intel.com>
  30 * Sean Gulley <sean.m.gulley@intel.com>
  31 * Chandramouli Narayanan <mouli@linux.intel.com>
  32 *
  33 * BSD LICENSE
  34 *
  35 * Copyright(c) 2014 Intel Corporation.
  36 *
  37 * Redistribution and use in source and binary forms, with or without
  38 * modification, are permitted provided that the following conditions
  39 * are met:
  40 *
  41 * Redistributions of source code must retain the above copyright
  42 * notice, this list of conditions and the following disclaimer.
  43 * Redistributions in binary form must reproduce the above copyright
  44 * notice, this list of conditions and the following disclaimer in
  45 * the documentation and/or other materials provided with the
  46 * distribution.
  47 * Neither the name of Intel Corporation nor the names of its
  48 * contributors may be used to endorse or promote products derived
  49 * from this software without specific prior written permission.
  50 *
  51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  62 *
  63 */
  64
  65#include <linux/linkage.h>
  66
  67#define VMOVDQ          vmovdqu
  68
  69#define xdata0          %xmm0
  70#define xdata1          %xmm1
  71#define xdata2          %xmm2
  72#define xdata3          %xmm3
  73#define xdata4          %xmm4
  74#define xdata5          %xmm5
  75#define xdata6          %xmm6
  76#define xdata7          %xmm7
  77#define xcounter        %xmm8
  78#define xbyteswap       %xmm9
  79#define xkey0           %xmm10
  80#define xkey4           %xmm11
  81#define xkey8           %xmm12
  82#define xkey12          %xmm13
  83#define xkeyA           %xmm14
  84#define xkeyB           %xmm15
  85
  86#define p_in            %rdi
  87#define p_iv            %rsi
  88#define p_keys          %rdx
  89#define p_out           %rcx
  90#define num_bytes       %r8
  91
  92#define tmp             %r10
  93#define DDQ_DATA        0
  94#define XDATA           1
  95#define KEY_128         1
  96#define KEY_192         2
  97#define KEY_256         3
  98
  99.section .rodata
 100.align 16
 101
 102byteswap_const:
 103        .octa 0x000102030405060708090A0B0C0D0E0F
 104ddq_low_msk:
 105        .octa 0x0000000000000000FFFFFFFFFFFFFFFF
 106ddq_high_add_1:
 107        .octa 0x00000000000000010000000000000000
 108ddq_add_1:
 109        .octa 0x00000000000000000000000000000001
 110ddq_add_2:
 111        .octa 0x00000000000000000000000000000002
 112ddq_add_3:
 113        .octa 0x00000000000000000000000000000003
 114ddq_add_4:
 115        .octa 0x00000000000000000000000000000004
 116ddq_add_5:
 117        .octa 0x00000000000000000000000000000005
 118ddq_add_6:
 119        .octa 0x00000000000000000000000000000006
 120ddq_add_7:
 121        .octa 0x00000000000000000000000000000007
 122ddq_add_8:
 123        .octa 0x00000000000000000000000000000008
 124
 125.text
 126
 127/* generate a unique variable for ddq_add_x */
 128
 129/* generate a unique variable for xmm register */
 130.macro setxdata n
 131        var_xdata = %xmm\n
 132.endm
 133
 134/* club the numeric 'id' to the symbol 'name' */
 135
 136.macro club name, id
 137.altmacro
 138        .if \name == XDATA
 139                setxdata %\id
 140        .endif
 141.noaltmacro
 142.endm
 143
 144/*
 145 * do_aes num_in_par load_keys key_len
 146 * This increments p_in, but not p_out
 147 */
 148.macro do_aes b, k, key_len
 149        .set by, \b
 150        .set load_keys, \k
 151        .set klen, \key_len
 152
 153        .if (load_keys)
 154                vmovdqa 0*16(p_keys), xkey0
 155        .endif
 156
 157        vpshufb xbyteswap, xcounter, xdata0
 158
 159        .set i, 1
 160        .rept (by - 1)
 161                club XDATA, i
 162                vpaddq  (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
 163                vptest  ddq_low_msk(%rip), var_xdata
 164                jnz 1f
 165                vpaddq  ddq_high_add_1(%rip), var_xdata, var_xdata
 166                vpaddq  ddq_high_add_1(%rip), xcounter, xcounter
 167                1:
 168                vpshufb xbyteswap, var_xdata, var_xdata
 169                .set i, (i +1)
 170        .endr
 171
 172        vmovdqa 1*16(p_keys), xkeyA
 173
 174        vpxor   xkey0, xdata0, xdata0
 175        vpaddq  (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
 176        vptest  ddq_low_msk(%rip), xcounter
 177        jnz     1f
 178        vpaddq  ddq_high_add_1(%rip), xcounter, xcounter
 179        1:
 180
 181        .set i, 1
 182        .rept (by - 1)
 183                club XDATA, i
 184                vpxor   xkey0, var_xdata, var_xdata
 185                .set i, (i +1)
 186        .endr
 187
 188        vmovdqa 2*16(p_keys), xkeyB
 189
 190        .set i, 0
 191        .rept by
 192                club XDATA, i
 193                vaesenc xkeyA, var_xdata, var_xdata             /* key 1 */
 194                .set i, (i +1)
 195        .endr
 196
 197        .if (klen == KEY_128)
 198                .if (load_keys)
 199                        vmovdqa 3*16(p_keys), xkey4
 200                .endif
 201        .else
 202                vmovdqa 3*16(p_keys), xkeyA
 203        .endif
 204
 205        .set i, 0
 206        .rept by
 207                club XDATA, i
 208                vaesenc xkeyB, var_xdata, var_xdata             /* key 2 */
 209                .set i, (i +1)
 210        .endr
 211
 212        add     $(16*by), p_in
 213
 214        .if (klen == KEY_128)
 215                vmovdqa 4*16(p_keys), xkeyB
 216        .else
 217                .if (load_keys)
 218                        vmovdqa 4*16(p_keys), xkey4
 219                .endif
 220        .endif
 221
 222        .set i, 0
 223        .rept by
 224                club XDATA, i
 225                /* key 3 */
 226                .if (klen == KEY_128)
 227                        vaesenc xkey4, var_xdata, var_xdata
 228                .else
 229                        vaesenc xkeyA, var_xdata, var_xdata
 230                .endif
 231                .set i, (i +1)
 232        .endr
 233
 234        vmovdqa 5*16(p_keys), xkeyA
 235
 236        .set i, 0
 237        .rept by
 238                club XDATA, i
 239                /* key 4 */
 240                .if (klen == KEY_128)
 241                        vaesenc xkeyB, var_xdata, var_xdata
 242                .else
 243                        vaesenc xkey4, var_xdata, var_xdata
 244                .endif
 245                .set i, (i +1)
 246        .endr
 247
 248        .if (klen == KEY_128)
 249                .if (load_keys)
 250                        vmovdqa 6*16(p_keys), xkey8
 251                .endif
 252        .else
 253                vmovdqa 6*16(p_keys), xkeyB
 254        .endif
 255
 256        .set i, 0
 257        .rept by
 258                club XDATA, i
 259                vaesenc xkeyA, var_xdata, var_xdata             /* key 5 */
 260                .set i, (i +1)
 261        .endr
 262
 263        vmovdqa 7*16(p_keys), xkeyA
 264
 265        .set i, 0
 266        .rept by
 267                club XDATA, i
 268                /* key 6 */
 269                .if (klen == KEY_128)
 270                        vaesenc xkey8, var_xdata, var_xdata
 271                .else
 272                        vaesenc xkeyB, var_xdata, var_xdata
 273                .endif
 274                .set i, (i +1)
 275        .endr
 276
 277        .if (klen == KEY_128)
 278                vmovdqa 8*16(p_keys), xkeyB
 279        .else
 280                .if (load_keys)
 281                        vmovdqa 8*16(p_keys), xkey8
 282                .endif
 283        .endif
 284
 285        .set i, 0
 286        .rept by
 287                club XDATA, i
 288                vaesenc xkeyA, var_xdata, var_xdata             /* key 7 */
 289                .set i, (i +1)
 290        .endr
 291
 292        .if (klen == KEY_128)
 293                .if (load_keys)
 294                        vmovdqa 9*16(p_keys), xkey12
 295                .endif
 296        .else
 297                vmovdqa 9*16(p_keys), xkeyA
 298        .endif
 299
 300        .set i, 0
 301        .rept by
 302                club XDATA, i
 303                /* key 8 */
 304                .if (klen == KEY_128)
 305                        vaesenc xkeyB, var_xdata, var_xdata
 306                .else
 307                        vaesenc xkey8, var_xdata, var_xdata
 308                .endif
 309                .set i, (i +1)
 310        .endr
 311
 312        vmovdqa 10*16(p_keys), xkeyB
 313
 314        .set i, 0
 315        .rept by
 316                club XDATA, i
 317                /* key 9 */
 318                .if (klen == KEY_128)
 319                        vaesenc xkey12, var_xdata, var_xdata
 320                .else
 321                        vaesenc xkeyA, var_xdata, var_xdata
 322                .endif
 323                .set i, (i +1)
 324        .endr
 325
 326        .if (klen != KEY_128)
 327                vmovdqa 11*16(p_keys), xkeyA
 328        .endif
 329
 330        .set i, 0
 331        .rept by
 332                club XDATA, i
 333                /* key 10 */
 334                .if (klen == KEY_128)
 335                        vaesenclast     xkeyB, var_xdata, var_xdata
 336                .else
 337                        vaesenc xkeyB, var_xdata, var_xdata
 338                .endif
 339                .set i, (i +1)
 340        .endr
 341
 342        .if (klen != KEY_128)
 343                .if (load_keys)
 344                        vmovdqa 12*16(p_keys), xkey12
 345                .endif
 346
 347                .set i, 0
 348                .rept by
 349                        club XDATA, i
 350                        vaesenc xkeyA, var_xdata, var_xdata     /* key 11 */
 351                        .set i, (i +1)
 352                .endr
 353
 354                .if (klen == KEY_256)
 355                        vmovdqa 13*16(p_keys), xkeyA
 356                .endif
 357
 358                .set i, 0
 359                .rept by
 360                        club XDATA, i
 361                        .if (klen == KEY_256)
 362                                /* key 12 */
 363                                vaesenc xkey12, var_xdata, var_xdata
 364                        .else
 365                                vaesenclast xkey12, var_xdata, var_xdata
 366                        .endif
 367                        .set i, (i +1)
 368                .endr
 369
 370                .if (klen == KEY_256)
 371                        vmovdqa 14*16(p_keys), xkeyB
 372
 373                        .set i, 0
 374                        .rept by
 375                                club XDATA, i
 376                                /* key 13 */
 377                                vaesenc xkeyA, var_xdata, var_xdata
 378                                .set i, (i +1)
 379                        .endr
 380
 381                        .set i, 0
 382                        .rept by
 383                                club XDATA, i
 384                                /* key 14 */
 385                                vaesenclast     xkeyB, var_xdata, var_xdata
 386                                .set i, (i +1)
 387                        .endr
 388                .endif
 389        .endif
 390
 391        .set i, 0
 392        .rept (by / 2)
 393                .set j, (i+1)
 394                VMOVDQ  (i*16 - 16*by)(p_in), xkeyA
 395                VMOVDQ  (j*16 - 16*by)(p_in), xkeyB
 396                club XDATA, i
 397                vpxor   xkeyA, var_xdata, var_xdata
 398                club XDATA, j
 399                vpxor   xkeyB, var_xdata, var_xdata
 400                .set i, (i+2)
 401        .endr
 402
 403        .if (i < by)
 404                VMOVDQ  (i*16 - 16*by)(p_in), xkeyA
 405                club XDATA, i
 406                vpxor   xkeyA, var_xdata, var_xdata
 407        .endif
 408
 409        .set i, 0
 410        .rept by
 411                club XDATA, i
 412                VMOVDQ  var_xdata, i*16(p_out)
 413                .set i, (i+1)
 414        .endr
 415.endm
 416
 417.macro do_aes_load val, key_len
 418        do_aes \val, 1, \key_len
 419.endm
 420
 421.macro do_aes_noload val, key_len
 422        do_aes \val, 0, \key_len
 423.endm
 424
 425/* main body of aes ctr load */
 426
 427.macro do_aes_ctrmain key_len
 428        cmp     $16, num_bytes
 429        jb      .Ldo_return2\key_len
 430
 431        vmovdqa byteswap_const(%rip), xbyteswap
 432        vmovdqu (p_iv), xcounter
 433        vpshufb xbyteswap, xcounter, xcounter
 434
 435        mov     num_bytes, tmp
 436        and     $(7*16), tmp
 437        jz      .Lmult_of_8_blks\key_len
 438
 439        /* 1 <= tmp <= 7 */
 440        cmp     $(4*16), tmp
 441        jg      .Lgt4\key_len
 442        je      .Leq4\key_len
 443
 444.Llt4\key_len:
 445        cmp     $(2*16), tmp
 446        jg      .Leq3\key_len
 447        je      .Leq2\key_len
 448
 449.Leq1\key_len:
 450        do_aes_load     1, \key_len
 451        add     $(1*16), p_out
 452        and     $(~7*16), num_bytes
 453        jz      .Ldo_return2\key_len
 454        jmp     .Lmain_loop2\key_len
 455
 456.Leq2\key_len:
 457        do_aes_load     2, \key_len
 458        add     $(2*16), p_out
 459        and     $(~7*16), num_bytes
 460        jz      .Ldo_return2\key_len
 461        jmp     .Lmain_loop2\key_len
 462
 463
 464.Leq3\key_len:
 465        do_aes_load     3, \key_len
 466        add     $(3*16), p_out
 467        and     $(~7*16), num_bytes
 468        jz      .Ldo_return2\key_len
 469        jmp     .Lmain_loop2\key_len
 470
 471.Leq4\key_len:
 472        do_aes_load     4, \key_len
 473        add     $(4*16), p_out
 474        and     $(~7*16), num_bytes
 475        jz      .Ldo_return2\key_len
 476        jmp     .Lmain_loop2\key_len
 477
 478.Lgt4\key_len:
 479        cmp     $(6*16), tmp
 480        jg      .Leq7\key_len
 481        je      .Leq6\key_len
 482
 483.Leq5\key_len:
 484        do_aes_load     5, \key_len
 485        add     $(5*16), p_out
 486        and     $(~7*16), num_bytes
 487        jz      .Ldo_return2\key_len
 488        jmp     .Lmain_loop2\key_len
 489
 490.Leq6\key_len:
 491        do_aes_load     6, \key_len
 492        add     $(6*16), p_out
 493        and     $(~7*16), num_bytes
 494        jz      .Ldo_return2\key_len
 495        jmp     .Lmain_loop2\key_len
 496
 497.Leq7\key_len:
 498        do_aes_load     7, \key_len
 499        add     $(7*16), p_out
 500        and     $(~7*16), num_bytes
 501        jz      .Ldo_return2\key_len
 502        jmp     .Lmain_loop2\key_len
 503
 504.Lmult_of_8_blks\key_len:
 505        .if (\key_len != KEY_128)
 506                vmovdqa 0*16(p_keys), xkey0
 507                vmovdqa 4*16(p_keys), xkey4
 508                vmovdqa 8*16(p_keys), xkey8
 509                vmovdqa 12*16(p_keys), xkey12
 510        .else
 511                vmovdqa 0*16(p_keys), xkey0
 512                vmovdqa 3*16(p_keys), xkey4
 513                vmovdqa 6*16(p_keys), xkey8
 514                vmovdqa 9*16(p_keys), xkey12
 515        .endif
 516.align 16
 517.Lmain_loop2\key_len:
 518        /* num_bytes is a multiple of 8 and >0 */
 519        do_aes_noload   8, \key_len
 520        add     $(8*16), p_out
 521        sub     $(8*16), num_bytes
 522        jne     .Lmain_loop2\key_len
 523
 524.Ldo_return2\key_len:
 525        /* return updated IV */
 526        vpshufb xbyteswap, xcounter, xcounter
 527        vmovdqu xcounter, (p_iv)
 528        ret
 529.endm
 530
 531/*
 532 * routine to do AES128 CTR enc/decrypt "by8"
 533 * XMM registers are clobbered.
 534 * Saving/restoring must be done at a higher level
 535 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
 536 *                      unsigned int num_bytes)
 537 */
 538SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
 539        /* call the aes main loop */
 540        do_aes_ctrmain KEY_128
 541
 542SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
 543
 544/*
 545 * routine to do AES192 CTR enc/decrypt "by8"
 546 * XMM registers are clobbered.
 547 * Saving/restoring must be done at a higher level
 548 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
 549 *                      unsigned int num_bytes)
 550 */
 551SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
 552        /* call the aes main loop */
 553        do_aes_ctrmain KEY_192
 554
 555SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
 556
 557/*
 558 * routine to do AES256 CTR enc/decrypt "by8"
 559 * XMM registers are clobbered.
 560 * Saving/restoring must be done at a higher level
 561 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
 562 *                      unsigned int num_bytes)
 563 */
 564SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
 565        /* call the aes main loop */
 566        do_aes_ctrmain KEY_256
 567
 568SYM_FUNC_END(aes_ctr_enc_256_avx_by8)
 569