linux/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
<<
>>
Prefs
   1/*
   2 *      Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
   3 *
   4 * This is AES128/192/256 CTR mode optimization implementation. It requires
   5 * the support of Intel(R) AESNI and AVX instructions.
   6 *
   7 * This work was inspired by the AES CTR mode optimization published
   8 * in Intel Optimized IPSEC Cryptograhpic library.
   9 * Additional information on it can be found at:
  10 *    http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
  11 *
  12 * This file is provided under a dual BSD/GPLv2 license.  When using or
  13 * redistributing this file, you may do so under either license.
  14 *
  15 * GPL LICENSE SUMMARY
  16 *
  17 * Copyright(c) 2014 Intel Corporation.
  18 *
  19 * This program is free software; you can redistribute it and/or modify
  20 * it under the terms of version 2 of the GNU General Public License as
  21 * published by the Free Software Foundation.
  22 *
  23 * This program is distributed in the hope that it will be useful, but
  24 * WITHOUT ANY WARRANTY; without even the implied warranty of
  25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  26 * General Public License for more details.
  27 *
  28 * Contact Information:
  29 * James Guilford <james.guilford@intel.com>
  30 * Sean Gulley <sean.m.gulley@intel.com>
  31 * Chandramouli Narayanan <mouli@linux.intel.com>
  32 *
  33 * BSD LICENSE
  34 *
  35 * Copyright(c) 2014 Intel Corporation.
  36 *
  37 * Redistribution and use in source and binary forms, with or without
  38 * modification, are permitted provided that the following conditions
  39 * are met:
  40 *
  41 * Redistributions of source code must retain the above copyright
  42 * notice, this list of conditions and the following disclaimer.
  43 * Redistributions in binary form must reproduce the above copyright
  44 * notice, this list of conditions and the following disclaimer in
  45 * the documentation and/or other materials provided with the
  46 * distribution.
  47 * Neither the name of Intel Corporation nor the names of its
  48 * contributors may be used to endorse or promote products derived
  49 * from this software without specific prior written permission.
  50 *
  51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  62 *
  63 */
  64
  65#include <linux/linkage.h>
  66#include <asm/inst.h>
  67
  68#define CONCAT(a,b)     a##b
  69#define VMOVDQ          vmovdqu
  70
  71#define xdata0          %xmm0
  72#define xdata1          %xmm1
  73#define xdata2          %xmm2
  74#define xdata3          %xmm3
  75#define xdata4          %xmm4
  76#define xdata5          %xmm5
  77#define xdata6          %xmm6
  78#define xdata7          %xmm7
  79#define xcounter        %xmm8
  80#define xbyteswap       %xmm9
  81#define xkey0           %xmm10
  82#define xkey4           %xmm11
  83#define xkey8           %xmm12
  84#define xkey12          %xmm13
  85#define xkeyA           %xmm14
  86#define xkeyB           %xmm15
  87
  88#define p_in            %rdi
  89#define p_iv            %rsi
  90#define p_keys          %rdx
  91#define p_out           %rcx
  92#define num_bytes       %r8
  93
  94#define tmp             %r10
  95#define DDQ(i)          CONCAT(ddq_add_,i)
  96#define XMM(i)          CONCAT(%xmm, i)
  97#define DDQ_DATA        0
  98#define XDATA           1
  99#define KEY_128         1
 100#define KEY_192         2
 101#define KEY_256         3
 102
 103.section .rodata
 104.align 16
 105
 106byteswap_const:
 107        .octa 0x000102030405060708090A0B0C0D0E0F
 108ddq_low_msk:
 109        .octa 0x0000000000000000FFFFFFFFFFFFFFFF
 110ddq_high_add_1:
 111        .octa 0x00000000000000010000000000000000
 112ddq_add_1:
 113        .octa 0x00000000000000000000000000000001
 114ddq_add_2:
 115        .octa 0x00000000000000000000000000000002
 116ddq_add_3:
 117        .octa 0x00000000000000000000000000000003
 118ddq_add_4:
 119        .octa 0x00000000000000000000000000000004
 120ddq_add_5:
 121        .octa 0x00000000000000000000000000000005
 122ddq_add_6:
 123        .octa 0x00000000000000000000000000000006
 124ddq_add_7:
 125        .octa 0x00000000000000000000000000000007
 126ddq_add_8:
 127        .octa 0x00000000000000000000000000000008
 128
 129.text
 130
 131/* generate a unique variable for ddq_add_x */
 132
 133.macro setddq n
 134        var_ddq_add = DDQ(\n)
 135.endm
 136
 137/* generate a unique variable for xmm register */
 138.macro setxdata n
 139        var_xdata = XMM(\n)
 140.endm
 141
 142/* club the numeric 'id' to the symbol 'name' */
 143
 144.macro club name, id
 145.altmacro
 146        .if \name == DDQ_DATA
 147                setddq %\id
 148        .elseif \name == XDATA
 149                setxdata %\id
 150        .endif
 151.noaltmacro
 152.endm
 153
 154/*
 155 * do_aes num_in_par load_keys key_len
 156 * This increments p_in, but not p_out
 157 */
 158.macro do_aes b, k, key_len
 159        .set by, \b
 160        .set load_keys, \k
 161        .set klen, \key_len
 162
 163        .if (load_keys)
 164                vmovdqa 0*16(p_keys), xkey0
 165        .endif
 166
 167        vpshufb xbyteswap, xcounter, xdata0
 168
 169        .set i, 1
 170        .rept (by - 1)
 171                club DDQ_DATA, i
 172                club XDATA, i
 173                vpaddq  var_ddq_add(%rip), xcounter, var_xdata
 174                vptest  ddq_low_msk(%rip), var_xdata
 175                jnz 1f
 176                vpaddq  ddq_high_add_1(%rip), var_xdata, var_xdata
 177                vpaddq  ddq_high_add_1(%rip), xcounter, xcounter
 178                1:
 179                vpshufb xbyteswap, var_xdata, var_xdata
 180                .set i, (i +1)
 181        .endr
 182
 183        vmovdqa 1*16(p_keys), xkeyA
 184
 185        vpxor   xkey0, xdata0, xdata0
 186        club DDQ_DATA, by
 187        vpaddq  var_ddq_add(%rip), xcounter, xcounter
 188        vptest  ddq_low_msk(%rip), xcounter
 189        jnz     1f
 190        vpaddq  ddq_high_add_1(%rip), xcounter, xcounter
 191        1:
 192
 193        .set i, 1
 194        .rept (by - 1)
 195                club XDATA, i
 196                vpxor   xkey0, var_xdata, var_xdata
 197                .set i, (i +1)
 198        .endr
 199
 200        vmovdqa 2*16(p_keys), xkeyB
 201
 202        .set i, 0
 203        .rept by
 204                club XDATA, i
 205                vaesenc xkeyA, var_xdata, var_xdata             /* key 1 */
 206                .set i, (i +1)
 207        .endr
 208
 209        .if (klen == KEY_128)
 210                .if (load_keys)
 211                        vmovdqa 3*16(p_keys), xkey4
 212                .endif
 213        .else
 214                vmovdqa 3*16(p_keys), xkeyA
 215        .endif
 216
 217        .set i, 0
 218        .rept by
 219                club XDATA, i
 220                vaesenc xkeyB, var_xdata, var_xdata             /* key 2 */
 221                .set i, (i +1)
 222        .endr
 223
 224        add     $(16*by), p_in
 225
 226        .if (klen == KEY_128)
 227                vmovdqa 4*16(p_keys), xkeyB
 228        .else
 229                .if (load_keys)
 230                        vmovdqa 4*16(p_keys), xkey4
 231                .endif
 232        .endif
 233
 234        .set i, 0
 235        .rept by
 236                club XDATA, i
 237                /* key 3 */
 238                .if (klen == KEY_128)
 239                        vaesenc xkey4, var_xdata, var_xdata
 240                .else
 241                        vaesenc xkeyA, var_xdata, var_xdata
 242                .endif
 243                .set i, (i +1)
 244        .endr
 245
 246        vmovdqa 5*16(p_keys), xkeyA
 247
 248        .set i, 0
 249        .rept by
 250                club XDATA, i
 251                /* key 4 */
 252                .if (klen == KEY_128)
 253                        vaesenc xkeyB, var_xdata, var_xdata
 254                .else
 255                        vaesenc xkey4, var_xdata, var_xdata
 256                .endif
 257                .set i, (i +1)
 258        .endr
 259
 260        .if (klen == KEY_128)
 261                .if (load_keys)
 262                        vmovdqa 6*16(p_keys), xkey8
 263                .endif
 264        .else
 265                vmovdqa 6*16(p_keys), xkeyB
 266        .endif
 267
 268        .set i, 0
 269        .rept by
 270                club XDATA, i
 271                vaesenc xkeyA, var_xdata, var_xdata             /* key 5 */
 272                .set i, (i +1)
 273        .endr
 274
 275        vmovdqa 7*16(p_keys), xkeyA
 276
 277        .set i, 0
 278        .rept by
 279                club XDATA, i
 280                /* key 6 */
 281                .if (klen == KEY_128)
 282                        vaesenc xkey8, var_xdata, var_xdata
 283                .else
 284                        vaesenc xkeyB, var_xdata, var_xdata
 285                .endif
 286                .set i, (i +1)
 287        .endr
 288
 289        .if (klen == KEY_128)
 290                vmovdqa 8*16(p_keys), xkeyB
 291        .else
 292                .if (load_keys)
 293                        vmovdqa 8*16(p_keys), xkey8
 294                .endif
 295        .endif
 296
 297        .set i, 0
 298        .rept by
 299                club XDATA, i
 300                vaesenc xkeyA, var_xdata, var_xdata             /* key 7 */
 301                .set i, (i +1)
 302        .endr
 303
 304        .if (klen == KEY_128)
 305                .if (load_keys)
 306                        vmovdqa 9*16(p_keys), xkey12
 307                .endif
 308        .else
 309                vmovdqa 9*16(p_keys), xkeyA
 310        .endif
 311
 312        .set i, 0
 313        .rept by
 314                club XDATA, i
 315                /* key 8 */
 316                .if (klen == KEY_128)
 317                        vaesenc xkeyB, var_xdata, var_xdata
 318                .else
 319                        vaesenc xkey8, var_xdata, var_xdata
 320                .endif
 321                .set i, (i +1)
 322        .endr
 323
 324        vmovdqa 10*16(p_keys), xkeyB
 325
 326        .set i, 0
 327        .rept by
 328                club XDATA, i
 329                /* key 9 */
 330                .if (klen == KEY_128)
 331                        vaesenc xkey12, var_xdata, var_xdata
 332                .else
 333                        vaesenc xkeyA, var_xdata, var_xdata
 334                .endif
 335                .set i, (i +1)
 336        .endr
 337
 338        .if (klen != KEY_128)
 339                vmovdqa 11*16(p_keys), xkeyA
 340        .endif
 341
 342        .set i, 0
 343        .rept by
 344                club XDATA, i
 345                /* key 10 */
 346                .if (klen == KEY_128)
 347                        vaesenclast     xkeyB, var_xdata, var_xdata
 348                .else
 349                        vaesenc xkeyB, var_xdata, var_xdata
 350                .endif
 351                .set i, (i +1)
 352        .endr
 353
 354        .if (klen != KEY_128)
 355                .if (load_keys)
 356                        vmovdqa 12*16(p_keys), xkey12
 357                .endif
 358
 359                .set i, 0
 360                .rept by
 361                        club XDATA, i
 362                        vaesenc xkeyA, var_xdata, var_xdata     /* key 11 */
 363                        .set i, (i +1)
 364                .endr
 365
 366                .if (klen == KEY_256)
 367                        vmovdqa 13*16(p_keys), xkeyA
 368                .endif
 369
 370                .set i, 0
 371                .rept by
 372                        club XDATA, i
 373                        .if (klen == KEY_256)
 374                                /* key 12 */
 375                                vaesenc xkey12, var_xdata, var_xdata
 376                        .else
 377                                vaesenclast xkey12, var_xdata, var_xdata
 378                        .endif
 379                        .set i, (i +1)
 380                .endr
 381
 382                .if (klen == KEY_256)
 383                        vmovdqa 14*16(p_keys), xkeyB
 384
 385                        .set i, 0
 386                        .rept by
 387                                club XDATA, i
 388                                /* key 13 */
 389                                vaesenc xkeyA, var_xdata, var_xdata
 390                                .set i, (i +1)
 391                        .endr
 392
 393                        .set i, 0
 394                        .rept by
 395                                club XDATA, i
 396                                /* key 14 */
 397                                vaesenclast     xkeyB, var_xdata, var_xdata
 398                                .set i, (i +1)
 399                        .endr
 400                .endif
 401        .endif
 402
 403        .set i, 0
 404        .rept (by / 2)
 405                .set j, (i+1)
 406                VMOVDQ  (i*16 - 16*by)(p_in), xkeyA
 407                VMOVDQ  (j*16 - 16*by)(p_in), xkeyB
 408                club XDATA, i
 409                vpxor   xkeyA, var_xdata, var_xdata
 410                club XDATA, j
 411                vpxor   xkeyB, var_xdata, var_xdata
 412                .set i, (i+2)
 413        .endr
 414
 415        .if (i < by)
 416                VMOVDQ  (i*16 - 16*by)(p_in), xkeyA
 417                club XDATA, i
 418                vpxor   xkeyA, var_xdata, var_xdata
 419        .endif
 420
 421        .set i, 0
 422        .rept by
 423                club XDATA, i
 424                VMOVDQ  var_xdata, i*16(p_out)
 425                .set i, (i+1)
 426        .endr
 427.endm
 428
 429.macro do_aes_load val, key_len
 430        do_aes \val, 1, \key_len
 431.endm
 432
 433.macro do_aes_noload val, key_len
 434        do_aes \val, 0, \key_len
 435.endm
 436
 437/* main body of aes ctr load */
 438
 439.macro do_aes_ctrmain key_len
 440        cmp     $16, num_bytes
 441        jb      .Ldo_return2\key_len
 442
 443        vmovdqa byteswap_const(%rip), xbyteswap
 444        vmovdqu (p_iv), xcounter
 445        vpshufb xbyteswap, xcounter, xcounter
 446
 447        mov     num_bytes, tmp
 448        and     $(7*16), tmp
 449        jz      .Lmult_of_8_blks\key_len
 450
 451        /* 1 <= tmp <= 7 */
 452        cmp     $(4*16), tmp
 453        jg      .Lgt4\key_len
 454        je      .Leq4\key_len
 455
 456.Llt4\key_len:
 457        cmp     $(2*16), tmp
 458        jg      .Leq3\key_len
 459        je      .Leq2\key_len
 460
 461.Leq1\key_len:
 462        do_aes_load     1, \key_len
 463        add     $(1*16), p_out
 464        and     $(~7*16), num_bytes
 465        jz      .Ldo_return2\key_len
 466        jmp     .Lmain_loop2\key_len
 467
 468.Leq2\key_len:
 469        do_aes_load     2, \key_len
 470        add     $(2*16), p_out
 471        and     $(~7*16), num_bytes
 472        jz      .Ldo_return2\key_len
 473        jmp     .Lmain_loop2\key_len
 474
 475
 476.Leq3\key_len:
 477        do_aes_load     3, \key_len
 478        add     $(3*16), p_out
 479        and     $(~7*16), num_bytes
 480        jz      .Ldo_return2\key_len
 481        jmp     .Lmain_loop2\key_len
 482
 483.Leq4\key_len:
 484        do_aes_load     4, \key_len
 485        add     $(4*16), p_out
 486        and     $(~7*16), num_bytes
 487        jz      .Ldo_return2\key_len
 488        jmp     .Lmain_loop2\key_len
 489
 490.Lgt4\key_len:
 491        cmp     $(6*16), tmp
 492        jg      .Leq7\key_len
 493        je      .Leq6\key_len
 494
 495.Leq5\key_len:
 496        do_aes_load     5, \key_len
 497        add     $(5*16), p_out
 498        and     $(~7*16), num_bytes
 499        jz      .Ldo_return2\key_len
 500        jmp     .Lmain_loop2\key_len
 501
 502.Leq6\key_len:
 503        do_aes_load     6, \key_len
 504        add     $(6*16), p_out
 505        and     $(~7*16), num_bytes
 506        jz      .Ldo_return2\key_len
 507        jmp     .Lmain_loop2\key_len
 508
 509.Leq7\key_len:
 510        do_aes_load     7, \key_len
 511        add     $(7*16), p_out
 512        and     $(~7*16), num_bytes
 513        jz      .Ldo_return2\key_len
 514        jmp     .Lmain_loop2\key_len
 515
 516.Lmult_of_8_blks\key_len:
 517        .if (\key_len != KEY_128)
 518                vmovdqa 0*16(p_keys), xkey0
 519                vmovdqa 4*16(p_keys), xkey4
 520                vmovdqa 8*16(p_keys), xkey8
 521                vmovdqa 12*16(p_keys), xkey12
 522        .else
 523                vmovdqa 0*16(p_keys), xkey0
 524                vmovdqa 3*16(p_keys), xkey4
 525                vmovdqa 6*16(p_keys), xkey8
 526                vmovdqa 9*16(p_keys), xkey12
 527        .endif
 528.align 16
 529.Lmain_loop2\key_len:
 530        /* num_bytes is a multiple of 8 and >0 */
 531        do_aes_noload   8, \key_len
 532        add     $(8*16), p_out
 533        sub     $(8*16), num_bytes
 534        jne     .Lmain_loop2\key_len
 535
 536.Ldo_return2\key_len:
 537        /* return updated IV */
 538        vpshufb xbyteswap, xcounter, xcounter
 539        vmovdqu xcounter, (p_iv)
 540        ret
 541.endm
 542
 543/*
 544 * routine to do AES128 CTR enc/decrypt "by8"
 545 * XMM registers are clobbered.
 546 * Saving/restoring must be done at a higher level
 547 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
 548 *                      unsigned int num_bytes)
 549 */
 550ENTRY(aes_ctr_enc_128_avx_by8)
 551        /* call the aes main loop */
 552        do_aes_ctrmain KEY_128
 553
 554ENDPROC(aes_ctr_enc_128_avx_by8)
 555
 556/*
 557 * routine to do AES192 CTR enc/decrypt "by8"
 558 * XMM registers are clobbered.
 559 * Saving/restoring must be done at a higher level
 560 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
 561 *                      unsigned int num_bytes)
 562 */
 563ENTRY(aes_ctr_enc_192_avx_by8)
 564        /* call the aes main loop */
 565        do_aes_ctrmain KEY_192
 566
 567ENDPROC(aes_ctr_enc_192_avx_by8)
 568
 569/*
 570 * routine to do AES256 CTR enc/decrypt "by8"
 571 * XMM registers are clobbered.
 572 * Saving/restoring must be done at a higher level
 573 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
 574 *                      unsigned int num_bytes)
 575 */
 576ENTRY(aes_ctr_enc_256_avx_by8)
 577        /* call the aes main loop */
 578        do_aes_ctrmain KEY_256
 579
 580ENDPROC(aes_ctr_enc_256_avx_by8)
 581