linux/arch/x86/crypto/aegis128l-aesni-asm.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-only */
   2/*
   3 * AES-NI + SSE2 implementation of AEGIS-128L
   4 *
   5 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
   6 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
   7 */
   8
   9#include <linux/linkage.h>
  10#include <asm/frame.h>
  11
  12#define STATE0  %xmm0
  13#define STATE1  %xmm1
  14#define STATE2  %xmm2
  15#define STATE3  %xmm3
  16#define STATE4  %xmm4
  17#define STATE5  %xmm5
  18#define STATE6  %xmm6
  19#define STATE7  %xmm7
  20#define MSG0    %xmm8
  21#define MSG1    %xmm9
  22#define T0      %xmm10
  23#define T1      %xmm11
  24#define T2      %xmm12
  25#define T3      %xmm13
  26
  27#define STATEP  %rdi
  28#define LEN     %rsi
  29#define SRC     %rdx
  30#define DST     %rcx
  31
  32.section .rodata.cst16.aegis128l_const, "aM", @progbits, 32
  33.align 16
  34.Laegis128l_const_0:
  35        .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
  36        .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
  37.Laegis128l_const_1:
  38        .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
  39        .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
  40
  41.section .rodata.cst16.aegis128l_counter, "aM", @progbits, 16
  42.align 16
  43.Laegis128l_counter0:
  44        .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
  45        .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
  46.Laegis128l_counter1:
  47        .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
  48        .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
  49
  50.text
  51
  52/*
  53 * __load_partial: internal ABI
  54 * input:
  55 *   LEN - bytes
  56 *   SRC - src
  57 * output:
  58 *   MSG0 - first message block
  59 *   MSG1 - second message block
  60 * changed:
  61 *   T0
  62 *   %r8
  63 *   %r9
  64 */
  65__load_partial:
  66        xor %r9d, %r9d
  67        pxor MSG0, MSG0
  68        pxor MSG1, MSG1
  69
  70        mov LEN, %r8
  71        and $0x1, %r8
  72        jz .Lld_partial_1
  73
  74        mov LEN, %r8
  75        and $0x1E, %r8
  76        add SRC, %r8
  77        mov (%r8), %r9b
  78
  79.Lld_partial_1:
  80        mov LEN, %r8
  81        and $0x2, %r8
  82        jz .Lld_partial_2
  83
  84        mov LEN, %r8
  85        and $0x1C, %r8
  86        add SRC, %r8
  87        shl $0x10, %r9
  88        mov (%r8), %r9w
  89
  90.Lld_partial_2:
  91        mov LEN, %r8
  92        and $0x4, %r8
  93        jz .Lld_partial_4
  94
  95        mov LEN, %r8
  96        and $0x18, %r8
  97        add SRC, %r8
  98        shl $32, %r9
  99        mov (%r8), %r8d
 100        xor %r8, %r9
 101
 102.Lld_partial_4:
 103        movq %r9, MSG0
 104
 105        mov LEN, %r8
 106        and $0x8, %r8
 107        jz .Lld_partial_8
 108
 109        mov LEN, %r8
 110        and $0x10, %r8
 111        add SRC, %r8
 112        pslldq $8, MSG0
 113        movq (%r8), T0
 114        pxor T0, MSG0
 115
 116.Lld_partial_8:
 117        mov LEN, %r8
 118        and $0x10, %r8
 119        jz .Lld_partial_16
 120
 121        movdqa MSG0, MSG1
 122        movdqu (SRC), MSG0
 123
 124.Lld_partial_16:
 125        ret
 126ENDPROC(__load_partial)
 127
 128/*
 129 * __store_partial: internal ABI
 130 * input:
 131 *   LEN - bytes
 132 *   DST - dst
 133 * output:
 134 *   T0   - first message block
 135 *   T1   - second message block
 136 * changed:
 137 *   %r8
 138 *   %r9
 139 *   %r10
 140 */
 141__store_partial:
 142        mov LEN, %r8
 143        mov DST, %r9
 144
 145        cmp $16, %r8
 146        jl .Lst_partial_16
 147
 148        movdqu T0, (%r9)
 149        movdqa T1, T0
 150
 151        sub $16, %r8
 152        add $16, %r9
 153
 154.Lst_partial_16:
 155        movq T0, %r10
 156
 157        cmp $8, %r8
 158        jl .Lst_partial_8
 159
 160        mov %r10, (%r9)
 161        psrldq $8, T0
 162        movq T0, %r10
 163
 164        sub $8, %r8
 165        add $8, %r9
 166
 167.Lst_partial_8:
 168        cmp $4, %r8
 169        jl .Lst_partial_4
 170
 171        mov %r10d, (%r9)
 172        shr $32, %r10
 173
 174        sub $4, %r8
 175        add $4, %r9
 176
 177.Lst_partial_4:
 178        cmp $2, %r8
 179        jl .Lst_partial_2
 180
 181        mov %r10w, (%r9)
 182        shr $0x10, %r10
 183
 184        sub $2, %r8
 185        add $2, %r9
 186
 187.Lst_partial_2:
 188        cmp $1, %r8
 189        jl .Lst_partial_1
 190
 191        mov %r10b, (%r9)
 192
 193.Lst_partial_1:
 194        ret
 195ENDPROC(__store_partial)
 196
 197.macro update
 198        movdqa STATE7, T0
 199        aesenc STATE0, STATE7
 200        aesenc STATE1, STATE0
 201        aesenc STATE2, STATE1
 202        aesenc STATE3, STATE2
 203        aesenc STATE4, STATE3
 204        aesenc STATE5, STATE4
 205        aesenc STATE6, STATE5
 206        aesenc T0,     STATE6
 207.endm
 208
 209.macro update0
 210        update
 211        pxor MSG0, STATE7
 212        pxor MSG1, STATE3
 213.endm
 214
 215.macro update1
 216        update
 217        pxor MSG0, STATE6
 218        pxor MSG1, STATE2
 219.endm
 220
 221.macro update2
 222        update
 223        pxor MSG0, STATE5
 224        pxor MSG1, STATE1
 225.endm
 226
 227.macro update3
 228        update
 229        pxor MSG0, STATE4
 230        pxor MSG1, STATE0
 231.endm
 232
 233.macro update4
 234        update
 235        pxor MSG0, STATE3
 236        pxor MSG1, STATE7
 237.endm
 238
 239.macro update5
 240        update
 241        pxor MSG0, STATE2
 242        pxor MSG1, STATE6
 243.endm
 244
 245.macro update6
 246        update
 247        pxor MSG0, STATE1
 248        pxor MSG1, STATE5
 249.endm
 250
 251.macro update7
 252        update
 253        pxor MSG0, STATE0
 254        pxor MSG1, STATE4
 255.endm
 256
 257.macro state_load
 258        movdqu 0x00(STATEP), STATE0
 259        movdqu 0x10(STATEP), STATE1
 260        movdqu 0x20(STATEP), STATE2
 261        movdqu 0x30(STATEP), STATE3
 262        movdqu 0x40(STATEP), STATE4
 263        movdqu 0x50(STATEP), STATE5
 264        movdqu 0x60(STATEP), STATE6
 265        movdqu 0x70(STATEP), STATE7
 266.endm
 267
 268.macro state_store s0 s1 s2 s3 s4 s5 s6 s7
 269        movdqu \s7, 0x00(STATEP)
 270        movdqu \s0, 0x10(STATEP)
 271        movdqu \s1, 0x20(STATEP)
 272        movdqu \s2, 0x30(STATEP)
 273        movdqu \s3, 0x40(STATEP)
 274        movdqu \s4, 0x50(STATEP)
 275        movdqu \s5, 0x60(STATEP)
 276        movdqu \s6, 0x70(STATEP)
 277.endm
 278
 279.macro state_store0
 280        state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7
 281.endm
 282
 283.macro state_store1
 284        state_store STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6
 285.endm
 286
 287.macro state_store2
 288        state_store STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
 289.endm
 290
 291.macro state_store3
 292        state_store STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4
 293.endm
 294
 295.macro state_store4
 296        state_store STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3
 297.endm
 298
 299.macro state_store5
 300        state_store STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2
 301.endm
 302
 303.macro state_store6
 304        state_store STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1
 305.endm
 306
 307.macro state_store7
 308        state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0
 309.endm
 310
 311/*
 312 * void crypto_aegis128l_aesni_init(void *state, const void *key, const void *iv);
 313 */
 314ENTRY(crypto_aegis128l_aesni_init)
 315        FRAME_BEGIN
 316
 317        /* load key: */
 318        movdqa (%rsi), MSG1
 319        movdqa MSG1, STATE0
 320        movdqa MSG1, STATE4
 321        movdqa MSG1, STATE5
 322        movdqa MSG1, STATE6
 323        movdqa MSG1, STATE7
 324
 325        /* load IV: */
 326        movdqu (%rdx), MSG0
 327        pxor MSG0, STATE0
 328        pxor MSG0, STATE4
 329
 330        /* load the constants: */
 331        movdqa .Laegis128l_const_0, STATE2
 332        movdqa .Laegis128l_const_1, STATE1
 333        movdqa STATE1, STATE3
 334        pxor STATE2, STATE5
 335        pxor STATE1, STATE6
 336        pxor STATE2, STATE7
 337
 338        /* update 10 times with IV and KEY: */
 339        update0
 340        update1
 341        update2
 342        update3
 343        update4
 344        update5
 345        update6
 346        update7
 347        update0
 348        update1
 349
 350        state_store1
 351
 352        FRAME_END
 353        ret
 354ENDPROC(crypto_aegis128l_aesni_init)
 355
 356.macro ad_block a i
 357        movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
 358        movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
 359        update\i
 360        sub $0x20, LEN
 361        cmp $0x20, LEN
 362        jl .Lad_out_\i
 363.endm
 364
 365/*
 366 * void crypto_aegis128l_aesni_ad(void *state, unsigned int length,
 367 *                                const void *data);
 368 */
 369ENTRY(crypto_aegis128l_aesni_ad)
 370        FRAME_BEGIN
 371
 372        cmp $0x20, LEN
 373        jb .Lad_out
 374
 375        state_load
 376
 377        mov  SRC, %r8
 378        and $0xf, %r8
 379        jnz .Lad_u_loop
 380
 381.align 8
 382.Lad_a_loop:
 383        ad_block a 0
 384        ad_block a 1
 385        ad_block a 2
 386        ad_block a 3
 387        ad_block a 4
 388        ad_block a 5
 389        ad_block a 6
 390        ad_block a 7
 391
 392        add $0x100, SRC
 393        jmp .Lad_a_loop
 394
 395.align 8
 396.Lad_u_loop:
 397        ad_block u 0
 398        ad_block u 1
 399        ad_block u 2
 400        ad_block u 3
 401        ad_block u 4
 402        ad_block u 5
 403        ad_block u 6
 404        ad_block u 7
 405
 406        add $0x100, SRC
 407        jmp .Lad_u_loop
 408
 409.Lad_out_0:
 410        state_store0
 411        FRAME_END
 412        ret
 413
 414.Lad_out_1:
 415        state_store1
 416        FRAME_END
 417        ret
 418
 419.Lad_out_2:
 420        state_store2
 421        FRAME_END
 422        ret
 423
 424.Lad_out_3:
 425        state_store3
 426        FRAME_END
 427        ret
 428
 429.Lad_out_4:
 430        state_store4
 431        FRAME_END
 432        ret
 433
 434.Lad_out_5:
 435        state_store5
 436        FRAME_END
 437        ret
 438
 439.Lad_out_6:
 440        state_store6
 441        FRAME_END
 442        ret
 443
 444.Lad_out_7:
 445        state_store7
 446        FRAME_END
 447        ret
 448
 449.Lad_out:
 450        FRAME_END
 451        ret
 452ENDPROC(crypto_aegis128l_aesni_ad)
 453
 454.macro crypt m0 m1 s0 s1 s2 s3 s4 s5 s6 s7
 455        pxor \s1, \m0
 456        pxor \s6, \m0
 457        movdqa \s2, T3
 458        pand \s3, T3
 459        pxor T3, \m0
 460
 461        pxor \s2, \m1
 462        pxor \s5, \m1
 463        movdqa \s6, T3
 464        pand \s7, T3
 465        pxor T3, \m1
 466.endm
 467
 468.macro crypt0 m0 m1
 469        crypt \m0 \m1 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7
 470.endm
 471
 472.macro crypt1 m0 m1
 473        crypt \m0 \m1 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6
 474.endm
 475
 476.macro crypt2 m0 m1
 477        crypt \m0 \m1 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
 478.endm
 479
 480.macro crypt3 m0 m1
 481        crypt \m0 \m1 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4
 482.endm
 483
 484.macro crypt4 m0 m1
 485        crypt \m0 \m1 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3
 486.endm
 487
 488.macro crypt5 m0 m1
 489        crypt \m0 \m1 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2
 490.endm
 491
 492.macro crypt6 m0 m1
 493        crypt \m0 \m1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1
 494.endm
 495
 496.macro crypt7 m0 m1
 497        crypt \m0 \m1 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0
 498.endm
 499
 500.macro encrypt_block a i
 501        movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
 502        movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
 503        movdqa MSG0, T0
 504        movdqa MSG1, T1
 505        crypt\i T0, T1
 506        movdq\a T0, (\i * 0x20 + 0x00)(DST)
 507        movdq\a T1, (\i * 0x20 + 0x10)(DST)
 508
 509        update\i
 510
 511        sub $0x20, LEN
 512        cmp $0x20, LEN
 513        jl .Lenc_out_\i
 514.endm
 515
 516.macro decrypt_block a i
 517        movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
 518        movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
 519        crypt\i MSG0, MSG1
 520        movdq\a MSG0, (\i * 0x20 + 0x00)(DST)
 521        movdq\a MSG1, (\i * 0x20 + 0x10)(DST)
 522
 523        update\i
 524
 525        sub $0x20, LEN
 526        cmp $0x20, LEN
 527        jl .Ldec_out_\i
 528.endm
 529
 530/*
 531 * void crypto_aegis128l_aesni_enc(void *state, unsigned int length,
 532 *                                 const void *src, void *dst);
 533 */
 534ENTRY(crypto_aegis128l_aesni_enc)
 535        FRAME_BEGIN
 536
 537        cmp $0x20, LEN
 538        jb .Lenc_out
 539
 540        state_load
 541
 542        mov  SRC, %r8
 543        or   DST, %r8
 544        and $0xf, %r8
 545        jnz .Lenc_u_loop
 546
 547.align 8
 548.Lenc_a_loop:
 549        encrypt_block a 0
 550        encrypt_block a 1
 551        encrypt_block a 2
 552        encrypt_block a 3
 553        encrypt_block a 4
 554        encrypt_block a 5
 555        encrypt_block a 6
 556        encrypt_block a 7
 557
 558        add $0x100, SRC
 559        add $0x100, DST
 560        jmp .Lenc_a_loop
 561
 562.align 8
 563.Lenc_u_loop:
 564        encrypt_block u 0
 565        encrypt_block u 1
 566        encrypt_block u 2
 567        encrypt_block u 3
 568        encrypt_block u 4
 569        encrypt_block u 5
 570        encrypt_block u 6
 571        encrypt_block u 7
 572
 573        add $0x100, SRC
 574        add $0x100, DST
 575        jmp .Lenc_u_loop
 576
 577.Lenc_out_0:
 578        state_store0
 579        FRAME_END
 580        ret
 581
 582.Lenc_out_1:
 583        state_store1
 584        FRAME_END
 585        ret
 586
 587.Lenc_out_2:
 588        state_store2
 589        FRAME_END
 590        ret
 591
 592.Lenc_out_3:
 593        state_store3
 594        FRAME_END
 595        ret
 596
 597.Lenc_out_4:
 598        state_store4
 599        FRAME_END
 600        ret
 601
 602.Lenc_out_5:
 603        state_store5
 604        FRAME_END
 605        ret
 606
 607.Lenc_out_6:
 608        state_store6
 609        FRAME_END
 610        ret
 611
 612.Lenc_out_7:
 613        state_store7
 614        FRAME_END
 615        ret
 616
 617.Lenc_out:
 618        FRAME_END
 619        ret
 620ENDPROC(crypto_aegis128l_aesni_enc)
 621
 622/*
 623 * void crypto_aegis128l_aesni_enc_tail(void *state, unsigned int length,
 624 *                                      const void *src, void *dst);
 625 */
 626ENTRY(crypto_aegis128l_aesni_enc_tail)
 627        FRAME_BEGIN
 628
 629        state_load
 630
 631        /* encrypt message: */
 632        call __load_partial
 633
 634        movdqa MSG0, T0
 635        movdqa MSG1, T1
 636        crypt0 T0, T1
 637
 638        call __store_partial
 639
 640        update0
 641
 642        state_store0
 643
 644        FRAME_END
 645        ret
 646ENDPROC(crypto_aegis128l_aesni_enc_tail)
 647
 648/*
 649 * void crypto_aegis128l_aesni_dec(void *state, unsigned int length,
 650 *                                 const void *src, void *dst);
 651 */
 652ENTRY(crypto_aegis128l_aesni_dec)
 653        FRAME_BEGIN
 654
 655        cmp $0x20, LEN
 656        jb .Ldec_out
 657
 658        state_load
 659
 660        mov  SRC, %r8
 661        or   DST, %r8
 662        and $0xF, %r8
 663        jnz .Ldec_u_loop
 664
 665.align 8
 666.Ldec_a_loop:
 667        decrypt_block a 0
 668        decrypt_block a 1
 669        decrypt_block a 2
 670        decrypt_block a 3
 671        decrypt_block a 4
 672        decrypt_block a 5
 673        decrypt_block a 6
 674        decrypt_block a 7
 675
 676        add $0x100, SRC
 677        add $0x100, DST
 678        jmp .Ldec_a_loop
 679
 680.align 8
 681.Ldec_u_loop:
 682        decrypt_block u 0
 683        decrypt_block u 1
 684        decrypt_block u 2
 685        decrypt_block u 3
 686        decrypt_block u 4
 687        decrypt_block u 5
 688        decrypt_block u 6
 689        decrypt_block u 7
 690
 691        add $0x100, SRC
 692        add $0x100, DST
 693        jmp .Ldec_u_loop
 694
 695.Ldec_out_0:
 696        state_store0
 697        FRAME_END
 698        ret
 699
 700.Ldec_out_1:
 701        state_store1
 702        FRAME_END
 703        ret
 704
 705.Ldec_out_2:
 706        state_store2
 707        FRAME_END
 708        ret
 709
 710.Ldec_out_3:
 711        state_store3
 712        FRAME_END
 713        ret
 714
 715.Ldec_out_4:
 716        state_store4
 717        FRAME_END
 718        ret
 719
 720.Ldec_out_5:
 721        state_store5
 722        FRAME_END
 723        ret
 724
 725.Ldec_out_6:
 726        state_store6
 727        FRAME_END
 728        ret
 729
 730.Ldec_out_7:
 731        state_store7
 732        FRAME_END
 733        ret
 734
 735.Ldec_out:
 736        FRAME_END
 737        ret
 738ENDPROC(crypto_aegis128l_aesni_dec)
 739
 740/*
 741 * void crypto_aegis128l_aesni_dec_tail(void *state, unsigned int length,
 742 *                                      const void *src, void *dst);
 743 */
 744ENTRY(crypto_aegis128l_aesni_dec_tail)
 745        FRAME_BEGIN
 746
 747        state_load
 748
 749        /* decrypt message: */
 750        call __load_partial
 751
 752        crypt0 MSG0, MSG1
 753
 754        movdqa MSG0, T0
 755        movdqa MSG1, T1
 756        call __store_partial
 757
 758        /* mask with byte count: */
 759        movq LEN, T0
 760        punpcklbw T0, T0
 761        punpcklbw T0, T0
 762        punpcklbw T0, T0
 763        punpcklbw T0, T0
 764        movdqa T0, T1
 765        movdqa .Laegis128l_counter0, T2
 766        movdqa .Laegis128l_counter1, T3
 767        pcmpgtb T2, T0
 768        pcmpgtb T3, T1
 769        pand T0, MSG0
 770        pand T1, MSG1
 771
 772        update0
 773
 774        state_store0
 775
 776        FRAME_END
 777        ret
 778ENDPROC(crypto_aegis128l_aesni_dec_tail)
 779
 780/*
 781 * void crypto_aegis128l_aesni_final(void *state, void *tag_xor,
 782 *                                   u64 assoclen, u64 cryptlen);
 783 */
 784ENTRY(crypto_aegis128l_aesni_final)
 785        FRAME_BEGIN
 786
 787        state_load
 788
 789        /* prepare length block: */
 790        movq %rdx, MSG0
 791        movq %rcx, T0
 792        pslldq $8, T0
 793        pxor T0, MSG0
 794        psllq $3, MSG0 /* multiply by 8 (to get bit count) */
 795
 796        pxor STATE2, MSG0
 797        movdqa MSG0, MSG1
 798
 799        /* update state: */
 800        update0
 801        update1
 802        update2
 803        update3
 804        update4
 805        update5
 806        update6
 807
 808        /* xor tag: */
 809        movdqu (%rsi), T0
 810
 811        pxor STATE1, T0
 812        pxor STATE2, T0
 813        pxor STATE3, T0
 814        pxor STATE4, T0
 815        pxor STATE5, T0
 816        pxor STATE6, T0
 817        pxor STATE7, T0
 818
 819        movdqu T0, (%rsi)
 820
 821        FRAME_END
 822        ret
 823ENDPROC(crypto_aegis128l_aesni_final)
 824