linux/arch/x86/crypto/aegis128-aesni-asm.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-only */
   2/*
   3 * AES-NI + SSE2 implementation of AEGIS-128
   4 *
   5 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
   6 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
   7 */
   8
   9#include <linux/linkage.h>
  10#include <asm/frame.h>
  11
  12#define STATE0  %xmm0
  13#define STATE1  %xmm1
  14#define STATE2  %xmm2
  15#define STATE3  %xmm3
  16#define STATE4  %xmm4
  17#define KEY     %xmm5
  18#define MSG     %xmm5
  19#define T0      %xmm6
  20#define T1      %xmm7
  21
  22#define STATEP  %rdi
  23#define LEN     %rsi
  24#define SRC     %rdx
  25#define DST     %rcx
  26
  27.section .rodata.cst16.aegis128_const, "aM", @progbits, 32
  28.align 16
  29.Laegis128_const_0:
  30        .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
  31        .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
  32.Laegis128_const_1:
  33        .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
  34        .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
  35
  36.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16
  37.align 16
  38.Laegis128_counter:
  39        .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
  40        .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
  41
  42.text
  43
  44/*
  45 * aegis128_update
  46 * input:
  47 *   STATE[0-4] - input state
  48 * output:
  49 *   STATE[0-4] - output state (shifted positions)
  50 * changed:
  51 *   T0
  52 */
  53.macro aegis128_update
  54        movdqa STATE4, T0
  55        aesenc STATE0, STATE4
  56        aesenc STATE1, STATE0
  57        aesenc STATE2, STATE1
  58        aesenc STATE3, STATE2
  59        aesenc T0,     STATE3
  60.endm
  61
  62/*
  63 * __load_partial: internal ABI
  64 * input:
  65 *   LEN - bytes
  66 *   SRC - src
  67 * output:
  68 *   MSG  - message block
  69 * changed:
  70 *   T0
  71 *   %r8
  72 *   %r9
  73 */
  74__load_partial:
  75        xor %r9d, %r9d
  76        pxor MSG, MSG
  77
  78        mov LEN, %r8
  79        and $0x1, %r8
  80        jz .Lld_partial_1
  81
  82        mov LEN, %r8
  83        and $0x1E, %r8
  84        add SRC, %r8
  85        mov (%r8), %r9b
  86
  87.Lld_partial_1:
  88        mov LEN, %r8
  89        and $0x2, %r8
  90        jz .Lld_partial_2
  91
  92        mov LEN, %r8
  93        and $0x1C, %r8
  94        add SRC, %r8
  95        shl $0x10, %r9
  96        mov (%r8), %r9w
  97
  98.Lld_partial_2:
  99        mov LEN, %r8
 100        and $0x4, %r8
 101        jz .Lld_partial_4
 102
 103        mov LEN, %r8
 104        and $0x18, %r8
 105        add SRC, %r8
 106        shl $32, %r9
 107        mov (%r8), %r8d
 108        xor %r8, %r9
 109
 110.Lld_partial_4:
 111        movq %r9, MSG
 112
 113        mov LEN, %r8
 114        and $0x8, %r8
 115        jz .Lld_partial_8
 116
 117        mov LEN, %r8
 118        and $0x10, %r8
 119        add SRC, %r8
 120        pslldq $8, MSG
 121        movq (%r8), T0
 122        pxor T0, MSG
 123
 124.Lld_partial_8:
 125        ret
 126ENDPROC(__load_partial)
 127
 128/*
 129 * __store_partial: internal ABI
 130 * input:
 131 *   LEN - bytes
 132 *   DST - dst
 133 * output:
 134 *   T0   - message block
 135 * changed:
 136 *   %r8
 137 *   %r9
 138 *   %r10
 139 */
 140__store_partial:
 141        mov LEN, %r8
 142        mov DST, %r9
 143
 144        movq T0, %r10
 145
 146        cmp $8, %r8
 147        jl .Lst_partial_8
 148
 149        mov %r10, (%r9)
 150        psrldq $8, T0
 151        movq T0, %r10
 152
 153        sub $8, %r8
 154        add $8, %r9
 155
 156.Lst_partial_8:
 157        cmp $4, %r8
 158        jl .Lst_partial_4
 159
 160        mov %r10d, (%r9)
 161        shr $32, %r10
 162
 163        sub $4, %r8
 164        add $4, %r9
 165
 166.Lst_partial_4:
 167        cmp $2, %r8
 168        jl .Lst_partial_2
 169
 170        mov %r10w, (%r9)
 171        shr $0x10, %r10
 172
 173        sub $2, %r8
 174        add $2, %r9
 175
 176.Lst_partial_2:
 177        cmp $1, %r8
 178        jl .Lst_partial_1
 179
 180        mov %r10b, (%r9)
 181
 182.Lst_partial_1:
 183        ret
 184ENDPROC(__store_partial)
 185
 186/*
 187 * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv);
 188 */
 189ENTRY(crypto_aegis128_aesni_init)
 190        FRAME_BEGIN
 191
 192        /* load IV: */
 193        movdqu (%rdx), T1
 194
 195        /* load key: */
 196        movdqa (%rsi), KEY
 197        pxor KEY, T1
 198        movdqa T1, STATE0
 199        movdqa KEY, STATE3
 200        movdqa KEY, STATE4
 201
 202        /* load the constants: */
 203        movdqa .Laegis128_const_0, STATE2
 204        movdqa .Laegis128_const_1, STATE1
 205        pxor STATE2, STATE3
 206        pxor STATE1, STATE4
 207
 208        /* update 10 times with KEY / KEY xor IV: */
 209        aegis128_update; pxor KEY, STATE4
 210        aegis128_update; pxor T1,  STATE3
 211        aegis128_update; pxor KEY, STATE2
 212        aegis128_update; pxor T1,  STATE1
 213        aegis128_update; pxor KEY, STATE0
 214        aegis128_update; pxor T1,  STATE4
 215        aegis128_update; pxor KEY, STATE3
 216        aegis128_update; pxor T1,  STATE2
 217        aegis128_update; pxor KEY, STATE1
 218        aegis128_update; pxor T1,  STATE0
 219
 220        /* store the state: */
 221        movdqu STATE0, 0x00(STATEP)
 222        movdqu STATE1, 0x10(STATEP)
 223        movdqu STATE2, 0x20(STATEP)
 224        movdqu STATE3, 0x30(STATEP)
 225        movdqu STATE4, 0x40(STATEP)
 226
 227        FRAME_END
 228        ret
 229ENDPROC(crypto_aegis128_aesni_init)
 230
 231/*
 232 * void crypto_aegis128_aesni_ad(void *state, unsigned int length,
 233 *                               const void *data);
 234 */
 235ENTRY(crypto_aegis128_aesni_ad)
 236        FRAME_BEGIN
 237
 238        cmp $0x10, LEN
 239        jb .Lad_out
 240
 241        /* load the state: */
 242        movdqu 0x00(STATEP), STATE0
 243        movdqu 0x10(STATEP), STATE1
 244        movdqu 0x20(STATEP), STATE2
 245        movdqu 0x30(STATEP), STATE3
 246        movdqu 0x40(STATEP), STATE4
 247
 248        mov SRC, %r8
 249        and $0xF, %r8
 250        jnz .Lad_u_loop
 251
 252.align 8
 253.Lad_a_loop:
 254        movdqa 0x00(SRC), MSG
 255        aegis128_update
 256        pxor MSG, STATE4
 257        sub $0x10, LEN
 258        cmp $0x10, LEN
 259        jl .Lad_out_1
 260
 261        movdqa 0x10(SRC), MSG
 262        aegis128_update
 263        pxor MSG, STATE3
 264        sub $0x10, LEN
 265        cmp $0x10, LEN
 266        jl .Lad_out_2
 267
 268        movdqa 0x20(SRC), MSG
 269        aegis128_update
 270        pxor MSG, STATE2
 271        sub $0x10, LEN
 272        cmp $0x10, LEN
 273        jl .Lad_out_3
 274
 275        movdqa 0x30(SRC), MSG
 276        aegis128_update
 277        pxor MSG, STATE1
 278        sub $0x10, LEN
 279        cmp $0x10, LEN
 280        jl .Lad_out_4
 281
 282        movdqa 0x40(SRC), MSG
 283        aegis128_update
 284        pxor MSG, STATE0
 285        sub $0x10, LEN
 286        cmp $0x10, LEN
 287        jl .Lad_out_0
 288
 289        add $0x50, SRC
 290        jmp .Lad_a_loop
 291
 292.align 8
 293.Lad_u_loop:
 294        movdqu 0x00(SRC), MSG
 295        aegis128_update
 296        pxor MSG, STATE4
 297        sub $0x10, LEN
 298        cmp $0x10, LEN
 299        jl .Lad_out_1
 300
 301        movdqu 0x10(SRC), MSG
 302        aegis128_update
 303        pxor MSG, STATE3
 304        sub $0x10, LEN
 305        cmp $0x10, LEN
 306        jl .Lad_out_2
 307
 308        movdqu 0x20(SRC), MSG
 309        aegis128_update
 310        pxor MSG, STATE2
 311        sub $0x10, LEN
 312        cmp $0x10, LEN
 313        jl .Lad_out_3
 314
 315        movdqu 0x30(SRC), MSG
 316        aegis128_update
 317        pxor MSG, STATE1
 318        sub $0x10, LEN
 319        cmp $0x10, LEN
 320        jl .Lad_out_4
 321
 322        movdqu 0x40(SRC), MSG
 323        aegis128_update
 324        pxor MSG, STATE0
 325        sub $0x10, LEN
 326        cmp $0x10, LEN
 327        jl .Lad_out_0
 328
 329        add $0x50, SRC
 330        jmp .Lad_u_loop
 331
 332        /* store the state: */
 333.Lad_out_0:
 334        movdqu STATE0, 0x00(STATEP)
 335        movdqu STATE1, 0x10(STATEP)
 336        movdqu STATE2, 0x20(STATEP)
 337        movdqu STATE3, 0x30(STATEP)
 338        movdqu STATE4, 0x40(STATEP)
 339        FRAME_END
 340        ret
 341
 342.Lad_out_1:
 343        movdqu STATE4, 0x00(STATEP)
 344        movdqu STATE0, 0x10(STATEP)
 345        movdqu STATE1, 0x20(STATEP)
 346        movdqu STATE2, 0x30(STATEP)
 347        movdqu STATE3, 0x40(STATEP)
 348        FRAME_END
 349        ret
 350
 351.Lad_out_2:
 352        movdqu STATE3, 0x00(STATEP)
 353        movdqu STATE4, 0x10(STATEP)
 354        movdqu STATE0, 0x20(STATEP)
 355        movdqu STATE1, 0x30(STATEP)
 356        movdqu STATE2, 0x40(STATEP)
 357        FRAME_END
 358        ret
 359
 360.Lad_out_3:
 361        movdqu STATE2, 0x00(STATEP)
 362        movdqu STATE3, 0x10(STATEP)
 363        movdqu STATE4, 0x20(STATEP)
 364        movdqu STATE0, 0x30(STATEP)
 365        movdqu STATE1, 0x40(STATEP)
 366        FRAME_END
 367        ret
 368
 369.Lad_out_4:
 370        movdqu STATE1, 0x00(STATEP)
 371        movdqu STATE2, 0x10(STATEP)
 372        movdqu STATE3, 0x20(STATEP)
 373        movdqu STATE4, 0x30(STATEP)
 374        movdqu STATE0, 0x40(STATEP)
 375        FRAME_END
 376        ret
 377
 378.Lad_out:
 379        FRAME_END
 380        ret
 381ENDPROC(crypto_aegis128_aesni_ad)
 382
 383.macro encrypt_block a s0 s1 s2 s3 s4 i
 384        movdq\a (\i * 0x10)(SRC), MSG
 385        movdqa MSG, T0
 386        pxor \s1, T0
 387        pxor \s4, T0
 388        movdqa \s2, T1
 389        pand \s3, T1
 390        pxor T1, T0
 391        movdq\a T0, (\i * 0x10)(DST)
 392
 393        aegis128_update
 394        pxor MSG, \s4
 395
 396        sub $0x10, LEN
 397        cmp $0x10, LEN
 398        jl .Lenc_out_\i
 399.endm
 400
 401/*
 402 * void crypto_aegis128_aesni_enc(void *state, unsigned int length,
 403 *                                const void *src, void *dst);
 404 */
 405ENTRY(crypto_aegis128_aesni_enc)
 406        FRAME_BEGIN
 407
 408        cmp $0x10, LEN
 409        jb .Lenc_out
 410
 411        /* load the state: */
 412        movdqu 0x00(STATEP), STATE0
 413        movdqu 0x10(STATEP), STATE1
 414        movdqu 0x20(STATEP), STATE2
 415        movdqu 0x30(STATEP), STATE3
 416        movdqu 0x40(STATEP), STATE4
 417
 418        mov  SRC,  %r8
 419        or   DST,  %r8
 420        and $0xF, %r8
 421        jnz .Lenc_u_loop
 422
 423.align 8
 424.Lenc_a_loop:
 425        encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
 426        encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
 427        encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
 428        encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
 429        encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
 430
 431        add $0x50, SRC
 432        add $0x50, DST
 433        jmp .Lenc_a_loop
 434
 435.align 8
 436.Lenc_u_loop:
 437        encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
 438        encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
 439        encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
 440        encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
 441        encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
 442
 443        add $0x50, SRC
 444        add $0x50, DST
 445        jmp .Lenc_u_loop
 446
 447        /* store the state: */
 448.Lenc_out_0:
 449        movdqu STATE4, 0x00(STATEP)
 450        movdqu STATE0, 0x10(STATEP)
 451        movdqu STATE1, 0x20(STATEP)
 452        movdqu STATE2, 0x30(STATEP)
 453        movdqu STATE3, 0x40(STATEP)
 454        FRAME_END
 455        ret
 456
 457.Lenc_out_1:
 458        movdqu STATE3, 0x00(STATEP)
 459        movdqu STATE4, 0x10(STATEP)
 460        movdqu STATE0, 0x20(STATEP)
 461        movdqu STATE1, 0x30(STATEP)
 462        movdqu STATE2, 0x40(STATEP)
 463        FRAME_END
 464        ret
 465
 466.Lenc_out_2:
 467        movdqu STATE2, 0x00(STATEP)
 468        movdqu STATE3, 0x10(STATEP)
 469        movdqu STATE4, 0x20(STATEP)
 470        movdqu STATE0, 0x30(STATEP)
 471        movdqu STATE1, 0x40(STATEP)
 472        FRAME_END
 473        ret
 474
 475.Lenc_out_3:
 476        movdqu STATE1, 0x00(STATEP)
 477        movdqu STATE2, 0x10(STATEP)
 478        movdqu STATE3, 0x20(STATEP)
 479        movdqu STATE4, 0x30(STATEP)
 480        movdqu STATE0, 0x40(STATEP)
 481        FRAME_END
 482        ret
 483
 484.Lenc_out_4:
 485        movdqu STATE0, 0x00(STATEP)
 486        movdqu STATE1, 0x10(STATEP)
 487        movdqu STATE2, 0x20(STATEP)
 488        movdqu STATE3, 0x30(STATEP)
 489        movdqu STATE4, 0x40(STATEP)
 490        FRAME_END
 491        ret
 492
 493.Lenc_out:
 494        FRAME_END
 495        ret
 496ENDPROC(crypto_aegis128_aesni_enc)
 497
 498/*
 499 * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
 500 *                                     const void *src, void *dst);
 501 */
 502ENTRY(crypto_aegis128_aesni_enc_tail)
 503        FRAME_BEGIN
 504
 505        /* load the state: */
 506        movdqu 0x00(STATEP), STATE0
 507        movdqu 0x10(STATEP), STATE1
 508        movdqu 0x20(STATEP), STATE2
 509        movdqu 0x30(STATEP), STATE3
 510        movdqu 0x40(STATEP), STATE4
 511
 512        /* encrypt message: */
 513        call __load_partial
 514
 515        movdqa MSG, T0
 516        pxor STATE1, T0
 517        pxor STATE4, T0
 518        movdqa STATE2, T1
 519        pand STATE3, T1
 520        pxor T1, T0
 521
 522        call __store_partial
 523
 524        aegis128_update
 525        pxor MSG, STATE4
 526
 527        /* store the state: */
 528        movdqu STATE4, 0x00(STATEP)
 529        movdqu STATE0, 0x10(STATEP)
 530        movdqu STATE1, 0x20(STATEP)
 531        movdqu STATE2, 0x30(STATEP)
 532        movdqu STATE3, 0x40(STATEP)
 533
 534        FRAME_END
 535        ret
 536ENDPROC(crypto_aegis128_aesni_enc_tail)
 537
 538.macro decrypt_block a s0 s1 s2 s3 s4 i
 539        movdq\a (\i * 0x10)(SRC), MSG
 540        pxor \s1, MSG
 541        pxor \s4, MSG
 542        movdqa \s2, T1
 543        pand \s3, T1
 544        pxor T1, MSG
 545        movdq\a MSG, (\i * 0x10)(DST)
 546
 547        aegis128_update
 548        pxor MSG, \s4
 549
 550        sub $0x10, LEN
 551        cmp $0x10, LEN
 552        jl .Ldec_out_\i
 553.endm
 554
 555/*
 556 * void crypto_aegis128_aesni_dec(void *state, unsigned int length,
 557 *                                const void *src, void *dst);
 558 */
 559ENTRY(crypto_aegis128_aesni_dec)
 560        FRAME_BEGIN
 561
 562        cmp $0x10, LEN
 563        jb .Ldec_out
 564
 565        /* load the state: */
 566        movdqu 0x00(STATEP), STATE0
 567        movdqu 0x10(STATEP), STATE1
 568        movdqu 0x20(STATEP), STATE2
 569        movdqu 0x30(STATEP), STATE3
 570        movdqu 0x40(STATEP), STATE4
 571
 572        mov  SRC, %r8
 573        or   DST, %r8
 574        and $0xF, %r8
 575        jnz .Ldec_u_loop
 576
 577.align 8
 578.Ldec_a_loop:
 579        decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
 580        decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
 581        decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
 582        decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
 583        decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
 584
 585        add $0x50, SRC
 586        add $0x50, DST
 587        jmp .Ldec_a_loop
 588
 589.align 8
 590.Ldec_u_loop:
 591        decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
 592        decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
 593        decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
 594        decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
 595        decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
 596
 597        add $0x50, SRC
 598        add $0x50, DST
 599        jmp .Ldec_u_loop
 600
 601        /* store the state: */
 602.Ldec_out_0:
 603        movdqu STATE4, 0x00(STATEP)
 604        movdqu STATE0, 0x10(STATEP)
 605        movdqu STATE1, 0x20(STATEP)
 606        movdqu STATE2, 0x30(STATEP)
 607        movdqu STATE3, 0x40(STATEP)
 608        FRAME_END
 609        ret
 610
 611.Ldec_out_1:
 612        movdqu STATE3, 0x00(STATEP)
 613        movdqu STATE4, 0x10(STATEP)
 614        movdqu STATE0, 0x20(STATEP)
 615        movdqu STATE1, 0x30(STATEP)
 616        movdqu STATE2, 0x40(STATEP)
 617        FRAME_END
 618        ret
 619
 620.Ldec_out_2:
 621        movdqu STATE2, 0x00(STATEP)
 622        movdqu STATE3, 0x10(STATEP)
 623        movdqu STATE4, 0x20(STATEP)
 624        movdqu STATE0, 0x30(STATEP)
 625        movdqu STATE1, 0x40(STATEP)
 626        FRAME_END
 627        ret
 628
 629.Ldec_out_3:
 630        movdqu STATE1, 0x00(STATEP)
 631        movdqu STATE2, 0x10(STATEP)
 632        movdqu STATE3, 0x20(STATEP)
 633        movdqu STATE4, 0x30(STATEP)
 634        movdqu STATE0, 0x40(STATEP)
 635        FRAME_END
 636        ret
 637
 638.Ldec_out_4:
 639        movdqu STATE0, 0x00(STATEP)
 640        movdqu STATE1, 0x10(STATEP)
 641        movdqu STATE2, 0x20(STATEP)
 642        movdqu STATE3, 0x30(STATEP)
 643        movdqu STATE4, 0x40(STATEP)
 644        FRAME_END
 645        ret
 646
 647.Ldec_out:
 648        FRAME_END
 649        ret
 650ENDPROC(crypto_aegis128_aesni_dec)
 651
 652/*
 653 * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
 654 *                                     const void *src, void *dst);
 655 */
 656ENTRY(crypto_aegis128_aesni_dec_tail)
 657        FRAME_BEGIN
 658
 659        /* load the state: */
 660        movdqu 0x00(STATEP), STATE0
 661        movdqu 0x10(STATEP), STATE1
 662        movdqu 0x20(STATEP), STATE2
 663        movdqu 0x30(STATEP), STATE3
 664        movdqu 0x40(STATEP), STATE4
 665
 666        /* decrypt message: */
 667        call __load_partial
 668
 669        pxor STATE1, MSG
 670        pxor STATE4, MSG
 671        movdqa STATE2, T1
 672        pand STATE3, T1
 673        pxor T1, MSG
 674
 675        movdqa MSG, T0
 676        call __store_partial
 677
 678        /* mask with byte count: */
 679        movq LEN, T0
 680        punpcklbw T0, T0
 681        punpcklbw T0, T0
 682        punpcklbw T0, T0
 683        punpcklbw T0, T0
 684        movdqa .Laegis128_counter, T1
 685        pcmpgtb T1, T0
 686        pand T0, MSG
 687
 688        aegis128_update
 689        pxor MSG, STATE4
 690
 691        /* store the state: */
 692        movdqu STATE4, 0x00(STATEP)
 693        movdqu STATE0, 0x10(STATEP)
 694        movdqu STATE1, 0x20(STATEP)
 695        movdqu STATE2, 0x30(STATEP)
 696        movdqu STATE3, 0x40(STATEP)
 697
 698        FRAME_END
 699        ret
 700ENDPROC(crypto_aegis128_aesni_dec_tail)
 701
 702/*
 703 * void crypto_aegis128_aesni_final(void *state, void *tag_xor,
 704 *                                  u64 assoclen, u64 cryptlen);
 705 */
 706ENTRY(crypto_aegis128_aesni_final)
 707        FRAME_BEGIN
 708
 709        /* load the state: */
 710        movdqu 0x00(STATEP), STATE0
 711        movdqu 0x10(STATEP), STATE1
 712        movdqu 0x20(STATEP), STATE2
 713        movdqu 0x30(STATEP), STATE3
 714        movdqu 0x40(STATEP), STATE4
 715
 716        /* prepare length block: */
 717        movq %rdx, MSG
 718        movq %rcx, T0
 719        pslldq $8, T0
 720        pxor T0, MSG
 721        psllq $3, MSG /* multiply by 8 (to get bit count) */
 722
 723        pxor STATE3, MSG
 724
 725        /* update state: */
 726        aegis128_update; pxor MSG, STATE4
 727        aegis128_update; pxor MSG, STATE3
 728        aegis128_update; pxor MSG, STATE2
 729        aegis128_update; pxor MSG, STATE1
 730        aegis128_update; pxor MSG, STATE0
 731        aegis128_update; pxor MSG, STATE4
 732        aegis128_update; pxor MSG, STATE3
 733
 734        /* xor tag: */
 735        movdqu (%rsi), MSG
 736
 737        pxor STATE0, MSG
 738        pxor STATE1, MSG
 739        pxor STATE2, MSG
 740        pxor STATE3, MSG
 741        pxor STATE4, MSG
 742
 743        movdqu MSG, (%rsi)
 744
 745        FRAME_END
 746        ret
 747ENDPROC(crypto_aegis128_aesni_final)
 748