linux/arch/x86/crypto/morus640-sse2-asm.S
<<
>>
Prefs
   1/*
   2 * SSE2 implementation of MORUS-640
   3 *
   4 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
   5 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
   6 *
   7 * This program is free software; you can redistribute it and/or modify it
   8 * under the terms of the GNU General Public License version 2 as published
   9 * by the Free Software Foundation.
  10 */
  11
  12#include <linux/linkage.h>
  13#include <asm/frame.h>
  14
  15#define SHUFFLE_MASK(i0, i1, i2, i3) \
  16        (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
  17
  18#define MASK1 SHUFFLE_MASK(3, 0, 1, 2)
  19#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
  20#define MASK3 SHUFFLE_MASK(1, 2, 3, 0)
  21
  22#define STATE0  %xmm0
  23#define STATE1  %xmm1
  24#define STATE2  %xmm2
  25#define STATE3  %xmm3
  26#define STATE4  %xmm4
  27#define KEY     %xmm5
  28#define MSG     %xmm5
  29#define T0      %xmm6
  30#define T1      %xmm7
  31
  32.section .rodata.cst16.morus640_const, "aM", @progbits, 32
  33.align 16
  34.Lmorus640_const_0:
  35        .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
  36        .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
  37.Lmorus640_const_1:
  38        .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
  39        .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
  40
  41.section .rodata.cst16.morus640_counter, "aM", @progbits, 16
  42.align 16
  43.Lmorus640_counter:
  44        .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
  45        .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
  46
  47.text
  48
  49.macro morus640_round s0, s1, s2, s3, s4, b, w
  50        movdqa \s1, T0
  51        pand \s2, T0
  52        pxor T0, \s0
  53        pxor \s3, \s0
  54        movdqa \s0, T0
  55        pslld $\b, T0
  56        psrld $(32 - \b), \s0
  57        pxor T0, \s0
  58        pshufd $\w, \s3, \s3
  59.endm
  60
  61/*
  62 * __morus640_update: internal ABI
  63 * input:
  64 *   STATE[0-4] - input state
  65 *   MSG        - message block
  66 * output:
  67 *   STATE[0-4] - output state
  68 * changed:
  69 *   T0
  70 */
  71__morus640_update:
  72        morus640_round STATE0, STATE1, STATE2, STATE3, STATE4,  5, MASK1
  73        pxor MSG, STATE1
  74        morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
  75        pxor MSG, STATE2
  76        morus640_round STATE2, STATE3, STATE4, STATE0, STATE1,  7, MASK3
  77        pxor MSG, STATE3
  78        morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
  79        pxor MSG, STATE4
  80        morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
  81        ret
  82ENDPROC(__morus640_update)
  83
  84
  85/*
  86 * __morus640_update_zero: internal ABI
  87 * input:
  88 *   STATE[0-4] - input state
  89 * output:
  90 *   STATE[0-4] - output state
  91 * changed:
  92 *   T0
  93 */
  94__morus640_update_zero:
  95        morus640_round STATE0, STATE1, STATE2, STATE3, STATE4,  5, MASK1
  96        morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
  97        morus640_round STATE2, STATE3, STATE4, STATE0, STATE1,  7, MASK3
  98        morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
  99        morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
 100        ret
 101ENDPROC(__morus640_update_zero)
 102
 103/*
 104 * __load_partial: internal ABI
 105 * input:
 106 *   %rsi - src
 107 *   %rcx - bytes
 108 * output:
 109 *   MSG  - message block
 110 * changed:
 111 *   T0
 112 *   %r8
 113 *   %r9
 114 */
 115__load_partial:
 116        xor %r9d, %r9d
 117        pxor MSG, MSG
 118
 119        mov %rcx, %r8
 120        and $0x1, %r8
 121        jz .Lld_partial_1
 122
 123        mov %rcx, %r8
 124        and $0x1E, %r8
 125        add %rsi, %r8
 126        mov (%r8), %r9b
 127
 128.Lld_partial_1:
 129        mov %rcx, %r8
 130        and $0x2, %r8
 131        jz .Lld_partial_2
 132
 133        mov %rcx, %r8
 134        and $0x1C, %r8
 135        add %rsi, %r8
 136        shl $16, %r9
 137        mov (%r8), %r9w
 138
 139.Lld_partial_2:
 140        mov %rcx, %r8
 141        and $0x4, %r8
 142        jz .Lld_partial_4
 143
 144        mov %rcx, %r8
 145        and $0x18, %r8
 146        add %rsi, %r8
 147        shl $32, %r9
 148        mov (%r8), %r8d
 149        xor %r8, %r9
 150
 151.Lld_partial_4:
 152        movq %r9, MSG
 153
 154        mov %rcx, %r8
 155        and $0x8, %r8
 156        jz .Lld_partial_8
 157
 158        mov %rcx, %r8
 159        and $0x10, %r8
 160        add %rsi, %r8
 161        pslldq $8, MSG
 162        movq (%r8), T0
 163        pxor T0, MSG
 164
 165.Lld_partial_8:
 166        ret
 167ENDPROC(__load_partial)
 168
 169/*
 170 * __store_partial: internal ABI
 171 * input:
 172 *   %rdx - dst
 173 *   %rcx - bytes
 174 * output:
 175 *   T0   - message block
 176 * changed:
 177 *   %r8
 178 *   %r9
 179 *   %r10
 180 */
 181__store_partial:
 182        mov %rcx, %r8
 183        mov %rdx, %r9
 184
 185        movq T0, %r10
 186
 187        cmp $8, %r8
 188        jl .Lst_partial_8
 189
 190        mov %r10, (%r9)
 191        psrldq $8, T0
 192        movq T0, %r10
 193
 194        sub $8, %r8
 195        add $8, %r9
 196
 197.Lst_partial_8:
 198        cmp $4, %r8
 199        jl .Lst_partial_4
 200
 201        mov %r10d, (%r9)
 202        shr $32, %r10
 203
 204        sub $4, %r8
 205        add $4, %r9
 206
 207.Lst_partial_4:
 208        cmp $2, %r8
 209        jl .Lst_partial_2
 210
 211        mov %r10w, (%r9)
 212        shr $16, %r10
 213
 214        sub $2, %r8
 215        add $2, %r9
 216
 217.Lst_partial_2:
 218        cmp $1, %r8
 219        jl .Lst_partial_1
 220
 221        mov %r10b, (%r9)
 222
 223.Lst_partial_1:
 224        ret
 225ENDPROC(__store_partial)
 226
 227/*
 228 * void crypto_morus640_sse2_init(void *state, const void *key, const void *iv);
 229 */
 230ENTRY(crypto_morus640_sse2_init)
 231        FRAME_BEGIN
 232
 233        /* load IV: */
 234        movdqu (%rdx), STATE0
 235        /* load key: */
 236        movdqu (%rsi), KEY
 237        movdqa KEY, STATE1
 238        /* load all ones: */
 239        pcmpeqd STATE2, STATE2
 240        /* load the constants: */
 241        movdqa .Lmorus640_const_0, STATE3
 242        movdqa .Lmorus640_const_1, STATE4
 243
 244        /* update 16 times with zero: */
 245        call __morus640_update_zero
 246        call __morus640_update_zero
 247        call __morus640_update_zero
 248        call __morus640_update_zero
 249        call __morus640_update_zero
 250        call __morus640_update_zero
 251        call __morus640_update_zero
 252        call __morus640_update_zero
 253        call __morus640_update_zero
 254        call __morus640_update_zero
 255        call __morus640_update_zero
 256        call __morus640_update_zero
 257        call __morus640_update_zero
 258        call __morus640_update_zero
 259        call __morus640_update_zero
 260        call __morus640_update_zero
 261
 262        /* xor-in the key again after updates: */
 263        pxor KEY, STATE1
 264
 265        /* store the state: */
 266        movdqu STATE0, (0 * 16)(%rdi)
 267        movdqu STATE1, (1 * 16)(%rdi)
 268        movdqu STATE2, (2 * 16)(%rdi)
 269        movdqu STATE3, (3 * 16)(%rdi)
 270        movdqu STATE4, (4 * 16)(%rdi)
 271
 272        FRAME_END
 273        ret
 274ENDPROC(crypto_morus640_sse2_init)
 275
 276/*
 277 * void crypto_morus640_sse2_ad(void *state, const void *data,
 278 *                              unsigned int length);
 279 */
 280ENTRY(crypto_morus640_sse2_ad)
 281        FRAME_BEGIN
 282
 283        cmp $16, %rdx
 284        jb .Lad_out
 285
 286        /* load the state: */
 287        movdqu (0 * 16)(%rdi), STATE0
 288        movdqu (1 * 16)(%rdi), STATE1
 289        movdqu (2 * 16)(%rdi), STATE2
 290        movdqu (3 * 16)(%rdi), STATE3
 291        movdqu (4 * 16)(%rdi), STATE4
 292
 293        mov %rsi, %r8
 294        and $0xF, %r8
 295        jnz .Lad_u_loop
 296
 297.align 4
 298.Lad_a_loop:
 299        movdqa (%rsi), MSG
 300        call __morus640_update
 301        sub $16, %rdx
 302        add $16, %rsi
 303        cmp $16, %rdx
 304        jge .Lad_a_loop
 305
 306        jmp .Lad_cont
 307.align 4
 308.Lad_u_loop:
 309        movdqu (%rsi), MSG
 310        call __morus640_update
 311        sub $16, %rdx
 312        add $16, %rsi
 313        cmp $16, %rdx
 314        jge .Lad_u_loop
 315
 316.Lad_cont:
 317        /* store the state: */
 318        movdqu STATE0, (0 * 16)(%rdi)
 319        movdqu STATE1, (1 * 16)(%rdi)
 320        movdqu STATE2, (2 * 16)(%rdi)
 321        movdqu STATE3, (3 * 16)(%rdi)
 322        movdqu STATE4, (4 * 16)(%rdi)
 323
 324.Lad_out:
 325        FRAME_END
 326        ret
 327ENDPROC(crypto_morus640_sse2_ad)
 328
 329/*
 330 * void crypto_morus640_sse2_enc(void *state, const void *src, void *dst,
 331 *                               unsigned int length);
 332 */
 333ENTRY(crypto_morus640_sse2_enc)
 334        FRAME_BEGIN
 335
 336        cmp $16, %rcx
 337        jb .Lenc_out
 338
 339        /* load the state: */
 340        movdqu (0 * 16)(%rdi), STATE0
 341        movdqu (1 * 16)(%rdi), STATE1
 342        movdqu (2 * 16)(%rdi), STATE2
 343        movdqu (3 * 16)(%rdi), STATE3
 344        movdqu (4 * 16)(%rdi), STATE4
 345
 346        mov %rsi, %r8
 347        or  %rdx, %r8
 348        and $0xF, %r8
 349        jnz .Lenc_u_loop
 350
 351.align 4
 352.Lenc_a_loop:
 353        movdqa (%rsi), MSG
 354        movdqa MSG, T0
 355        pxor STATE0, T0
 356        pshufd $MASK3, STATE1, T1
 357        pxor T1, T0
 358        movdqa STATE2, T1
 359        pand STATE3, T1
 360        pxor T1, T0
 361        movdqa T0, (%rdx)
 362
 363        call __morus640_update
 364        sub $16, %rcx
 365        add $16, %rsi
 366        add $16, %rdx
 367        cmp $16, %rcx
 368        jge .Lenc_a_loop
 369
 370        jmp .Lenc_cont
 371.align 4
 372.Lenc_u_loop:
 373        movdqu (%rsi), MSG
 374        movdqa MSG, T0
 375        pxor STATE0, T0
 376        pshufd $MASK3, STATE1, T1
 377        pxor T1, T0
 378        movdqa STATE2, T1
 379        pand STATE3, T1
 380        pxor T1, T0
 381        movdqu T0, (%rdx)
 382
 383        call __morus640_update
 384        sub $16, %rcx
 385        add $16, %rsi
 386        add $16, %rdx
 387        cmp $16, %rcx
 388        jge .Lenc_u_loop
 389
 390.Lenc_cont:
 391        /* store the state: */
 392        movdqu STATE0, (0 * 16)(%rdi)
 393        movdqu STATE1, (1 * 16)(%rdi)
 394        movdqu STATE2, (2 * 16)(%rdi)
 395        movdqu STATE3, (3 * 16)(%rdi)
 396        movdqu STATE4, (4 * 16)(%rdi)
 397
 398.Lenc_out:
 399        FRAME_END
 400        ret
 401ENDPROC(crypto_morus640_sse2_enc)
 402
 403/*
 404 * void crypto_morus640_sse2_enc_tail(void *state, const void *src, void *dst,
 405 *                                    unsigned int length);
 406 */
 407ENTRY(crypto_morus640_sse2_enc_tail)
 408        FRAME_BEGIN
 409
 410        /* load the state: */
 411        movdqu (0 * 16)(%rdi), STATE0
 412        movdqu (1 * 16)(%rdi), STATE1
 413        movdqu (2 * 16)(%rdi), STATE2
 414        movdqu (3 * 16)(%rdi), STATE3
 415        movdqu (4 * 16)(%rdi), STATE4
 416
 417        /* encrypt message: */
 418        call __load_partial
 419
 420        movdqa MSG, T0
 421        pxor STATE0, T0
 422        pshufd $MASK3, STATE1, T1
 423        pxor T1, T0
 424        movdqa STATE2, T1
 425        pand STATE3, T1
 426        pxor T1, T0
 427
 428        call __store_partial
 429
 430        call __morus640_update
 431
 432        /* store the state: */
 433        movdqu STATE0, (0 * 16)(%rdi)
 434        movdqu STATE1, (1 * 16)(%rdi)
 435        movdqu STATE2, (2 * 16)(%rdi)
 436        movdqu STATE3, (3 * 16)(%rdi)
 437        movdqu STATE4, (4 * 16)(%rdi)
 438
 439        FRAME_END
 440        ret
 441ENDPROC(crypto_morus640_sse2_enc_tail)
 442
 443/*
 444 * void crypto_morus640_sse2_dec(void *state, const void *src, void *dst,
 445 *                               unsigned int length);
 446 */
 447ENTRY(crypto_morus640_sse2_dec)
 448        FRAME_BEGIN
 449
 450        cmp $16, %rcx
 451        jb .Ldec_out
 452
 453        /* load the state: */
 454        movdqu (0 * 16)(%rdi), STATE0
 455        movdqu (1 * 16)(%rdi), STATE1
 456        movdqu (2 * 16)(%rdi), STATE2
 457        movdqu (3 * 16)(%rdi), STATE3
 458        movdqu (4 * 16)(%rdi), STATE4
 459
 460        mov %rsi, %r8
 461        or  %rdx, %r8
 462        and $0xF, %r8
 463        jnz .Ldec_u_loop
 464
 465.align 4
 466.Ldec_a_loop:
 467        movdqa (%rsi), MSG
 468        pxor STATE0, MSG
 469        pshufd $MASK3, STATE1, T0
 470        pxor T0, MSG
 471        movdqa STATE2, T0
 472        pand STATE3, T0
 473        pxor T0, MSG
 474        movdqa MSG, (%rdx)
 475
 476        call __morus640_update
 477        sub $16, %rcx
 478        add $16, %rsi
 479        add $16, %rdx
 480        cmp $16, %rcx
 481        jge .Ldec_a_loop
 482
 483        jmp .Ldec_cont
 484.align 4
 485.Ldec_u_loop:
 486        movdqu (%rsi), MSG
 487        pxor STATE0, MSG
 488        pshufd $MASK3, STATE1, T0
 489        pxor T0, MSG
 490        movdqa STATE2, T0
 491        pand STATE3, T0
 492        pxor T0, MSG
 493        movdqu MSG, (%rdx)
 494
 495        call __morus640_update
 496        sub $16, %rcx
 497        add $16, %rsi
 498        add $16, %rdx
 499        cmp $16, %rcx
 500        jge .Ldec_u_loop
 501
 502.Ldec_cont:
 503        /* store the state: */
 504        movdqu STATE0, (0 * 16)(%rdi)
 505        movdqu STATE1, (1 * 16)(%rdi)
 506        movdqu STATE2, (2 * 16)(%rdi)
 507        movdqu STATE3, (3 * 16)(%rdi)
 508        movdqu STATE4, (4 * 16)(%rdi)
 509
 510.Ldec_out:
 511        FRAME_END
 512        ret
 513ENDPROC(crypto_morus640_sse2_dec)
 514
 515/*
 516 * void crypto_morus640_sse2_dec_tail(void *state, const void *src, void *dst,
 517 *                                    unsigned int length);
 518 */
 519ENTRY(crypto_morus640_sse2_dec_tail)
 520        FRAME_BEGIN
 521
 522        /* load the state: */
 523        movdqu (0 * 16)(%rdi), STATE0
 524        movdqu (1 * 16)(%rdi), STATE1
 525        movdqu (2 * 16)(%rdi), STATE2
 526        movdqu (3 * 16)(%rdi), STATE3
 527        movdqu (4 * 16)(%rdi), STATE4
 528
 529        /* decrypt message: */
 530        call __load_partial
 531
 532        pxor STATE0, MSG
 533        pshufd $MASK3, STATE1, T0
 534        pxor T0, MSG
 535        movdqa STATE2, T0
 536        pand STATE3, T0
 537        pxor T0, MSG
 538        movdqa MSG, T0
 539
 540        call __store_partial
 541
 542        /* mask with byte count: */
 543        movq %rcx, T0
 544        punpcklbw T0, T0
 545        punpcklbw T0, T0
 546        punpcklbw T0, T0
 547        punpcklbw T0, T0
 548        movdqa .Lmorus640_counter, T1
 549        pcmpgtb T1, T0
 550        pand T0, MSG
 551
 552        call __morus640_update
 553
 554        /* store the state: */
 555        movdqu STATE0, (0 * 16)(%rdi)
 556        movdqu STATE1, (1 * 16)(%rdi)
 557        movdqu STATE2, (2 * 16)(%rdi)
 558        movdqu STATE3, (3 * 16)(%rdi)
 559        movdqu STATE4, (4 * 16)(%rdi)
 560
 561        FRAME_END
 562        ret
 563ENDPROC(crypto_morus640_sse2_dec_tail)
 564
 565/*
 566 * void crypto_morus640_sse2_final(void *state, void *tag_xor,
 567 *                                 u64 assoclen, u64 cryptlen);
 568 */
 569ENTRY(crypto_morus640_sse2_final)
 570        FRAME_BEGIN
 571
 572        /* load the state: */
 573        movdqu (0 * 16)(%rdi), STATE0
 574        movdqu (1 * 16)(%rdi), STATE1
 575        movdqu (2 * 16)(%rdi), STATE2
 576        movdqu (3 * 16)(%rdi), STATE3
 577        movdqu (4 * 16)(%rdi), STATE4
 578
 579        /* xor state[0] into state[4]: */
 580        pxor STATE0, STATE4
 581
 582        /* prepare length block: */
 583        movq %rdx, MSG
 584        movq %rcx, T0
 585        pslldq $8, T0
 586        pxor T0, MSG
 587        psllq $3, MSG /* multiply by 8 (to get bit count) */
 588
 589        /* update state: */
 590        call __morus640_update
 591        call __morus640_update
 592        call __morus640_update
 593        call __morus640_update
 594        call __morus640_update
 595        call __morus640_update
 596        call __morus640_update
 597        call __morus640_update
 598        call __morus640_update
 599        call __morus640_update
 600
 601        /* xor tag: */
 602        movdqu (%rsi), MSG
 603
 604        pxor STATE0, MSG
 605        pshufd $MASK3, STATE1, T0
 606        pxor T0, MSG
 607        movdqa STATE2, T0
 608        pand STATE3, T0
 609        pxor T0, MSG
 610
 611        movdqu MSG, (%rsi)
 612
 613        FRAME_END
 614        ret
 615ENDPROC(crypto_morus640_sse2_final)
 616