linux/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * Cast5 Cipher 16-way parallel algorithm (AVX/x86_64)
   4 *
   5 * Copyright (C) 2012 Johannes Goetzfried
   6 *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
   7 *
   8 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   9 */
  10
  11#include <linux/linkage.h>
  12#include <asm/frame.h>
  13
  14.file "cast5-avx-x86_64-asm_64.S"
  15
  16.extern cast_s1
  17.extern cast_s2
  18.extern cast_s3
  19.extern cast_s4
  20
  21/* structure of crypto context */
  22#define km      0
  23#define kr      (16*4)
  24#define rr      ((16*4)+16)
  25
  26/* s-boxes */
  27#define s1      cast_s1
  28#define s2      cast_s2
  29#define s3      cast_s3
  30#define s4      cast_s4
  31
  32/**********************************************************************
  33  16-way AVX cast5
  34 **********************************************************************/
  35#define CTX %r15
  36
  37#define RL1 %xmm0
  38#define RR1 %xmm1
  39#define RL2 %xmm2
  40#define RR2 %xmm3
  41#define RL3 %xmm4
  42#define RR3 %xmm5
  43#define RL4 %xmm6
  44#define RR4 %xmm7
  45
  46#define RX %xmm8
  47
  48#define RKM  %xmm9
  49#define RKR  %xmm10
  50#define RKRF %xmm11
  51#define RKRR %xmm12
  52
  53#define R32  %xmm13
  54#define R1ST %xmm14
  55
  56#define RTMP %xmm15
  57
  58#define RID1  %rdi
  59#define RID1d %edi
  60#define RID2  %rsi
  61#define RID2d %esi
  62
  63#define RGI1   %rdx
  64#define RGI1bl %dl
  65#define RGI1bh %dh
  66#define RGI2   %rcx
  67#define RGI2bl %cl
  68#define RGI2bh %ch
  69
  70#define RGI3   %rax
  71#define RGI3bl %al
  72#define RGI3bh %ah
  73#define RGI4   %rbx
  74#define RGI4bl %bl
  75#define RGI4bh %bh
  76
  77#define RFS1  %r8
  78#define RFS1d %r8d
  79#define RFS2  %r9
  80#define RFS2d %r9d
  81#define RFS3  %r10
  82#define RFS3d %r10d
  83
  84
  85#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
  86        movzbl          src ## bh,     RID1d;    \
  87        movzbl          src ## bl,     RID2d;    \
  88        shrq $16,       src;                     \
  89        movl            s1(, RID1, 4), dst ## d; \
  90        op1             s2(, RID2, 4), dst ## d; \
  91        movzbl          src ## bh,     RID1d;    \
  92        movzbl          src ## bl,     RID2d;    \
  93        interleave_op(il_reg);                   \
  94        op2             s3(, RID1, 4), dst ## d; \
  95        op3             s4(, RID2, 4), dst ## d;
  96
  97#define dummy(d) /* do nothing */
  98
  99#define shr_next(reg) \
 100        shrq $16,       reg;
 101
 102#define F_head(a, x, gi1, gi2, op0) \
 103        op0     a,      RKM,  x;                 \
 104        vpslld  RKRF,   x,    RTMP;              \
 105        vpsrld  RKRR,   x,    x;                 \
 106        vpor    RTMP,   x,    x;                 \
 107        \
 108        vmovq           x,    gi1;               \
 109        vpextrq $1,     x,    gi2;
 110
 111#define F_tail(a, x, gi1, gi2, op1, op2, op3) \
 112        lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
 113        lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
 114        \
 115        lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none);     \
 116        shlq $32,       RFS2;                                      \
 117        orq             RFS1, RFS2;                                \
 118        lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none);     \
 119        shlq $32,       RFS1;                                      \
 120        orq             RFS1, RFS3;                                \
 121        \
 122        vmovq           RFS2, x;                                   \
 123        vpinsrq $1,     RFS3, x, x;
 124
 125#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
 126        F_head(b1, RX, RGI1, RGI2, op0);              \
 127        F_head(b2, RX, RGI3, RGI4, op0);              \
 128        \
 129        F_tail(b1, RX, RGI1, RGI2, op1, op2, op3);    \
 130        F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3);  \
 131        \
 132        vpxor           a1, RX,   a1;                 \
 133        vpxor           a2, RTMP, a2;
 134
 135#define F1_2(a1, b1, a2, b2) \
 136        F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
 137#define F2_2(a1, b1, a2, b2) \
 138        F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
 139#define F3_2(a1, b1, a2, b2) \
 140        F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
 141
 142#define subround(a1, b1, a2, b2, f) \
 143        F ## f ## _2(a1, b1, a2, b2);
 144
 145#define round(l, r, n, f) \
 146        vbroadcastss    (km+(4*n))(CTX), RKM;        \
 147        vpand           R1ST,            RKR,  RKRF; \
 148        vpsubq          RKRF,            R32,  RKRR; \
 149        vpsrldq $1,     RKR,             RKR;        \
 150        subround(l ## 1, r ## 1, l ## 2, r ## 2, f); \
 151        subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
 152
 153#define enc_preload_rkr() \
 154        vbroadcastss    .L16_mask,                RKR;      \
 155        /* add 16-bit rotation to key rotations (mod 32) */ \
 156        vpxor           kr(CTX),                  RKR, RKR;
 157
 158#define dec_preload_rkr() \
 159        vbroadcastss    .L16_mask,                RKR;      \
 160        /* add 16-bit rotation to key rotations (mod 32) */ \
 161        vpxor           kr(CTX),                  RKR, RKR; \
 162        vpshufb         .Lbswap128_mask,          RKR, RKR;
 163
 164#define transpose_2x4(x0, x1, t0, t1) \
 165        vpunpckldq              x1, x0, t0; \
 166        vpunpckhdq              x1, x0, t1; \
 167        \
 168        vpunpcklqdq             t1, t0, x0; \
 169        vpunpckhqdq             t1, t0, x1;
 170
 171#define inpack_blocks(x0, x1, t0, t1, rmask) \
 172        vpshufb rmask,  x0,     x0; \
 173        vpshufb rmask,  x1,     x1; \
 174        \
 175        transpose_2x4(x0, x1, t0, t1)
 176
 177#define outunpack_blocks(x0, x1, t0, t1, rmask) \
 178        transpose_2x4(x0, x1, t0, t1) \
 179        \
 180        vpshufb rmask,  x0, x0;           \
 181        vpshufb rmask,  x1, x1;
 182
 183.section        .rodata.cst16.bswap_mask, "aM", @progbits, 16
 184.align 16
 185.Lbswap_mask:
 186        .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 187.section        .rodata.cst16.bswap128_mask, "aM", @progbits, 16
 188.align 16
 189.Lbswap128_mask:
 190        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 191.section        .rodata.cst16.bswap_iv_mask, "aM", @progbits, 16
 192.align 16
 193.Lbswap_iv_mask:
 194        .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
 195
 196.section        .rodata.cst4.16_mask, "aM", @progbits, 4
 197.align 4
 198.L16_mask:
 199        .byte 16, 16, 16, 16
 200.section        .rodata.cst4.32_mask, "aM", @progbits, 4
 201.align 4
 202.L32_mask:
 203        .byte 32, 0, 0, 0
 204.section        .rodata.cst4.first_mask, "aM", @progbits, 4
 205.align 4
 206.Lfirst_mask:
 207        .byte 0x1f, 0, 0, 0
 208
 209.text
 210
 211.align 16
 212SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
 213        /* input:
 214         *      %rdi: ctx
 215         *      RL1: blocks 1 and 2
 216         *      RR1: blocks 3 and 4
 217         *      RL2: blocks 5 and 6
 218         *      RR2: blocks 7 and 8
 219         *      RL3: blocks 9 and 10
 220         *      RR3: blocks 11 and 12
 221         *      RL4: blocks 13 and 14
 222         *      RR4: blocks 15 and 16
 223         * output:
 224         *      RL1: encrypted blocks 1 and 2
 225         *      RR1: encrypted blocks 3 and 4
 226         *      RL2: encrypted blocks 5 and 6
 227         *      RR2: encrypted blocks 7 and 8
 228         *      RL3: encrypted blocks 9 and 10
 229         *      RR3: encrypted blocks 11 and 12
 230         *      RL4: encrypted blocks 13 and 14
 231         *      RR4: encrypted blocks 15 and 16
 232         */
 233
 234        pushq %r15;
 235        pushq %rbx;
 236
 237        movq %rdi, CTX;
 238
 239        vmovdqa .Lbswap_mask, RKM;
 240        vmovd .Lfirst_mask, R1ST;
 241        vmovd .L32_mask, R32;
 242        enc_preload_rkr();
 243
 244        inpack_blocks(RL1, RR1, RTMP, RX, RKM);
 245        inpack_blocks(RL2, RR2, RTMP, RX, RKM);
 246        inpack_blocks(RL3, RR3, RTMP, RX, RKM);
 247        inpack_blocks(RL4, RR4, RTMP, RX, RKM);
 248
 249        round(RL, RR, 0, 1);
 250        round(RR, RL, 1, 2);
 251        round(RL, RR, 2, 3);
 252        round(RR, RL, 3, 1);
 253        round(RL, RR, 4, 2);
 254        round(RR, RL, 5, 3);
 255        round(RL, RR, 6, 1);
 256        round(RR, RL, 7, 2);
 257        round(RL, RR, 8, 3);
 258        round(RR, RL, 9, 1);
 259        round(RL, RR, 10, 2);
 260        round(RR, RL, 11, 3);
 261
 262        movzbl rr(CTX), %eax;
 263        testl %eax, %eax;
 264        jnz .L__skip_enc;
 265
 266        round(RL, RR, 12, 1);
 267        round(RR, RL, 13, 2);
 268        round(RL, RR, 14, 3);
 269        round(RR, RL, 15, 1);
 270
 271.L__skip_enc:
 272        popq %rbx;
 273        popq %r15;
 274
 275        vmovdqa .Lbswap_mask, RKM;
 276
 277        outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
 278        outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
 279        outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
 280        outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
 281
 282        ret;
 283SYM_FUNC_END(__cast5_enc_blk16)
 284
 285.align 16
 286SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
 287        /* input:
 288         *      %rdi: ctx
 289         *      RL1: encrypted blocks 1 and 2
 290         *      RR1: encrypted blocks 3 and 4
 291         *      RL2: encrypted blocks 5 and 6
 292         *      RR2: encrypted blocks 7 and 8
 293         *      RL3: encrypted blocks 9 and 10
 294         *      RR3: encrypted blocks 11 and 12
 295         *      RL4: encrypted blocks 13 and 14
 296         *      RR4: encrypted blocks 15 and 16
 297         * output:
 298         *      RL1: decrypted blocks 1 and 2
 299         *      RR1: decrypted blocks 3 and 4
 300         *      RL2: decrypted blocks 5 and 6
 301         *      RR2: decrypted blocks 7 and 8
 302         *      RL3: decrypted blocks 9 and 10
 303         *      RR3: decrypted blocks 11 and 12
 304         *      RL4: decrypted blocks 13 and 14
 305         *      RR4: decrypted blocks 15 and 16
 306         */
 307
 308        pushq %r15;
 309        pushq %rbx;
 310
 311        movq %rdi, CTX;
 312
 313        vmovdqa .Lbswap_mask, RKM;
 314        vmovd .Lfirst_mask, R1ST;
 315        vmovd .L32_mask, R32;
 316        dec_preload_rkr();
 317
 318        inpack_blocks(RL1, RR1, RTMP, RX, RKM);
 319        inpack_blocks(RL2, RR2, RTMP, RX, RKM);
 320        inpack_blocks(RL3, RR3, RTMP, RX, RKM);
 321        inpack_blocks(RL4, RR4, RTMP, RX, RKM);
 322
 323        movzbl rr(CTX), %eax;
 324        testl %eax, %eax;
 325        jnz .L__skip_dec;
 326
 327        round(RL, RR, 15, 1);
 328        round(RR, RL, 14, 3);
 329        round(RL, RR, 13, 2);
 330        round(RR, RL, 12, 1);
 331
 332.L__dec_tail:
 333        round(RL, RR, 11, 3);
 334        round(RR, RL, 10, 2);
 335        round(RL, RR, 9, 1);
 336        round(RR, RL, 8, 3);
 337        round(RL, RR, 7, 2);
 338        round(RR, RL, 6, 1);
 339        round(RL, RR, 5, 3);
 340        round(RR, RL, 4, 2);
 341        round(RL, RR, 3, 1);
 342        round(RR, RL, 2, 3);
 343        round(RL, RR, 1, 2);
 344        round(RR, RL, 0, 1);
 345
 346        vmovdqa .Lbswap_mask, RKM;
 347        popq %rbx;
 348        popq %r15;
 349
 350        outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
 351        outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
 352        outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
 353        outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
 354
 355        ret;
 356
 357.L__skip_dec:
 358        vpsrldq $4, RKR, RKR;
 359        jmp .L__dec_tail;
 360SYM_FUNC_END(__cast5_dec_blk16)
 361
 362SYM_FUNC_START(cast5_ecb_enc_16way)
 363        /* input:
 364         *      %rdi: ctx
 365         *      %rsi: dst
 366         *      %rdx: src
 367         */
 368        FRAME_BEGIN
 369        pushq %r15;
 370
 371        movq %rdi, CTX;
 372        movq %rsi, %r11;
 373
 374        vmovdqu (0*4*4)(%rdx), RL1;
 375        vmovdqu (1*4*4)(%rdx), RR1;
 376        vmovdqu (2*4*4)(%rdx), RL2;
 377        vmovdqu (3*4*4)(%rdx), RR2;
 378        vmovdqu (4*4*4)(%rdx), RL3;
 379        vmovdqu (5*4*4)(%rdx), RR3;
 380        vmovdqu (6*4*4)(%rdx), RL4;
 381        vmovdqu (7*4*4)(%rdx), RR4;
 382
 383        call __cast5_enc_blk16;
 384
 385        vmovdqu RR1, (0*4*4)(%r11);
 386        vmovdqu RL1, (1*4*4)(%r11);
 387        vmovdqu RR2, (2*4*4)(%r11);
 388        vmovdqu RL2, (3*4*4)(%r11);
 389        vmovdqu RR3, (4*4*4)(%r11);
 390        vmovdqu RL3, (5*4*4)(%r11);
 391        vmovdqu RR4, (6*4*4)(%r11);
 392        vmovdqu RL4, (7*4*4)(%r11);
 393
 394        popq %r15;
 395        FRAME_END
 396        ret;
 397SYM_FUNC_END(cast5_ecb_enc_16way)
 398
 399SYM_FUNC_START(cast5_ecb_dec_16way)
 400        /* input:
 401         *      %rdi: ctx
 402         *      %rsi: dst
 403         *      %rdx: src
 404         */
 405
 406        FRAME_BEGIN
 407        pushq %r15;
 408
 409        movq %rdi, CTX;
 410        movq %rsi, %r11;
 411
 412        vmovdqu (0*4*4)(%rdx), RL1;
 413        vmovdqu (1*4*4)(%rdx), RR1;
 414        vmovdqu (2*4*4)(%rdx), RL2;
 415        vmovdqu (3*4*4)(%rdx), RR2;
 416        vmovdqu (4*4*4)(%rdx), RL3;
 417        vmovdqu (5*4*4)(%rdx), RR3;
 418        vmovdqu (6*4*4)(%rdx), RL4;
 419        vmovdqu (7*4*4)(%rdx), RR4;
 420
 421        call __cast5_dec_blk16;
 422
 423        vmovdqu RR1, (0*4*4)(%r11);
 424        vmovdqu RL1, (1*4*4)(%r11);
 425        vmovdqu RR2, (2*4*4)(%r11);
 426        vmovdqu RL2, (3*4*4)(%r11);
 427        vmovdqu RR3, (4*4*4)(%r11);
 428        vmovdqu RL3, (5*4*4)(%r11);
 429        vmovdqu RR4, (6*4*4)(%r11);
 430        vmovdqu RL4, (7*4*4)(%r11);
 431
 432        popq %r15;
 433        FRAME_END
 434        ret;
 435SYM_FUNC_END(cast5_ecb_dec_16way)
 436
 437SYM_FUNC_START(cast5_cbc_dec_16way)
 438        /* input:
 439         *      %rdi: ctx
 440         *      %rsi: dst
 441         *      %rdx: src
 442         */
 443        FRAME_BEGIN
 444        pushq %r12;
 445        pushq %r15;
 446
 447        movq %rdi, CTX;
 448        movq %rsi, %r11;
 449        movq %rdx, %r12;
 450
 451        vmovdqu (0*16)(%rdx), RL1;
 452        vmovdqu (1*16)(%rdx), RR1;
 453        vmovdqu (2*16)(%rdx), RL2;
 454        vmovdqu (3*16)(%rdx), RR2;
 455        vmovdqu (4*16)(%rdx), RL3;
 456        vmovdqu (5*16)(%rdx), RR3;
 457        vmovdqu (6*16)(%rdx), RL4;
 458        vmovdqu (7*16)(%rdx), RR4;
 459
 460        call __cast5_dec_blk16;
 461
 462        /* xor with src */
 463        vmovq (%r12), RX;
 464        vpshufd $0x4f, RX, RX;
 465        vpxor RX, RR1, RR1;
 466        vpxor 0*16+8(%r12), RL1, RL1;
 467        vpxor 1*16+8(%r12), RR2, RR2;
 468        vpxor 2*16+8(%r12), RL2, RL2;
 469        vpxor 3*16+8(%r12), RR3, RR3;
 470        vpxor 4*16+8(%r12), RL3, RL3;
 471        vpxor 5*16+8(%r12), RR4, RR4;
 472        vpxor 6*16+8(%r12), RL4, RL4;
 473
 474        vmovdqu RR1, (0*16)(%r11);
 475        vmovdqu RL1, (1*16)(%r11);
 476        vmovdqu RR2, (2*16)(%r11);
 477        vmovdqu RL2, (3*16)(%r11);
 478        vmovdqu RR3, (4*16)(%r11);
 479        vmovdqu RL3, (5*16)(%r11);
 480        vmovdqu RR4, (6*16)(%r11);
 481        vmovdqu RL4, (7*16)(%r11);
 482
 483        popq %r15;
 484        popq %r12;
 485        FRAME_END
 486        ret;
 487SYM_FUNC_END(cast5_cbc_dec_16way)
 488
 489SYM_FUNC_START(cast5_ctr_16way)
 490        /* input:
 491         *      %rdi: ctx
 492         *      %rsi: dst
 493         *      %rdx: src
 494         *      %rcx: iv (big endian, 64bit)
 495         */
 496        FRAME_BEGIN
 497        pushq %r12;
 498        pushq %r15;
 499
 500        movq %rdi, CTX;
 501        movq %rsi, %r11;
 502        movq %rdx, %r12;
 503
 504        vpcmpeqd RTMP, RTMP, RTMP;
 505        vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
 506
 507        vpcmpeqd RKR, RKR, RKR;
 508        vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
 509        vmovdqa .Lbswap_iv_mask, R1ST;
 510        vmovdqa .Lbswap128_mask, RKM;
 511
 512        /* load IV and byteswap */
 513        vmovq (%rcx), RX;
 514        vpshufb R1ST, RX, RX;
 515
 516        /* construct IVs */
 517        vpsubq RTMP, RX, RX;  /* le: IV1, IV0 */
 518        vpshufb RKM, RX, RL1; /* be: IV0, IV1 */
 519        vpsubq RKR, RX, RX;
 520        vpshufb RKM, RX, RR1; /* be: IV2, IV3 */
 521        vpsubq RKR, RX, RX;
 522        vpshufb RKM, RX, RL2; /* be: IV4, IV5 */
 523        vpsubq RKR, RX, RX;
 524        vpshufb RKM, RX, RR2; /* be: IV6, IV7 */
 525        vpsubq RKR, RX, RX;
 526        vpshufb RKM, RX, RL3; /* be: IV8, IV9 */
 527        vpsubq RKR, RX, RX;
 528        vpshufb RKM, RX, RR3; /* be: IV10, IV11 */
 529        vpsubq RKR, RX, RX;
 530        vpshufb RKM, RX, RL4; /* be: IV12, IV13 */
 531        vpsubq RKR, RX, RX;
 532        vpshufb RKM, RX, RR4; /* be: IV14, IV15 */
 533
 534        /* store last IV */
 535        vpsubq RTMP, RX, RX; /* le: IV16, IV14 */
 536        vpshufb R1ST, RX, RX; /* be: IV16, IV16 */
 537        vmovq RX, (%rcx);
 538
 539        call __cast5_enc_blk16;
 540
 541        /* dst = src ^ iv */
 542        vpxor (0*16)(%r12), RR1, RR1;
 543        vpxor (1*16)(%r12), RL1, RL1;
 544        vpxor (2*16)(%r12), RR2, RR2;
 545        vpxor (3*16)(%r12), RL2, RL2;
 546        vpxor (4*16)(%r12), RR3, RR3;
 547        vpxor (5*16)(%r12), RL3, RL3;
 548        vpxor (6*16)(%r12), RR4, RR4;
 549        vpxor (7*16)(%r12), RL4, RL4;
 550        vmovdqu RR1, (0*16)(%r11);
 551        vmovdqu RL1, (1*16)(%r11);
 552        vmovdqu RR2, (2*16)(%r11);
 553        vmovdqu RL2, (3*16)(%r11);
 554        vmovdqu RR3, (4*16)(%r11);
 555        vmovdqu RL3, (5*16)(%r11);
 556        vmovdqu RR4, (6*16)(%r11);
 557        vmovdqu RL4, (7*16)(%r11);
 558
 559        popq %r15;
 560        popq %r12;
 561        FRAME_END
 562        ret;
 563SYM_FUNC_END(cast5_ctr_16way)
 564