linux/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
<<
>>
Prefs
   1/*
   2 * Cast5 Cipher 16-way parallel algorithm (AVX/x86_64)
   3 *
   4 * Copyright (C) 2012 Johannes Goetzfried
   5 *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
   6 *
   7 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of the GNU General Public License as published by
  11 * the Free Software Foundation; either version 2 of the License, or
  12 * (at your option) any later version.
  13 *
  14 * This program is distributed in the hope that it will be useful,
  15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 * GNU General Public License for more details.
  18 *
  19 * You should have received a copy of the GNU General Public License
  20 * along with this program; if not, write to the Free Software
  21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
  22 * USA
  23 *
  24 */
  25
  26#include <linux/linkage.h>
  27#include <asm/frame.h>
  28
  29.file "cast5-avx-x86_64-asm_64.S"
  30
  31.extern cast_s1
  32.extern cast_s2
  33.extern cast_s3
  34.extern cast_s4
  35
  36/* structure of crypto context */
  37#define km      0
  38#define kr      (16*4)
  39#define rr      ((16*4)+16)
  40
  41/* s-boxes */
  42#define s1      cast_s1
  43#define s2      cast_s2
  44#define s3      cast_s3
  45#define s4      cast_s4
  46
  47/**********************************************************************
  48  16-way AVX cast5
  49 **********************************************************************/
  50#define CTX %r15
  51
  52#define RL1 %xmm0
  53#define RR1 %xmm1
  54#define RL2 %xmm2
  55#define RR2 %xmm3
  56#define RL3 %xmm4
  57#define RR3 %xmm5
  58#define RL4 %xmm6
  59#define RR4 %xmm7
  60
  61#define RX %xmm8
  62
  63#define RKM  %xmm9
  64#define RKR  %xmm10
  65#define RKRF %xmm11
  66#define RKRR %xmm12
  67
  68#define R32  %xmm13
  69#define R1ST %xmm14
  70
  71#define RTMP %xmm15
  72
  73#define RID1  %rdi
  74#define RID1d %edi
  75#define RID2  %rsi
  76#define RID2d %esi
  77
  78#define RGI1   %rdx
  79#define RGI1bl %dl
  80#define RGI1bh %dh
  81#define RGI2   %rcx
  82#define RGI2bl %cl
  83#define RGI2bh %ch
  84
  85#define RGI3   %rax
  86#define RGI3bl %al
  87#define RGI3bh %ah
  88#define RGI4   %rbx
  89#define RGI4bl %bl
  90#define RGI4bh %bh
  91
  92#define RFS1  %r8
  93#define RFS1d %r8d
  94#define RFS2  %r9
  95#define RFS2d %r9d
  96#define RFS3  %r10
  97#define RFS3d %r10d
  98
  99
 100#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
 101        movzbl          src ## bh,     RID1d;    \
 102        movzbl          src ## bl,     RID2d;    \
 103        shrq $16,       src;                     \
 104        movl            s1(, RID1, 4), dst ## d; \
 105        op1             s2(, RID2, 4), dst ## d; \
 106        movzbl          src ## bh,     RID1d;    \
 107        movzbl          src ## bl,     RID2d;    \
 108        interleave_op(il_reg);                   \
 109        op2             s3(, RID1, 4), dst ## d; \
 110        op3             s4(, RID2, 4), dst ## d;
 111
 112#define dummy(d) /* do nothing */
 113
 114#define shr_next(reg) \
 115        shrq $16,       reg;
 116
 117#define F_head(a, x, gi1, gi2, op0) \
 118        op0     a,      RKM,  x;                 \
 119        vpslld  RKRF,   x,    RTMP;              \
 120        vpsrld  RKRR,   x,    x;                 \
 121        vpor    RTMP,   x,    x;                 \
 122        \
 123        vmovq           x,    gi1;               \
 124        vpextrq $1,     x,    gi2;
 125
 126#define F_tail(a, x, gi1, gi2, op1, op2, op3) \
 127        lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
 128        lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
 129        \
 130        lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none);     \
 131        shlq $32,       RFS2;                                      \
 132        orq             RFS1, RFS2;                                \
 133        lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none);     \
 134        shlq $32,       RFS1;                                      \
 135        orq             RFS1, RFS3;                                \
 136        \
 137        vmovq           RFS2, x;                                   \
 138        vpinsrq $1,     RFS3, x, x;
 139
 140#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
 141        F_head(b1, RX, RGI1, RGI2, op0);              \
 142        F_head(b2, RX, RGI3, RGI4, op0);              \
 143        \
 144        F_tail(b1, RX, RGI1, RGI2, op1, op2, op3);    \
 145        F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3);  \
 146        \
 147        vpxor           a1, RX,   a1;                 \
 148        vpxor           a2, RTMP, a2;
 149
 150#define F1_2(a1, b1, a2, b2) \
 151        F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
 152#define F2_2(a1, b1, a2, b2) \
 153        F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
 154#define F3_2(a1, b1, a2, b2) \
 155        F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
 156
 157#define subround(a1, b1, a2, b2, f) \
 158        F ## f ## _2(a1, b1, a2, b2);
 159
 160#define round(l, r, n, f) \
 161        vbroadcastss    (km+(4*n))(CTX), RKM;        \
 162        vpand           R1ST,            RKR,  RKRF; \
 163        vpsubq          RKRF,            R32,  RKRR; \
 164        vpsrldq $1,     RKR,             RKR;        \
 165        subround(l ## 1, r ## 1, l ## 2, r ## 2, f); \
 166        subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
 167
 168#define enc_preload_rkr() \
 169        vbroadcastss    .L16_mask,                RKR;      \
 170        /* add 16-bit rotation to key rotations (mod 32) */ \
 171        vpxor           kr(CTX),                  RKR, RKR;
 172
 173#define dec_preload_rkr() \
 174        vbroadcastss    .L16_mask,                RKR;      \
 175        /* add 16-bit rotation to key rotations (mod 32) */ \
 176        vpxor           kr(CTX),                  RKR, RKR; \
 177        vpshufb         .Lbswap128_mask,          RKR, RKR;
 178
 179#define transpose_2x4(x0, x1, t0, t1) \
 180        vpunpckldq              x1, x0, t0; \
 181        vpunpckhdq              x1, x0, t1; \
 182        \
 183        vpunpcklqdq             t1, t0, x0; \
 184        vpunpckhqdq             t1, t0, x1;
 185
 186#define inpack_blocks(x0, x1, t0, t1, rmask) \
 187        vpshufb rmask,  x0,     x0; \
 188        vpshufb rmask,  x1,     x1; \
 189        \
 190        transpose_2x4(x0, x1, t0, t1)
 191
 192#define outunpack_blocks(x0, x1, t0, t1, rmask) \
 193        transpose_2x4(x0, x1, t0, t1) \
 194        \
 195        vpshufb rmask,  x0, x0;           \
 196        vpshufb rmask,  x1, x1;
 197
 198.section        .rodata.cst16.bswap_mask, "aM", @progbits, 16
 199.align 16
 200.Lbswap_mask:
 201        .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 202.section        .rodata.cst16.bswap128_mask, "aM", @progbits, 16
 203.align 16
 204.Lbswap128_mask:
 205        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 206.section        .rodata.cst16.bswap_iv_mask, "aM", @progbits, 16
 207.align 16
 208.Lbswap_iv_mask:
 209        .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
 210
 211.section        .rodata.cst4.16_mask, "aM", @progbits, 4
 212.align 4
 213.L16_mask:
 214        .byte 16, 16, 16, 16
 215.section        .rodata.cst4.32_mask, "aM", @progbits, 4
 216.align 4
 217.L32_mask:
 218        .byte 32, 0, 0, 0
 219.section        .rodata.cst4.first_mask, "aM", @progbits, 4
 220.align 4
 221.Lfirst_mask:
 222        .byte 0x1f, 0, 0, 0
 223
 224.text
 225
 226.align 16
 227__cast5_enc_blk16:
 228        /* input:
 229         *      %rdi: ctx
 230         *      RL1: blocks 1 and 2
 231         *      RR1: blocks 3 and 4
 232         *      RL2: blocks 5 and 6
 233         *      RR2: blocks 7 and 8
 234         *      RL3: blocks 9 and 10
 235         *      RR3: blocks 11 and 12
 236         *      RL4: blocks 13 and 14
 237         *      RR4: blocks 15 and 16
 238         * output:
 239         *      RL1: encrypted blocks 1 and 2
 240         *      RR1: encrypted blocks 3 and 4
 241         *      RL2: encrypted blocks 5 and 6
 242         *      RR2: encrypted blocks 7 and 8
 243         *      RL3: encrypted blocks 9 and 10
 244         *      RR3: encrypted blocks 11 and 12
 245         *      RL4: encrypted blocks 13 and 14
 246         *      RR4: encrypted blocks 15 and 16
 247         */
 248
 249        pushq %r15;
 250        pushq %rbx;
 251
 252        movq %rdi, CTX;
 253
 254        vmovdqa .Lbswap_mask, RKM;
 255        vmovd .Lfirst_mask, R1ST;
 256        vmovd .L32_mask, R32;
 257        enc_preload_rkr();
 258
 259        inpack_blocks(RL1, RR1, RTMP, RX, RKM);
 260        inpack_blocks(RL2, RR2, RTMP, RX, RKM);
 261        inpack_blocks(RL3, RR3, RTMP, RX, RKM);
 262        inpack_blocks(RL4, RR4, RTMP, RX, RKM);
 263
 264        round(RL, RR, 0, 1);
 265        round(RR, RL, 1, 2);
 266        round(RL, RR, 2, 3);
 267        round(RR, RL, 3, 1);
 268        round(RL, RR, 4, 2);
 269        round(RR, RL, 5, 3);
 270        round(RL, RR, 6, 1);
 271        round(RR, RL, 7, 2);
 272        round(RL, RR, 8, 3);
 273        round(RR, RL, 9, 1);
 274        round(RL, RR, 10, 2);
 275        round(RR, RL, 11, 3);
 276
 277        movzbl rr(CTX), %eax;
 278        testl %eax, %eax;
 279        jnz .L__skip_enc;
 280
 281        round(RL, RR, 12, 1);
 282        round(RR, RL, 13, 2);
 283        round(RL, RR, 14, 3);
 284        round(RR, RL, 15, 1);
 285
 286.L__skip_enc:
 287        popq %rbx;
 288        popq %r15;
 289
 290        vmovdqa .Lbswap_mask, RKM;
 291
 292        outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
 293        outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
 294        outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
 295        outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
 296
 297        ret;
 298ENDPROC(__cast5_enc_blk16)
 299
 300.align 16
 301__cast5_dec_blk16:
 302        /* input:
 303         *      %rdi: ctx
 304         *      RL1: encrypted blocks 1 and 2
 305         *      RR1: encrypted blocks 3 and 4
 306         *      RL2: encrypted blocks 5 and 6
 307         *      RR2: encrypted blocks 7 and 8
 308         *      RL3: encrypted blocks 9 and 10
 309         *      RR3: encrypted blocks 11 and 12
 310         *      RL4: encrypted blocks 13 and 14
 311         *      RR4: encrypted blocks 15 and 16
 312         * output:
 313         *      RL1: decrypted blocks 1 and 2
 314         *      RR1: decrypted blocks 3 and 4
 315         *      RL2: decrypted blocks 5 and 6
 316         *      RR2: decrypted blocks 7 and 8
 317         *      RL3: decrypted blocks 9 and 10
 318         *      RR3: decrypted blocks 11 and 12
 319         *      RL4: decrypted blocks 13 and 14
 320         *      RR4: decrypted blocks 15 and 16
 321         */
 322
 323        pushq %r15;
 324        pushq %rbx;
 325
 326        movq %rdi, CTX;
 327
 328        vmovdqa .Lbswap_mask, RKM;
 329        vmovd .Lfirst_mask, R1ST;
 330        vmovd .L32_mask, R32;
 331        dec_preload_rkr();
 332
 333        inpack_blocks(RL1, RR1, RTMP, RX, RKM);
 334        inpack_blocks(RL2, RR2, RTMP, RX, RKM);
 335        inpack_blocks(RL3, RR3, RTMP, RX, RKM);
 336        inpack_blocks(RL4, RR4, RTMP, RX, RKM);
 337
 338        movzbl rr(CTX), %eax;
 339        testl %eax, %eax;
 340        jnz .L__skip_dec;
 341
 342        round(RL, RR, 15, 1);
 343        round(RR, RL, 14, 3);
 344        round(RL, RR, 13, 2);
 345        round(RR, RL, 12, 1);
 346
 347.L__dec_tail:
 348        round(RL, RR, 11, 3);
 349        round(RR, RL, 10, 2);
 350        round(RL, RR, 9, 1);
 351        round(RR, RL, 8, 3);
 352        round(RL, RR, 7, 2);
 353        round(RR, RL, 6, 1);
 354        round(RL, RR, 5, 3);
 355        round(RR, RL, 4, 2);
 356        round(RL, RR, 3, 1);
 357        round(RR, RL, 2, 3);
 358        round(RL, RR, 1, 2);
 359        round(RR, RL, 0, 1);
 360
 361        vmovdqa .Lbswap_mask, RKM;
 362        popq %rbx;
 363        popq %r15;
 364
 365        outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
 366        outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
 367        outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
 368        outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
 369
 370        ret;
 371
 372.L__skip_dec:
 373        vpsrldq $4, RKR, RKR;
 374        jmp .L__dec_tail;
 375ENDPROC(__cast5_dec_blk16)
 376
 377ENTRY(cast5_ecb_enc_16way)
 378        /* input:
 379         *      %rdi: ctx
 380         *      %rsi: dst
 381         *      %rdx: src
 382         */
 383        FRAME_BEGIN
 384        pushq %r15;
 385
 386        movq %rdi, CTX;
 387        movq %rsi, %r11;
 388
 389        vmovdqu (0*4*4)(%rdx), RL1;
 390        vmovdqu (1*4*4)(%rdx), RR1;
 391        vmovdqu (2*4*4)(%rdx), RL2;
 392        vmovdqu (3*4*4)(%rdx), RR2;
 393        vmovdqu (4*4*4)(%rdx), RL3;
 394        vmovdqu (5*4*4)(%rdx), RR3;
 395        vmovdqu (6*4*4)(%rdx), RL4;
 396        vmovdqu (7*4*4)(%rdx), RR4;
 397
 398        call __cast5_enc_blk16;
 399
 400        vmovdqu RR1, (0*4*4)(%r11);
 401        vmovdqu RL1, (1*4*4)(%r11);
 402        vmovdqu RR2, (2*4*4)(%r11);
 403        vmovdqu RL2, (3*4*4)(%r11);
 404        vmovdqu RR3, (4*4*4)(%r11);
 405        vmovdqu RL3, (5*4*4)(%r11);
 406        vmovdqu RR4, (6*4*4)(%r11);
 407        vmovdqu RL4, (7*4*4)(%r11);
 408
 409        popq %r15;
 410        FRAME_END
 411        ret;
 412ENDPROC(cast5_ecb_enc_16way)
 413
 414ENTRY(cast5_ecb_dec_16way)
 415        /* input:
 416         *      %rdi: ctx
 417         *      %rsi: dst
 418         *      %rdx: src
 419         */
 420
 421        FRAME_BEGIN
 422        pushq %r15;
 423
 424        movq %rdi, CTX;
 425        movq %rsi, %r11;
 426
 427        vmovdqu (0*4*4)(%rdx), RL1;
 428        vmovdqu (1*4*4)(%rdx), RR1;
 429        vmovdqu (2*4*4)(%rdx), RL2;
 430        vmovdqu (3*4*4)(%rdx), RR2;
 431        vmovdqu (4*4*4)(%rdx), RL3;
 432        vmovdqu (5*4*4)(%rdx), RR3;
 433        vmovdqu (6*4*4)(%rdx), RL4;
 434        vmovdqu (7*4*4)(%rdx), RR4;
 435
 436        call __cast5_dec_blk16;
 437
 438        vmovdqu RR1, (0*4*4)(%r11);
 439        vmovdqu RL1, (1*4*4)(%r11);
 440        vmovdqu RR2, (2*4*4)(%r11);
 441        vmovdqu RL2, (3*4*4)(%r11);
 442        vmovdqu RR3, (4*4*4)(%r11);
 443        vmovdqu RL3, (5*4*4)(%r11);
 444        vmovdqu RR4, (6*4*4)(%r11);
 445        vmovdqu RL4, (7*4*4)(%r11);
 446
 447        popq %r15;
 448        FRAME_END
 449        ret;
 450ENDPROC(cast5_ecb_dec_16way)
 451
 452ENTRY(cast5_cbc_dec_16way)
 453        /* input:
 454         *      %rdi: ctx
 455         *      %rsi: dst
 456         *      %rdx: src
 457         */
 458        FRAME_BEGIN
 459        pushq %r12;
 460        pushq %r15;
 461
 462        movq %rdi, CTX;
 463        movq %rsi, %r11;
 464        movq %rdx, %r12;
 465
 466        vmovdqu (0*16)(%rdx), RL1;
 467        vmovdqu (1*16)(%rdx), RR1;
 468        vmovdqu (2*16)(%rdx), RL2;
 469        vmovdqu (3*16)(%rdx), RR2;
 470        vmovdqu (4*16)(%rdx), RL3;
 471        vmovdqu (5*16)(%rdx), RR3;
 472        vmovdqu (6*16)(%rdx), RL4;
 473        vmovdqu (7*16)(%rdx), RR4;
 474
 475        call __cast5_dec_blk16;
 476
 477        /* xor with src */
 478        vmovq (%r12), RX;
 479        vpshufd $0x4f, RX, RX;
 480        vpxor RX, RR1, RR1;
 481        vpxor 0*16+8(%r12), RL1, RL1;
 482        vpxor 1*16+8(%r12), RR2, RR2;
 483        vpxor 2*16+8(%r12), RL2, RL2;
 484        vpxor 3*16+8(%r12), RR3, RR3;
 485        vpxor 4*16+8(%r12), RL3, RL3;
 486        vpxor 5*16+8(%r12), RR4, RR4;
 487        vpxor 6*16+8(%r12), RL4, RL4;
 488
 489        vmovdqu RR1, (0*16)(%r11);
 490        vmovdqu RL1, (1*16)(%r11);
 491        vmovdqu RR2, (2*16)(%r11);
 492        vmovdqu RL2, (3*16)(%r11);
 493        vmovdqu RR3, (4*16)(%r11);
 494        vmovdqu RL3, (5*16)(%r11);
 495        vmovdqu RR4, (6*16)(%r11);
 496        vmovdqu RL4, (7*16)(%r11);
 497
 498        popq %r15;
 499        popq %r12;
 500        FRAME_END
 501        ret;
 502ENDPROC(cast5_cbc_dec_16way)
 503
 504ENTRY(cast5_ctr_16way)
 505        /* input:
 506         *      %rdi: ctx
 507         *      %rsi: dst
 508         *      %rdx: src
 509         *      %rcx: iv (big endian, 64bit)
 510         */
 511        FRAME_BEGIN
 512        pushq %r12;
 513        pushq %r15;
 514
 515        movq %rdi, CTX;
 516        movq %rsi, %r11;
 517        movq %rdx, %r12;
 518
 519        vpcmpeqd RTMP, RTMP, RTMP;
 520        vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
 521
 522        vpcmpeqd RKR, RKR, RKR;
 523        vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
 524        vmovdqa .Lbswap_iv_mask, R1ST;
 525        vmovdqa .Lbswap128_mask, RKM;
 526
 527        /* load IV and byteswap */
 528        vmovq (%rcx), RX;
 529        vpshufb R1ST, RX, RX;
 530
 531        /* construct IVs */
 532        vpsubq RTMP, RX, RX;  /* le: IV1, IV0 */
 533        vpshufb RKM, RX, RL1; /* be: IV0, IV1 */
 534        vpsubq RKR, RX, RX;
 535        vpshufb RKM, RX, RR1; /* be: IV2, IV3 */
 536        vpsubq RKR, RX, RX;
 537        vpshufb RKM, RX, RL2; /* be: IV4, IV5 */
 538        vpsubq RKR, RX, RX;
 539        vpshufb RKM, RX, RR2; /* be: IV6, IV7 */
 540        vpsubq RKR, RX, RX;
 541        vpshufb RKM, RX, RL3; /* be: IV8, IV9 */
 542        vpsubq RKR, RX, RX;
 543        vpshufb RKM, RX, RR3; /* be: IV10, IV11 */
 544        vpsubq RKR, RX, RX;
 545        vpshufb RKM, RX, RL4; /* be: IV12, IV13 */
 546        vpsubq RKR, RX, RX;
 547        vpshufb RKM, RX, RR4; /* be: IV14, IV15 */
 548
 549        /* store last IV */
 550        vpsubq RTMP, RX, RX; /* le: IV16, IV14 */
 551        vpshufb R1ST, RX, RX; /* be: IV16, IV16 */
 552        vmovq RX, (%rcx);
 553
 554        call __cast5_enc_blk16;
 555
 556        /* dst = src ^ iv */
 557        vpxor (0*16)(%r12), RR1, RR1;
 558        vpxor (1*16)(%r12), RL1, RL1;
 559        vpxor (2*16)(%r12), RR2, RR2;
 560        vpxor (3*16)(%r12), RL2, RL2;
 561        vpxor (4*16)(%r12), RR3, RR3;
 562        vpxor (5*16)(%r12), RL3, RL3;
 563        vpxor (6*16)(%r12), RR4, RR4;
 564        vpxor (7*16)(%r12), RL4, RL4;
 565        vmovdqu RR1, (0*16)(%r11);
 566        vmovdqu RL1, (1*16)(%r11);
 567        vmovdqu RR2, (2*16)(%r11);
 568        vmovdqu RL2, (3*16)(%r11);
 569        vmovdqu RR3, (4*16)(%r11);
 570        vmovdqu RL3, (5*16)(%r11);
 571        vmovdqu RR4, (6*16)(%r11);
 572        vmovdqu RL4, (7*16)(%r11);
 573
 574        popq %r15;
 575        popq %r12;
 576        FRAME_END
 577        ret;
 578ENDPROC(cast5_ctr_16way)
 579