linux/arch/x86/crypto/blowfish-avx2-asm_64.S
<<
>>
Prefs
   1/*
   2 * x86_64/AVX2 assembler optimized version of Blowfish
   3 *
   4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation; either version 2 of the License, or
   9 * (at your option) any later version.
  10 *
  11 */
  12
  13#include <linux/linkage.h>
  14
  15.file "blowfish-avx2-asm_64.S"
  16
  17.data
  18.align 32
  19
  20.Lprefetch_mask:
  21.long 0*64
  22.long 1*64
  23.long 2*64
  24.long 3*64
  25.long 4*64
  26.long 5*64
  27.long 6*64
  28.long 7*64
  29
  30.Lbswap32_mask:
  31.long 0x00010203
  32.long 0x04050607
  33.long 0x08090a0b
  34.long 0x0c0d0e0f
  35
  36.Lbswap128_mask:
  37        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  38.Lbswap_iv_mask:
  39        .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
  40
  41.text
  42/* structure of crypto context */
  43#define p       0
  44#define s0      ((16 + 2) * 4)
  45#define s1      ((16 + 2 + (1 * 256)) * 4)
  46#define s2      ((16 + 2 + (2 * 256)) * 4)
  47#define s3      ((16 + 2 + (3 * 256)) * 4)
  48
  49/* register macros */
  50#define CTX     %rdi
  51#define RIO      %rdx
  52
  53#define RS0     %rax
  54#define RS1     %r8
  55#define RS2     %r9
  56#define RS3     %r10
  57
  58#define RLOOP   %r11
  59#define RLOOPd  %r11d
  60
  61#define RXr0    %ymm8
  62#define RXr1    %ymm9
  63#define RXr2    %ymm10
  64#define RXr3    %ymm11
  65#define RXl0    %ymm12
  66#define RXl1    %ymm13
  67#define RXl2    %ymm14
  68#define RXl3    %ymm15
  69
  70/* temp regs */
  71#define RT0     %ymm0
  72#define RT0x    %xmm0
  73#define RT1     %ymm1
  74#define RT1x    %xmm1
  75#define RIDX0   %ymm2
  76#define RIDX1   %ymm3
  77#define RIDX1x  %xmm3
  78#define RIDX2   %ymm4
  79#define RIDX3   %ymm5
  80
  81/* vpgatherdd mask and '-1' */
  82#define RNOT    %ymm6
  83
  84/* byte mask, (-1 >> 24) */
  85#define RBYTE   %ymm7
  86
  87/***********************************************************************
  88 * 32-way AVX2 blowfish
  89 ***********************************************************************/
  90#define F(xl, xr) \
  91        vpsrld $24, xl, RIDX0; \
  92        vpsrld $16, xl, RIDX1; \
  93        vpsrld $8, xl, RIDX2; \
  94        vpand RBYTE, RIDX1, RIDX1; \
  95        vpand RBYTE, RIDX2, RIDX2; \
  96        vpand RBYTE, xl, RIDX3; \
  97        \
  98        vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \
  99        vpcmpeqd RNOT, RNOT, RNOT; \
 100        vpcmpeqd RIDX0, RIDX0, RIDX0; \
 101        \
 102        vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \
 103        vpcmpeqd RIDX1, RIDX1, RIDX1; \
 104        vpaddd RT0, RT1, RT0; \
 105        \
 106        vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \
 107        vpxor RT0, RT1, RT0; \
 108        \
 109        vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \
 110        vpcmpeqd RNOT, RNOT, RNOT; \
 111        vpaddd RT0, RT1, RT0; \
 112        \
 113        vpxor RT0, xr, xr;
 114
 115#define add_roundkey(xl, nmem) \
 116        vpbroadcastd nmem, RT0; \
 117        vpxor RT0, xl ## 0, xl ## 0; \
 118        vpxor RT0, xl ## 1, xl ## 1; \
 119        vpxor RT0, xl ## 2, xl ## 2; \
 120        vpxor RT0, xl ## 3, xl ## 3;
 121
 122#define round_enc() \
 123        add_roundkey(RXr, p(CTX,RLOOP,4)); \
 124        F(RXl0, RXr0); \
 125        F(RXl1, RXr1); \
 126        F(RXl2, RXr2); \
 127        F(RXl3, RXr3); \
 128        \
 129        add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
 130        F(RXr0, RXl0); \
 131        F(RXr1, RXl1); \
 132        F(RXr2, RXl2); \
 133        F(RXr3, RXl3);
 134
 135#define round_dec() \
 136        add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \
 137        F(RXl0, RXr0); \
 138        F(RXl1, RXr1); \
 139        F(RXl2, RXr2); \
 140        F(RXl3, RXr3); \
 141        \
 142        add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
 143        F(RXr0, RXl0); \
 144        F(RXr1, RXl1); \
 145        F(RXr2, RXl2); \
 146        F(RXr3, RXl3);
 147
 148#define init_round_constants() \
 149        vpcmpeqd RNOT, RNOT, RNOT; \
 150        leaq s0(CTX), RS0; \
 151        leaq s1(CTX), RS1; \
 152        leaq s2(CTX), RS2; \
 153        leaq s3(CTX), RS3; \
 154        vpsrld $24, RNOT, RBYTE;
 155
 156#define transpose_2x2(x0, x1, t0) \
 157        vpunpckldq x0, x1, t0; \
 158        vpunpckhdq x0, x1, x1; \
 159        \
 160        vpunpcklqdq t0, x1, x0; \
 161        vpunpckhqdq t0, x1, x1;
 162
 163#define read_block(xl, xr) \
 164        vbroadcasti128 .Lbswap32_mask, RT1; \
 165        \
 166        vpshufb RT1, xl ## 0, xl ## 0; \
 167        vpshufb RT1, xr ## 0, xr ## 0; \
 168        vpshufb RT1, xl ## 1, xl ## 1; \
 169        vpshufb RT1, xr ## 1, xr ## 1; \
 170        vpshufb RT1, xl ## 2, xl ## 2; \
 171        vpshufb RT1, xr ## 2, xr ## 2; \
 172        vpshufb RT1, xl ## 3, xl ## 3; \
 173        vpshufb RT1, xr ## 3, xr ## 3; \
 174        \
 175        transpose_2x2(xl ## 0, xr ## 0, RT0); \
 176        transpose_2x2(xl ## 1, xr ## 1, RT0); \
 177        transpose_2x2(xl ## 2, xr ## 2, RT0); \
 178        transpose_2x2(xl ## 3, xr ## 3, RT0);
 179
 180#define write_block(xl, xr) \
 181        vbroadcasti128 .Lbswap32_mask, RT1; \
 182        \
 183        transpose_2x2(xl ## 0, xr ## 0, RT0); \
 184        transpose_2x2(xl ## 1, xr ## 1, RT0); \
 185        transpose_2x2(xl ## 2, xr ## 2, RT0); \
 186        transpose_2x2(xl ## 3, xr ## 3, RT0); \
 187        \
 188        vpshufb RT1, xl ## 0, xl ## 0; \
 189        vpshufb RT1, xr ## 0, xr ## 0; \
 190        vpshufb RT1, xl ## 1, xl ## 1; \
 191        vpshufb RT1, xr ## 1, xr ## 1; \
 192        vpshufb RT1, xl ## 2, xl ## 2; \
 193        vpshufb RT1, xr ## 2, xr ## 2; \
 194        vpshufb RT1, xl ## 3, xl ## 3; \
 195        vpshufb RT1, xr ## 3, xr ## 3;
 196
 197.align 8
 198__blowfish_enc_blk32:
 199        /* input:
 200         *      %rdi: ctx, CTX
 201         *      RXl0..4, RXr0..4: plaintext
 202         * output:
 203         *      RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped)
 204         */
 205        init_round_constants();
 206
 207        read_block(RXl, RXr);
 208
 209        movl $1, RLOOPd;
 210        add_roundkey(RXl, p+4*(0)(CTX));
 211
 212.align 4
 213.L__enc_loop:
 214        round_enc();
 215
 216        leal 2(RLOOPd), RLOOPd;
 217        cmpl $17, RLOOPd;
 218        jne .L__enc_loop;
 219
 220        add_roundkey(RXr, p+4*(17)(CTX));
 221
 222        write_block(RXl, RXr);
 223
 224        ret;
 225ENDPROC(__blowfish_enc_blk32)
 226
 227.align 8
 228__blowfish_dec_blk32:
 229        /* input:
 230         *      %rdi: ctx, CTX
 231         *      RXl0..4, RXr0..4: ciphertext
 232         * output:
 233         *      RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped)
 234         */
 235        init_round_constants();
 236
 237        read_block(RXl, RXr);
 238
 239        movl $14, RLOOPd;
 240        add_roundkey(RXl, p+4*(17)(CTX));
 241
 242.align 4
 243.L__dec_loop:
 244        round_dec();
 245
 246        addl $-2, RLOOPd;
 247        jns .L__dec_loop;
 248
 249        add_roundkey(RXr, p+4*(0)(CTX));
 250
 251        write_block(RXl, RXr);
 252
 253        ret;
 254ENDPROC(__blowfish_dec_blk32)
 255
 256ENTRY(blowfish_ecb_enc_32way)
 257        /* input:
 258         *      %rdi: ctx, CTX
 259         *      %rsi: dst
 260         *      %rdx: src
 261         */
 262
 263        vzeroupper;
 264
 265        vmovdqu 0*32(%rdx), RXl0;
 266        vmovdqu 1*32(%rdx), RXr0;
 267        vmovdqu 2*32(%rdx), RXl1;
 268        vmovdqu 3*32(%rdx), RXr1;
 269        vmovdqu 4*32(%rdx), RXl2;
 270        vmovdqu 5*32(%rdx), RXr2;
 271        vmovdqu 6*32(%rdx), RXl3;
 272        vmovdqu 7*32(%rdx), RXr3;
 273
 274        call __blowfish_enc_blk32;
 275
 276        vmovdqu RXr0, 0*32(%rsi);
 277        vmovdqu RXl0, 1*32(%rsi);
 278        vmovdqu RXr1, 2*32(%rsi);
 279        vmovdqu RXl1, 3*32(%rsi);
 280        vmovdqu RXr2, 4*32(%rsi);
 281        vmovdqu RXl2, 5*32(%rsi);
 282        vmovdqu RXr3, 6*32(%rsi);
 283        vmovdqu RXl3, 7*32(%rsi);
 284
 285        vzeroupper;
 286
 287        ret;
 288ENDPROC(blowfish_ecb_enc_32way)
 289
 290ENTRY(blowfish_ecb_dec_32way)
 291        /* input:
 292         *      %rdi: ctx, CTX
 293         *      %rsi: dst
 294         *      %rdx: src
 295         */
 296
 297        vzeroupper;
 298
 299        vmovdqu 0*32(%rdx), RXl0;
 300        vmovdqu 1*32(%rdx), RXr0;
 301        vmovdqu 2*32(%rdx), RXl1;
 302        vmovdqu 3*32(%rdx), RXr1;
 303        vmovdqu 4*32(%rdx), RXl2;
 304        vmovdqu 5*32(%rdx), RXr2;
 305        vmovdqu 6*32(%rdx), RXl3;
 306        vmovdqu 7*32(%rdx), RXr3;
 307
 308        call __blowfish_dec_blk32;
 309
 310        vmovdqu RXr0, 0*32(%rsi);
 311        vmovdqu RXl0, 1*32(%rsi);
 312        vmovdqu RXr1, 2*32(%rsi);
 313        vmovdqu RXl1, 3*32(%rsi);
 314        vmovdqu RXr2, 4*32(%rsi);
 315        vmovdqu RXl2, 5*32(%rsi);
 316        vmovdqu RXr3, 6*32(%rsi);
 317        vmovdqu RXl3, 7*32(%rsi);
 318
 319        vzeroupper;
 320
 321        ret;
 322ENDPROC(blowfish_ecb_dec_32way)
 323
 324ENTRY(blowfish_cbc_dec_32way)
 325        /* input:
 326         *      %rdi: ctx, CTX
 327         *      %rsi: dst
 328         *      %rdx: src
 329         */
 330
 331        vzeroupper;
 332
 333        vmovdqu 0*32(%rdx), RXl0;
 334        vmovdqu 1*32(%rdx), RXr0;
 335        vmovdqu 2*32(%rdx), RXl1;
 336        vmovdqu 3*32(%rdx), RXr1;
 337        vmovdqu 4*32(%rdx), RXl2;
 338        vmovdqu 5*32(%rdx), RXr2;
 339        vmovdqu 6*32(%rdx), RXl3;
 340        vmovdqu 7*32(%rdx), RXr3;
 341
 342        call __blowfish_dec_blk32;
 343
 344        /* xor with src */
 345        vmovq (%rdx), RT0x;
 346        vpshufd $0x4f, RT0x, RT0x;
 347        vinserti128 $1, 8(%rdx), RT0, RT0;
 348        vpxor RT0, RXr0, RXr0;
 349        vpxor 0*32+24(%rdx), RXl0, RXl0;
 350        vpxor 1*32+24(%rdx), RXr1, RXr1;
 351        vpxor 2*32+24(%rdx), RXl1, RXl1;
 352        vpxor 3*32+24(%rdx), RXr2, RXr2;
 353        vpxor 4*32+24(%rdx), RXl2, RXl2;
 354        vpxor 5*32+24(%rdx), RXr3, RXr3;
 355        vpxor 6*32+24(%rdx), RXl3, RXl3;
 356
 357        vmovdqu RXr0, (0*32)(%rsi);
 358        vmovdqu RXl0, (1*32)(%rsi);
 359        vmovdqu RXr1, (2*32)(%rsi);
 360        vmovdqu RXl1, (3*32)(%rsi);
 361        vmovdqu RXr2, (4*32)(%rsi);
 362        vmovdqu RXl2, (5*32)(%rsi);
 363        vmovdqu RXr3, (6*32)(%rsi);
 364        vmovdqu RXl3, (7*32)(%rsi);
 365
 366        vzeroupper;
 367
 368        ret;
 369ENDPROC(blowfish_cbc_dec_32way)
 370
 371ENTRY(blowfish_ctr_32way)
 372        /* input:
 373         *      %rdi: ctx, CTX
 374         *      %rsi: dst
 375         *      %rdx: src
 376         *      %rcx: iv (big endian, 64bit)
 377         */
 378
 379        vzeroupper;
 380
 381        vpcmpeqd RT0, RT0, RT0;
 382        vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */
 383
 384        vpcmpeqd RT1x, RT1x, RT1x;
 385        vpaddq RT1x, RT1x, RT1x; /* a: -2, b: -2 */
 386        vpxor RIDX0, RIDX0, RIDX0;
 387        vinserti128 $1, RT1x, RIDX0, RIDX0; /* a: 0, b: 0, c: -2, d: -2 */
 388
 389        vpaddq RIDX0, RT0, RT0; /* a: -1, b: 0, c: -3, d: -2 */
 390
 391        vpcmpeqd RT1, RT1, RT1;
 392        vpaddq RT1, RT1, RT1; /* a: -2, b: -2, c: -2, d: -2 */
 393        vpaddq RT1, RT1, RIDX2; /* a: -4, b: -4, c: -4, d: -4 */
 394
 395        vbroadcasti128 .Lbswap_iv_mask, RIDX0;
 396        vbroadcasti128 .Lbswap128_mask, RIDX1;
 397
 398        /* load IV and byteswap */
 399        vmovq (%rcx), RT1x;
 400        vinserti128 $1, RT1x, RT1, RT1; /* a: BE, b: 0, c: BE, d: 0 */
 401        vpshufb RIDX0, RT1, RT1; /* a: LE, b: LE, c: LE, d: LE */
 402
 403        /* construct IVs */
 404        vpsubq RT0, RT1, RT1;           /* a: le1, b: le0, c: le3, d: le2 */
 405        vpshufb RIDX1, RT1, RXl0;       /* a: be0, b: be1, c: be2, d: be3 */
 406        vpsubq RIDX2, RT1, RT1;         /* le5, le4, le7, le6 */
 407        vpshufb RIDX1, RT1, RXr0;       /* be4, be5, be6, be7 */
 408        vpsubq RIDX2, RT1, RT1;
 409        vpshufb RIDX1, RT1, RXl1;
 410        vpsubq RIDX2, RT1, RT1;
 411        vpshufb RIDX1, RT1, RXr1;
 412        vpsubq RIDX2, RT1, RT1;
 413        vpshufb RIDX1, RT1, RXl2;
 414        vpsubq RIDX2, RT1, RT1;
 415        vpshufb RIDX1, RT1, RXr2;
 416        vpsubq RIDX2, RT1, RT1;
 417        vpshufb RIDX1, RT1, RXl3;
 418        vpsubq RIDX2, RT1, RT1;
 419        vpshufb RIDX1, RT1, RXr3;
 420
 421        /* store last IV */
 422        vpsubq RIDX2, RT1, RT1; /* a: le33, b: le32, ... */
 423        vpshufb RIDX1x, RT1x, RT1x; /* a: be32, ... */
 424        vmovq RT1x, (%rcx);
 425
 426        call __blowfish_enc_blk32;
 427
 428        /* dst = src ^ iv */
 429        vpxor 0*32(%rdx), RXr0, RXr0;
 430        vpxor 1*32(%rdx), RXl0, RXl0;
 431        vpxor 2*32(%rdx), RXr1, RXr1;
 432        vpxor 3*32(%rdx), RXl1, RXl1;
 433        vpxor 4*32(%rdx), RXr2, RXr2;
 434        vpxor 5*32(%rdx), RXl2, RXl2;
 435        vpxor 6*32(%rdx), RXr3, RXr3;
 436        vpxor 7*32(%rdx), RXl3, RXl3;
 437        vmovdqu RXr0, (0*32)(%rsi);
 438        vmovdqu RXl0, (1*32)(%rsi);
 439        vmovdqu RXr1, (2*32)(%rsi);
 440        vmovdqu RXl1, (3*32)(%rsi);
 441        vmovdqu RXr2, (4*32)(%rsi);
 442        vmovdqu RXl2, (5*32)(%rsi);
 443        vmovdqu RXr3, (6*32)(%rsi);
 444        vmovdqu RXl3, (7*32)(%rsi);
 445
 446        vzeroupper;
 447
 448        ret;
 449ENDPROC(blowfish_ctr_32way)
 450