linux/arch/x86/crypto/twofish-avx2-asm_64.S
<<
>>
Prefs
   1/*
   2 * x86_64/AVX2 assembler optimized version of Twofish
   3 *
   4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation; either version 2 of the License, or
   9 * (at your option) any later version.
  10 *
  11 */
  12
  13#include <linux/linkage.h>
  14#include "glue_helper-asm-avx2.S"
  15
  16.file "twofish-avx2-asm_64.S"
  17
  18.data
  19.align 16
  20
  21.Lvpshufb_mask0:
  22.long 0x80808000
  23.long 0x80808004
  24.long 0x80808008
  25.long 0x8080800c
  26
  27.Lbswap128_mask:
  28        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  29.Lxts_gf128mul_and_shl1_mask_0:
  30        .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
  31.Lxts_gf128mul_and_shl1_mask_1:
  32        .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
  33
  34.text
  35
  36/* structure of crypto context */
  37#define s0      0
  38#define s1      1024
  39#define s2      2048
  40#define s3      3072
  41#define w       4096
  42#define k       4128
  43
  44/* register macros */
  45#define CTX     %rdi
  46
  47#define RS0     CTX
  48#define RS1     %r8
  49#define RS2     %r9
  50#define RS3     %r10
  51#define RK      %r11
  52#define RW      %rax
  53#define RROUND  %r12
  54#define RROUNDd %r12d
  55
  56#define RA0     %ymm8
  57#define RB0     %ymm9
  58#define RC0     %ymm10
  59#define RD0     %ymm11
  60#define RA1     %ymm12
  61#define RB1     %ymm13
  62#define RC1     %ymm14
  63#define RD1     %ymm15
  64
  65/* temp regs */
  66#define RX0     %ymm0
  67#define RY0     %ymm1
  68#define RX1     %ymm2
  69#define RY1     %ymm3
  70#define RT0     %ymm4
  71#define RIDX    %ymm5
  72
  73#define RX0x    %xmm0
  74#define RY0x    %xmm1
  75#define RX1x    %xmm2
  76#define RY1x    %xmm3
  77#define RT0x    %xmm4
  78
  79/* vpgatherdd mask and '-1' */
  80#define RNOT    %ymm6
  81
  82/* byte mask, (-1 >> 24) */
  83#define RBYTE   %ymm7
  84
  85/**********************************************************************
  86  16-way AVX2 twofish
  87 **********************************************************************/
  88#define init_round_constants() \
  89        vpcmpeqd RNOT, RNOT, RNOT; \
  90        vpsrld $24, RNOT, RBYTE; \
  91        leaq k(CTX), RK; \
  92        leaq w(CTX), RW; \
  93        leaq s1(CTX), RS1; \
  94        leaq s2(CTX), RS2; \
  95        leaq s3(CTX), RS3; \
  96
  97#define g16(ab, rs0, rs1, rs2, rs3, xy) \
  98        vpand RBYTE, ab ## 0, RIDX; \
  99        vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \
 100        vpcmpeqd RNOT, RNOT, RNOT; \
 101                \
 102                vpand RBYTE, ab ## 1, RIDX; \
 103                vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \
 104                vpcmpeqd RNOT, RNOT, RNOT; \
 105        \
 106        vpsrld $8, ab ## 0, RIDX; \
 107        vpand RBYTE, RIDX, RIDX; \
 108        vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
 109        vpcmpeqd RNOT, RNOT, RNOT; \
 110        vpxor RT0, xy ## 0, xy ## 0; \
 111                \
 112                vpsrld $8, ab ## 1, RIDX; \
 113                vpand RBYTE, RIDX, RIDX; \
 114                vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
 115                vpcmpeqd RNOT, RNOT, RNOT; \
 116                vpxor RT0, xy ## 1, xy ## 1; \
 117        \
 118        vpsrld $16, ab ## 0, RIDX; \
 119        vpand RBYTE, RIDX, RIDX; \
 120        vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
 121        vpcmpeqd RNOT, RNOT, RNOT; \
 122        vpxor RT0, xy ## 0, xy ## 0; \
 123                \
 124                vpsrld $16, ab ## 1, RIDX; \
 125                vpand RBYTE, RIDX, RIDX; \
 126                vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
 127                vpcmpeqd RNOT, RNOT, RNOT; \
 128                vpxor RT0, xy ## 1, xy ## 1; \
 129        \
 130        vpsrld $24, ab ## 0, RIDX; \
 131        vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
 132        vpcmpeqd RNOT, RNOT, RNOT; \
 133        vpxor RT0, xy ## 0, xy ## 0; \
 134                \
 135                vpsrld $24, ab ## 1, RIDX; \
 136                vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
 137                vpcmpeqd RNOT, RNOT, RNOT; \
 138                vpxor RT0, xy ## 1, xy ## 1;
 139
 140#define g1_16(a, x) \
 141        g16(a, RS0, RS1, RS2, RS3, x);
 142
 143#define g2_16(b, y) \
 144        g16(b, RS1, RS2, RS3, RS0, y);
 145
 146#define encrypt_round_end16(a, b, c, d, nk) \
 147        vpaddd RY0, RX0, RX0; \
 148        vpaddd RX0, RY0, RY0; \
 149        vpbroadcastd nk(RK,RROUND,8), RT0; \
 150        vpaddd RT0, RX0, RX0; \
 151        vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
 152        vpaddd RT0, RY0, RY0; \
 153        \
 154        vpxor RY0, d ## 0, d ## 0; \
 155        \
 156        vpxor RX0, c ## 0, c ## 0; \
 157        vpsrld $1, c ## 0, RT0; \
 158        vpslld $31, c ## 0, c ## 0; \
 159        vpor RT0, c ## 0, c ## 0; \
 160        \
 161                vpaddd RY1, RX1, RX1; \
 162                vpaddd RX1, RY1, RY1; \
 163                vpbroadcastd nk(RK,RROUND,8), RT0; \
 164                vpaddd RT0, RX1, RX1; \
 165                vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
 166                vpaddd RT0, RY1, RY1; \
 167                \
 168                vpxor RY1, d ## 1, d ## 1; \
 169                \
 170                vpxor RX1, c ## 1, c ## 1; \
 171                vpsrld $1, c ## 1, RT0; \
 172                vpslld $31, c ## 1, c ## 1; \
 173                vpor RT0, c ## 1, c ## 1; \
 174
 175#define encrypt_round16(a, b, c, d, nk) \
 176        g2_16(b, RY); \
 177        \
 178        vpslld $1, b ## 0, RT0; \
 179        vpsrld $31, b ## 0, b ## 0; \
 180        vpor RT0, b ## 0, b ## 0; \
 181        \
 182                vpslld $1, b ## 1, RT0; \
 183                vpsrld $31, b ## 1, b ## 1; \
 184                vpor RT0, b ## 1, b ## 1; \
 185        \
 186        g1_16(a, RX); \
 187        \
 188        encrypt_round_end16(a, b, c, d, nk);
 189
 190#define encrypt_round_first16(a, b, c, d, nk) \
 191        vpslld $1, d ## 0, RT0; \
 192        vpsrld $31, d ## 0, d ## 0; \
 193        vpor RT0, d ## 0, d ## 0; \
 194        \
 195                vpslld $1, d ## 1, RT0; \
 196                vpsrld $31, d ## 1, d ## 1; \
 197                vpor RT0, d ## 1, d ## 1; \
 198        \
 199        encrypt_round16(a, b, c, d, nk);
 200
 201#define encrypt_round_last16(a, b, c, d, nk) \
 202        g2_16(b, RY); \
 203        \
 204        g1_16(a, RX); \
 205        \
 206        encrypt_round_end16(a, b, c, d, nk);
 207
 208#define decrypt_round_end16(a, b, c, d, nk) \
 209        vpaddd RY0, RX0, RX0; \
 210        vpaddd RX0, RY0, RY0; \
 211        vpbroadcastd nk(RK,RROUND,8), RT0; \
 212        vpaddd RT0, RX0, RX0; \
 213        vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
 214        vpaddd RT0, RY0, RY0; \
 215        \
 216        vpxor RX0, c ## 0, c ## 0; \
 217        \
 218        vpxor RY0, d ## 0, d ## 0; \
 219        vpsrld $1, d ## 0, RT0; \
 220        vpslld $31, d ## 0, d ## 0; \
 221        vpor RT0, d ## 0, d ## 0; \
 222        \
 223                vpaddd RY1, RX1, RX1; \
 224                vpaddd RX1, RY1, RY1; \
 225                vpbroadcastd nk(RK,RROUND,8), RT0; \
 226                vpaddd RT0, RX1, RX1; \
 227                vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
 228                vpaddd RT0, RY1, RY1; \
 229                \
 230                vpxor RX1, c ## 1, c ## 1; \
 231                \
 232                vpxor RY1, d ## 1, d ## 1; \
 233                vpsrld $1, d ## 1, RT0; \
 234                vpslld $31, d ## 1, d ## 1; \
 235                vpor RT0, d ## 1, d ## 1;
 236
 237#define decrypt_round16(a, b, c, d, nk) \
 238        g1_16(a, RX); \
 239        \
 240        vpslld $1, a ## 0, RT0; \
 241        vpsrld $31, a ## 0, a ## 0; \
 242        vpor RT0, a ## 0, a ## 0; \
 243        \
 244                vpslld $1, a ## 1, RT0; \
 245                vpsrld $31, a ## 1, a ## 1; \
 246                vpor RT0, a ## 1, a ## 1; \
 247        \
 248        g2_16(b, RY); \
 249        \
 250        decrypt_round_end16(a, b, c, d, nk);
 251
 252#define decrypt_round_first16(a, b, c, d, nk) \
 253        vpslld $1, c ## 0, RT0; \
 254        vpsrld $31, c ## 0, c ## 0; \
 255        vpor RT0, c ## 0, c ## 0; \
 256        \
 257                vpslld $1, c ## 1, RT0; \
 258                vpsrld $31, c ## 1, c ## 1; \
 259                vpor RT0, c ## 1, c ## 1; \
 260        \
 261        decrypt_round16(a, b, c, d, nk)
 262
 263#define decrypt_round_last16(a, b, c, d, nk) \
 264        g1_16(a, RX); \
 265        \
 266        g2_16(b, RY); \
 267        \
 268        decrypt_round_end16(a, b, c, d, nk);
 269
 270#define encrypt_cycle16() \
 271        encrypt_round16(RA, RB, RC, RD, 0); \
 272        encrypt_round16(RC, RD, RA, RB, 8);
 273
 274#define encrypt_cycle_first16() \
 275        encrypt_round_first16(RA, RB, RC, RD, 0); \
 276        encrypt_round16(RC, RD, RA, RB, 8);
 277
 278#define encrypt_cycle_last16() \
 279        encrypt_round16(RA, RB, RC, RD, 0); \
 280        encrypt_round_last16(RC, RD, RA, RB, 8);
 281
 282#define decrypt_cycle16(n) \
 283        decrypt_round16(RC, RD, RA, RB, 8); \
 284        decrypt_round16(RA, RB, RC, RD, 0);
 285
 286#define decrypt_cycle_first16(n) \
 287        decrypt_round_first16(RC, RD, RA, RB, 8); \
 288        decrypt_round16(RA, RB, RC, RD, 0);
 289
 290#define decrypt_cycle_last16(n) \
 291        decrypt_round16(RC, RD, RA, RB, 8); \
 292        decrypt_round_last16(RA, RB, RC, RD, 0);
 293
 294#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
 295        vpunpckhdq x1, x0, t2; \
 296        vpunpckldq x1, x0, x0; \
 297        \
 298        vpunpckldq x3, x2, t1; \
 299        vpunpckhdq x3, x2, x2; \
 300        \
 301        vpunpckhqdq t1, x0, x1; \
 302        vpunpcklqdq t1, x0, x0; \
 303        \
 304        vpunpckhqdq x2, t2, x3; \
 305        vpunpcklqdq x2, t2, x2;
 306
 307#define read_blocks8(offs,a,b,c,d) \
 308        transpose_4x4(a, b, c, d, RX0, RY0);
 309
 310#define write_blocks8(offs,a,b,c,d) \
 311        transpose_4x4(a, b, c, d, RX0, RY0);
 312
 313#define inpack_enc8(a,b,c,d) \
 314        vpbroadcastd 4*0(RW), RT0; \
 315        vpxor RT0, a, a; \
 316        \
 317        vpbroadcastd 4*1(RW), RT0; \
 318        vpxor RT0, b, b; \
 319        \
 320        vpbroadcastd 4*2(RW), RT0; \
 321        vpxor RT0, c, c; \
 322        \
 323        vpbroadcastd 4*3(RW), RT0; \
 324        vpxor RT0, d, d;
 325
 326#define outunpack_enc8(a,b,c,d) \
 327        vpbroadcastd 4*4(RW), RX0; \
 328        vpbroadcastd 4*5(RW), RY0; \
 329        vpxor RX0, c, RX0; \
 330        vpxor RY0, d, RY0; \
 331        \
 332        vpbroadcastd 4*6(RW), RT0; \
 333        vpxor RT0, a, c; \
 334        vpbroadcastd 4*7(RW), RT0; \
 335        vpxor RT0, b, d; \
 336        \
 337        vmovdqa RX0, a; \
 338        vmovdqa RY0, b;
 339
 340#define inpack_dec8(a,b,c,d) \
 341        vpbroadcastd 4*4(RW), RX0; \
 342        vpbroadcastd 4*5(RW), RY0; \
 343        vpxor RX0, a, RX0; \
 344        vpxor RY0, b, RY0; \
 345        \
 346        vpbroadcastd 4*6(RW), RT0; \
 347        vpxor RT0, c, a; \
 348        vpbroadcastd 4*7(RW), RT0; \
 349        vpxor RT0, d, b; \
 350        \
 351        vmovdqa RX0, c; \
 352        vmovdqa RY0, d;
 353
 354#define outunpack_dec8(a,b,c,d) \
 355        vpbroadcastd 4*0(RW), RT0; \
 356        vpxor RT0, a, a; \
 357        \
 358        vpbroadcastd 4*1(RW), RT0; \
 359        vpxor RT0, b, b; \
 360        \
 361        vpbroadcastd 4*2(RW), RT0; \
 362        vpxor RT0, c, c; \
 363        \
 364        vpbroadcastd 4*3(RW), RT0; \
 365        vpxor RT0, d, d;
 366
 367#define read_blocks16(a,b,c,d) \
 368        read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
 369        read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
 370
 371#define write_blocks16(a,b,c,d) \
 372        write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
 373        write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
 374
 375#define xor_blocks16(a,b,c,d) \
 376        xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
 377        xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
 378
 379#define inpack_enc16(a,b,c,d) \
 380        inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
 381        inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
 382
 383#define outunpack_enc16(a,b,c,d) \
 384        outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
 385        outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
 386
 387#define inpack_dec16(a,b,c,d) \
 388        inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
 389        inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
 390
 391#define outunpack_dec16(a,b,c,d) \
 392        outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
 393        outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
 394
 395.align 8
 396__twofish_enc_blk16:
 397        /* input:
 398         *      %rdi: ctx, CTX
 399         *      RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
 400         * output:
 401         *      RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
 402         */
 403        init_round_constants();
 404
 405        read_blocks16(RA, RB, RC, RD);
 406        inpack_enc16(RA, RB, RC, RD);
 407
 408        xorl RROUNDd, RROUNDd;
 409        encrypt_cycle_first16();
 410        movl $2, RROUNDd;
 411
 412.align 4
 413.L__enc_loop:
 414        encrypt_cycle16();
 415
 416        addl $2, RROUNDd;
 417        cmpl $14, RROUNDd;
 418        jne .L__enc_loop;
 419
 420        encrypt_cycle_last16();
 421
 422        outunpack_enc16(RA, RB, RC, RD);
 423        write_blocks16(RA, RB, RC, RD);
 424
 425        ret;
 426ENDPROC(__twofish_enc_blk16)
 427
 428.align 8
 429__twofish_dec_blk16:
 430        /* input:
 431         *      %rdi: ctx, CTX
 432         *      RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
 433         * output:
 434         *      RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
 435         */
 436        init_round_constants();
 437
 438        read_blocks16(RA, RB, RC, RD);
 439        inpack_dec16(RA, RB, RC, RD);
 440
 441        movl $14, RROUNDd;
 442        decrypt_cycle_first16();
 443        movl $12, RROUNDd;
 444
 445.align 4
 446.L__dec_loop:
 447        decrypt_cycle16();
 448
 449        addl $-2, RROUNDd;
 450        jnz .L__dec_loop;
 451
 452        decrypt_cycle_last16();
 453
 454        outunpack_dec16(RA, RB, RC, RD);
 455        write_blocks16(RA, RB, RC, RD);
 456
 457        ret;
 458ENDPROC(__twofish_dec_blk16)
 459
 460ENTRY(twofish_ecb_enc_16way)
 461        /* input:
 462         *      %rdi: ctx, CTX
 463         *      %rsi: dst
 464         *      %rdx: src
 465         */
 466
 467        vzeroupper;
 468        pushq %r12;
 469
 470        load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
 471
 472        call __twofish_enc_blk16;
 473
 474        store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
 475
 476        popq %r12;
 477        vzeroupper;
 478
 479        ret;
 480ENDPROC(twofish_ecb_enc_16way)
 481
 482ENTRY(twofish_ecb_dec_16way)
 483        /* input:
 484         *      %rdi: ctx, CTX
 485         *      %rsi: dst
 486         *      %rdx: src
 487         */
 488
 489        vzeroupper;
 490        pushq %r12;
 491
 492        load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
 493
 494        call __twofish_dec_blk16;
 495
 496        store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
 497
 498        popq %r12;
 499        vzeroupper;
 500
 501        ret;
 502ENDPROC(twofish_ecb_dec_16way)
 503
 504ENTRY(twofish_cbc_dec_16way)
 505        /* input:
 506         *      %rdi: ctx, CTX
 507         *      %rsi: dst
 508         *      %rdx: src
 509         */
 510
 511        vzeroupper;
 512        pushq %r12;
 513
 514        load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
 515
 516        call __twofish_dec_blk16;
 517
 518        store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1,
 519                        RX0);
 520
 521        popq %r12;
 522        vzeroupper;
 523
 524        ret;
 525ENDPROC(twofish_cbc_dec_16way)
 526
 527ENTRY(twofish_ctr_16way)
 528        /* input:
 529         *      %rdi: ctx, CTX
 530         *      %rsi: dst (16 blocks)
 531         *      %rdx: src (16 blocks)
 532         *      %rcx: iv (little endian, 128bit)
 533         */
 534
 535        vzeroupper;
 536        pushq %r12;
 537
 538        load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
 539                       RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
 540                       RBYTE);
 541
 542        call __twofish_enc_blk16;
 543
 544        store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
 545
 546        popq %r12;
 547        vzeroupper;
 548
 549        ret;
 550ENDPROC(twofish_ctr_16way)
 551
 552.align 8
 553twofish_xts_crypt_16way:
 554        /* input:
 555         *      %rdi: ctx, CTX
 556         *      %rsi: dst (16 blocks)
 557         *      %rdx: src (16 blocks)
 558         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 559         *      %r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16
 560         */
 561
 562        vzeroupper;
 563        pushq %r12;
 564
 565        load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
 566                       RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
 567                       .Lxts_gf128mul_and_shl1_mask_0,
 568                       .Lxts_gf128mul_and_shl1_mask_1);
 569
 570        call *%r8;
 571
 572        store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
 573
 574        popq %r12;
 575        vzeroupper;
 576
 577        ret;
 578ENDPROC(twofish_xts_crypt_16way)
 579
 580ENTRY(twofish_xts_enc_16way)
 581        /* input:
 582         *      %rdi: ctx, CTX
 583         *      %rsi: dst (16 blocks)
 584         *      %rdx: src (16 blocks)
 585         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 586         */
 587        leaq __twofish_enc_blk16, %r8;
 588        jmp twofish_xts_crypt_16way;
 589ENDPROC(twofish_xts_enc_16way)
 590
 591ENTRY(twofish_xts_dec_16way)
 592        /* input:
 593         *      %rdi: ctx, CTX
 594         *      %rsi: dst (16 blocks)
 595         *      %rdx: src (16 blocks)
 596         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 597         */
 598        leaq __twofish_dec_blk16, %r8;
 599        jmp twofish_xts_crypt_16way;
 600ENDPROC(twofish_xts_dec_16way)
 601