linux/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * Twofish Cipher 3-way parallel algorithm (x86_64)
   4 *
   5 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   6 */
   7
   8#include <linux/linkage.h>
   9
  10.file "twofish-x86_64-asm-3way.S"
  11.text
  12
  13/* structure of crypto context */
  14#define s0      0
  15#define s1      1024
  16#define s2      2048
  17#define s3      3072
  18#define w       4096
  19#define k       4128
  20
  21/**********************************************************************
  22  3-way twofish
  23 **********************************************************************/
  24#define CTX %rdi
  25#define RIO %rdx
  26
  27#define RAB0 %rax
  28#define RAB1 %rbx
  29#define RAB2 %rcx
  30
  31#define RAB0d %eax
  32#define RAB1d %ebx
  33#define RAB2d %ecx
  34
  35#define RAB0bh %ah
  36#define RAB1bh %bh
  37#define RAB2bh %ch
  38
  39#define RAB0bl %al
  40#define RAB1bl %bl
  41#define RAB2bl %cl
  42
  43#define CD0 0x0(%rsp)
  44#define CD1 0x8(%rsp)
  45#define CD2 0x10(%rsp)
  46
  47# used only before/after all rounds
  48#define RCD0 %r8
  49#define RCD1 %r9
  50#define RCD2 %r10
  51
  52# used only during rounds
  53#define RX0 %r8
  54#define RX1 %r9
  55#define RX2 %r10
  56
  57#define RX0d %r8d
  58#define RX1d %r9d
  59#define RX2d %r10d
  60
  61#define RY0 %r11
  62#define RY1 %r12
  63#define RY2 %r13
  64
  65#define RY0d %r11d
  66#define RY1d %r12d
  67#define RY2d %r13d
  68
  69#define RT0 %rdx
  70#define RT1 %rsi
  71
  72#define RT0d %edx
  73#define RT1d %esi
  74
  75#define RT1bl %sil
  76
  77#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
  78        movzbl ab ## bl,                tmp2 ## d; \
  79        movzbl ab ## bh,                tmp1 ## d; \
  80        rorq $(rot),                    ab; \
  81        op1##l T0(CTX, tmp2, 4),        dst ## d; \
  82        op2##l T1(CTX, tmp1, 4),        dst ## d;
  83
  84#define swap_ab_with_cd(ab, cd, tmp)    \
  85        movq cd, tmp;                   \
  86        movq ab, cd;                    \
  87        movq tmp, ab;
  88
  89/*
  90 * Combined G1 & G2 function. Reordered with help of rotates to have moves
  91 * at beginning.
  92 */
  93#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
  94        /* G1,1 && G2,1 */ \
  95        do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \
  96        do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \
  97        \
  98        do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \
  99        do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \
 100        \
 101        do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \
 102        do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \
 103        \
 104        /* G1,2 && G2,2 */ \
 105        do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
 106        do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
 107        swap_ab_with_cd(ab ## 0, cd ## 0, RT0); \
 108        \
 109        do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
 110        do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
 111        swap_ab_with_cd(ab ## 1, cd ## 1, RT0); \
 112        \
 113        do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
 114        do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
 115        swap_ab_with_cd(ab ## 2, cd ## 2, RT0);
 116
 117#define enc_round_end(ab, x, y, n) \
 118        addl y ## d,                    x ## d; \
 119        addl x ## d,                    y ## d; \
 120        addl k+4*(2*(n))(CTX),          x ## d; \
 121        xorl ab ## d,                   x ## d; \
 122        addl k+4*(2*(n)+1)(CTX),        y ## d; \
 123        shrq $32,                       ab; \
 124        roll $1,                        ab ## d; \
 125        xorl y ## d,                    ab ## d; \
 126        shlq $32,                       ab; \
 127        rorl $1,                        x ## d; \
 128        orq x,                          ab;
 129
 130#define dec_round_end(ba, x, y, n) \
 131        addl y ## d,                    x ## d; \
 132        addl x ## d,                    y ## d; \
 133        addl k+4*(2*(n))(CTX),          x ## d; \
 134        addl k+4*(2*(n)+1)(CTX),        y ## d; \
 135        xorl ba ## d,                   y ## d; \
 136        shrq $32,                       ba; \
 137        roll $1,                        ba ## d; \
 138        xorl x ## d,                    ba ## d; \
 139        shlq $32,                       ba; \
 140        rorl $1,                        y ## d; \
 141        orq y,                          ba;
 142
 143#define encrypt_round3(ab, cd, n) \
 144        g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \
 145        \
 146        enc_round_end(ab ## 0, RX0, RY0, n); \
 147        enc_round_end(ab ## 1, RX1, RY1, n); \
 148        enc_round_end(ab ## 2, RX2, RY2, n);
 149
 150#define decrypt_round3(ba, dc, n) \
 151        g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \
 152        \
 153        dec_round_end(ba ## 0, RX0, RY0, n); \
 154        dec_round_end(ba ## 1, RX1, RY1, n); \
 155        dec_round_end(ba ## 2, RX2, RY2, n);
 156
 157#define encrypt_cycle3(ab, cd, n) \
 158        encrypt_round3(ab, cd, n*2); \
 159        encrypt_round3(ab, cd, (n*2)+1);
 160
 161#define decrypt_cycle3(ba, dc, n) \
 162        decrypt_round3(ba, dc, (n*2)+1); \
 163        decrypt_round3(ba, dc, (n*2));
 164
 165#define push_cd()       \
 166        pushq RCD2;     \
 167        pushq RCD1;     \
 168        pushq RCD0;
 169
 170#define pop_cd()        \
 171        popq RCD0;      \
 172        popq RCD1;      \
 173        popq RCD2;
 174
 175#define inpack3(in, n, xy, m) \
 176        movq 4*(n)(in),                 xy ## 0; \
 177        xorq w+4*m(CTX),                xy ## 0; \
 178        \
 179        movq 4*(4+(n))(in),             xy ## 1; \
 180        xorq w+4*m(CTX),                xy ## 1; \
 181        \
 182        movq 4*(8+(n))(in),             xy ## 2; \
 183        xorq w+4*m(CTX),                xy ## 2;
 184
 185#define outunpack3(op, out, n, xy, m) \
 186        xorq w+4*m(CTX),                xy ## 0; \
 187        op ## q xy ## 0,                4*(n)(out); \
 188        \
 189        xorq w+4*m(CTX),                xy ## 1; \
 190        op ## q xy ## 1,                4*(4+(n))(out); \
 191        \
 192        xorq w+4*m(CTX),                xy ## 2; \
 193        op ## q xy ## 2,                4*(8+(n))(out);
 194
 195#define inpack_enc3() \
 196        inpack3(RIO, 0, RAB, 0); \
 197        inpack3(RIO, 2, RCD, 2);
 198
 199#define outunpack_enc3(op) \
 200        outunpack3(op, RIO, 2, RAB, 6); \
 201        outunpack3(op, RIO, 0, RCD, 4);
 202
 203#define inpack_dec3() \
 204        inpack3(RIO, 0, RAB, 4); \
 205        rorq $32,                       RAB0; \
 206        rorq $32,                       RAB1; \
 207        rorq $32,                       RAB2; \
 208        inpack3(RIO, 2, RCD, 6); \
 209        rorq $32,                       RCD0; \
 210        rorq $32,                       RCD1; \
 211        rorq $32,                       RCD2;
 212
 213#define outunpack_dec3() \
 214        rorq $32,                       RCD0; \
 215        rorq $32,                       RCD1; \
 216        rorq $32,                       RCD2; \
 217        outunpack3(mov, RIO, 0, RCD, 0); \
 218        rorq $32,                       RAB0; \
 219        rorq $32,                       RAB1; \
 220        rorq $32,                       RAB2; \
 221        outunpack3(mov, RIO, 2, RAB, 2);
 222
 223SYM_FUNC_START(__twofish_enc_blk_3way)
 224        /* input:
 225         *      %rdi: ctx, CTX
 226         *      %rsi: dst
 227         *      %rdx: src, RIO
 228         *      %rcx: bool, if true: xor output
 229         */
 230        pushq %r13;
 231        pushq %r12;
 232        pushq %rbx;
 233
 234        pushq %rcx; /* bool xor */
 235        pushq %rsi; /* dst */
 236
 237        inpack_enc3();
 238
 239        push_cd();
 240        encrypt_cycle3(RAB, CD, 0);
 241        encrypt_cycle3(RAB, CD, 1);
 242        encrypt_cycle3(RAB, CD, 2);
 243        encrypt_cycle3(RAB, CD, 3);
 244        encrypt_cycle3(RAB, CD, 4);
 245        encrypt_cycle3(RAB, CD, 5);
 246        encrypt_cycle3(RAB, CD, 6);
 247        encrypt_cycle3(RAB, CD, 7);
 248        pop_cd();
 249
 250        popq RIO; /* dst */
 251        popq RT1; /* bool xor */
 252
 253        testb RT1bl, RT1bl;
 254        jnz .L__enc_xor3;
 255
 256        outunpack_enc3(mov);
 257
 258        popq %rbx;
 259        popq %r12;
 260        popq %r13;
 261        ret;
 262
 263.L__enc_xor3:
 264        outunpack_enc3(xor);
 265
 266        popq %rbx;
 267        popq %r12;
 268        popq %r13;
 269        ret;
 270SYM_FUNC_END(__twofish_enc_blk_3way)
 271
 272SYM_FUNC_START(twofish_dec_blk_3way)
 273        /* input:
 274         *      %rdi: ctx, CTX
 275         *      %rsi: dst
 276         *      %rdx: src, RIO
 277         */
 278        pushq %r13;
 279        pushq %r12;
 280        pushq %rbx;
 281
 282        pushq %rsi; /* dst */
 283
 284        inpack_dec3();
 285
 286        push_cd();
 287        decrypt_cycle3(RAB, CD, 7);
 288        decrypt_cycle3(RAB, CD, 6);
 289        decrypt_cycle3(RAB, CD, 5);
 290        decrypt_cycle3(RAB, CD, 4);
 291        decrypt_cycle3(RAB, CD, 3);
 292        decrypt_cycle3(RAB, CD, 2);
 293        decrypt_cycle3(RAB, CD, 1);
 294        decrypt_cycle3(RAB, CD, 0);
 295        pop_cd();
 296
 297        popq RIO; /* dst */
 298
 299        outunpack_dec3();
 300
 301        popq %rbx;
 302        popq %r12;
 303        popq %r13;
 304        ret;
 305SYM_FUNC_END(twofish_dec_blk_3way)
 306