linux/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
<<
>>
Prefs
   1/*
   2 * Twofish Cipher 3-way parallel algorithm (x86_64)
   3 *
   4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation; either version 2 of the License, or
   9 * (at your option) any later version.
  10 *
  11 * This program is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 * GNU General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program; if not, write to the Free Software
  18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
  19 * USA
  20 *
  21 */
  22
  23.file "twofish-x86_64-asm-3way.S"
  24.text
  25
  26/* structure of crypto context */
  27#define s0      0
  28#define s1      1024
  29#define s2      2048
  30#define s3      3072
  31#define w       4096
  32#define k       4128
  33
  34/**********************************************************************
  35  3-way twofish
  36 **********************************************************************/
  37#define CTX %rdi
  38#define RIO %rdx
  39
  40#define RAB0 %rax
  41#define RAB1 %rbx
  42#define RAB2 %rcx
  43
  44#define RAB0d %eax
  45#define RAB1d %ebx
  46#define RAB2d %ecx
  47
  48#define RAB0bh %ah
  49#define RAB1bh %bh
  50#define RAB2bh %ch
  51
  52#define RAB0bl %al
  53#define RAB1bl %bl
  54#define RAB2bl %cl
  55
  56#define RCD0 %r8
  57#define RCD1 %r9
  58#define RCD2 %r10
  59
  60#define RCD0d %r8d
  61#define RCD1d %r9d
  62#define RCD2d %r10d
  63
  64#define RX0 %rbp
  65#define RX1 %r11
  66#define RX2 %r12
  67
  68#define RX0d %ebp
  69#define RX1d %r11d
  70#define RX2d %r12d
  71
  72#define RY0 %r13
  73#define RY1 %r14
  74#define RY2 %r15
  75
  76#define RY0d %r13d
  77#define RY1d %r14d
  78#define RY2d %r15d
  79
  80#define RT0 %rdx
  81#define RT1 %rsi
  82
  83#define RT0d %edx
  84#define RT1d %esi
  85
  86#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
  87        movzbl ab ## bl,                tmp2 ## d; \
  88        movzbl ab ## bh,                tmp1 ## d; \
  89        rorq $(rot),                    ab; \
  90        op1##l T0(CTX, tmp2, 4),        dst ## d; \
  91        op2##l T1(CTX, tmp1, 4),        dst ## d;
  92
  93/*
  94 * Combined G1 & G2 function. Reordered with help of rotates to have moves
  95 * at begining.
  96 */
  97#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
  98        /* G1,1 && G2,1 */ \
  99        do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \
 100        do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \
 101        \
 102        do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \
 103        do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \
 104        \
 105        do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \
 106        do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \
 107        \
 108        /* G1,2 && G2,2 */ \
 109        do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
 110        do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
 111        xchgq cd ## 0, ab ## 0; \
 112        \
 113        do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
 114        do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
 115        xchgq cd ## 1, ab ## 1; \
 116        \
 117        do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
 118        do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
 119        xchgq cd ## 2, ab ## 2;
 120
 121#define enc_round_end(ab, x, y, n) \
 122        addl y ## d,                    x ## d; \
 123        addl x ## d,                    y ## d; \
 124        addl k+4*(2*(n))(CTX),          x ## d; \
 125        xorl ab ## d,                   x ## d; \
 126        addl k+4*(2*(n)+1)(CTX),        y ## d; \
 127        shrq $32,                       ab; \
 128        roll $1,                        ab ## d; \
 129        xorl y ## d,                    ab ## d; \
 130        shlq $32,                       ab; \
 131        rorl $1,                        x ## d; \
 132        orq x,                          ab;
 133
 134#define dec_round_end(ba, x, y, n) \
 135        addl y ## d,                    x ## d; \
 136        addl x ## d,                    y ## d; \
 137        addl k+4*(2*(n))(CTX),          x ## d; \
 138        addl k+4*(2*(n)+1)(CTX),        y ## d; \
 139        xorl ba ## d,                   y ## d; \
 140        shrq $32,                       ba; \
 141        roll $1,                        ba ## d; \
 142        xorl x ## d,                    ba ## d; \
 143        shlq $32,                       ba; \
 144        rorl $1,                        y ## d; \
 145        orq y,                          ba;
 146
 147#define encrypt_round3(ab, cd, n) \
 148        g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \
 149        \
 150        enc_round_end(ab ## 0, RX0, RY0, n); \
 151        enc_round_end(ab ## 1, RX1, RY1, n); \
 152        enc_round_end(ab ## 2, RX2, RY2, n);
 153
 154#define decrypt_round3(ba, dc, n) \
 155        g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \
 156        \
 157        dec_round_end(ba ## 0, RX0, RY0, n); \
 158        dec_round_end(ba ## 1, RX1, RY1, n); \
 159        dec_round_end(ba ## 2, RX2, RY2, n);
 160
 161#define encrypt_cycle3(ab, cd, n) \
 162        encrypt_round3(ab, cd, n*2); \
 163        encrypt_round3(ab, cd, (n*2)+1);
 164
 165#define decrypt_cycle3(ba, dc, n) \
 166        decrypt_round3(ba, dc, (n*2)+1); \
 167        decrypt_round3(ba, dc, (n*2));
 168
 169#define inpack3(in, n, xy, m) \
 170        movq 4*(n)(in),                 xy ## 0; \
 171        xorq w+4*m(CTX),                xy ## 0; \
 172        \
 173        movq 4*(4+(n))(in),             xy ## 1; \
 174        xorq w+4*m(CTX),                xy ## 1; \
 175        \
 176        movq 4*(8+(n))(in),             xy ## 2; \
 177        xorq w+4*m(CTX),                xy ## 2;
 178
 179#define outunpack3(op, out, n, xy, m) \
 180        xorq w+4*m(CTX),                xy ## 0; \
 181        op ## q xy ## 0,                4*(n)(out); \
 182        \
 183        xorq w+4*m(CTX),                xy ## 1; \
 184        op ## q xy ## 1,                4*(4+(n))(out); \
 185        \
 186        xorq w+4*m(CTX),                xy ## 2; \
 187        op ## q xy ## 2,                4*(8+(n))(out);
 188
 189#define inpack_enc3() \
 190        inpack3(RIO, 0, RAB, 0); \
 191        inpack3(RIO, 2, RCD, 2);
 192
 193#define outunpack_enc3(op) \
 194        outunpack3(op, RIO, 2, RAB, 6); \
 195        outunpack3(op, RIO, 0, RCD, 4);
 196
 197#define inpack_dec3() \
 198        inpack3(RIO, 0, RAB, 4); \
 199        rorq $32,                       RAB0; \
 200        rorq $32,                       RAB1; \
 201        rorq $32,                       RAB2; \
 202        inpack3(RIO, 2, RCD, 6); \
 203        rorq $32,                       RCD0; \
 204        rorq $32,                       RCD1; \
 205        rorq $32,                       RCD2;
 206
 207#define outunpack_dec3() \
 208        rorq $32,                       RCD0; \
 209        rorq $32,                       RCD1; \
 210        rorq $32,                       RCD2; \
 211        outunpack3(mov, RIO, 0, RCD, 0); \
 212        rorq $32,                       RAB0; \
 213        rorq $32,                       RAB1; \
 214        rorq $32,                       RAB2; \
 215        outunpack3(mov, RIO, 2, RAB, 2);
 216
 217.align 8
 218.global __twofish_enc_blk_3way
 219.type   __twofish_enc_blk_3way,@function;
 220
 221__twofish_enc_blk_3way:
 222        /* input:
 223         *      %rdi: ctx, CTX
 224         *      %rsi: dst
 225         *      %rdx: src, RIO
 226         *      %rcx: bool, if true: xor output
 227         */
 228        pushq %r15;
 229        pushq %r14;
 230        pushq %r13;
 231        pushq %r12;
 232        pushq %rbp;
 233        pushq %rbx;
 234
 235        pushq %rcx; /* bool xor */
 236        pushq %rsi; /* dst */
 237
 238        inpack_enc3();
 239
 240        encrypt_cycle3(RAB, RCD, 0);
 241        encrypt_cycle3(RAB, RCD, 1);
 242        encrypt_cycle3(RAB, RCD, 2);
 243        encrypt_cycle3(RAB, RCD, 3);
 244        encrypt_cycle3(RAB, RCD, 4);
 245        encrypt_cycle3(RAB, RCD, 5);
 246        encrypt_cycle3(RAB, RCD, 6);
 247        encrypt_cycle3(RAB, RCD, 7);
 248
 249        popq RIO; /* dst */
 250        popq %rbp; /* bool xor */
 251
 252        testb %bpl, %bpl;
 253        jnz __enc_xor3;
 254
 255        outunpack_enc3(mov);
 256
 257        popq %rbx;
 258        popq %rbp;
 259        popq %r12;
 260        popq %r13;
 261        popq %r14;
 262        popq %r15;
 263        ret;
 264
 265__enc_xor3:
 266        outunpack_enc3(xor);
 267
 268        popq %rbx;
 269        popq %rbp;
 270        popq %r12;
 271        popq %r13;
 272        popq %r14;
 273        popq %r15;
 274        ret;
 275
 276.global twofish_dec_blk_3way
 277.type   twofish_dec_blk_3way,@function;
 278
 279twofish_dec_blk_3way:
 280        /* input:
 281         *      %rdi: ctx, CTX
 282         *      %rsi: dst
 283         *      %rdx: src, RIO
 284         */
 285        pushq %r15;
 286        pushq %r14;
 287        pushq %r13;
 288        pushq %r12;
 289        pushq %rbp;
 290        pushq %rbx;
 291
 292        pushq %rsi; /* dst */
 293
 294        inpack_dec3();
 295
 296        decrypt_cycle3(RAB, RCD, 7);
 297        decrypt_cycle3(RAB, RCD, 6);
 298        decrypt_cycle3(RAB, RCD, 5);
 299        decrypt_cycle3(RAB, RCD, 4);
 300        decrypt_cycle3(RAB, RCD, 3);
 301        decrypt_cycle3(RAB, RCD, 2);
 302        decrypt_cycle3(RAB, RCD, 1);
 303        decrypt_cycle3(RAB, RCD, 0);
 304
 305        popq RIO; /* dst */
 306
 307        outunpack_dec3();
 308
 309        popq %rbx;
 310        popq %rbp;
 311        popq %r12;
 312        popq %r13;
 313        popq %r14;
 314        popq %r15;
 315        ret;
 316
 317