linux/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
<<
>>
Prefs
   1/*
   2 * Twofish Cipher 3-way parallel algorithm (x86_64)
   3 *
   4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation; either version 2 of the License, or
   9 * (at your option) any later version.
  10 *
  11 * This program is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 * GNU General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program; if not, write to the Free Software
  18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
  19 * USA
  20 *
  21 */
  22
  23#include <linux/linkage.h>
  24
  25.file "twofish-x86_64-asm-3way.S"
  26.text
  27
  28/* structure of crypto context */
  29#define s0      0
  30#define s1      1024
  31#define s2      2048
  32#define s3      3072
  33#define w       4096
  34#define k       4128
  35
  36/**********************************************************************
  37  3-way twofish
  38 **********************************************************************/
  39#define CTX %rdi
  40#define RIO %rdx
  41
  42#define RAB0 %rax
  43#define RAB1 %rbx
  44#define RAB2 %rcx
  45
  46#define RAB0d %eax
  47#define RAB1d %ebx
  48#define RAB2d %ecx
  49
  50#define RAB0bh %ah
  51#define RAB1bh %bh
  52#define RAB2bh %ch
  53
  54#define RAB0bl %al
  55#define RAB1bl %bl
  56#define RAB2bl %cl
  57
  58#define RCD0 %r8
  59#define RCD1 %r9
  60#define RCD2 %r10
  61
  62#define RCD0d %r8d
  63#define RCD1d %r9d
  64#define RCD2d %r10d
  65
  66#define RX0 %rbp
  67#define RX1 %r11
  68#define RX2 %r12
  69
  70#define RX0d %ebp
  71#define RX1d %r11d
  72#define RX2d %r12d
  73
  74#define RY0 %r13
  75#define RY1 %r14
  76#define RY2 %r15
  77
  78#define RY0d %r13d
  79#define RY1d %r14d
  80#define RY2d %r15d
  81
  82#define RT0 %rdx
  83#define RT1 %rsi
  84
  85#define RT0d %edx
  86#define RT1d %esi
  87
  88#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
  89        movzbl ab ## bl,                tmp2 ## d; \
  90        movzbl ab ## bh,                tmp1 ## d; \
  91        rorq $(rot),                    ab; \
  92        op1##l T0(CTX, tmp2, 4),        dst ## d; \
  93        op2##l T1(CTX, tmp1, 4),        dst ## d;
  94
  95/*
  96 * Combined G1 & G2 function. Reordered with help of rotates to have moves
  97 * at begining.
  98 */
  99#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
 100        /* G1,1 && G2,1 */ \
 101        do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \
 102        do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \
 103        \
 104        do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \
 105        do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \
 106        \
 107        do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \
 108        do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \
 109        \
 110        /* G1,2 && G2,2 */ \
 111        do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
 112        do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
 113        xchgq cd ## 0, ab ## 0; \
 114        \
 115        do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
 116        do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
 117        xchgq cd ## 1, ab ## 1; \
 118        \
 119        do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
 120        do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
 121        xchgq cd ## 2, ab ## 2;
 122
 123#define enc_round_end(ab, x, y, n) \
 124        addl y ## d,                    x ## d; \
 125        addl x ## d,                    y ## d; \
 126        addl k+4*(2*(n))(CTX),          x ## d; \
 127        xorl ab ## d,                   x ## d; \
 128        addl k+4*(2*(n)+1)(CTX),        y ## d; \
 129        shrq $32,                       ab; \
 130        roll $1,                        ab ## d; \
 131        xorl y ## d,                    ab ## d; \
 132        shlq $32,                       ab; \
 133        rorl $1,                        x ## d; \
 134        orq x,                          ab;
 135
 136#define dec_round_end(ba, x, y, n) \
 137        addl y ## d,                    x ## d; \
 138        addl x ## d,                    y ## d; \
 139        addl k+4*(2*(n))(CTX),          x ## d; \
 140        addl k+4*(2*(n)+1)(CTX),        y ## d; \
 141        xorl ba ## d,                   y ## d; \
 142        shrq $32,                       ba; \
 143        roll $1,                        ba ## d; \
 144        xorl x ## d,                    ba ## d; \
 145        shlq $32,                       ba; \
 146        rorl $1,                        y ## d; \
 147        orq y,                          ba;
 148
 149#define encrypt_round3(ab, cd, n) \
 150        g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \
 151        \
 152        enc_round_end(ab ## 0, RX0, RY0, n); \
 153        enc_round_end(ab ## 1, RX1, RY1, n); \
 154        enc_round_end(ab ## 2, RX2, RY2, n);
 155
 156#define decrypt_round3(ba, dc, n) \
 157        g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \
 158        \
 159        dec_round_end(ba ## 0, RX0, RY0, n); \
 160        dec_round_end(ba ## 1, RX1, RY1, n); \
 161        dec_round_end(ba ## 2, RX2, RY2, n);
 162
 163#define encrypt_cycle3(ab, cd, n) \
 164        encrypt_round3(ab, cd, n*2); \
 165        encrypt_round3(ab, cd, (n*2)+1);
 166
 167#define decrypt_cycle3(ba, dc, n) \
 168        decrypt_round3(ba, dc, (n*2)+1); \
 169        decrypt_round3(ba, dc, (n*2));
 170
 171#define inpack3(in, n, xy, m) \
 172        movq 4*(n)(in),                 xy ## 0; \
 173        xorq w+4*m(CTX),                xy ## 0; \
 174        \
 175        movq 4*(4+(n))(in),             xy ## 1; \
 176        xorq w+4*m(CTX),                xy ## 1; \
 177        \
 178        movq 4*(8+(n))(in),             xy ## 2; \
 179        xorq w+4*m(CTX),                xy ## 2;
 180
 181#define outunpack3(op, out, n, xy, m) \
 182        xorq w+4*m(CTX),                xy ## 0; \
 183        op ## q xy ## 0,                4*(n)(out); \
 184        \
 185        xorq w+4*m(CTX),                xy ## 1; \
 186        op ## q xy ## 1,                4*(4+(n))(out); \
 187        \
 188        xorq w+4*m(CTX),                xy ## 2; \
 189        op ## q xy ## 2,                4*(8+(n))(out);
 190
 191#define inpack_enc3() \
 192        inpack3(RIO, 0, RAB, 0); \
 193        inpack3(RIO, 2, RCD, 2);
 194
 195#define outunpack_enc3(op) \
 196        outunpack3(op, RIO, 2, RAB, 6); \
 197        outunpack3(op, RIO, 0, RCD, 4);
 198
 199#define inpack_dec3() \
 200        inpack3(RIO, 0, RAB, 4); \
 201        rorq $32,                       RAB0; \
 202        rorq $32,                       RAB1; \
 203        rorq $32,                       RAB2; \
 204        inpack3(RIO, 2, RCD, 6); \
 205        rorq $32,                       RCD0; \
 206        rorq $32,                       RCD1; \
 207        rorq $32,                       RCD2;
 208
 209#define outunpack_dec3() \
 210        rorq $32,                       RCD0; \
 211        rorq $32,                       RCD1; \
 212        rorq $32,                       RCD2; \
 213        outunpack3(mov, RIO, 0, RCD, 0); \
 214        rorq $32,                       RAB0; \
 215        rorq $32,                       RAB1; \
 216        rorq $32,                       RAB2; \
 217        outunpack3(mov, RIO, 2, RAB, 2);
 218
 219ENTRY(__twofish_enc_blk_3way)
 220        /* input:
 221         *      %rdi: ctx, CTX
 222         *      %rsi: dst
 223         *      %rdx: src, RIO
 224         *      %rcx: bool, if true: xor output
 225         */
 226        pushq %r15;
 227        pushq %r14;
 228        pushq %r13;
 229        pushq %r12;
 230        pushq %rbp;
 231        pushq %rbx;
 232
 233        pushq %rcx; /* bool xor */
 234        pushq %rsi; /* dst */
 235
 236        inpack_enc3();
 237
 238        encrypt_cycle3(RAB, RCD, 0);
 239        encrypt_cycle3(RAB, RCD, 1);
 240        encrypt_cycle3(RAB, RCD, 2);
 241        encrypt_cycle3(RAB, RCD, 3);
 242        encrypt_cycle3(RAB, RCD, 4);
 243        encrypt_cycle3(RAB, RCD, 5);
 244        encrypt_cycle3(RAB, RCD, 6);
 245        encrypt_cycle3(RAB, RCD, 7);
 246
 247        popq RIO; /* dst */
 248        popq %rbp; /* bool xor */
 249
 250        testb %bpl, %bpl;
 251        jnz .L__enc_xor3;
 252
 253        outunpack_enc3(mov);
 254
 255        popq %rbx;
 256        popq %rbp;
 257        popq %r12;
 258        popq %r13;
 259        popq %r14;
 260        popq %r15;
 261        ret;
 262
 263.L__enc_xor3:
 264        outunpack_enc3(xor);
 265
 266        popq %rbx;
 267        popq %rbp;
 268        popq %r12;
 269        popq %r13;
 270        popq %r14;
 271        popq %r15;
 272        ret;
 273ENDPROC(__twofish_enc_blk_3way)
 274
 275ENTRY(twofish_dec_blk_3way)
 276        /* input:
 277         *      %rdi: ctx, CTX
 278         *      %rsi: dst
 279         *      %rdx: src, RIO
 280         */
 281        pushq %r15;
 282        pushq %r14;
 283        pushq %r13;
 284        pushq %r12;
 285        pushq %rbp;
 286        pushq %rbx;
 287
 288        pushq %rsi; /* dst */
 289
 290        inpack_dec3();
 291
 292        decrypt_cycle3(RAB, RCD, 7);
 293        decrypt_cycle3(RAB, RCD, 6);
 294        decrypt_cycle3(RAB, RCD, 5);
 295        decrypt_cycle3(RAB, RCD, 4);
 296        decrypt_cycle3(RAB, RCD, 3);
 297        decrypt_cycle3(RAB, RCD, 2);
 298        decrypt_cycle3(RAB, RCD, 1);
 299        decrypt_cycle3(RAB, RCD, 0);
 300
 301        popq RIO; /* dst */
 302
 303        outunpack_dec3();
 304
 305        popq %rbx;
 306        popq %rbp;
 307        popq %r12;
 308        popq %r13;
 309        popq %r14;
 310        popq %r15;
 311        ret;
 312ENDPROC(twofish_dec_blk_3way)
 313