linux/arch/x86/crypto/blowfish-x86_64-asm_64.S
<<
>>
Prefs
   1/*
   2 * Blowfish Cipher Algorithm (x86_64)
   3 *
   4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation; either version 2 of the License, or
   9 * (at your option) any later version.
  10 *
  11 * This program is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 * GNU General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program; if not, write to the Free Software
  18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
  19 * USA
  20 *
  21 */
  22
  23.file "blowfish-x86_64-asm.S"
  24.text
  25
  26/* structure of crypto context */
  27#define p       0
  28#define s0      ((16 + 2) * 4)
  29#define s1      ((16 + 2 + (1 * 256)) * 4)
  30#define s2      ((16 + 2 + (2 * 256)) * 4)
  31#define s3      ((16 + 2 + (3 * 256)) * 4)
  32
  33/* register macros */
  34#define CTX %rdi
  35#define RIO %rsi
  36
  37#define RX0 %rax
  38#define RX1 %rbx
  39#define RX2 %rcx
  40#define RX3 %rdx
  41
  42#define RX0d %eax
  43#define RX1d %ebx
  44#define RX2d %ecx
  45#define RX3d %edx
  46
  47#define RX0bl %al
  48#define RX1bl %bl
  49#define RX2bl %cl
  50#define RX3bl %dl
  51
  52#define RX0bh %ah
  53#define RX1bh %bh
  54#define RX2bh %ch
  55#define RX3bh %dh
  56
  57#define RT0 %rbp
  58#define RT1 %rsi
  59#define RT2 %r8
  60#define RT3 %r9
  61
  62#define RT0d %ebp
  63#define RT1d %esi
  64#define RT2d %r8d
  65#define RT3d %r9d
  66
  67#define RKEY %r10
  68
  69/***********************************************************************
  70 * 1-way blowfish
  71 ***********************************************************************/
  72#define F() \
  73        rorq $16,               RX0; \
  74        movzbl RX0bh,           RT0d; \
  75        movzbl RX0bl,           RT1d; \
  76        rolq $16,               RX0; \
  77        movl s0(CTX,RT0,4),     RT0d; \
  78        addl s1(CTX,RT1,4),     RT0d; \
  79        movzbl RX0bh,           RT1d; \
  80        movzbl RX0bl,           RT2d; \
  81        rolq $32,               RX0; \
  82        xorl s2(CTX,RT1,4),     RT0d; \
  83        addl s3(CTX,RT2,4),     RT0d; \
  84        xorq RT0,               RX0;
  85
  86#define add_roundkey_enc(n) \
  87        xorq p+4*(n)(CTX),      RX0;
  88
  89#define round_enc(n) \
  90        add_roundkey_enc(n); \
  91        \
  92        F(); \
  93        F();
  94
  95#define add_roundkey_dec(n) \
  96        movq p+4*(n-1)(CTX),    RT0; \
  97        rorq $32,               RT0; \
  98        xorq RT0,               RX0;
  99
 100#define round_dec(n) \
 101        add_roundkey_dec(n); \
 102        \
 103        F(); \
 104        F(); \
 105
 106#define read_block() \
 107        movq (RIO),             RX0; \
 108        rorq $32,               RX0; \
 109        bswapq                  RX0;
 110
 111#define write_block() \
 112        bswapq                  RX0; \
 113        movq RX0,               (RIO);
 114
 115#define xor_block() \
 116        bswapq                  RX0; \
 117        xorq RX0,               (RIO);
 118
 119.align 8
 120.global __blowfish_enc_blk
 121.type   __blowfish_enc_blk,@function;
 122
 123__blowfish_enc_blk:
 124        /* input:
 125         *      %rdi: ctx, CTX
 126         *      %rsi: dst
 127         *      %rdx: src
 128         *      %rcx: bool, if true: xor output
 129         */
 130        movq %rbp, %r11;
 131
 132        movq %rsi, %r10;
 133        movq %rdx, RIO;
 134
 135        read_block();
 136
 137        round_enc(0);
 138        round_enc(2);
 139        round_enc(4);
 140        round_enc(6);
 141        round_enc(8);
 142        round_enc(10);
 143        round_enc(12);
 144        round_enc(14);
 145        add_roundkey_enc(16);
 146
 147        movq %r11, %rbp;
 148
 149        movq %r10, RIO;
 150        test %cl, %cl;
 151        jnz __enc_xor;
 152
 153        write_block();
 154        ret;
 155__enc_xor:
 156        xor_block();
 157        ret;
 158
 159.align 8
 160.global blowfish_dec_blk
 161.type   blowfish_dec_blk,@function;
 162
 163blowfish_dec_blk:
 164        /* input:
 165         *      %rdi: ctx, CTX
 166         *      %rsi: dst
 167         *      %rdx: src
 168         */
 169        movq %rbp, %r11;
 170
 171        movq %rsi, %r10;
 172        movq %rdx, RIO;
 173
 174        read_block();
 175
 176        round_dec(17);
 177        round_dec(15);
 178        round_dec(13);
 179        round_dec(11);
 180        round_dec(9);
 181        round_dec(7);
 182        round_dec(5);
 183        round_dec(3);
 184        add_roundkey_dec(1);
 185
 186        movq %r10, RIO;
 187        write_block();
 188
 189        movq %r11, %rbp;
 190
 191        ret;
 192
 193/**********************************************************************
 194  4-way blowfish, four blocks parallel
 195 **********************************************************************/
 196
 197/* F() for 4-way. Slower when used alone/1-way, but faster when used
 198 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
 199 */
 200#define F4(x) \
 201        movzbl x ## bh,         RT1d; \
 202        movzbl x ## bl,         RT3d; \
 203        rorq $16,               x; \
 204        movzbl x ## bh,         RT0d; \
 205        movzbl x ## bl,         RT2d; \
 206        rorq $16,               x; \
 207        movl s0(CTX,RT0,4),     RT0d; \
 208        addl s1(CTX,RT2,4),     RT0d; \
 209        xorl s2(CTX,RT1,4),     RT0d; \
 210        addl s3(CTX,RT3,4),     RT0d; \
 211        xorq RT0,               x;
 212
 213#define add_preloaded_roundkey4() \
 214        xorq RKEY,              RX0; \
 215        xorq RKEY,              RX1; \
 216        xorq RKEY,              RX2; \
 217        xorq RKEY,              RX3;
 218
 219#define preload_roundkey_enc(n) \
 220        movq p+4*(n)(CTX),      RKEY;
 221
 222#define add_roundkey_enc4(n) \
 223        add_preloaded_roundkey4(); \
 224        preload_roundkey_enc(n + 2);
 225
 226#define round_enc4(n) \
 227        add_roundkey_enc4(n); \
 228        \
 229        F4(RX0); \
 230        F4(RX1); \
 231        F4(RX2); \
 232        F4(RX3); \
 233        \
 234        F4(RX0); \
 235        F4(RX1); \
 236        F4(RX2); \
 237        F4(RX3);
 238
 239#define preload_roundkey_dec(n) \
 240        movq p+4*((n)-1)(CTX),  RKEY; \
 241        rorq $32,               RKEY;
 242
 243#define add_roundkey_dec4(n) \
 244        add_preloaded_roundkey4(); \
 245        preload_roundkey_dec(n - 2);
 246
 247#define round_dec4(n) \
 248        add_roundkey_dec4(n); \
 249        \
 250        F4(RX0); \
 251        F4(RX1); \
 252        F4(RX2); \
 253        F4(RX3); \
 254        \
 255        F4(RX0); \
 256        F4(RX1); \
 257        F4(RX2); \
 258        F4(RX3);
 259
 260#define read_block4() \
 261        movq (RIO),             RX0; \
 262        rorq $32,               RX0; \
 263        bswapq                  RX0; \
 264        \
 265        movq 8(RIO),            RX1; \
 266        rorq $32,               RX1; \
 267        bswapq                  RX1; \
 268        \
 269        movq 16(RIO),           RX2; \
 270        rorq $32,               RX2; \
 271        bswapq                  RX2; \
 272        \
 273        movq 24(RIO),           RX3; \
 274        rorq $32,               RX3; \
 275        bswapq                  RX3;
 276
 277#define write_block4() \
 278        bswapq                  RX0; \
 279        movq RX0,               (RIO); \
 280        \
 281        bswapq                  RX1; \
 282        movq RX1,               8(RIO); \
 283        \
 284        bswapq                  RX2; \
 285        movq RX2,               16(RIO); \
 286        \
 287        bswapq                  RX3; \
 288        movq RX3,               24(RIO);
 289
 290#define xor_block4() \
 291        bswapq                  RX0; \
 292        xorq RX0,               (RIO); \
 293        \
 294        bswapq                  RX1; \
 295        xorq RX1,               8(RIO); \
 296        \
 297        bswapq                  RX2; \
 298        xorq RX2,               16(RIO); \
 299        \
 300        bswapq                  RX3; \
 301        xorq RX3,               24(RIO);
 302
 303.align 8
 304.global __blowfish_enc_blk_4way
 305.type   __blowfish_enc_blk_4way,@function;
 306
 307__blowfish_enc_blk_4way:
 308        /* input:
 309         *      %rdi: ctx, CTX
 310         *      %rsi: dst
 311         *      %rdx: src
 312         *      %rcx: bool, if true: xor output
 313         */
 314        pushq %rbp;
 315        pushq %rbx;
 316        pushq %rcx;
 317
 318        preload_roundkey_enc(0);
 319
 320        movq %rsi, %r11;
 321        movq %rdx, RIO;
 322
 323        read_block4();
 324
 325        round_enc4(0);
 326        round_enc4(2);
 327        round_enc4(4);
 328        round_enc4(6);
 329        round_enc4(8);
 330        round_enc4(10);
 331        round_enc4(12);
 332        round_enc4(14);
 333        add_preloaded_roundkey4();
 334
 335        popq %rbp;
 336        movq %r11, RIO;
 337
 338        test %bpl, %bpl;
 339        jnz __enc_xor4;
 340
 341        write_block4();
 342
 343        popq %rbx;
 344        popq %rbp;
 345        ret;
 346
 347__enc_xor4:
 348        xor_block4();
 349
 350        popq %rbx;
 351        popq %rbp;
 352        ret;
 353
 354.align 8
 355.global blowfish_dec_blk_4way
 356.type   blowfish_dec_blk_4way,@function;
 357
 358blowfish_dec_blk_4way:
 359        /* input:
 360         *      %rdi: ctx, CTX
 361         *      %rsi: dst
 362         *      %rdx: src
 363         */
 364        pushq %rbp;
 365        pushq %rbx;
 366        preload_roundkey_dec(17);
 367
 368        movq %rsi, %r11;
 369        movq %rdx, RIO;
 370
 371        read_block4();
 372
 373        round_dec4(17);
 374        round_dec4(15);
 375        round_dec4(13);
 376        round_dec4(11);
 377        round_dec4(9);
 378        round_dec4(7);
 379        round_dec4(5);
 380        round_dec4(3);
 381        add_preloaded_roundkey4();
 382
 383        movq %r11, RIO;
 384        write_block4();
 385
 386        popq %rbx;
 387        popq %rbp;
 388
 389        ret;
 390
 391