linux/arch/x86/crypto/blowfish-x86_64-asm_64.S
<<
>>
Prefs
   1/*
   2 * Blowfish Cipher Algorithm (x86_64)
   3 *
   4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation; either version 2 of the License, or
   9 * (at your option) any later version.
  10 *
  11 * This program is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 * GNU General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program; if not, write to the Free Software
  18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
  19 * USA
  20 *
  21 */
  22
  23#include <linux/linkage.h>
  24
  25.file "blowfish-x86_64-asm.S"
  26.text
  27
  28/* structure of crypto context */
  29#define p       0
  30#define s0      ((16 + 2) * 4)
  31#define s1      ((16 + 2 + (1 * 256)) * 4)
  32#define s2      ((16 + 2 + (2 * 256)) * 4)
  33#define s3      ((16 + 2 + (3 * 256)) * 4)
  34
  35/* register macros */
  36#define CTX %rdi
  37#define RIO %rsi
  38
  39#define RX0 %rax
  40#define RX1 %rbx
  41#define RX2 %rcx
  42#define RX3 %rdx
  43
  44#define RX0d %eax
  45#define RX1d %ebx
  46#define RX2d %ecx
  47#define RX3d %edx
  48
  49#define RX0bl %al
  50#define RX1bl %bl
  51#define RX2bl %cl
  52#define RX3bl %dl
  53
  54#define RX0bh %ah
  55#define RX1bh %bh
  56#define RX2bh %ch
  57#define RX3bh %dh
  58
  59#define RT0 %rbp
  60#define RT1 %rsi
  61#define RT2 %r8
  62#define RT3 %r9
  63
  64#define RT0d %ebp
  65#define RT1d %esi
  66#define RT2d %r8d
  67#define RT3d %r9d
  68
  69#define RKEY %r10
  70
  71/***********************************************************************
  72 * 1-way blowfish
  73 ***********************************************************************/
  74#define F() \
  75        rorq $16,               RX0; \
  76        movzbl RX0bh,           RT0d; \
  77        movzbl RX0bl,           RT1d; \
  78        rolq $16,               RX0; \
  79        movl s0(CTX,RT0,4),     RT0d; \
  80        addl s1(CTX,RT1,4),     RT0d; \
  81        movzbl RX0bh,           RT1d; \
  82        movzbl RX0bl,           RT2d; \
  83        rolq $32,               RX0; \
  84        xorl s2(CTX,RT1,4),     RT0d; \
  85        addl s3(CTX,RT2,4),     RT0d; \
  86        xorq RT0,               RX0;
  87
  88#define add_roundkey_enc(n) \
  89        xorq p+4*(n)(CTX),      RX0;
  90
  91#define round_enc(n) \
  92        add_roundkey_enc(n); \
  93        \
  94        F(); \
  95        F();
  96
  97#define add_roundkey_dec(n) \
  98        movq p+4*(n-1)(CTX),    RT0; \
  99        rorq $32,               RT0; \
 100        xorq RT0,               RX0;
 101
 102#define round_dec(n) \
 103        add_roundkey_dec(n); \
 104        \
 105        F(); \
 106        F(); \
 107
 108#define read_block() \
 109        movq (RIO),             RX0; \
 110        rorq $32,               RX0; \
 111        bswapq                  RX0;
 112
 113#define write_block() \
 114        bswapq                  RX0; \
 115        movq RX0,               (RIO);
 116
 117#define xor_block() \
 118        bswapq                  RX0; \
 119        xorq RX0,               (RIO);
 120
 121ENTRY(__blowfish_enc_blk)
 122        /* input:
 123         *      %rdi: ctx, CTX
 124         *      %rsi: dst
 125         *      %rdx: src
 126         *      %rcx: bool, if true: xor output
 127         */
 128        movq %rbp, %r11;
 129
 130        movq %rsi, %r10;
 131        movq %rdx, RIO;
 132
 133        read_block();
 134
 135        round_enc(0);
 136        round_enc(2);
 137        round_enc(4);
 138        round_enc(6);
 139        round_enc(8);
 140        round_enc(10);
 141        round_enc(12);
 142        round_enc(14);
 143        add_roundkey_enc(16);
 144
 145        movq %r11, %rbp;
 146
 147        movq %r10, RIO;
 148        test %cl, %cl;
 149        jnz .L__enc_xor;
 150
 151        write_block();
 152        ret;
 153.L__enc_xor:
 154        xor_block();
 155        ret;
 156ENDPROC(__blowfish_enc_blk)
 157
 158ENTRY(blowfish_dec_blk)
 159        /* input:
 160         *      %rdi: ctx, CTX
 161         *      %rsi: dst
 162         *      %rdx: src
 163         */
 164        movq %rbp, %r11;
 165
 166        movq %rsi, %r10;
 167        movq %rdx, RIO;
 168
 169        read_block();
 170
 171        round_dec(17);
 172        round_dec(15);
 173        round_dec(13);
 174        round_dec(11);
 175        round_dec(9);
 176        round_dec(7);
 177        round_dec(5);
 178        round_dec(3);
 179        add_roundkey_dec(1);
 180
 181        movq %r10, RIO;
 182        write_block();
 183
 184        movq %r11, %rbp;
 185
 186        ret;
 187ENDPROC(blowfish_dec_blk)
 188
 189/**********************************************************************
 190  4-way blowfish, four blocks parallel
 191 **********************************************************************/
 192
 193/* F() for 4-way. Slower when used alone/1-way, but faster when used
 194 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
 195 */
 196#define F4(x) \
 197        movzbl x ## bh,         RT1d; \
 198        movzbl x ## bl,         RT3d; \
 199        rorq $16,               x; \
 200        movzbl x ## bh,         RT0d; \
 201        movzbl x ## bl,         RT2d; \
 202        rorq $16,               x; \
 203        movl s0(CTX,RT0,4),     RT0d; \
 204        addl s1(CTX,RT2,4),     RT0d; \
 205        xorl s2(CTX,RT1,4),     RT0d; \
 206        addl s3(CTX,RT3,4),     RT0d; \
 207        xorq RT0,               x;
 208
 209#define add_preloaded_roundkey4() \
 210        xorq RKEY,              RX0; \
 211        xorq RKEY,              RX1; \
 212        xorq RKEY,              RX2; \
 213        xorq RKEY,              RX3;
 214
 215#define preload_roundkey_enc(n) \
 216        movq p+4*(n)(CTX),      RKEY;
 217
 218#define add_roundkey_enc4(n) \
 219        add_preloaded_roundkey4(); \
 220        preload_roundkey_enc(n + 2);
 221
 222#define round_enc4(n) \
 223        add_roundkey_enc4(n); \
 224        \
 225        F4(RX0); \
 226        F4(RX1); \
 227        F4(RX2); \
 228        F4(RX3); \
 229        \
 230        F4(RX0); \
 231        F4(RX1); \
 232        F4(RX2); \
 233        F4(RX3);
 234
 235#define preload_roundkey_dec(n) \
 236        movq p+4*((n)-1)(CTX),  RKEY; \
 237        rorq $32,               RKEY;
 238
 239#define add_roundkey_dec4(n) \
 240        add_preloaded_roundkey4(); \
 241        preload_roundkey_dec(n - 2);
 242
 243#define round_dec4(n) \
 244        add_roundkey_dec4(n); \
 245        \
 246        F4(RX0); \
 247        F4(RX1); \
 248        F4(RX2); \
 249        F4(RX3); \
 250        \
 251        F4(RX0); \
 252        F4(RX1); \
 253        F4(RX2); \
 254        F4(RX3);
 255
 256#define read_block4() \
 257        movq (RIO),             RX0; \
 258        rorq $32,               RX0; \
 259        bswapq                  RX0; \
 260        \
 261        movq 8(RIO),            RX1; \
 262        rorq $32,               RX1; \
 263        bswapq                  RX1; \
 264        \
 265        movq 16(RIO),           RX2; \
 266        rorq $32,               RX2; \
 267        bswapq                  RX2; \
 268        \
 269        movq 24(RIO),           RX3; \
 270        rorq $32,               RX3; \
 271        bswapq                  RX3;
 272
 273#define write_block4() \
 274        bswapq                  RX0; \
 275        movq RX0,               (RIO); \
 276        \
 277        bswapq                  RX1; \
 278        movq RX1,               8(RIO); \
 279        \
 280        bswapq                  RX2; \
 281        movq RX2,               16(RIO); \
 282        \
 283        bswapq                  RX3; \
 284        movq RX3,               24(RIO);
 285
 286#define xor_block4() \
 287        bswapq                  RX0; \
 288        xorq RX0,               (RIO); \
 289        \
 290        bswapq                  RX1; \
 291        xorq RX1,               8(RIO); \
 292        \
 293        bswapq                  RX2; \
 294        xorq RX2,               16(RIO); \
 295        \
 296        bswapq                  RX3; \
 297        xorq RX3,               24(RIO);
 298
 299ENTRY(__blowfish_enc_blk_4way)
 300        /* input:
 301         *      %rdi: ctx, CTX
 302         *      %rsi: dst
 303         *      %rdx: src
 304         *      %rcx: bool, if true: xor output
 305         */
 306        pushq %rbp;
 307        pushq %rbx;
 308        pushq %rcx;
 309
 310        preload_roundkey_enc(0);
 311
 312        movq %rsi, %r11;
 313        movq %rdx, RIO;
 314
 315        read_block4();
 316
 317        round_enc4(0);
 318        round_enc4(2);
 319        round_enc4(4);
 320        round_enc4(6);
 321        round_enc4(8);
 322        round_enc4(10);
 323        round_enc4(12);
 324        round_enc4(14);
 325        add_preloaded_roundkey4();
 326
 327        popq %rbp;
 328        movq %r11, RIO;
 329
 330        test %bpl, %bpl;
 331        jnz .L__enc_xor4;
 332
 333        write_block4();
 334
 335        popq %rbx;
 336        popq %rbp;
 337        ret;
 338
 339.L__enc_xor4:
 340        xor_block4();
 341
 342        popq %rbx;
 343        popq %rbp;
 344        ret;
 345ENDPROC(__blowfish_enc_blk_4way)
 346
 347ENTRY(blowfish_dec_blk_4way)
 348        /* input:
 349         *      %rdi: ctx, CTX
 350         *      %rsi: dst
 351         *      %rdx: src
 352         */
 353        pushq %rbp;
 354        pushq %rbx;
 355        preload_roundkey_dec(17);
 356
 357        movq %rsi, %r11;
 358        movq %rdx, RIO;
 359
 360        read_block4();
 361
 362        round_dec4(17);
 363        round_dec4(15);
 364        round_dec4(13);
 365        round_dec4(11);
 366        round_dec4(9);
 367        round_dec4(7);
 368        round_dec4(5);
 369        round_dec4(3);
 370        add_preloaded_roundkey4();
 371
 372        movq %r11, RIO;
 373        write_block4();
 374
 375        popq %rbx;
 376        popq %rbp;
 377
 378        ret;
 379ENDPROC(blowfish_dec_blk_4way)
 380