linux/arch/x86/crypto/blowfish-x86_64-asm_64.S
<<
>>
Prefs
   1/*
   2 * Blowfish Cipher Algorithm (x86_64)
   3 *
   4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation; either version 2 of the License, or
   9 * (at your option) any later version.
  10 *
  11 * This program is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 * GNU General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program; if not, write to the Free Software
  18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
  19 * USA
  20 *
  21 */
  22
  23#include <linux/linkage.h>
  24
  25.file "blowfish-x86_64-asm.S"
  26.text
  27
  28/* structure of crypto context */
  29#define p       0
  30#define s0      ((16 + 2) * 4)
  31#define s1      ((16 + 2 + (1 * 256)) * 4)
  32#define s2      ((16 + 2 + (2 * 256)) * 4)
  33#define s3      ((16 + 2 + (3 * 256)) * 4)
  34
  35/* register macros */
  36#define CTX %r12
  37#define RIO %rsi
  38
  39#define RX0 %rax
  40#define RX1 %rbx
  41#define RX2 %rcx
  42#define RX3 %rdx
  43
  44#define RX0d %eax
  45#define RX1d %ebx
  46#define RX2d %ecx
  47#define RX3d %edx
  48
  49#define RX0bl %al
  50#define RX1bl %bl
  51#define RX2bl %cl
  52#define RX3bl %dl
  53
  54#define RX0bh %ah
  55#define RX1bh %bh
  56#define RX2bh %ch
  57#define RX3bh %dh
  58
  59#define RT0 %rdi
  60#define RT1 %rsi
  61#define RT2 %r8
  62#define RT3 %r9
  63
  64#define RT0d %edi
  65#define RT1d %esi
  66#define RT2d %r8d
  67#define RT3d %r9d
  68
  69#define RKEY %r10
  70
  71/***********************************************************************
  72 * 1-way blowfish
  73 ***********************************************************************/
  74#define F() \
  75        rorq $16,               RX0; \
  76        movzbl RX0bh,           RT0d; \
  77        movzbl RX0bl,           RT1d; \
  78        rolq $16,               RX0; \
  79        movl s0(CTX,RT0,4),     RT0d; \
  80        addl s1(CTX,RT1,4),     RT0d; \
  81        movzbl RX0bh,           RT1d; \
  82        movzbl RX0bl,           RT2d; \
  83        rolq $32,               RX0; \
  84        xorl s2(CTX,RT1,4),     RT0d; \
  85        addl s3(CTX,RT2,4),     RT0d; \
  86        xorq RT0,               RX0;
  87
  88#define add_roundkey_enc(n) \
  89        xorq p+4*(n)(CTX),      RX0;
  90
  91#define round_enc(n) \
  92        add_roundkey_enc(n); \
  93        \
  94        F(); \
  95        F();
  96
  97#define add_roundkey_dec(n) \
  98        movq p+4*(n-1)(CTX),    RT0; \
  99        rorq $32,               RT0; \
 100        xorq RT0,               RX0;
 101
 102#define round_dec(n) \
 103        add_roundkey_dec(n); \
 104        \
 105        F(); \
 106        F(); \
 107
 108#define read_block() \
 109        movq (RIO),             RX0; \
 110        rorq $32,               RX0; \
 111        bswapq                  RX0;
 112
 113#define write_block() \
 114        bswapq                  RX0; \
 115        movq RX0,               (RIO);
 116
 117#define xor_block() \
 118        bswapq                  RX0; \
 119        xorq RX0,               (RIO);
 120
 121ENTRY(__blowfish_enc_blk)
 122        /* input:
 123         *      %rdi: ctx
 124         *      %rsi: dst
 125         *      %rdx: src
 126         *      %rcx: bool, if true: xor output
 127         */
 128        movq %r12, %r11;
 129
 130        movq %rdi, CTX;
 131        movq %rsi, %r10;
 132        movq %rdx, RIO;
 133
 134        read_block();
 135
 136        round_enc(0);
 137        round_enc(2);
 138        round_enc(4);
 139        round_enc(6);
 140        round_enc(8);
 141        round_enc(10);
 142        round_enc(12);
 143        round_enc(14);
 144        add_roundkey_enc(16);
 145
 146        movq %r11, %r12;
 147
 148        movq %r10, RIO;
 149        test %cl, %cl;
 150        jnz .L__enc_xor;
 151
 152        write_block();
 153        ret;
 154.L__enc_xor:
 155        xor_block();
 156        ret;
 157ENDPROC(__blowfish_enc_blk)
 158
 159ENTRY(blowfish_dec_blk)
 160        /* input:
 161         *      %rdi: ctx
 162         *      %rsi: dst
 163         *      %rdx: src
 164         */
 165        movq %r12, %r11;
 166
 167        movq %rdi, CTX;
 168        movq %rsi, %r10;
 169        movq %rdx, RIO;
 170
 171        read_block();
 172
 173        round_dec(17);
 174        round_dec(15);
 175        round_dec(13);
 176        round_dec(11);
 177        round_dec(9);
 178        round_dec(7);
 179        round_dec(5);
 180        round_dec(3);
 181        add_roundkey_dec(1);
 182
 183        movq %r10, RIO;
 184        write_block();
 185
 186        movq %r11, %r12;
 187
 188        ret;
 189ENDPROC(blowfish_dec_blk)
 190
 191/**********************************************************************
 192  4-way blowfish, four blocks parallel
 193 **********************************************************************/
 194
 195/* F() for 4-way. Slower when used alone/1-way, but faster when used
 196 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
 197 */
 198#define F4(x) \
 199        movzbl x ## bh,         RT1d; \
 200        movzbl x ## bl,         RT3d; \
 201        rorq $16,               x; \
 202        movzbl x ## bh,         RT0d; \
 203        movzbl x ## bl,         RT2d; \
 204        rorq $16,               x; \
 205        movl s0(CTX,RT0,4),     RT0d; \
 206        addl s1(CTX,RT2,4),     RT0d; \
 207        xorl s2(CTX,RT1,4),     RT0d; \
 208        addl s3(CTX,RT3,4),     RT0d; \
 209        xorq RT0,               x;
 210
 211#define add_preloaded_roundkey4() \
 212        xorq RKEY,              RX0; \
 213        xorq RKEY,              RX1; \
 214        xorq RKEY,              RX2; \
 215        xorq RKEY,              RX3;
 216
 217#define preload_roundkey_enc(n) \
 218        movq p+4*(n)(CTX),      RKEY;
 219
 220#define add_roundkey_enc4(n) \
 221        add_preloaded_roundkey4(); \
 222        preload_roundkey_enc(n + 2);
 223
 224#define round_enc4(n) \
 225        add_roundkey_enc4(n); \
 226        \
 227        F4(RX0); \
 228        F4(RX1); \
 229        F4(RX2); \
 230        F4(RX3); \
 231        \
 232        F4(RX0); \
 233        F4(RX1); \
 234        F4(RX2); \
 235        F4(RX3);
 236
 237#define preload_roundkey_dec(n) \
 238        movq p+4*((n)-1)(CTX),  RKEY; \
 239        rorq $32,               RKEY;
 240
 241#define add_roundkey_dec4(n) \
 242        add_preloaded_roundkey4(); \
 243        preload_roundkey_dec(n - 2);
 244
 245#define round_dec4(n) \
 246        add_roundkey_dec4(n); \
 247        \
 248        F4(RX0); \
 249        F4(RX1); \
 250        F4(RX2); \
 251        F4(RX3); \
 252        \
 253        F4(RX0); \
 254        F4(RX1); \
 255        F4(RX2); \
 256        F4(RX3);
 257
 258#define read_block4() \
 259        movq (RIO),             RX0; \
 260        rorq $32,               RX0; \
 261        bswapq                  RX0; \
 262        \
 263        movq 8(RIO),            RX1; \
 264        rorq $32,               RX1; \
 265        bswapq                  RX1; \
 266        \
 267        movq 16(RIO),           RX2; \
 268        rorq $32,               RX2; \
 269        bswapq                  RX2; \
 270        \
 271        movq 24(RIO),           RX3; \
 272        rorq $32,               RX3; \
 273        bswapq                  RX3;
 274
 275#define write_block4() \
 276        bswapq                  RX0; \
 277        movq RX0,               (RIO); \
 278        \
 279        bswapq                  RX1; \
 280        movq RX1,               8(RIO); \
 281        \
 282        bswapq                  RX2; \
 283        movq RX2,               16(RIO); \
 284        \
 285        bswapq                  RX3; \
 286        movq RX3,               24(RIO);
 287
 288#define xor_block4() \
 289        bswapq                  RX0; \
 290        xorq RX0,               (RIO); \
 291        \
 292        bswapq                  RX1; \
 293        xorq RX1,               8(RIO); \
 294        \
 295        bswapq                  RX2; \
 296        xorq RX2,               16(RIO); \
 297        \
 298        bswapq                  RX3; \
 299        xorq RX3,               24(RIO);
 300
 301ENTRY(__blowfish_enc_blk_4way)
 302        /* input:
 303         *      %rdi: ctx
 304         *      %rsi: dst
 305         *      %rdx: src
 306         *      %rcx: bool, if true: xor output
 307         */
 308        pushq %r12;
 309        pushq %rbx;
 310        pushq %rcx;
 311
 312        movq %rdi, CTX
 313        movq %rsi, %r11;
 314        movq %rdx, RIO;
 315
 316        preload_roundkey_enc(0);
 317
 318        read_block4();
 319
 320        round_enc4(0);
 321        round_enc4(2);
 322        round_enc4(4);
 323        round_enc4(6);
 324        round_enc4(8);
 325        round_enc4(10);
 326        round_enc4(12);
 327        round_enc4(14);
 328        add_preloaded_roundkey4();
 329
 330        popq %r12;
 331        movq %r11, RIO;
 332
 333        test %r12b, %r12b;
 334        jnz .L__enc_xor4;
 335
 336        write_block4();
 337
 338        popq %rbx;
 339        popq %r12;
 340        ret;
 341
 342.L__enc_xor4:
 343        xor_block4();
 344
 345        popq %rbx;
 346        popq %r12;
 347        ret;
 348ENDPROC(__blowfish_enc_blk_4way)
 349
 350ENTRY(blowfish_dec_blk_4way)
 351        /* input:
 352         *      %rdi: ctx
 353         *      %rsi: dst
 354         *      %rdx: src
 355         */
 356        pushq %r12;
 357        pushq %rbx;
 358
 359        movq %rdi, CTX;
 360        movq %rsi, %r11
 361        movq %rdx, RIO;
 362
 363        preload_roundkey_dec(17);
 364        read_block4();
 365
 366        round_dec4(17);
 367        round_dec4(15);
 368        round_dec4(13);
 369        round_dec4(11);
 370        round_dec4(9);
 371        round_dec4(7);
 372        round_dec4(5);
 373        round_dec4(3);
 374        add_preloaded_roundkey4();
 375
 376        movq %r11, RIO;
 377        write_block4();
 378
 379        popq %rbx;
 380        popq %r12;
 381
 382        ret;
 383ENDPROC(blowfish_dec_blk_4way)
 384