linux/arch/x86/crypto/blowfish-x86_64-asm_64.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * Blowfish Cipher Algorithm (x86_64)
   4 *
   5 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   6 */
   7
   8#include <linux/linkage.h>
   9
  10.file "blowfish-x86_64-asm.S"
  11.text
  12
  13/* structure of crypto context */
  14#define p       0
  15#define s0      ((16 + 2) * 4)
  16#define s1      ((16 + 2 + (1 * 256)) * 4)
  17#define s2      ((16 + 2 + (2 * 256)) * 4)
  18#define s3      ((16 + 2 + (3 * 256)) * 4)
  19
  20/* register macros */
  21#define CTX %r12
  22#define RIO %rsi
  23
  24#define RX0 %rax
  25#define RX1 %rbx
  26#define RX2 %rcx
  27#define RX3 %rdx
  28
  29#define RX0d %eax
  30#define RX1d %ebx
  31#define RX2d %ecx
  32#define RX3d %edx
  33
  34#define RX0bl %al
  35#define RX1bl %bl
  36#define RX2bl %cl
  37#define RX3bl %dl
  38
  39#define RX0bh %ah
  40#define RX1bh %bh
  41#define RX2bh %ch
  42#define RX3bh %dh
  43
  44#define RT0 %rdi
  45#define RT1 %rsi
  46#define RT2 %r8
  47#define RT3 %r9
  48
  49#define RT0d %edi
  50#define RT1d %esi
  51#define RT2d %r8d
  52#define RT3d %r9d
  53
  54#define RKEY %r10
  55
  56/***********************************************************************
  57 * 1-way blowfish
  58 ***********************************************************************/
  59#define F() \
  60        rorq $16,               RX0; \
  61        movzbl RX0bh,           RT0d; \
  62        movzbl RX0bl,           RT1d; \
  63        rolq $16,               RX0; \
  64        movl s0(CTX,RT0,4),     RT0d; \
  65        addl s1(CTX,RT1,4),     RT0d; \
  66        movzbl RX0bh,           RT1d; \
  67        movzbl RX0bl,           RT2d; \
  68        rolq $32,               RX0; \
  69        xorl s2(CTX,RT1,4),     RT0d; \
  70        addl s3(CTX,RT2,4),     RT0d; \
  71        xorq RT0,               RX0;
  72
  73#define add_roundkey_enc(n) \
  74        xorq p+4*(n)(CTX),      RX0;
  75
  76#define round_enc(n) \
  77        add_roundkey_enc(n); \
  78        \
  79        F(); \
  80        F();
  81
  82#define add_roundkey_dec(n) \
  83        movq p+4*(n-1)(CTX),    RT0; \
  84        rorq $32,               RT0; \
  85        xorq RT0,               RX0;
  86
  87#define round_dec(n) \
  88        add_roundkey_dec(n); \
  89        \
  90        F(); \
  91        F(); \
  92
  93#define read_block() \
  94        movq (RIO),             RX0; \
  95        rorq $32,               RX0; \
  96        bswapq                  RX0;
  97
  98#define write_block() \
  99        bswapq                  RX0; \
 100        movq RX0,               (RIO);
 101
 102#define xor_block() \
 103        bswapq                  RX0; \
 104        xorq RX0,               (RIO);
 105
 106SYM_FUNC_START(__blowfish_enc_blk)
 107        /* input:
 108         *      %rdi: ctx
 109         *      %rsi: dst
 110         *      %rdx: src
 111         *      %rcx: bool, if true: xor output
 112         */
 113        movq %r12, %r11;
 114
 115        movq %rdi, CTX;
 116        movq %rsi, %r10;
 117        movq %rdx, RIO;
 118
 119        read_block();
 120
 121        round_enc(0);
 122        round_enc(2);
 123        round_enc(4);
 124        round_enc(6);
 125        round_enc(8);
 126        round_enc(10);
 127        round_enc(12);
 128        round_enc(14);
 129        add_roundkey_enc(16);
 130
 131        movq %r11, %r12;
 132
 133        movq %r10, RIO;
 134        test %cl, %cl;
 135        jnz .L__enc_xor;
 136
 137        write_block();
 138        ret;
 139.L__enc_xor:
 140        xor_block();
 141        ret;
 142SYM_FUNC_END(__blowfish_enc_blk)
 143
 144SYM_FUNC_START(blowfish_dec_blk)
 145        /* input:
 146         *      %rdi: ctx
 147         *      %rsi: dst
 148         *      %rdx: src
 149         */
 150        movq %r12, %r11;
 151
 152        movq %rdi, CTX;
 153        movq %rsi, %r10;
 154        movq %rdx, RIO;
 155
 156        read_block();
 157
 158        round_dec(17);
 159        round_dec(15);
 160        round_dec(13);
 161        round_dec(11);
 162        round_dec(9);
 163        round_dec(7);
 164        round_dec(5);
 165        round_dec(3);
 166        add_roundkey_dec(1);
 167
 168        movq %r10, RIO;
 169        write_block();
 170
 171        movq %r11, %r12;
 172
 173        ret;
 174SYM_FUNC_END(blowfish_dec_blk)
 175
 176/**********************************************************************
 177  4-way blowfish, four blocks parallel
 178 **********************************************************************/
 179
 180/* F() for 4-way. Slower when used alone/1-way, but faster when used
 181 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
 182 */
 183#define F4(x) \
 184        movzbl x ## bh,         RT1d; \
 185        movzbl x ## bl,         RT3d; \
 186        rorq $16,               x; \
 187        movzbl x ## bh,         RT0d; \
 188        movzbl x ## bl,         RT2d; \
 189        rorq $16,               x; \
 190        movl s0(CTX,RT0,4),     RT0d; \
 191        addl s1(CTX,RT2,4),     RT0d; \
 192        xorl s2(CTX,RT1,4),     RT0d; \
 193        addl s3(CTX,RT3,4),     RT0d; \
 194        xorq RT0,               x;
 195
 196#define add_preloaded_roundkey4() \
 197        xorq RKEY,              RX0; \
 198        xorq RKEY,              RX1; \
 199        xorq RKEY,              RX2; \
 200        xorq RKEY,              RX3;
 201
 202#define preload_roundkey_enc(n) \
 203        movq p+4*(n)(CTX),      RKEY;
 204
 205#define add_roundkey_enc4(n) \
 206        add_preloaded_roundkey4(); \
 207        preload_roundkey_enc(n + 2);
 208
 209#define round_enc4(n) \
 210        add_roundkey_enc4(n); \
 211        \
 212        F4(RX0); \
 213        F4(RX1); \
 214        F4(RX2); \
 215        F4(RX3); \
 216        \
 217        F4(RX0); \
 218        F4(RX1); \
 219        F4(RX2); \
 220        F4(RX3);
 221
 222#define preload_roundkey_dec(n) \
 223        movq p+4*((n)-1)(CTX),  RKEY; \
 224        rorq $32,               RKEY;
 225
 226#define add_roundkey_dec4(n) \
 227        add_preloaded_roundkey4(); \
 228        preload_roundkey_dec(n - 2);
 229
 230#define round_dec4(n) \
 231        add_roundkey_dec4(n); \
 232        \
 233        F4(RX0); \
 234        F4(RX1); \
 235        F4(RX2); \
 236        F4(RX3); \
 237        \
 238        F4(RX0); \
 239        F4(RX1); \
 240        F4(RX2); \
 241        F4(RX3);
 242
 243#define read_block4() \
 244        movq (RIO),             RX0; \
 245        rorq $32,               RX0; \
 246        bswapq                  RX0; \
 247        \
 248        movq 8(RIO),            RX1; \
 249        rorq $32,               RX1; \
 250        bswapq                  RX1; \
 251        \
 252        movq 16(RIO),           RX2; \
 253        rorq $32,               RX2; \
 254        bswapq                  RX2; \
 255        \
 256        movq 24(RIO),           RX3; \
 257        rorq $32,               RX3; \
 258        bswapq                  RX3;
 259
 260#define write_block4() \
 261        bswapq                  RX0; \
 262        movq RX0,               (RIO); \
 263        \
 264        bswapq                  RX1; \
 265        movq RX1,               8(RIO); \
 266        \
 267        bswapq                  RX2; \
 268        movq RX2,               16(RIO); \
 269        \
 270        bswapq                  RX3; \
 271        movq RX3,               24(RIO);
 272
 273#define xor_block4() \
 274        bswapq                  RX0; \
 275        xorq RX0,               (RIO); \
 276        \
 277        bswapq                  RX1; \
 278        xorq RX1,               8(RIO); \
 279        \
 280        bswapq                  RX2; \
 281        xorq RX2,               16(RIO); \
 282        \
 283        bswapq                  RX3; \
 284        xorq RX3,               24(RIO);
 285
 286SYM_FUNC_START(__blowfish_enc_blk_4way)
 287        /* input:
 288         *      %rdi: ctx
 289         *      %rsi: dst
 290         *      %rdx: src
 291         *      %rcx: bool, if true: xor output
 292         */
 293        pushq %r12;
 294        pushq %rbx;
 295        pushq %rcx;
 296
 297        movq %rdi, CTX
 298        movq %rsi, %r11;
 299        movq %rdx, RIO;
 300
 301        preload_roundkey_enc(0);
 302
 303        read_block4();
 304
 305        round_enc4(0);
 306        round_enc4(2);
 307        round_enc4(4);
 308        round_enc4(6);
 309        round_enc4(8);
 310        round_enc4(10);
 311        round_enc4(12);
 312        round_enc4(14);
 313        add_preloaded_roundkey4();
 314
 315        popq %r12;
 316        movq %r11, RIO;
 317
 318        test %r12b, %r12b;
 319        jnz .L__enc_xor4;
 320
 321        write_block4();
 322
 323        popq %rbx;
 324        popq %r12;
 325        ret;
 326
 327.L__enc_xor4:
 328        xor_block4();
 329
 330        popq %rbx;
 331        popq %r12;
 332        ret;
 333SYM_FUNC_END(__blowfish_enc_blk_4way)
 334
 335SYM_FUNC_START(blowfish_dec_blk_4way)
 336        /* input:
 337         *      %rdi: ctx
 338         *      %rsi: dst
 339         *      %rdx: src
 340         */
 341        pushq %r12;
 342        pushq %rbx;
 343
 344        movq %rdi, CTX;
 345        movq %rsi, %r11
 346        movq %rdx, RIO;
 347
 348        preload_roundkey_dec(17);
 349        read_block4();
 350
 351        round_dec4(17);
 352        round_dec4(15);
 353        round_dec4(13);
 354        round_dec4(11);
 355        round_dec4(9);
 356        round_dec4(7);
 357        round_dec4(5);
 358        round_dec4(3);
 359        add_preloaded_roundkey4();
 360
 361        movq %r11, RIO;
 362        write_block4();
 363
 364        popq %rbx;
 365        popq %r12;
 366
 367        ret;
 368SYM_FUNC_END(blowfish_dec_blk_4way)
 369