linux/arch/x86/crypto/camellia-x86_64-asm_64.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * Camellia Cipher Algorithm (x86_64)
   4 *
   5 * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   6 */
   7
   8#include <linux/linkage.h>
   9
  10.file "camellia-x86_64-asm_64.S"
  11.text
  12
  13.extern camellia_sp10011110;
  14.extern camellia_sp22000222;
  15.extern camellia_sp03303033;
  16.extern camellia_sp00444404;
  17.extern camellia_sp02220222;
  18.extern camellia_sp30333033;
  19.extern camellia_sp44044404;
  20.extern camellia_sp11101110;
  21
  22#define sp10011110 camellia_sp10011110
  23#define sp22000222 camellia_sp22000222
  24#define sp03303033 camellia_sp03303033
  25#define sp00444404 camellia_sp00444404
  26#define sp02220222 camellia_sp02220222
  27#define sp30333033 camellia_sp30333033
  28#define sp44044404 camellia_sp44044404
  29#define sp11101110 camellia_sp11101110
  30
  31#define CAMELLIA_TABLE_BYTE_LEN 272
  32
  33/* struct camellia_ctx: */
  34#define key_table 0
  35#define key_length CAMELLIA_TABLE_BYTE_LEN
  36
  37/* register macros */
  38#define CTX %rdi
  39#define RIO %rsi
  40#define RIOd %esi
  41
  42#define RAB0 %rax
  43#define RCD0 %rcx
  44#define RAB1 %rbx
  45#define RCD1 %rdx
  46
  47#define RAB0d %eax
  48#define RCD0d %ecx
  49#define RAB1d %ebx
  50#define RCD1d %edx
  51
  52#define RAB0bl %al
  53#define RCD0bl %cl
  54#define RAB1bl %bl
  55#define RCD1bl %dl
  56
  57#define RAB0bh %ah
  58#define RCD0bh %ch
  59#define RAB1bh %bh
  60#define RCD1bh %dh
  61
  62#define RT0 %rsi
  63#define RT1 %r12
  64#define RT2 %r8
  65
  66#define RT0d %esi
  67#define RT1d %r12d
  68#define RT2d %r8d
  69
  70#define RT2bl %r8b
  71
  72#define RXOR %r9
  73#define RR12 %r10
  74#define RDST %r11
  75
  76#define RXORd %r9d
  77#define RXORbl %r9b
  78
  79#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
  80        movzbl ab ## bl,                tmp2 ## d; \
  81        movzbl ab ## bh,                tmp1 ## d; \
  82        rorq $16,                       ab; \
  83        xorq T0(, tmp2, 8),             dst; \
  84        xorq T1(, tmp1, 8),             dst;
  85
  86/**********************************************************************
  87  1-way camellia
  88 **********************************************************************/
  89#define roundsm(ab, subkey, cd) \
  90        movq (key_table + ((subkey) * 2) * 4)(CTX),     RT2; \
  91        \
  92        xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
  93        xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
  94        xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
  95        xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
  96        \
  97        xorq RT2,                                       cd ## 0;
  98
  99#define fls(l, r, kl, kr) \
 100        movl (key_table + ((kl) * 2) * 4)(CTX),         RT0d; \
 101        andl l ## 0d,                                   RT0d; \
 102        roll $1,                                        RT0d; \
 103        shlq $32,                                       RT0; \
 104        xorq RT0,                                       l ## 0; \
 105        movq (key_table + ((kr) * 2) * 4)(CTX),         RT1; \
 106        orq r ## 0,                                     RT1; \
 107        shrq $32,                                       RT1; \
 108        xorq RT1,                                       r ## 0; \
 109        \
 110        movq (key_table + ((kl) * 2) * 4)(CTX),         RT2; \
 111        orq l ## 0,                                     RT2; \
 112        shrq $32,                                       RT2; \
 113        xorq RT2,                                       l ## 0; \
 114        movl (key_table + ((kr) * 2) * 4)(CTX),         RT0d; \
 115        andl r ## 0d,                                   RT0d; \
 116        roll $1,                                        RT0d; \
 117        shlq $32,                                       RT0; \
 118        xorq RT0,                                       r ## 0;
 119
 120#define enc_rounds(i) \
 121        roundsm(RAB, i + 2, RCD); \
 122        roundsm(RCD, i + 3, RAB); \
 123        roundsm(RAB, i + 4, RCD); \
 124        roundsm(RCD, i + 5, RAB); \
 125        roundsm(RAB, i + 6, RCD); \
 126        roundsm(RCD, i + 7, RAB);
 127
 128#define enc_fls(i) \
 129        fls(RAB, RCD, i + 0, i + 1);
 130
 131#define enc_inpack() \
 132        movq (RIO),                     RAB0; \
 133        bswapq                          RAB0; \
 134        rolq $32,                       RAB0; \
 135        movq 4*2(RIO),                  RCD0; \
 136        bswapq                          RCD0; \
 137        rorq $32,                       RCD0; \
 138        xorq key_table(CTX),            RAB0;
 139
 140#define enc_outunpack(op, max) \
 141        xorq key_table(CTX, max, 8),    RCD0; \
 142        rorq $32,                       RCD0; \
 143        bswapq                          RCD0; \
 144        op ## q RCD0,                   (RIO); \
 145        rolq $32,                       RAB0; \
 146        bswapq                          RAB0; \
 147        op ## q RAB0,                   4*2(RIO);
 148
 149#define dec_rounds(i) \
 150        roundsm(RAB, i + 7, RCD); \
 151        roundsm(RCD, i + 6, RAB); \
 152        roundsm(RAB, i + 5, RCD); \
 153        roundsm(RCD, i + 4, RAB); \
 154        roundsm(RAB, i + 3, RCD); \
 155        roundsm(RCD, i + 2, RAB);
 156
 157#define dec_fls(i) \
 158        fls(RAB, RCD, i + 1, i + 0);
 159
 160#define dec_inpack(max) \
 161        movq (RIO),                     RAB0; \
 162        bswapq                          RAB0; \
 163        rolq $32,                       RAB0; \
 164        movq 4*2(RIO),                  RCD0; \
 165        bswapq                          RCD0; \
 166        rorq $32,                       RCD0; \
 167        xorq key_table(CTX, max, 8),    RAB0;
 168
 169#define dec_outunpack() \
 170        xorq key_table(CTX),            RCD0; \
 171        rorq $32,                       RCD0; \
 172        bswapq                          RCD0; \
 173        movq RCD0,                      (RIO); \
 174        rolq $32,                       RAB0; \
 175        bswapq                          RAB0; \
 176        movq RAB0,                      4*2(RIO);
 177
 178SYM_FUNC_START(__camellia_enc_blk)
 179        /* input:
 180         *      %rdi: ctx, CTX
 181         *      %rsi: dst
 182         *      %rdx: src
 183         *      %rcx: bool xor
 184         */
 185        movq %r12, RR12;
 186
 187        movq %rcx, RXOR;
 188        movq %rsi, RDST;
 189        movq %rdx, RIO;
 190
 191        enc_inpack();
 192
 193        enc_rounds(0);
 194        enc_fls(8);
 195        enc_rounds(8);
 196        enc_fls(16);
 197        enc_rounds(16);
 198        movl $24, RT1d; /* max */
 199
 200        cmpb $16, key_length(CTX);
 201        je .L__enc_done;
 202
 203        enc_fls(24);
 204        enc_rounds(24);
 205        movl $32, RT1d; /* max */
 206
 207.L__enc_done:
 208        testb RXORbl, RXORbl;
 209        movq RDST, RIO;
 210
 211        jnz .L__enc_xor;
 212
 213        enc_outunpack(mov, RT1);
 214
 215        movq RR12, %r12;
 216        ret;
 217
 218.L__enc_xor:
 219        enc_outunpack(xor, RT1);
 220
 221        movq RR12, %r12;
 222        ret;
 223SYM_FUNC_END(__camellia_enc_blk)
 224
 225SYM_FUNC_START(camellia_dec_blk)
 226        /* input:
 227         *      %rdi: ctx, CTX
 228         *      %rsi: dst
 229         *      %rdx: src
 230         */
 231        cmpl $16, key_length(CTX);
 232        movl $32, RT2d;
 233        movl $24, RXORd;
 234        cmovel RXORd, RT2d; /* max */
 235
 236        movq %r12, RR12;
 237        movq %rsi, RDST;
 238        movq %rdx, RIO;
 239
 240        dec_inpack(RT2);
 241
 242        cmpb $24, RT2bl;
 243        je .L__dec_rounds16;
 244
 245        dec_rounds(24);
 246        dec_fls(24);
 247
 248.L__dec_rounds16:
 249        dec_rounds(16);
 250        dec_fls(16);
 251        dec_rounds(8);
 252        dec_fls(8);
 253        dec_rounds(0);
 254
 255        movq RDST, RIO;
 256
 257        dec_outunpack();
 258
 259        movq RR12, %r12;
 260        ret;
 261SYM_FUNC_END(camellia_dec_blk)
 262
 263/**********************************************************************
 264  2-way camellia
 265 **********************************************************************/
 266#define roundsm2(ab, subkey, cd) \
 267        movq (key_table + ((subkey) * 2) * 4)(CTX),     RT2; \
 268        xorq RT2,                                       cd ## 1; \
 269        \
 270        xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
 271        xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
 272        xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
 273        xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
 274        \
 275                xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
 276                xorq RT2,                                       cd ## 0; \
 277                xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
 278                xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
 279                xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
 280
 281#define fls2(l, r, kl, kr) \
 282        movl (key_table + ((kl) * 2) * 4)(CTX),         RT0d; \
 283        andl l ## 0d,                                   RT0d; \
 284        roll $1,                                        RT0d; \
 285        shlq $32,                                       RT0; \
 286        xorq RT0,                                       l ## 0; \
 287        movq (key_table + ((kr) * 2) * 4)(CTX),         RT1; \
 288        orq r ## 0,                                     RT1; \
 289        shrq $32,                                       RT1; \
 290        xorq RT1,                                       r ## 0; \
 291        \
 292                movl (key_table + ((kl) * 2) * 4)(CTX),         RT2d; \
 293                andl l ## 1d,                                   RT2d; \
 294                roll $1,                                        RT2d; \
 295                shlq $32,                                       RT2; \
 296                xorq RT2,                                       l ## 1; \
 297                movq (key_table + ((kr) * 2) * 4)(CTX),         RT0; \
 298                orq r ## 1,                                     RT0; \
 299                shrq $32,                                       RT0; \
 300                xorq RT0,                                       r ## 1; \
 301        \
 302        movq (key_table + ((kl) * 2) * 4)(CTX),         RT1; \
 303        orq l ## 0,                                     RT1; \
 304        shrq $32,                                       RT1; \
 305        xorq RT1,                                       l ## 0; \
 306        movl (key_table + ((kr) * 2) * 4)(CTX),         RT2d; \
 307        andl r ## 0d,                                   RT2d; \
 308        roll $1,                                        RT2d; \
 309        shlq $32,                                       RT2; \
 310        xorq RT2,                                       r ## 0; \
 311        \
 312                movq (key_table + ((kl) * 2) * 4)(CTX),         RT0; \
 313                orq l ## 1,                                     RT0; \
 314                shrq $32,                                       RT0; \
 315                xorq RT0,                                       l ## 1; \
 316                movl (key_table + ((kr) * 2) * 4)(CTX),         RT1d; \
 317                andl r ## 1d,                                   RT1d; \
 318                roll $1,                                        RT1d; \
 319                shlq $32,                                       RT1; \
 320                xorq RT1,                                       r ## 1;
 321
 322#define enc_rounds2(i) \
 323        roundsm2(RAB, i + 2, RCD); \
 324        roundsm2(RCD, i + 3, RAB); \
 325        roundsm2(RAB, i + 4, RCD); \
 326        roundsm2(RCD, i + 5, RAB); \
 327        roundsm2(RAB, i + 6, RCD); \
 328        roundsm2(RCD, i + 7, RAB);
 329
 330#define enc_fls2(i) \
 331        fls2(RAB, RCD, i + 0, i + 1);
 332
 333#define enc_inpack2() \
 334        movq (RIO),                     RAB0; \
 335        bswapq                          RAB0; \
 336        rorq $32,                       RAB0; \
 337        movq 4*2(RIO),                  RCD0; \
 338        bswapq                          RCD0; \
 339        rolq $32,                       RCD0; \
 340        xorq key_table(CTX),            RAB0; \
 341        \
 342                movq 8*2(RIO),                  RAB1; \
 343                bswapq                          RAB1; \
 344                rorq $32,                       RAB1; \
 345                movq 12*2(RIO),                 RCD1; \
 346                bswapq                          RCD1; \
 347                rolq $32,                       RCD1; \
 348                xorq key_table(CTX),            RAB1;
 349
 350#define enc_outunpack2(op, max) \
 351        xorq key_table(CTX, max, 8),    RCD0; \
 352        rolq $32,                       RCD0; \
 353        bswapq                          RCD0; \
 354        op ## q RCD0,                   (RIO); \
 355        rorq $32,                       RAB0; \
 356        bswapq                          RAB0; \
 357        op ## q RAB0,                   4*2(RIO); \
 358        \
 359                xorq key_table(CTX, max, 8),    RCD1; \
 360                rolq $32,                       RCD1; \
 361                bswapq                          RCD1; \
 362                op ## q RCD1,                   8*2(RIO); \
 363                rorq $32,                       RAB1; \
 364                bswapq                          RAB1; \
 365                op ## q RAB1,                   12*2(RIO);
 366
 367#define dec_rounds2(i) \
 368        roundsm2(RAB, i + 7, RCD); \
 369        roundsm2(RCD, i + 6, RAB); \
 370        roundsm2(RAB, i + 5, RCD); \
 371        roundsm2(RCD, i + 4, RAB); \
 372        roundsm2(RAB, i + 3, RCD); \
 373        roundsm2(RCD, i + 2, RAB);
 374
 375#define dec_fls2(i) \
 376        fls2(RAB, RCD, i + 1, i + 0);
 377
 378#define dec_inpack2(max) \
 379        movq (RIO),                     RAB0; \
 380        bswapq                          RAB0; \
 381        rorq $32,                       RAB0; \
 382        movq 4*2(RIO),                  RCD0; \
 383        bswapq                          RCD0; \
 384        rolq $32,                       RCD0; \
 385        xorq key_table(CTX, max, 8),    RAB0; \
 386        \
 387                movq 8*2(RIO),                  RAB1; \
 388                bswapq                          RAB1; \
 389                rorq $32,                       RAB1; \
 390                movq 12*2(RIO),                 RCD1; \
 391                bswapq                          RCD1; \
 392                rolq $32,                       RCD1; \
 393                xorq key_table(CTX, max, 8),    RAB1;
 394
 395#define dec_outunpack2() \
 396        xorq key_table(CTX),            RCD0; \
 397        rolq $32,                       RCD0; \
 398        bswapq                          RCD0; \
 399        movq RCD0,                      (RIO); \
 400        rorq $32,                       RAB0; \
 401        bswapq                          RAB0; \
 402        movq RAB0,                      4*2(RIO); \
 403        \
 404                xorq key_table(CTX),            RCD1; \
 405                rolq $32,                       RCD1; \
 406                bswapq                          RCD1; \
 407                movq RCD1,                      8*2(RIO); \
 408                rorq $32,                       RAB1; \
 409                bswapq                          RAB1; \
 410                movq RAB1,                      12*2(RIO);
 411
 412SYM_FUNC_START(__camellia_enc_blk_2way)
 413        /* input:
 414         *      %rdi: ctx, CTX
 415         *      %rsi: dst
 416         *      %rdx: src
 417         *      %rcx: bool xor
 418         */
 419        pushq %rbx;
 420
 421        movq %r12, RR12;
 422        movq %rcx, RXOR;
 423        movq %rsi, RDST;
 424        movq %rdx, RIO;
 425
 426        enc_inpack2();
 427
 428        enc_rounds2(0);
 429        enc_fls2(8);
 430        enc_rounds2(8);
 431        enc_fls2(16);
 432        enc_rounds2(16);
 433        movl $24, RT2d; /* max */
 434
 435        cmpb $16, key_length(CTX);
 436        je .L__enc2_done;
 437
 438        enc_fls2(24);
 439        enc_rounds2(24);
 440        movl $32, RT2d; /* max */
 441
 442.L__enc2_done:
 443        test RXORbl, RXORbl;
 444        movq RDST, RIO;
 445        jnz .L__enc2_xor;
 446
 447        enc_outunpack2(mov, RT2);
 448
 449        movq RR12, %r12;
 450        popq %rbx;
 451        ret;
 452
 453.L__enc2_xor:
 454        enc_outunpack2(xor, RT2);
 455
 456        movq RR12, %r12;
 457        popq %rbx;
 458        ret;
 459SYM_FUNC_END(__camellia_enc_blk_2way)
 460
 461SYM_FUNC_START(camellia_dec_blk_2way)
 462        /* input:
 463         *      %rdi: ctx, CTX
 464         *      %rsi: dst
 465         *      %rdx: src
 466         */
 467        cmpl $16, key_length(CTX);
 468        movl $32, RT2d;
 469        movl $24, RXORd;
 470        cmovel RXORd, RT2d; /* max */
 471
 472        movq %rbx, RXOR;
 473        movq %r12, RR12;
 474        movq %rsi, RDST;
 475        movq %rdx, RIO;
 476
 477        dec_inpack2(RT2);
 478
 479        cmpb $24, RT2bl;
 480        je .L__dec2_rounds16;
 481
 482        dec_rounds2(24);
 483        dec_fls2(24);
 484
 485.L__dec2_rounds16:
 486        dec_rounds2(16);
 487        dec_fls2(16);
 488        dec_rounds2(8);
 489        dec_fls2(8);
 490        dec_rounds2(0);
 491
 492        movq RDST, RIO;
 493
 494        dec_outunpack2();
 495
 496        movq RR12, %r12;
 497        movq RXOR, %rbx;
 498        ret;
 499SYM_FUNC_END(camellia_dec_blk_2way)
 500