linux/arch/x86/crypto/camellia-x86_64-asm_64.S
<<
>>
Prefs
   1/*
   2 * Camellia Cipher Algorithm (x86_64)
   3 *
   4 * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation; either version 2 of the License, or
   9 * (at your option) any later version.
  10 *
  11 * This program is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 * GNU General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program; if not, write to the Free Software
  18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
  19 * USA
  20 *
  21 */
  22
  23#include <linux/linkage.h>
  24
  25.file "camellia-x86_64-asm_64.S"
  26.text
  27
  28.extern camellia_sp10011110;
  29.extern camellia_sp22000222;
  30.extern camellia_sp03303033;
  31.extern camellia_sp00444404;
  32.extern camellia_sp02220222;
  33.extern camellia_sp30333033;
  34.extern camellia_sp44044404;
  35.extern camellia_sp11101110;
  36
  37#define sp10011110 camellia_sp10011110
  38#define sp22000222 camellia_sp22000222
  39#define sp03303033 camellia_sp03303033
  40#define sp00444404 camellia_sp00444404
  41#define sp02220222 camellia_sp02220222
  42#define sp30333033 camellia_sp30333033
  43#define sp44044404 camellia_sp44044404
  44#define sp11101110 camellia_sp11101110
  45
  46#define CAMELLIA_TABLE_BYTE_LEN 272
  47
  48/* struct camellia_ctx: */
  49#define key_table 0
  50#define key_length CAMELLIA_TABLE_BYTE_LEN
  51
  52/* register macros */
  53#define CTX %rdi
  54#define RIO %rsi
  55#define RIOd %esi
  56
  57#define RAB0 %rax
  58#define RCD0 %rcx
  59#define RAB1 %rbx
  60#define RCD1 %rdx
  61
  62#define RAB0d %eax
  63#define RCD0d %ecx
  64#define RAB1d %ebx
  65#define RCD1d %edx
  66
  67#define RAB0bl %al
  68#define RCD0bl %cl
  69#define RAB1bl %bl
  70#define RCD1bl %dl
  71
  72#define RAB0bh %ah
  73#define RCD0bh %ch
  74#define RAB1bh %bh
  75#define RCD1bh %dh
  76
  77#define RT0 %rsi
  78#define RT1 %rbp
  79#define RT2 %r8
  80
  81#define RT0d %esi
  82#define RT1d %ebp
  83#define RT2d %r8d
  84
  85#define RT2bl %r8b
  86
  87#define RXOR %r9
  88#define RRBP %r10
  89#define RDST %r11
  90
  91#define RXORd %r9d
  92#define RXORbl %r9b
  93
  94#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
  95        movzbl ab ## bl,                tmp2 ## d; \
  96        movzbl ab ## bh,                tmp1 ## d; \
  97        rorq $16,                       ab; \
  98        xorq T0(, tmp2, 8),             dst; \
  99        xorq T1(, tmp1, 8),             dst;
 100
 101/**********************************************************************
 102  1-way camellia
 103 **********************************************************************/
 104#define roundsm(ab, subkey, cd) \
 105        movq (key_table + ((subkey) * 2) * 4)(CTX),     RT2; \
 106        \
 107        xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
 108        xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
 109        xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
 110        xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
 111        \
 112        xorq RT2,                                       cd ## 0;
 113
 114#define fls(l, r, kl, kr) \
 115        movl (key_table + ((kl) * 2) * 4)(CTX),         RT0d; \
 116        andl l ## 0d,                                   RT0d; \
 117        roll $1,                                        RT0d; \
 118        shlq $32,                                       RT0; \
 119        xorq RT0,                                       l ## 0; \
 120        movq (key_table + ((kr) * 2) * 4)(CTX),         RT1; \
 121        orq r ## 0,                                     RT1; \
 122        shrq $32,                                       RT1; \
 123        xorq RT1,                                       r ## 0; \
 124        \
 125        movq (key_table + ((kl) * 2) * 4)(CTX),         RT2; \
 126        orq l ## 0,                                     RT2; \
 127        shrq $32,                                       RT2; \
 128        xorq RT2,                                       l ## 0; \
 129        movl (key_table + ((kr) * 2) * 4)(CTX),         RT0d; \
 130        andl r ## 0d,                                   RT0d; \
 131        roll $1,                                        RT0d; \
 132        shlq $32,                                       RT0; \
 133        xorq RT0,                                       r ## 0;
 134
 135#define enc_rounds(i) \
 136        roundsm(RAB, i + 2, RCD); \
 137        roundsm(RCD, i + 3, RAB); \
 138        roundsm(RAB, i + 4, RCD); \
 139        roundsm(RCD, i + 5, RAB); \
 140        roundsm(RAB, i + 6, RCD); \
 141        roundsm(RCD, i + 7, RAB);
 142
 143#define enc_fls(i) \
 144        fls(RAB, RCD, i + 0, i + 1);
 145
 146#define enc_inpack() \
 147        movq (RIO),                     RAB0; \
 148        bswapq                          RAB0; \
 149        rolq $32,                       RAB0; \
 150        movq 4*2(RIO),                  RCD0; \
 151        bswapq                          RCD0; \
 152        rorq $32,                       RCD0; \
 153        xorq key_table(CTX),            RAB0;
 154
 155#define enc_outunpack(op, max) \
 156        xorq key_table(CTX, max, 8),    RCD0; \
 157        rorq $32,                       RCD0; \
 158        bswapq                          RCD0; \
 159        op ## q RCD0,                   (RIO); \
 160        rolq $32,                       RAB0; \
 161        bswapq                          RAB0; \
 162        op ## q RAB0,                   4*2(RIO);
 163
 164#define dec_rounds(i) \
 165        roundsm(RAB, i + 7, RCD); \
 166        roundsm(RCD, i + 6, RAB); \
 167        roundsm(RAB, i + 5, RCD); \
 168        roundsm(RCD, i + 4, RAB); \
 169        roundsm(RAB, i + 3, RCD); \
 170        roundsm(RCD, i + 2, RAB);
 171
 172#define dec_fls(i) \
 173        fls(RAB, RCD, i + 1, i + 0);
 174
 175#define dec_inpack(max) \
 176        movq (RIO),                     RAB0; \
 177        bswapq                          RAB0; \
 178        rolq $32,                       RAB0; \
 179        movq 4*2(RIO),                  RCD0; \
 180        bswapq                          RCD0; \
 181        rorq $32,                       RCD0; \
 182        xorq key_table(CTX, max, 8),    RAB0;
 183
 184#define dec_outunpack() \
 185        xorq key_table(CTX),            RCD0; \
 186        rorq $32,                       RCD0; \
 187        bswapq                          RCD0; \
 188        movq RCD0,                      (RIO); \
 189        rolq $32,                       RAB0; \
 190        bswapq                          RAB0; \
 191        movq RAB0,                      4*2(RIO);
 192
 193ENTRY(__camellia_enc_blk)
 194        /* input:
 195         *      %rdi: ctx, CTX
 196         *      %rsi: dst
 197         *      %rdx: src
 198         *      %rcx: bool xor
 199         */
 200        movq %rbp, RRBP;
 201
 202        movq %rcx, RXOR;
 203        movq %rsi, RDST;
 204        movq %rdx, RIO;
 205
 206        enc_inpack();
 207
 208        enc_rounds(0);
 209        enc_fls(8);
 210        enc_rounds(8);
 211        enc_fls(16);
 212        enc_rounds(16);
 213        movl $24, RT1d; /* max */
 214
 215        cmpb $16, key_length(CTX);
 216        je .L__enc_done;
 217
 218        enc_fls(24);
 219        enc_rounds(24);
 220        movl $32, RT1d; /* max */
 221
 222.L__enc_done:
 223        testb RXORbl, RXORbl;
 224        movq RDST, RIO;
 225
 226        jnz .L__enc_xor;
 227
 228        enc_outunpack(mov, RT1);
 229
 230        movq RRBP, %rbp;
 231        ret;
 232
 233.L__enc_xor:
 234        enc_outunpack(xor, RT1);
 235
 236        movq RRBP, %rbp;
 237        ret;
 238ENDPROC(__camellia_enc_blk)
 239
 240ENTRY(camellia_dec_blk)
 241        /* input:
 242         *      %rdi: ctx, CTX
 243         *      %rsi: dst
 244         *      %rdx: src
 245         */
 246        cmpl $16, key_length(CTX);
 247        movl $32, RT2d;
 248        movl $24, RXORd;
 249        cmovel RXORd, RT2d; /* max */
 250
 251        movq %rbp, RRBP;
 252        movq %rsi, RDST;
 253        movq %rdx, RIO;
 254
 255        dec_inpack(RT2);
 256
 257        cmpb $24, RT2bl;
 258        je .L__dec_rounds16;
 259
 260        dec_rounds(24);
 261        dec_fls(24);
 262
 263.L__dec_rounds16:
 264        dec_rounds(16);
 265        dec_fls(16);
 266        dec_rounds(8);
 267        dec_fls(8);
 268        dec_rounds(0);
 269
 270        movq RDST, RIO;
 271
 272        dec_outunpack();
 273
 274        movq RRBP, %rbp;
 275        ret;
 276ENDPROC(camellia_dec_blk)
 277
 278/**********************************************************************
 279  2-way camellia
 280 **********************************************************************/
 281#define roundsm2(ab, subkey, cd) \
 282        movq (key_table + ((subkey) * 2) * 4)(CTX),     RT2; \
 283        xorq RT2,                                       cd ## 1; \
 284        \
 285        xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
 286        xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
 287        xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
 288        xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
 289        \
 290                xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
 291                xorq RT2,                                       cd ## 0; \
 292                xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
 293                xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
 294                xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
 295
 296#define fls2(l, r, kl, kr) \
 297        movl (key_table + ((kl) * 2) * 4)(CTX),         RT0d; \
 298        andl l ## 0d,                                   RT0d; \
 299        roll $1,                                        RT0d; \
 300        shlq $32,                                       RT0; \
 301        xorq RT0,                                       l ## 0; \
 302        movq (key_table + ((kr) * 2) * 4)(CTX),         RT1; \
 303        orq r ## 0,                                     RT1; \
 304        shrq $32,                                       RT1; \
 305        xorq RT1,                                       r ## 0; \
 306        \
 307                movl (key_table + ((kl) * 2) * 4)(CTX),         RT2d; \
 308                andl l ## 1d,                                   RT2d; \
 309                roll $1,                                        RT2d; \
 310                shlq $32,                                       RT2; \
 311                xorq RT2,                                       l ## 1; \
 312                movq (key_table + ((kr) * 2) * 4)(CTX),         RT0; \
 313                orq r ## 1,                                     RT0; \
 314                shrq $32,                                       RT0; \
 315                xorq RT0,                                       r ## 1; \
 316        \
 317        movq (key_table + ((kl) * 2) * 4)(CTX),         RT1; \
 318        orq l ## 0,                                     RT1; \
 319        shrq $32,                                       RT1; \
 320        xorq RT1,                                       l ## 0; \
 321        movl (key_table + ((kr) * 2) * 4)(CTX),         RT2d; \
 322        andl r ## 0d,                                   RT2d; \
 323        roll $1,                                        RT2d; \
 324        shlq $32,                                       RT2; \
 325        xorq RT2,                                       r ## 0; \
 326        \
 327                movq (key_table + ((kl) * 2) * 4)(CTX),         RT0; \
 328                orq l ## 1,                                     RT0; \
 329                shrq $32,                                       RT0; \
 330                xorq RT0,                                       l ## 1; \
 331                movl (key_table + ((kr) * 2) * 4)(CTX),         RT1d; \
 332                andl r ## 1d,                                   RT1d; \
 333                roll $1,                                        RT1d; \
 334                shlq $32,                                       RT1; \
 335                xorq RT1,                                       r ## 1;
 336
 337#define enc_rounds2(i) \
 338        roundsm2(RAB, i + 2, RCD); \
 339        roundsm2(RCD, i + 3, RAB); \
 340        roundsm2(RAB, i + 4, RCD); \
 341        roundsm2(RCD, i + 5, RAB); \
 342        roundsm2(RAB, i + 6, RCD); \
 343        roundsm2(RCD, i + 7, RAB);
 344
 345#define enc_fls2(i) \
 346        fls2(RAB, RCD, i + 0, i + 1);
 347
 348#define enc_inpack2() \
 349        movq (RIO),                     RAB0; \
 350        bswapq                          RAB0; \
 351        rorq $32,                       RAB0; \
 352        movq 4*2(RIO),                  RCD0; \
 353        bswapq                          RCD0; \
 354        rolq $32,                       RCD0; \
 355        xorq key_table(CTX),            RAB0; \
 356        \
 357                movq 8*2(RIO),                  RAB1; \
 358                bswapq                          RAB1; \
 359                rorq $32,                       RAB1; \
 360                movq 12*2(RIO),                 RCD1; \
 361                bswapq                          RCD1; \
 362                rolq $32,                       RCD1; \
 363                xorq key_table(CTX),            RAB1;
 364
 365#define enc_outunpack2(op, max) \
 366        xorq key_table(CTX, max, 8),    RCD0; \
 367        rolq $32,                       RCD0; \
 368        bswapq                          RCD0; \
 369        op ## q RCD0,                   (RIO); \
 370        rorq $32,                       RAB0; \
 371        bswapq                          RAB0; \
 372        op ## q RAB0,                   4*2(RIO); \
 373        \
 374                xorq key_table(CTX, max, 8),    RCD1; \
 375                rolq $32,                       RCD1; \
 376                bswapq                          RCD1; \
 377                op ## q RCD1,                   8*2(RIO); \
 378                rorq $32,                       RAB1; \
 379                bswapq                          RAB1; \
 380                op ## q RAB1,                   12*2(RIO);
 381
 382#define dec_rounds2(i) \
 383        roundsm2(RAB, i + 7, RCD); \
 384        roundsm2(RCD, i + 6, RAB); \
 385        roundsm2(RAB, i + 5, RCD); \
 386        roundsm2(RCD, i + 4, RAB); \
 387        roundsm2(RAB, i + 3, RCD); \
 388        roundsm2(RCD, i + 2, RAB);
 389
 390#define dec_fls2(i) \
 391        fls2(RAB, RCD, i + 1, i + 0);
 392
 393#define dec_inpack2(max) \
 394        movq (RIO),                     RAB0; \
 395        bswapq                          RAB0; \
 396        rorq $32,                       RAB0; \
 397        movq 4*2(RIO),                  RCD0; \
 398        bswapq                          RCD0; \
 399        rolq $32,                       RCD0; \
 400        xorq key_table(CTX, max, 8),    RAB0; \
 401        \
 402                movq 8*2(RIO),                  RAB1; \
 403                bswapq                          RAB1; \
 404                rorq $32,                       RAB1; \
 405                movq 12*2(RIO),                 RCD1; \
 406                bswapq                          RCD1; \
 407                rolq $32,                       RCD1; \
 408                xorq key_table(CTX, max, 8),    RAB1;
 409
 410#define dec_outunpack2() \
 411        xorq key_table(CTX),            RCD0; \
 412        rolq $32,                       RCD0; \
 413        bswapq                          RCD0; \
 414        movq RCD0,                      (RIO); \
 415        rorq $32,                       RAB0; \
 416        bswapq                          RAB0; \
 417        movq RAB0,                      4*2(RIO); \
 418        \
 419                xorq key_table(CTX),            RCD1; \
 420                rolq $32,                       RCD1; \
 421                bswapq                          RCD1; \
 422                movq RCD1,                      8*2(RIO); \
 423                rorq $32,                       RAB1; \
 424                bswapq                          RAB1; \
 425                movq RAB1,                      12*2(RIO);
 426
 427ENTRY(__camellia_enc_blk_2way)
 428        /* input:
 429         *      %rdi: ctx, CTX
 430         *      %rsi: dst
 431         *      %rdx: src
 432         *      %rcx: bool xor
 433         */
 434        pushq %rbx;
 435
 436        movq %rbp, RRBP;
 437        movq %rcx, RXOR;
 438        movq %rsi, RDST;
 439        movq %rdx, RIO;
 440
 441        enc_inpack2();
 442
 443        enc_rounds2(0);
 444        enc_fls2(8);
 445        enc_rounds2(8);
 446        enc_fls2(16);
 447        enc_rounds2(16);
 448        movl $24, RT2d; /* max */
 449
 450        cmpb $16, key_length(CTX);
 451        je .L__enc2_done;
 452
 453        enc_fls2(24);
 454        enc_rounds2(24);
 455        movl $32, RT2d; /* max */
 456
 457.L__enc2_done:
 458        test RXORbl, RXORbl;
 459        movq RDST, RIO;
 460        jnz .L__enc2_xor;
 461
 462        enc_outunpack2(mov, RT2);
 463
 464        movq RRBP, %rbp;
 465        popq %rbx;
 466        ret;
 467
 468.L__enc2_xor:
 469        enc_outunpack2(xor, RT2);
 470
 471        movq RRBP, %rbp;
 472        popq %rbx;
 473        ret;
 474ENDPROC(__camellia_enc_blk_2way)
 475
 476ENTRY(camellia_dec_blk_2way)
 477        /* input:
 478         *      %rdi: ctx, CTX
 479         *      %rsi: dst
 480         *      %rdx: src
 481         */
 482        cmpl $16, key_length(CTX);
 483        movl $32, RT2d;
 484        movl $24, RXORd;
 485        cmovel RXORd, RT2d; /* max */
 486
 487        movq %rbx, RXOR;
 488        movq %rbp, RRBP;
 489        movq %rsi, RDST;
 490        movq %rdx, RIO;
 491
 492        dec_inpack2(RT2);
 493
 494        cmpb $24, RT2bl;
 495        je .L__dec2_rounds16;
 496
 497        dec_rounds2(24);
 498        dec_fls2(24);
 499
 500.L__dec2_rounds16:
 501        dec_rounds2(16);
 502        dec_fls2(16);
 503        dec_rounds2(8);
 504        dec_fls2(8);
 505        dec_rounds2(0);
 506
 507        movq RDST, RIO;
 508
 509        dec_outunpack2();
 510
 511        movq RRBP, %rbp;
 512        movq RXOR, %rbx;
 513        ret;
 514ENDPROC(camellia_dec_blk_2way)
 515