linux/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
<<
>>
Prefs
   1/*
   2 * x86_64/AVX2/AES-NI assembler implementation of Camellia
   3 *
   4 * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation; either version 2 of the License, or
   9 * (at your option) any later version.
  10 *
  11 */
  12
  13#include <linux/linkage.h>
  14
  15#define CAMELLIA_TABLE_BYTE_LEN 272
  16
  17/* struct camellia_ctx: */
  18#define key_table 0
  19#define key_length CAMELLIA_TABLE_BYTE_LEN
  20
  21/* register macros */
  22#define CTX %rdi
  23#define RIO %r8
  24
  25/**********************************************************************
  26  helper macros
  27 **********************************************************************/
  28#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
  29        vpand x, mask4bit, tmp0; \
  30        vpandn x, mask4bit, x; \
  31        vpsrld $4, x, x; \
  32        \
  33        vpshufb tmp0, lo_t, tmp0; \
  34        vpshufb x, hi_t, x; \
  35        vpxor tmp0, x, x;
  36
  37#define ymm0_x xmm0
  38#define ymm1_x xmm1
  39#define ymm2_x xmm2
  40#define ymm3_x xmm3
  41#define ymm4_x xmm4
  42#define ymm5_x xmm5
  43#define ymm6_x xmm6
  44#define ymm7_x xmm7
  45#define ymm8_x xmm8
  46#define ymm9_x xmm9
  47#define ymm10_x xmm10
  48#define ymm11_x xmm11
  49#define ymm12_x xmm12
  50#define ymm13_x xmm13
  51#define ymm14_x xmm14
  52#define ymm15_x xmm15
  53
  54/**********************************************************************
  55  32-way camellia
  56 **********************************************************************/
  57
  58/*
  59 * IN:
  60 *   x0..x7: byte-sliced AB state
  61 *   mem_cd: register pointer storing CD state
  62 *   key: index for key material
  63 * OUT:
  64 *   x0..x7: new byte-sliced CD state
  65 */
  66#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
  67                  t7, mem_cd, key) \
  68        /* \
  69         * S-function with AES subbytes \
  70         */ \
  71        vbroadcasti128 .Linv_shift_row, t4; \
  72        vpbroadcastd .L0f0f0f0f, t7; \
  73        vbroadcasti128 .Lpre_tf_lo_s1, t5; \
  74        vbroadcasti128 .Lpre_tf_hi_s1, t6; \
  75        vbroadcasti128 .Lpre_tf_lo_s4, t2; \
  76        vbroadcasti128 .Lpre_tf_hi_s4, t3; \
  77        \
  78        /* AES inverse shift rows */ \
  79        vpshufb t4, x0, x0; \
  80        vpshufb t4, x7, x7; \
  81        vpshufb t4, x3, x3; \
  82        vpshufb t4, x6, x6; \
  83        vpshufb t4, x2, x2; \
  84        vpshufb t4, x5, x5; \
  85        vpshufb t4, x1, x1; \
  86        vpshufb t4, x4, x4; \
  87        \
  88        /* prefilter sboxes 1, 2 and 3 */ \
  89        /* prefilter sbox 4 */ \
  90        filter_8bit(x0, t5, t6, t7, t4); \
  91        filter_8bit(x7, t5, t6, t7, t4); \
  92        vextracti128 $1, x0, t0##_x; \
  93        vextracti128 $1, x7, t1##_x; \
  94        filter_8bit(x3, t2, t3, t7, t4); \
  95        filter_8bit(x6, t2, t3, t7, t4); \
  96        vextracti128 $1, x3, t3##_x; \
  97        vextracti128 $1, x6, t2##_x; \
  98        filter_8bit(x2, t5, t6, t7, t4); \
  99        filter_8bit(x5, t5, t6, t7, t4); \
 100        filter_8bit(x1, t5, t6, t7, t4); \
 101        filter_8bit(x4, t5, t6, t7, t4); \
 102        \
 103        vpxor t4##_x, t4##_x, t4##_x; \
 104        \
 105        /* AES subbytes + AES shift rows */ \
 106        vextracti128 $1, x2, t6##_x; \
 107        vextracti128 $1, x5, t5##_x; \
 108        vaesenclast t4##_x, x0##_x, x0##_x; \
 109        vaesenclast t4##_x, t0##_x, t0##_x; \
 110        vinserti128 $1, t0##_x, x0, x0; \
 111        vaesenclast t4##_x, x7##_x, x7##_x; \
 112        vaesenclast t4##_x, t1##_x, t1##_x; \
 113        vinserti128 $1, t1##_x, x7, x7; \
 114        vaesenclast t4##_x, x3##_x, x3##_x; \
 115        vaesenclast t4##_x, t3##_x, t3##_x; \
 116        vinserti128 $1, t3##_x, x3, x3; \
 117        vaesenclast t4##_x, x6##_x, x6##_x; \
 118        vaesenclast t4##_x, t2##_x, t2##_x; \
 119        vinserti128 $1, t2##_x, x6, x6; \
 120        vextracti128 $1, x1, t3##_x; \
 121        vextracti128 $1, x4, t2##_x; \
 122        vbroadcasti128 .Lpost_tf_lo_s1, t0; \
 123        vbroadcasti128 .Lpost_tf_hi_s1, t1; \
 124        vaesenclast t4##_x, x2##_x, x2##_x; \
 125        vaesenclast t4##_x, t6##_x, t6##_x; \
 126        vinserti128 $1, t6##_x, x2, x2; \
 127        vaesenclast t4##_x, x5##_x, x5##_x; \
 128        vaesenclast t4##_x, t5##_x, t5##_x; \
 129        vinserti128 $1, t5##_x, x5, x5; \
 130        vaesenclast t4##_x, x1##_x, x1##_x; \
 131        vaesenclast t4##_x, t3##_x, t3##_x; \
 132        vinserti128 $1, t3##_x, x1, x1; \
 133        vaesenclast t4##_x, x4##_x, x4##_x; \
 134        vaesenclast t4##_x, t2##_x, t2##_x; \
 135        vinserti128 $1, t2##_x, x4, x4; \
 136        \
 137        /* postfilter sboxes 1 and 4 */ \
 138        vbroadcasti128 .Lpost_tf_lo_s3, t2; \
 139        vbroadcasti128 .Lpost_tf_hi_s3, t3; \
 140        filter_8bit(x0, t0, t1, t7, t6); \
 141        filter_8bit(x7, t0, t1, t7, t6); \
 142        filter_8bit(x3, t0, t1, t7, t6); \
 143        filter_8bit(x6, t0, t1, t7, t6); \
 144        \
 145        /* postfilter sbox 3 */ \
 146        vbroadcasti128 .Lpost_tf_lo_s2, t4; \
 147        vbroadcasti128 .Lpost_tf_hi_s2, t5; \
 148        filter_8bit(x2, t2, t3, t7, t6); \
 149        filter_8bit(x5, t2, t3, t7, t6); \
 150        \
 151        vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
 152        \
 153        /* postfilter sbox 2 */ \
 154        filter_8bit(x1, t4, t5, t7, t2); \
 155        filter_8bit(x4, t4, t5, t7, t2); \
 156        vpxor t7, t7, t7; \
 157        \
 158        vpsrldq $1, t0, t1; \
 159        vpsrldq $2, t0, t2; \
 160        vpshufb t7, t1, t1; \
 161        vpsrldq $3, t0, t3; \
 162        \
 163        /* P-function */ \
 164        vpxor x5, x0, x0; \
 165        vpxor x6, x1, x1; \
 166        vpxor x7, x2, x2; \
 167        vpxor x4, x3, x3; \
 168        \
 169        vpshufb t7, t2, t2; \
 170        vpsrldq $4, t0, t4; \
 171        vpshufb t7, t3, t3; \
 172        vpsrldq $5, t0, t5; \
 173        vpshufb t7, t4, t4; \
 174        \
 175        vpxor x2, x4, x4; \
 176        vpxor x3, x5, x5; \
 177        vpxor x0, x6, x6; \
 178        vpxor x1, x7, x7; \
 179        \
 180        vpsrldq $6, t0, t6; \
 181        vpshufb t7, t5, t5; \
 182        vpshufb t7, t6, t6; \
 183        \
 184        vpxor x7, x0, x0; \
 185        vpxor x4, x1, x1; \
 186        vpxor x5, x2, x2; \
 187        vpxor x6, x3, x3; \
 188        \
 189        vpxor x3, x4, x4; \
 190        vpxor x0, x5, x5; \
 191        vpxor x1, x6, x6; \
 192        vpxor x2, x7, x7; /* note: high and low parts swapped */ \
 193        \
 194        /* Add key material and result to CD (x becomes new CD) */ \
 195        \
 196        vpxor t6, x1, x1; \
 197        vpxor 5 * 32(mem_cd), x1, x1; \
 198        \
 199        vpsrldq $7, t0, t6; \
 200        vpshufb t7, t0, t0; \
 201        vpshufb t7, t6, t7; \
 202        \
 203        vpxor t7, x0, x0; \
 204        vpxor 4 * 32(mem_cd), x0, x0; \
 205        \
 206        vpxor t5, x2, x2; \
 207        vpxor 6 * 32(mem_cd), x2, x2; \
 208        \
 209        vpxor t4, x3, x3; \
 210        vpxor 7 * 32(mem_cd), x3, x3; \
 211        \
 212        vpxor t3, x4, x4; \
 213        vpxor 0 * 32(mem_cd), x4, x4; \
 214        \
 215        vpxor t2, x5, x5; \
 216        vpxor 1 * 32(mem_cd), x5, x5; \
 217        \
 218        vpxor t1, x6, x6; \
 219        vpxor 2 * 32(mem_cd), x6, x6; \
 220        \
 221        vpxor t0, x7, x7; \
 222        vpxor 3 * 32(mem_cd), x7, x7;
 223
 224/*
 225 * Size optimization... with inlined roundsm32 binary would be over 5 times
 226 * larger and would only marginally faster.
 227 */
 228.align 8
 229roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
 230        roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 231                  %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
 232                  %rcx, (%r9));
 233        ret;
 234ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
 235
 236.align 8
 237roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
 238        roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
 239                  %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
 240                  %rax, (%r9));
 241        ret;
 242ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 243
 244/*
 245 * IN/OUT:
 246 *  x0..x7: byte-sliced AB state preloaded
 247 *  mem_ab: byte-sliced AB state in memory
 248 *  mem_cb: byte-sliced CD state in memory
 249 */
 250#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 251                      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
 252        leaq (key_table + (i) * 8)(CTX), %r9; \
 253        call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
 254        \
 255        vmovdqu x0, 4 * 32(mem_cd); \
 256        vmovdqu x1, 5 * 32(mem_cd); \
 257        vmovdqu x2, 6 * 32(mem_cd); \
 258        vmovdqu x3, 7 * 32(mem_cd); \
 259        vmovdqu x4, 0 * 32(mem_cd); \
 260        vmovdqu x5, 1 * 32(mem_cd); \
 261        vmovdqu x6, 2 * 32(mem_cd); \
 262        vmovdqu x7, 3 * 32(mem_cd); \
 263        \
 264        leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
 265        call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
 266        \
 267        store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
 268
 269#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
 270
 271#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
 272        /* Store new AB state */ \
 273        vmovdqu x4, 4 * 32(mem_ab); \
 274        vmovdqu x5, 5 * 32(mem_ab); \
 275        vmovdqu x6, 6 * 32(mem_ab); \
 276        vmovdqu x7, 7 * 32(mem_ab); \
 277        vmovdqu x0, 0 * 32(mem_ab); \
 278        vmovdqu x1, 1 * 32(mem_ab); \
 279        vmovdqu x2, 2 * 32(mem_ab); \
 280        vmovdqu x3, 3 * 32(mem_ab);
 281
 282#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 283                      y6, y7, mem_ab, mem_cd, i) \
 284        two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 285                      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
 286        two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 287                      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
 288        two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 289                      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
 290
 291#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 292                      y6, y7, mem_ab, mem_cd, i) \
 293        two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 294                      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
 295        two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 296                      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
 297        two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 298                      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
 299
 300/*
 301 * IN:
 302 *  v0..3: byte-sliced 32-bit integers
 303 * OUT:
 304 *  v0..3: (IN <<< 1)
 305 */
 306#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
 307        vpcmpgtb v0, zero, t0; \
 308        vpaddb v0, v0, v0; \
 309        vpabsb t0, t0; \
 310        \
 311        vpcmpgtb v1, zero, t1; \
 312        vpaddb v1, v1, v1; \
 313        vpabsb t1, t1; \
 314        \
 315        vpcmpgtb v2, zero, t2; \
 316        vpaddb v2, v2, v2; \
 317        vpabsb t2, t2; \
 318        \
 319        vpor t0, v1, v1; \
 320        \
 321        vpcmpgtb v3, zero, t0; \
 322        vpaddb v3, v3, v3; \
 323        vpabsb t0, t0; \
 324        \
 325        vpor t1, v2, v2; \
 326        vpor t2, v3, v3; \
 327        vpor t0, v0, v0;
 328
 329/*
 330 * IN:
 331 *   r: byte-sliced AB state in memory
 332 *   l: byte-sliced CD state in memory
 333 * OUT:
 334 *   x0..x7: new byte-sliced CD state
 335 */
 336#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
 337              tt1, tt2, tt3, kll, klr, krl, krr) \
 338        /* \
 339         * t0 = kll; \
 340         * t0 &= ll; \
 341         * lr ^= rol32(t0, 1); \
 342         */ \
 343        vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
 344        vpxor tt0, tt0, tt0; \
 345        vpshufb tt0, t0, t3; \
 346        vpsrldq $1, t0, t0; \
 347        vpshufb tt0, t0, t2; \
 348        vpsrldq $1, t0, t0; \
 349        vpshufb tt0, t0, t1; \
 350        vpsrldq $1, t0, t0; \
 351        vpshufb tt0, t0, t0; \
 352        \
 353        vpand l0, t0, t0; \
 354        vpand l1, t1, t1; \
 355        vpand l2, t2, t2; \
 356        vpand l3, t3, t3; \
 357        \
 358        rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
 359        \
 360        vpxor l4, t0, l4; \
 361        vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
 362        vmovdqu l4, 4 * 32(l); \
 363        vpxor l5, t1, l5; \
 364        vmovdqu l5, 5 * 32(l); \
 365        vpxor l6, t2, l6; \
 366        vmovdqu l6, 6 * 32(l); \
 367        vpxor l7, t3, l7; \
 368        vmovdqu l7, 7 * 32(l); \
 369        \
 370        /* \
 371         * t2 = krr; \
 372         * t2 |= rr; \
 373         * rl ^= t2; \
 374         */ \
 375        \
 376        vpshufb tt0, t0, t3; \
 377        vpsrldq $1, t0, t0; \
 378        vpshufb tt0, t0, t2; \
 379        vpsrldq $1, t0, t0; \
 380        vpshufb tt0, t0, t1; \
 381        vpsrldq $1, t0, t0; \
 382        vpshufb tt0, t0, t0; \
 383        \
 384        vpor 4 * 32(r), t0, t0; \
 385        vpor 5 * 32(r), t1, t1; \
 386        vpor 6 * 32(r), t2, t2; \
 387        vpor 7 * 32(r), t3, t3; \
 388        \
 389        vpxor 0 * 32(r), t0, t0; \
 390        vpxor 1 * 32(r), t1, t1; \
 391        vpxor 2 * 32(r), t2, t2; \
 392        vpxor 3 * 32(r), t3, t3; \
 393        vmovdqu t0, 0 * 32(r); \
 394        vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
 395        vmovdqu t1, 1 * 32(r); \
 396        vmovdqu t2, 2 * 32(r); \
 397        vmovdqu t3, 3 * 32(r); \
 398        \
 399        /* \
 400         * t2 = krl; \
 401         * t2 &= rl; \
 402         * rr ^= rol32(t2, 1); \
 403         */ \
 404        vpshufb tt0, t0, t3; \
 405        vpsrldq $1, t0, t0; \
 406        vpshufb tt0, t0, t2; \
 407        vpsrldq $1, t0, t0; \
 408        vpshufb tt0, t0, t1; \
 409        vpsrldq $1, t0, t0; \
 410        vpshufb tt0, t0, t0; \
 411        \
 412        vpand 0 * 32(r), t0, t0; \
 413        vpand 1 * 32(r), t1, t1; \
 414        vpand 2 * 32(r), t2, t2; \
 415        vpand 3 * 32(r), t3, t3; \
 416        \
 417        rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
 418        \
 419        vpxor 4 * 32(r), t0, t0; \
 420        vpxor 5 * 32(r), t1, t1; \
 421        vpxor 6 * 32(r), t2, t2; \
 422        vpxor 7 * 32(r), t3, t3; \
 423        vmovdqu t0, 4 * 32(r); \
 424        vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
 425        vmovdqu t1, 5 * 32(r); \
 426        vmovdqu t2, 6 * 32(r); \
 427        vmovdqu t3, 7 * 32(r); \
 428        \
 429        /* \
 430         * t0 = klr; \
 431         * t0 |= lr; \
 432         * ll ^= t0; \
 433         */ \
 434        \
 435        vpshufb tt0, t0, t3; \
 436        vpsrldq $1, t0, t0; \
 437        vpshufb tt0, t0, t2; \
 438        vpsrldq $1, t0, t0; \
 439        vpshufb tt0, t0, t1; \
 440        vpsrldq $1, t0, t0; \
 441        vpshufb tt0, t0, t0; \
 442        \
 443        vpor l4, t0, t0; \
 444        vpor l5, t1, t1; \
 445        vpor l6, t2, t2; \
 446        vpor l7, t3, t3; \
 447        \
 448        vpxor l0, t0, l0; \
 449        vmovdqu l0, 0 * 32(l); \
 450        vpxor l1, t1, l1; \
 451        vmovdqu l1, 1 * 32(l); \
 452        vpxor l2, t2, l2; \
 453        vmovdqu l2, 2 * 32(l); \
 454        vpxor l3, t3, l3; \
 455        vmovdqu l3, 3 * 32(l);
 456
 457#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
 458        vpunpckhdq x1, x0, t2; \
 459        vpunpckldq x1, x0, x0; \
 460        \
 461        vpunpckldq x3, x2, t1; \
 462        vpunpckhdq x3, x2, x2; \
 463        \
 464        vpunpckhqdq t1, x0, x1; \
 465        vpunpcklqdq t1, x0, x0; \
 466        \
 467        vpunpckhqdq x2, t2, x3; \
 468        vpunpcklqdq x2, t2, x2;
 469
 470#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
 471                              a3, b3, c3, d3, st0, st1) \
 472        vmovdqu d2, st0; \
 473        vmovdqu d3, st1; \
 474        transpose_4x4(a0, a1, a2, a3, d2, d3); \
 475        transpose_4x4(b0, b1, b2, b3, d2, d3); \
 476        vmovdqu st0, d2; \
 477        vmovdqu st1, d3; \
 478        \
 479        vmovdqu a0, st0; \
 480        vmovdqu a1, st1; \
 481        transpose_4x4(c0, c1, c2, c3, a0, a1); \
 482        transpose_4x4(d0, d1, d2, d3, a0, a1); \
 483        \
 484        vbroadcasti128 .Lshufb_16x16b, a0; \
 485        vmovdqu st1, a1; \
 486        vpshufb a0, a2, a2; \
 487        vpshufb a0, a3, a3; \
 488        vpshufb a0, b0, b0; \
 489        vpshufb a0, b1, b1; \
 490        vpshufb a0, b2, b2; \
 491        vpshufb a0, b3, b3; \
 492        vpshufb a0, a1, a1; \
 493        vpshufb a0, c0, c0; \
 494        vpshufb a0, c1, c1; \
 495        vpshufb a0, c2, c2; \
 496        vpshufb a0, c3, c3; \
 497        vpshufb a0, d0, d0; \
 498        vpshufb a0, d1, d1; \
 499        vpshufb a0, d2, d2; \
 500        vpshufb a0, d3, d3; \
 501        vmovdqu d3, st1; \
 502        vmovdqu st0, d3; \
 503        vpshufb a0, d3, a0; \
 504        vmovdqu d2, st0; \
 505        \
 506        transpose_4x4(a0, b0, c0, d0, d2, d3); \
 507        transpose_4x4(a1, b1, c1, d1, d2, d3); \
 508        vmovdqu st0, d2; \
 509        vmovdqu st1, d3; \
 510        \
 511        vmovdqu b0, st0; \
 512        vmovdqu b1, st1; \
 513        transpose_4x4(a2, b2, c2, d2, b0, b1); \
 514        transpose_4x4(a3, b3, c3, d3, b0, b1); \
 515        vmovdqu st0, b0; \
 516        vmovdqu st1, b1; \
 517        /* does not adjust output bytes inside vectors */
 518
 519/* load blocks to registers and apply pre-whitening */
 520#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 521                     y6, y7, rio, key) \
 522        vpbroadcastq key, x0; \
 523        vpshufb .Lpack_bswap, x0, x0; \
 524        \
 525        vpxor 0 * 32(rio), x0, y7; \
 526        vpxor 1 * 32(rio), x0, y6; \
 527        vpxor 2 * 32(rio), x0, y5; \
 528        vpxor 3 * 32(rio), x0, y4; \
 529        vpxor 4 * 32(rio), x0, y3; \
 530        vpxor 5 * 32(rio), x0, y2; \
 531        vpxor 6 * 32(rio), x0, y1; \
 532        vpxor 7 * 32(rio), x0, y0; \
 533        vpxor 8 * 32(rio), x0, x7; \
 534        vpxor 9 * 32(rio), x0, x6; \
 535        vpxor 10 * 32(rio), x0, x5; \
 536        vpxor 11 * 32(rio), x0, x4; \
 537        vpxor 12 * 32(rio), x0, x3; \
 538        vpxor 13 * 32(rio), x0, x2; \
 539        vpxor 14 * 32(rio), x0, x1; \
 540        vpxor 15 * 32(rio), x0, x0;
 541
 542/* byteslice pre-whitened blocks and store to temporary memory */
 543#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 544                      y6, y7, mem_ab, mem_cd) \
 545        byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
 546                              y4, y5, y6, y7, (mem_ab), (mem_cd)); \
 547        \
 548        vmovdqu x0, 0 * 32(mem_ab); \
 549        vmovdqu x1, 1 * 32(mem_ab); \
 550        vmovdqu x2, 2 * 32(mem_ab); \
 551        vmovdqu x3, 3 * 32(mem_ab); \
 552        vmovdqu x4, 4 * 32(mem_ab); \
 553        vmovdqu x5, 5 * 32(mem_ab); \
 554        vmovdqu x6, 6 * 32(mem_ab); \
 555        vmovdqu x7, 7 * 32(mem_ab); \
 556        vmovdqu y0, 0 * 32(mem_cd); \
 557        vmovdqu y1, 1 * 32(mem_cd); \
 558        vmovdqu y2, 2 * 32(mem_cd); \
 559        vmovdqu y3, 3 * 32(mem_cd); \
 560        vmovdqu y4, 4 * 32(mem_cd); \
 561        vmovdqu y5, 5 * 32(mem_cd); \
 562        vmovdqu y6, 6 * 32(mem_cd); \
 563        vmovdqu y7, 7 * 32(mem_cd);
 564
 565/* de-byteslice, apply post-whitening and store blocks */
 566#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
 567                    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
 568        byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
 569                              y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
 570        \
 571        vmovdqu x0, stack_tmp0; \
 572        \
 573        vpbroadcastq key, x0; \
 574        vpshufb .Lpack_bswap, x0, x0; \
 575        \
 576        vpxor x0, y7, y7; \
 577        vpxor x0, y6, y6; \
 578        vpxor x0, y5, y5; \
 579        vpxor x0, y4, y4; \
 580        vpxor x0, y3, y3; \
 581        vpxor x0, y2, y2; \
 582        vpxor x0, y1, y1; \
 583        vpxor x0, y0, y0; \
 584        vpxor x0, x7, x7; \
 585        vpxor x0, x6, x6; \
 586        vpxor x0, x5, x5; \
 587        vpxor x0, x4, x4; \
 588        vpxor x0, x3, x3; \
 589        vpxor x0, x2, x2; \
 590        vpxor x0, x1, x1; \
 591        vpxor stack_tmp0, x0, x0;
 592
 593#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 594                     y6, y7, rio) \
 595        vmovdqu x0, 0 * 32(rio); \
 596        vmovdqu x1, 1 * 32(rio); \
 597        vmovdqu x2, 2 * 32(rio); \
 598        vmovdqu x3, 3 * 32(rio); \
 599        vmovdqu x4, 4 * 32(rio); \
 600        vmovdqu x5, 5 * 32(rio); \
 601        vmovdqu x6, 6 * 32(rio); \
 602        vmovdqu x7, 7 * 32(rio); \
 603        vmovdqu y0, 8 * 32(rio); \
 604        vmovdqu y1, 9 * 32(rio); \
 605        vmovdqu y2, 10 * 32(rio); \
 606        vmovdqu y3, 11 * 32(rio); \
 607        vmovdqu y4, 12 * 32(rio); \
 608        vmovdqu y5, 13 * 32(rio); \
 609        vmovdqu y6, 14 * 32(rio); \
 610        vmovdqu y7, 15 * 32(rio);
 611
 612.data
 613.align 32
 614
 615#define SHUFB_BYTES(idx) \
 616        0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
 617
 618.Lshufb_16x16b:
 619        .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
 620        .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
 621
 622.Lpack_bswap:
 623        .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
 624        .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
 625
 626/* For CTR-mode IV byteswap */
 627.Lbswap128_mask:
 628        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 629
 630/* For XTS mode */
 631.Lxts_gf128mul_and_shl1_mask_0:
 632        .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
 633.Lxts_gf128mul_and_shl1_mask_1:
 634        .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
 635
 636/*
 637 * pre-SubByte transform
 638 *
 639 * pre-lookup for sbox1, sbox2, sbox3:
 640 *   swap_bitendianness(
 641 *       isom_map_camellia_to_aes(
 642 *           camellia_f(
 643 *               swap_bitendianess(in)
 644 *           )
 645 *       )
 646 *   )
 647 *
 648 * (note: '⊕ 0xc5' inside camellia_f())
 649 */
 650.Lpre_tf_lo_s1:
 651        .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
 652        .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
 653.Lpre_tf_hi_s1:
 654        .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
 655        .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
 656
 657/*
 658 * pre-SubByte transform
 659 *
 660 * pre-lookup for sbox4:
 661 *   swap_bitendianness(
 662 *       isom_map_camellia_to_aes(
 663 *           camellia_f(
 664 *               swap_bitendianess(in <<< 1)
 665 *           )
 666 *       )
 667 *   )
 668 *
 669 * (note: '⊕ 0xc5' inside camellia_f())
 670 */
 671.Lpre_tf_lo_s4:
 672        .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
 673        .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
 674.Lpre_tf_hi_s4:
 675        .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
 676        .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
 677
 678/*
 679 * post-SubByte transform
 680 *
 681 * post-lookup for sbox1, sbox4:
 682 *  swap_bitendianness(
 683 *      camellia_h(
 684 *          isom_map_aes_to_camellia(
 685 *              swap_bitendianness(
 686 *                  aes_inverse_affine_transform(in)
 687 *              )
 688 *          )
 689 *      )
 690 *  )
 691 *
 692 * (note: '⊕ 0x6e' inside camellia_h())
 693 */
 694.Lpost_tf_lo_s1:
 695        .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
 696        .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
 697.Lpost_tf_hi_s1:
 698        .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
 699        .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
 700
 701/*
 702 * post-SubByte transform
 703 *
 704 * post-lookup for sbox2:
 705 *  swap_bitendianness(
 706 *      camellia_h(
 707 *          isom_map_aes_to_camellia(
 708 *              swap_bitendianness(
 709 *                  aes_inverse_affine_transform(in)
 710 *              )
 711 *          )
 712 *      )
 713 *  ) <<< 1
 714 *
 715 * (note: '⊕ 0x6e' inside camellia_h())
 716 */
 717.Lpost_tf_lo_s2:
 718        .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
 719        .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
 720.Lpost_tf_hi_s2:
 721        .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
 722        .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
 723
 724/*
 725 * post-SubByte transform
 726 *
 727 * post-lookup for sbox3:
 728 *  swap_bitendianness(
 729 *      camellia_h(
 730 *          isom_map_aes_to_camellia(
 731 *              swap_bitendianness(
 732 *                  aes_inverse_affine_transform(in)
 733 *              )
 734 *          )
 735 *      )
 736 *  ) >>> 1
 737 *
 738 * (note: '⊕ 0x6e' inside camellia_h())
 739 */
 740.Lpost_tf_lo_s3:
 741        .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
 742        .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
 743.Lpost_tf_hi_s3:
 744        .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
 745        .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
 746
 747/* For isolating SubBytes from AESENCLAST, inverse shift row */
 748.Linv_shift_row:
 749        .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
 750        .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
 751
 752.align 4
 753/* 4-bit mask */
 754.L0f0f0f0f:
 755        .long 0x0f0f0f0f
 756
 757.text
 758
 759.align 8
 760__camellia_enc_blk32:
 761        /* input:
 762         *      %rdi: ctx, CTX
 763         *      %rax: temporary storage, 512 bytes
 764         *      %ymm0..%ymm15: 32 plaintext blocks
 765         * output:
 766         *      %ymm0..%ymm15: 32 encrypted blocks, order swapped:
 767         *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 768         */
 769
 770        leaq 8 * 32(%rax), %rcx;
 771
 772        inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 773                      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 774                      %ymm15, %rax, %rcx);
 775
 776        enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 777                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 778                     %ymm15, %rax, %rcx, 0);
 779
 780        fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 781              %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 782              %ymm15,
 783              ((key_table + (8) * 8) + 0)(CTX),
 784              ((key_table + (8) * 8) + 4)(CTX),
 785              ((key_table + (8) * 8) + 8)(CTX),
 786              ((key_table + (8) * 8) + 12)(CTX));
 787
 788        enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 789                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 790                     %ymm15, %rax, %rcx, 8);
 791
 792        fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 793              %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 794              %ymm15,
 795              ((key_table + (16) * 8) + 0)(CTX),
 796              ((key_table + (16) * 8) + 4)(CTX),
 797              ((key_table + (16) * 8) + 8)(CTX),
 798              ((key_table + (16) * 8) + 12)(CTX));
 799
 800        enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 801                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 802                     %ymm15, %rax, %rcx, 16);
 803
 804        movl $24, %r8d;
 805        cmpl $16, key_length(CTX);
 806        jne .Lenc_max32;
 807
 808.Lenc_done:
 809        /* load CD for output */
 810        vmovdqu 0 * 32(%rcx), %ymm8;
 811        vmovdqu 1 * 32(%rcx), %ymm9;
 812        vmovdqu 2 * 32(%rcx), %ymm10;
 813        vmovdqu 3 * 32(%rcx), %ymm11;
 814        vmovdqu 4 * 32(%rcx), %ymm12;
 815        vmovdqu 5 * 32(%rcx), %ymm13;
 816        vmovdqu 6 * 32(%rcx), %ymm14;
 817        vmovdqu 7 * 32(%rcx), %ymm15;
 818
 819        outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 820                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 821                    %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
 822
 823        ret;
 824
 825.align 8
 826.Lenc_max32:
 827        movl $32, %r8d;
 828
 829        fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 830              %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 831              %ymm15,
 832              ((key_table + (24) * 8) + 0)(CTX),
 833              ((key_table + (24) * 8) + 4)(CTX),
 834              ((key_table + (24) * 8) + 8)(CTX),
 835              ((key_table + (24) * 8) + 12)(CTX));
 836
 837        enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 838                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 839                     %ymm15, %rax, %rcx, 24);
 840
 841        jmp .Lenc_done;
 842ENDPROC(__camellia_enc_blk32)
 843
 844.align 8
 845__camellia_dec_blk32:
 846        /* input:
 847         *      %rdi: ctx, CTX
 848         *      %rax: temporary storage, 512 bytes
 849         *      %r8d: 24 for 16 byte key, 32 for larger
 850         *      %ymm0..%ymm15: 16 encrypted blocks
 851         * output:
 852         *      %ymm0..%ymm15: 16 plaintext blocks, order swapped:
 853         *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 854         */
 855
 856        leaq 8 * 32(%rax), %rcx;
 857
 858        inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 859                      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 860                      %ymm15, %rax, %rcx);
 861
 862        cmpl $32, %r8d;
 863        je .Ldec_max32;
 864
 865.Ldec_max24:
 866        dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 867                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 868                     %ymm15, %rax, %rcx, 16);
 869
 870        fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 871              %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 872              %ymm15,
 873              ((key_table + (16) * 8) + 8)(CTX),
 874              ((key_table + (16) * 8) + 12)(CTX),
 875              ((key_table + (16) * 8) + 0)(CTX),
 876              ((key_table + (16) * 8) + 4)(CTX));
 877
 878        dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 879                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 880                     %ymm15, %rax, %rcx, 8);
 881
 882        fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 883              %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 884              %ymm15,
 885              ((key_table + (8) * 8) + 8)(CTX),
 886              ((key_table + (8) * 8) + 12)(CTX),
 887              ((key_table + (8) * 8) + 0)(CTX),
 888              ((key_table + (8) * 8) + 4)(CTX));
 889
 890        dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 891                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 892                     %ymm15, %rax, %rcx, 0);
 893
 894        /* load CD for output */
 895        vmovdqu 0 * 32(%rcx), %ymm8;
 896        vmovdqu 1 * 32(%rcx), %ymm9;
 897        vmovdqu 2 * 32(%rcx), %ymm10;
 898        vmovdqu 3 * 32(%rcx), %ymm11;
 899        vmovdqu 4 * 32(%rcx), %ymm12;
 900        vmovdqu 5 * 32(%rcx), %ymm13;
 901        vmovdqu 6 * 32(%rcx), %ymm14;
 902        vmovdqu 7 * 32(%rcx), %ymm15;
 903
 904        outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 905                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 906                    %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
 907
 908        ret;
 909
 910.align 8
 911.Ldec_max32:
 912        dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 913                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 914                     %ymm15, %rax, %rcx, 24);
 915
 916        fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 917              %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 918              %ymm15,
 919              ((key_table + (24) * 8) + 8)(CTX),
 920              ((key_table + (24) * 8) + 12)(CTX),
 921              ((key_table + (24) * 8) + 0)(CTX),
 922              ((key_table + (24) * 8) + 4)(CTX));
 923
 924        jmp .Ldec_max24;
 925ENDPROC(__camellia_dec_blk32)
 926
 927ENTRY(camellia_ecb_enc_32way)
 928        /* input:
 929         *      %rdi: ctx, CTX
 930         *      %rsi: dst (32 blocks)
 931         *      %rdx: src (32 blocks)
 932         */
 933
 934        vzeroupper;
 935
 936        inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 937                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 938                     %ymm15, %rdx, (key_table)(CTX));
 939
 940        /* now dst can be used as temporary buffer (even in src == dst case) */
 941        movq    %rsi, %rax;
 942
 943        call __camellia_enc_blk32;
 944
 945        write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
 946                     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
 947                     %ymm8, %rsi);
 948
 949        vzeroupper;
 950
 951        ret;
 952ENDPROC(camellia_ecb_enc_32way)
 953
 954ENTRY(camellia_ecb_dec_32way)
 955        /* input:
 956         *      %rdi: ctx, CTX
 957         *      %rsi: dst (32 blocks)
 958         *      %rdx: src (32 blocks)
 959         */
 960
 961        vzeroupper;
 962
 963        cmpl $16, key_length(CTX);
 964        movl $32, %r8d;
 965        movl $24, %eax;
 966        cmovel %eax, %r8d; /* max */
 967
 968        inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 969                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 970                     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
 971
 972        /* now dst can be used as temporary buffer (even in src == dst case) */
 973        movq    %rsi, %rax;
 974
 975        call __camellia_dec_blk32;
 976
 977        write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
 978                     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
 979                     %ymm8, %rsi);
 980
 981        vzeroupper;
 982
 983        ret;
 984ENDPROC(camellia_ecb_dec_32way)
 985
 986ENTRY(camellia_cbc_dec_32way)
 987        /* input:
 988         *      %rdi: ctx, CTX
 989         *      %rsi: dst (32 blocks)
 990         *      %rdx: src (32 blocks)
 991         */
 992
 993        vzeroupper;
 994
 995        cmpl $16, key_length(CTX);
 996        movl $32, %r8d;
 997        movl $24, %eax;
 998        cmovel %eax, %r8d; /* max */
 999
1000        inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1001                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1002                     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
1003
1004        movq %rsp, %r10;
1005        cmpq %rsi, %rdx;
1006        je .Lcbc_dec_use_stack;
1007
1008        /* dst can be used as temporary storage, src is not overwritten. */
1009        movq %rsi, %rax;
1010        jmp .Lcbc_dec_continue;
1011
1012.Lcbc_dec_use_stack:
1013        /*
1014         * dst still in-use (because dst == src), so use stack for temporary
1015         * storage.
1016         */
1017        subq $(16 * 32), %rsp;
1018        movq %rsp, %rax;
1019
1020.Lcbc_dec_continue:
1021        call __camellia_dec_blk32;
1022
1023        vmovdqu %ymm7, (%rax);
1024        vpxor %ymm7, %ymm7, %ymm7;
1025        vinserti128 $1, (%rdx), %ymm7, %ymm7;
1026        vpxor (%rax), %ymm7, %ymm7;
1027        movq %r10, %rsp;
1028        vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
1029        vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
1030        vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
1031        vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
1032        vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
1033        vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
1034        vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
1035        vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
1036        vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
1037        vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
1038        vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
1039        vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
1040        vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
1041        vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
1042        vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
1043        write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1044                     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1045                     %ymm8, %rsi);
1046
1047        vzeroupper;
1048
1049        ret;
1050ENDPROC(camellia_cbc_dec_32way)
1051
1052#define inc_le128(x, minus_one, tmp) \
1053        vpcmpeqq minus_one, x, tmp; \
1054        vpsubq minus_one, x, x; \
1055        vpslldq $8, tmp, tmp; \
1056        vpsubq tmp, x, x;
1057
1058#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
1059        vpcmpeqq minus_one, x, tmp1; \
1060        vpcmpeqq minus_two, x, tmp2; \
1061        vpsubq minus_two, x, x; \
1062        vpor tmp2, tmp1, tmp1; \
1063        vpslldq $8, tmp1, tmp1; \
1064        vpsubq tmp1, x, x;
1065
1066ENTRY(camellia_ctr_32way)
1067        /* input:
1068         *      %rdi: ctx, CTX
1069         *      %rsi: dst (32 blocks)
1070         *      %rdx: src (32 blocks)
1071         *      %rcx: iv (little endian, 128bit)
1072         */
1073
1074        vzeroupper;
1075
1076        movq %rsp, %r10;
1077        cmpq %rsi, %rdx;
1078        je .Lctr_use_stack;
1079
1080        /* dst can be used as temporary storage, src is not overwritten. */
1081        movq %rsi, %rax;
1082        jmp .Lctr_continue;
1083
1084.Lctr_use_stack:
1085        subq $(16 * 32), %rsp;
1086        movq %rsp, %rax;
1087
1088.Lctr_continue:
1089        vpcmpeqd %ymm15, %ymm15, %ymm15;
1090        vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
1091        vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
1092
1093        /* load IV and byteswap */
1094        vmovdqu (%rcx), %xmm0;
1095        vmovdqa %xmm0, %xmm1;
1096        inc_le128(%xmm0, %xmm15, %xmm14);
1097        vbroadcasti128 .Lbswap128_mask, %ymm14;
1098        vinserti128 $1, %xmm0, %ymm1, %ymm0;
1099        vpshufb %ymm14, %ymm0, %ymm13;
1100        vmovdqu %ymm13, 15 * 32(%rax);
1101
1102        /* construct IVs */
1103        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
1104        vpshufb %ymm14, %ymm0, %ymm13;
1105        vmovdqu %ymm13, 14 * 32(%rax);
1106        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1107        vpshufb %ymm14, %ymm0, %ymm13;
1108        vmovdqu %ymm13, 13 * 32(%rax);
1109        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1110        vpshufb %ymm14, %ymm0, %ymm13;
1111        vmovdqu %ymm13, 12 * 32(%rax);
1112        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1113        vpshufb %ymm14, %ymm0, %ymm13;
1114        vmovdqu %ymm13, 11 * 32(%rax);
1115        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1116        vpshufb %ymm14, %ymm0, %ymm10;
1117        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1118        vpshufb %ymm14, %ymm0, %ymm9;
1119        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1120        vpshufb %ymm14, %ymm0, %ymm8;
1121        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1122        vpshufb %ymm14, %ymm0, %ymm7;
1123        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1124        vpshufb %ymm14, %ymm0, %ymm6;
1125        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1126        vpshufb %ymm14, %ymm0, %ymm5;
1127        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1128        vpshufb %ymm14, %ymm0, %ymm4;
1129        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1130        vpshufb %ymm14, %ymm0, %ymm3;
1131        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1132        vpshufb %ymm14, %ymm0, %ymm2;
1133        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1134        vpshufb %ymm14, %ymm0, %ymm1;
1135        add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1136        vextracti128 $1, %ymm0, %xmm13;
1137        vpshufb %ymm14, %ymm0, %ymm0;
1138        inc_le128(%xmm13, %xmm15, %xmm14);
1139        vmovdqu %xmm13, (%rcx);
1140
1141        /* inpack32_pre: */
1142        vpbroadcastq (key_table)(CTX), %ymm15;
1143        vpshufb .Lpack_bswap, %ymm15, %ymm15;
1144        vpxor %ymm0, %ymm15, %ymm0;
1145        vpxor %ymm1, %ymm15, %ymm1;
1146        vpxor %ymm2, %ymm15, %ymm2;
1147        vpxor %ymm3, %ymm15, %ymm3;
1148        vpxor %ymm4, %ymm15, %ymm4;
1149        vpxor %ymm5, %ymm15, %ymm5;
1150        vpxor %ymm6, %ymm15, %ymm6;
1151        vpxor %ymm7, %ymm15, %ymm7;
1152        vpxor %ymm8, %ymm15, %ymm8;
1153        vpxor %ymm9, %ymm15, %ymm9;
1154        vpxor %ymm10, %ymm15, %ymm10;
1155        vpxor 11 * 32(%rax), %ymm15, %ymm11;
1156        vpxor 12 * 32(%rax), %ymm15, %ymm12;
1157        vpxor 13 * 32(%rax), %ymm15, %ymm13;
1158        vpxor 14 * 32(%rax), %ymm15, %ymm14;
1159        vpxor 15 * 32(%rax), %ymm15, %ymm15;
1160
1161        call __camellia_enc_blk32;
1162
1163        movq %r10, %rsp;
1164
1165        vpxor 0 * 32(%rdx), %ymm7, %ymm7;
1166        vpxor 1 * 32(%rdx), %ymm6, %ymm6;
1167        vpxor 2 * 32(%rdx), %ymm5, %ymm5;
1168        vpxor 3 * 32(%rdx), %ymm4, %ymm4;
1169        vpxor 4 * 32(%rdx), %ymm3, %ymm3;
1170        vpxor 5 * 32(%rdx), %ymm2, %ymm2;
1171        vpxor 6 * 32(%rdx), %ymm1, %ymm1;
1172        vpxor 7 * 32(%rdx), %ymm0, %ymm0;
1173        vpxor 8 * 32(%rdx), %ymm15, %ymm15;
1174        vpxor 9 * 32(%rdx), %ymm14, %ymm14;
1175        vpxor 10 * 32(%rdx), %ymm13, %ymm13;
1176        vpxor 11 * 32(%rdx), %ymm12, %ymm12;
1177        vpxor 12 * 32(%rdx), %ymm11, %ymm11;
1178        vpxor 13 * 32(%rdx), %ymm10, %ymm10;
1179        vpxor 14 * 32(%rdx), %ymm9, %ymm9;
1180        vpxor 15 * 32(%rdx), %ymm8, %ymm8;
1181        write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1182                     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1183                     %ymm8, %rsi);
1184
1185        vzeroupper;
1186
1187        ret;
1188ENDPROC(camellia_ctr_32way)
1189
1190#define gf128mul_x_ble(iv, mask, tmp) \
1191        vpsrad $31, iv, tmp; \
1192        vpaddq iv, iv, iv; \
1193        vpshufd $0x13, tmp, tmp; \
1194        vpand mask, tmp, tmp; \
1195        vpxor tmp, iv, iv;
1196
1197#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
1198        vpsrad $31, iv, tmp0; \
1199        vpaddq iv, iv, tmp1; \
1200        vpsllq $2, iv, iv; \
1201        vpshufd $0x13, tmp0, tmp0; \
1202        vpsrad $31, tmp1, tmp1; \
1203        vpand mask2, tmp0, tmp0; \
1204        vpshufd $0x13, tmp1, tmp1; \
1205        vpxor tmp0, iv, iv; \
1206        vpand mask1, tmp1, tmp1; \
1207        vpxor tmp1, iv, iv;
1208
1209.align 8
1210camellia_xts_crypt_32way:
1211        /* input:
1212         *      %rdi: ctx, CTX
1213         *      %rsi: dst (32 blocks)
1214         *      %rdx: src (32 blocks)
1215         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1216         *      %r8: index for input whitening key
1217         *      %r9: pointer to  __camellia_enc_blk32 or __camellia_dec_blk32
1218         */
1219
1220        vzeroupper;
1221
1222        subq $(16 * 32), %rsp;
1223        movq %rsp, %rax;
1224
1225        vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
1226
1227        /* load IV and construct second IV */
1228        vmovdqu (%rcx), %xmm0;
1229        vmovdqa %xmm0, %xmm15;
1230        gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
1231        vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
1232        vinserti128 $1, %xmm0, %ymm15, %ymm0;
1233        vpxor 0 * 32(%rdx), %ymm0, %ymm15;
1234        vmovdqu %ymm15, 15 * 32(%rax);
1235        vmovdqu %ymm0, 0 * 32(%rsi);
1236
1237        /* construct IVs */
1238        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1239        vpxor 1 * 32(%rdx), %ymm0, %ymm15;
1240        vmovdqu %ymm15, 14 * 32(%rax);
1241        vmovdqu %ymm0, 1 * 32(%rsi);
1242
1243        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1244        vpxor 2 * 32(%rdx), %ymm0, %ymm15;
1245        vmovdqu %ymm15, 13 * 32(%rax);
1246        vmovdqu %ymm0, 2 * 32(%rsi);
1247
1248        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1249        vpxor 3 * 32(%rdx), %ymm0, %ymm15;
1250        vmovdqu %ymm15, 12 * 32(%rax);
1251        vmovdqu %ymm0, 3 * 32(%rsi);
1252
1253        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1254        vpxor 4 * 32(%rdx), %ymm0, %ymm11;
1255        vmovdqu %ymm0, 4 * 32(%rsi);
1256
1257        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1258        vpxor 5 * 32(%rdx), %ymm0, %ymm10;
1259        vmovdqu %ymm0, 5 * 32(%rsi);
1260
1261        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1262        vpxor 6 * 32(%rdx), %ymm0, %ymm9;
1263        vmovdqu %ymm0, 6 * 32(%rsi);
1264
1265        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1266        vpxor 7 * 32(%rdx), %ymm0, %ymm8;
1267        vmovdqu %ymm0, 7 * 32(%rsi);
1268
1269        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1270        vpxor 8 * 32(%rdx), %ymm0, %ymm7;
1271        vmovdqu %ymm0, 8 * 32(%rsi);
1272
1273        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1274        vpxor 9 * 32(%rdx), %ymm0, %ymm6;
1275        vmovdqu %ymm0, 9 * 32(%rsi);
1276
1277        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1278        vpxor 10 * 32(%rdx), %ymm0, %ymm5;
1279        vmovdqu %ymm0, 10 * 32(%rsi);
1280
1281        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1282        vpxor 11 * 32(%rdx), %ymm0, %ymm4;
1283        vmovdqu %ymm0, 11 * 32(%rsi);
1284
1285        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1286        vpxor 12 * 32(%rdx), %ymm0, %ymm3;
1287        vmovdqu %ymm0, 12 * 32(%rsi);
1288
1289        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1290        vpxor 13 * 32(%rdx), %ymm0, %ymm2;
1291        vmovdqu %ymm0, 13 * 32(%rsi);
1292
1293        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1294        vpxor 14 * 32(%rdx), %ymm0, %ymm1;
1295        vmovdqu %ymm0, 14 * 32(%rsi);
1296
1297        gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1298        vpxor 15 * 32(%rdx), %ymm0, %ymm15;
1299        vmovdqu %ymm15, 0 * 32(%rax);
1300        vmovdqu %ymm0, 15 * 32(%rsi);
1301
1302        vextracti128 $1, %ymm0, %xmm0;
1303        gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
1304        vmovdqu %xmm0, (%rcx);
1305
1306        /* inpack32_pre: */
1307        vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
1308        vpshufb .Lpack_bswap, %ymm15, %ymm15;
1309        vpxor 0 * 32(%rax), %ymm15, %ymm0;
1310        vpxor %ymm1, %ymm15, %ymm1;
1311        vpxor %ymm2, %ymm15, %ymm2;
1312        vpxor %ymm3, %ymm15, %ymm3;
1313        vpxor %ymm4, %ymm15, %ymm4;
1314        vpxor %ymm5, %ymm15, %ymm5;
1315        vpxor %ymm6, %ymm15, %ymm6;
1316        vpxor %ymm7, %ymm15, %ymm7;
1317        vpxor %ymm8, %ymm15, %ymm8;
1318        vpxor %ymm9, %ymm15, %ymm9;
1319        vpxor %ymm10, %ymm15, %ymm10;
1320        vpxor %ymm11, %ymm15, %ymm11;
1321        vpxor 12 * 32(%rax), %ymm15, %ymm12;
1322        vpxor 13 * 32(%rax), %ymm15, %ymm13;
1323        vpxor 14 * 32(%rax), %ymm15, %ymm14;
1324        vpxor 15 * 32(%rax), %ymm15, %ymm15;
1325
1326        call *%r9;
1327
1328        addq $(16 * 32), %rsp;
1329
1330        vpxor 0 * 32(%rsi), %ymm7, %ymm7;
1331        vpxor 1 * 32(%rsi), %ymm6, %ymm6;
1332        vpxor 2 * 32(%rsi), %ymm5, %ymm5;
1333        vpxor 3 * 32(%rsi), %ymm4, %ymm4;
1334        vpxor 4 * 32(%rsi), %ymm3, %ymm3;
1335        vpxor 5 * 32(%rsi), %ymm2, %ymm2;
1336        vpxor 6 * 32(%rsi), %ymm1, %ymm1;
1337        vpxor 7 * 32(%rsi), %ymm0, %ymm0;
1338        vpxor 8 * 32(%rsi), %ymm15, %ymm15;
1339        vpxor 9 * 32(%rsi), %ymm14, %ymm14;
1340        vpxor 10 * 32(%rsi), %ymm13, %ymm13;
1341        vpxor 11 * 32(%rsi), %ymm12, %ymm12;
1342        vpxor 12 * 32(%rsi), %ymm11, %ymm11;
1343        vpxor 13 * 32(%rsi), %ymm10, %ymm10;
1344        vpxor 14 * 32(%rsi), %ymm9, %ymm9;
1345        vpxor 15 * 32(%rsi), %ymm8, %ymm8;
1346        write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1347                     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1348                     %ymm8, %rsi);
1349
1350        vzeroupper;
1351
1352        ret;
1353ENDPROC(camellia_xts_crypt_32way)
1354
1355ENTRY(camellia_xts_enc_32way)
1356        /* input:
1357         *      %rdi: ctx, CTX
1358         *      %rsi: dst (32 blocks)
1359         *      %rdx: src (32 blocks)
1360         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1361         */
1362
1363        xorl %r8d, %r8d; /* input whitening key, 0 for enc */
1364
1365        leaq __camellia_enc_blk32, %r9;
1366
1367        jmp camellia_xts_crypt_32way;
1368ENDPROC(camellia_xts_enc_32way)
1369
1370ENTRY(camellia_xts_dec_32way)
1371        /* input:
1372         *      %rdi: ctx, CTX
1373         *      %rsi: dst (32 blocks)
1374         *      %rdx: src (32 blocks)
1375         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1376         */
1377
1378        cmpl $16, key_length(CTX);
1379        movl $32, %r8d;
1380        movl $24, %eax;
1381        cmovel %eax, %r8d;  /* input whitening key, last for dec */
1382
1383        leaq __camellia_dec_blk32, %r9;
1384
1385        jmp camellia_xts_crypt_32way;
1386ENDPROC(camellia_xts_dec_32way)
1387