linux/arch/x86/crypto/camellia-aesni-avx-asm_64.S
<<
>>
Prefs
   1/*
   2 * x86_64/AVX/AES-NI assembler implementation of Camellia
   3 *
   4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation; either version 2 of the License, or
   9 * (at your option) any later version.
  10 *
  11 */
  12
  13/*
  14 * Version licensed under 2-clause BSD License is available at:
  15 *      http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
  16 */
  17
  18#include <linux/linkage.h>
  19#include <asm/frame.h>
  20#include <asm/nospec-branch.h>
  21
  22#define CAMELLIA_TABLE_BYTE_LEN 272
  23
  24/* struct camellia_ctx: */
  25#define key_table 0
  26#define key_length CAMELLIA_TABLE_BYTE_LEN
  27
  28/* register macros */
  29#define CTX %rdi
  30
  31/**********************************************************************
  32  16-way camellia
  33 **********************************************************************/
  34#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
  35        vpand x, mask4bit, tmp0; \
  36        vpandn x, mask4bit, x; \
  37        vpsrld $4, x, x; \
  38        \
  39        vpshufb tmp0, lo_t, tmp0; \
  40        vpshufb x, hi_t, x; \
  41        vpxor tmp0, x, x;
  42
  43/*
  44 * IN:
  45 *   x0..x7: byte-sliced AB state
  46 *   mem_cd: register pointer storing CD state
  47 *   key: index for key material
  48 * OUT:
  49 *   x0..x7: new byte-sliced CD state
  50 */
  51#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
  52                  t7, mem_cd, key) \
  53        /* \
  54         * S-function with AES subbytes \
  55         */ \
  56        vmovdqa .Linv_shift_row, t4; \
  57        vbroadcastss .L0f0f0f0f, t7; \
  58        vmovdqa .Lpre_tf_lo_s1, t0; \
  59        vmovdqa .Lpre_tf_hi_s1, t1; \
  60        \
  61        /* AES inverse shift rows */ \
  62        vpshufb t4, x0, x0; \
  63        vpshufb t4, x7, x7; \
  64        vpshufb t4, x1, x1; \
  65        vpshufb t4, x4, x4; \
  66        vpshufb t4, x2, x2; \
  67        vpshufb t4, x5, x5; \
  68        vpshufb t4, x3, x3; \
  69        vpshufb t4, x6, x6; \
  70        \
  71        /* prefilter sboxes 1, 2 and 3 */ \
  72        vmovdqa .Lpre_tf_lo_s4, t2; \
  73        vmovdqa .Lpre_tf_hi_s4, t3; \
  74        filter_8bit(x0, t0, t1, t7, t6); \
  75        filter_8bit(x7, t0, t1, t7, t6); \
  76        filter_8bit(x1, t0, t1, t7, t6); \
  77        filter_8bit(x4, t0, t1, t7, t6); \
  78        filter_8bit(x2, t0, t1, t7, t6); \
  79        filter_8bit(x5, t0, t1, t7, t6); \
  80        \
  81        /* prefilter sbox 4 */ \
  82        vpxor t4, t4, t4; \
  83        filter_8bit(x3, t2, t3, t7, t6); \
  84        filter_8bit(x6, t2, t3, t7, t6); \
  85        \
  86        /* AES subbytes + AES shift rows */ \
  87        vmovdqa .Lpost_tf_lo_s1, t0; \
  88        vmovdqa .Lpost_tf_hi_s1, t1; \
  89        vaesenclast t4, x0, x0; \
  90        vaesenclast t4, x7, x7; \
  91        vaesenclast t4, x1, x1; \
  92        vaesenclast t4, x4, x4; \
  93        vaesenclast t4, x2, x2; \
  94        vaesenclast t4, x5, x5; \
  95        vaesenclast t4, x3, x3; \
  96        vaesenclast t4, x6, x6; \
  97        \
  98        /* postfilter sboxes 1 and 4 */ \
  99        vmovdqa .Lpost_tf_lo_s3, t2; \
 100        vmovdqa .Lpost_tf_hi_s3, t3; \
 101        filter_8bit(x0, t0, t1, t7, t6); \
 102        filter_8bit(x7, t0, t1, t7, t6); \
 103        filter_8bit(x3, t0, t1, t7, t6); \
 104        filter_8bit(x6, t0, t1, t7, t6); \
 105        \
 106        /* postfilter sbox 3 */ \
 107        vmovdqa .Lpost_tf_lo_s2, t4; \
 108        vmovdqa .Lpost_tf_hi_s2, t5; \
 109        filter_8bit(x2, t2, t3, t7, t6); \
 110        filter_8bit(x5, t2, t3, t7, t6); \
 111        \
 112        vpxor t6, t6, t6; \
 113        vmovq key, t0; \
 114        \
 115        /* postfilter sbox 2 */ \
 116        filter_8bit(x1, t4, t5, t7, t2); \
 117        filter_8bit(x4, t4, t5, t7, t2); \
 118        \
 119        vpsrldq $5, t0, t5; \
 120        vpsrldq $1, t0, t1; \
 121        vpsrldq $2, t0, t2; \
 122        vpsrldq $3, t0, t3; \
 123        vpsrldq $4, t0, t4; \
 124        vpshufb t6, t0, t0; \
 125        vpshufb t6, t1, t1; \
 126        vpshufb t6, t2, t2; \
 127        vpshufb t6, t3, t3; \
 128        vpshufb t6, t4, t4; \
 129        vpsrldq $2, t5, t7; \
 130        vpshufb t6, t7, t7; \
 131        \
 132        /* \
 133         * P-function \
 134         */ \
 135        vpxor x5, x0, x0; \
 136        vpxor x6, x1, x1; \
 137        vpxor x7, x2, x2; \
 138        vpxor x4, x3, x3; \
 139        \
 140        vpxor x2, x4, x4; \
 141        vpxor x3, x5, x5; \
 142        vpxor x0, x6, x6; \
 143        vpxor x1, x7, x7; \
 144        \
 145        vpxor x7, x0, x0; \
 146        vpxor x4, x1, x1; \
 147        vpxor x5, x2, x2; \
 148        vpxor x6, x3, x3; \
 149        \
 150        vpxor x3, x4, x4; \
 151        vpxor x0, x5, x5; \
 152        vpxor x1, x6, x6; \
 153        vpxor x2, x7, x7; /* note: high and low parts swapped */ \
 154        \
 155        /* \
 156         * Add key material and result to CD (x becomes new CD) \
 157         */ \
 158        \
 159        vpxor t3, x4, x4; \
 160        vpxor 0 * 16(mem_cd), x4, x4; \
 161        \
 162        vpxor t2, x5, x5; \
 163        vpxor 1 * 16(mem_cd), x5, x5; \
 164        \
 165        vpsrldq $1, t5, t3; \
 166        vpshufb t6, t5, t5; \
 167        vpshufb t6, t3, t6; \
 168        \
 169        vpxor t1, x6, x6; \
 170        vpxor 2 * 16(mem_cd), x6, x6; \
 171        \
 172        vpxor t0, x7, x7; \
 173        vpxor 3 * 16(mem_cd), x7, x7; \
 174        \
 175        vpxor t7, x0, x0; \
 176        vpxor 4 * 16(mem_cd), x0, x0; \
 177        \
 178        vpxor t6, x1, x1; \
 179        vpxor 5 * 16(mem_cd), x1, x1; \
 180        \
 181        vpxor t5, x2, x2; \
 182        vpxor 6 * 16(mem_cd), x2, x2; \
 183        \
 184        vpxor t4, x3, x3; \
 185        vpxor 7 * 16(mem_cd), x3, x3;
 186
 187/*
 188 * Size optimization... with inlined roundsm16, binary would be over 5 times
 189 * larger and would only be 0.5% faster (on sandy-bridge).
 190 */
 191.align 8
 192roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
 193        roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 194                  %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
 195                  %rcx, (%r9));
 196        ret;
 197ENDPROC(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
 198
 199.align 8
 200roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
 201        roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
 202                  %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
 203                  %rax, (%r9));
 204        ret;
 205ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 206
 207/*
 208 * IN/OUT:
 209 *  x0..x7: byte-sliced AB state preloaded
 210 *  mem_ab: byte-sliced AB state in memory
 211 *  mem_cb: byte-sliced CD state in memory
 212 */
 213#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 214                      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
 215        leaq (key_table + (i) * 8)(CTX), %r9; \
 216        call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
 217        \
 218        vmovdqu x4, 0 * 16(mem_cd); \
 219        vmovdqu x5, 1 * 16(mem_cd); \
 220        vmovdqu x6, 2 * 16(mem_cd); \
 221        vmovdqu x7, 3 * 16(mem_cd); \
 222        vmovdqu x0, 4 * 16(mem_cd); \
 223        vmovdqu x1, 5 * 16(mem_cd); \
 224        vmovdqu x2, 6 * 16(mem_cd); \
 225        vmovdqu x3, 7 * 16(mem_cd); \
 226        \
 227        leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
 228        call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
 229        \
 230        store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
 231
 232#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
 233
 234#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
 235        /* Store new AB state */ \
 236        vmovdqu x0, 0 * 16(mem_ab); \
 237        vmovdqu x1, 1 * 16(mem_ab); \
 238        vmovdqu x2, 2 * 16(mem_ab); \
 239        vmovdqu x3, 3 * 16(mem_ab); \
 240        vmovdqu x4, 4 * 16(mem_ab); \
 241        vmovdqu x5, 5 * 16(mem_ab); \
 242        vmovdqu x6, 6 * 16(mem_ab); \
 243        vmovdqu x7, 7 * 16(mem_ab);
 244
 245#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 246                      y6, y7, mem_ab, mem_cd, i) \
 247        two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 248                      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
 249        two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 250                      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
 251        two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 252                      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
 253
 254#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 255                      y6, y7, mem_ab, mem_cd, i) \
 256        two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 257                      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
 258        two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 259                      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
 260        two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 261                      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
 262
 263/*
 264 * IN:
 265 *  v0..3: byte-sliced 32-bit integers
 266 * OUT:
 267 *  v0..3: (IN <<< 1)
 268 */
 269#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
 270        vpcmpgtb v0, zero, t0; \
 271        vpaddb v0, v0, v0; \
 272        vpabsb t0, t0; \
 273        \
 274        vpcmpgtb v1, zero, t1; \
 275        vpaddb v1, v1, v1; \
 276        vpabsb t1, t1; \
 277        \
 278        vpcmpgtb v2, zero, t2; \
 279        vpaddb v2, v2, v2; \
 280        vpabsb t2, t2; \
 281        \
 282        vpor t0, v1, v1; \
 283        \
 284        vpcmpgtb v3, zero, t0; \
 285        vpaddb v3, v3, v3; \
 286        vpabsb t0, t0; \
 287        \
 288        vpor t1, v2, v2; \
 289        vpor t2, v3, v3; \
 290        vpor t0, v0, v0;
 291
 292/*
 293 * IN:
 294 *   r: byte-sliced AB state in memory
 295 *   l: byte-sliced CD state in memory
 296 * OUT:
 297 *   x0..x7: new byte-sliced CD state
 298 */
 299#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
 300              tt1, tt2, tt3, kll, klr, krl, krr) \
 301        /* \
 302         * t0 = kll; \
 303         * t0 &= ll; \
 304         * lr ^= rol32(t0, 1); \
 305         */ \
 306        vpxor tt0, tt0, tt0; \
 307        vmovd kll, t0; \
 308        vpshufb tt0, t0, t3; \
 309        vpsrldq $1, t0, t0; \
 310        vpshufb tt0, t0, t2; \
 311        vpsrldq $1, t0, t0; \
 312        vpshufb tt0, t0, t1; \
 313        vpsrldq $1, t0, t0; \
 314        vpshufb tt0, t0, t0; \
 315        \
 316        vpand l0, t0, t0; \
 317        vpand l1, t1, t1; \
 318        vpand l2, t2, t2; \
 319        vpand l3, t3, t3; \
 320        \
 321        rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
 322        \
 323        vpxor l4, t0, l4; \
 324        vmovdqu l4, 4 * 16(l); \
 325        vpxor l5, t1, l5; \
 326        vmovdqu l5, 5 * 16(l); \
 327        vpxor l6, t2, l6; \
 328        vmovdqu l6, 6 * 16(l); \
 329        vpxor l7, t3, l7; \
 330        vmovdqu l7, 7 * 16(l); \
 331        \
 332        /* \
 333         * t2 = krr; \
 334         * t2 |= rr; \
 335         * rl ^= t2; \
 336         */ \
 337        \
 338        vmovd krr, t0; \
 339        vpshufb tt0, t0, t3; \
 340        vpsrldq $1, t0, t0; \
 341        vpshufb tt0, t0, t2; \
 342        vpsrldq $1, t0, t0; \
 343        vpshufb tt0, t0, t1; \
 344        vpsrldq $1, t0, t0; \
 345        vpshufb tt0, t0, t0; \
 346        \
 347        vpor 4 * 16(r), t0, t0; \
 348        vpor 5 * 16(r), t1, t1; \
 349        vpor 6 * 16(r), t2, t2; \
 350        vpor 7 * 16(r), t3, t3; \
 351        \
 352        vpxor 0 * 16(r), t0, t0; \
 353        vpxor 1 * 16(r), t1, t1; \
 354        vpxor 2 * 16(r), t2, t2; \
 355        vpxor 3 * 16(r), t3, t3; \
 356        vmovdqu t0, 0 * 16(r); \
 357        vmovdqu t1, 1 * 16(r); \
 358        vmovdqu t2, 2 * 16(r); \
 359        vmovdqu t3, 3 * 16(r); \
 360        \
 361        /* \
 362         * t2 = krl; \
 363         * t2 &= rl; \
 364         * rr ^= rol32(t2, 1); \
 365         */ \
 366        vmovd krl, t0; \
 367        vpshufb tt0, t0, t3; \
 368        vpsrldq $1, t0, t0; \
 369        vpshufb tt0, t0, t2; \
 370        vpsrldq $1, t0, t0; \
 371        vpshufb tt0, t0, t1; \
 372        vpsrldq $1, t0, t0; \
 373        vpshufb tt0, t0, t0; \
 374        \
 375        vpand 0 * 16(r), t0, t0; \
 376        vpand 1 * 16(r), t1, t1; \
 377        vpand 2 * 16(r), t2, t2; \
 378        vpand 3 * 16(r), t3, t3; \
 379        \
 380        rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
 381        \
 382        vpxor 4 * 16(r), t0, t0; \
 383        vpxor 5 * 16(r), t1, t1; \
 384        vpxor 6 * 16(r), t2, t2; \
 385        vpxor 7 * 16(r), t3, t3; \
 386        vmovdqu t0, 4 * 16(r); \
 387        vmovdqu t1, 5 * 16(r); \
 388        vmovdqu t2, 6 * 16(r); \
 389        vmovdqu t3, 7 * 16(r); \
 390        \
 391        /* \
 392         * t0 = klr; \
 393         * t0 |= lr; \
 394         * ll ^= t0; \
 395         */ \
 396        \
 397        vmovd klr, t0; \
 398        vpshufb tt0, t0, t3; \
 399        vpsrldq $1, t0, t0; \
 400        vpshufb tt0, t0, t2; \
 401        vpsrldq $1, t0, t0; \
 402        vpshufb tt0, t0, t1; \
 403        vpsrldq $1, t0, t0; \
 404        vpshufb tt0, t0, t0; \
 405        \
 406        vpor l4, t0, t0; \
 407        vpor l5, t1, t1; \
 408        vpor l6, t2, t2; \
 409        vpor l7, t3, t3; \
 410        \
 411        vpxor l0, t0, l0; \
 412        vmovdqu l0, 0 * 16(l); \
 413        vpxor l1, t1, l1; \
 414        vmovdqu l1, 1 * 16(l); \
 415        vpxor l2, t2, l2; \
 416        vmovdqu l2, 2 * 16(l); \
 417        vpxor l3, t3, l3; \
 418        vmovdqu l3, 3 * 16(l);
 419
 420#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
 421        vpunpckhdq x1, x0, t2; \
 422        vpunpckldq x1, x0, x0; \
 423        \
 424        vpunpckldq x3, x2, t1; \
 425        vpunpckhdq x3, x2, x2; \
 426        \
 427        vpunpckhqdq t1, x0, x1; \
 428        vpunpcklqdq t1, x0, x0; \
 429        \
 430        vpunpckhqdq x2, t2, x3; \
 431        vpunpcklqdq x2, t2, x2;
 432
 433#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
 434                         b3, c3, d3, st0, st1) \
 435        vmovdqu d2, st0; \
 436        vmovdqu d3, st1; \
 437        transpose_4x4(a0, a1, a2, a3, d2, d3); \
 438        transpose_4x4(b0, b1, b2, b3, d2, d3); \
 439        vmovdqu st0, d2; \
 440        vmovdqu st1, d3; \
 441        \
 442        vmovdqu a0, st0; \
 443        vmovdqu a1, st1; \
 444        transpose_4x4(c0, c1, c2, c3, a0, a1); \
 445        transpose_4x4(d0, d1, d2, d3, a0, a1); \
 446        \
 447        vmovdqu .Lshufb_16x16b, a0; \
 448        vmovdqu st1, a1; \
 449        vpshufb a0, a2, a2; \
 450        vpshufb a0, a3, a3; \
 451        vpshufb a0, b0, b0; \
 452        vpshufb a0, b1, b1; \
 453        vpshufb a0, b2, b2; \
 454        vpshufb a0, b3, b3; \
 455        vpshufb a0, a1, a1; \
 456        vpshufb a0, c0, c0; \
 457        vpshufb a0, c1, c1; \
 458        vpshufb a0, c2, c2; \
 459        vpshufb a0, c3, c3; \
 460        vpshufb a0, d0, d0; \
 461        vpshufb a0, d1, d1; \
 462        vpshufb a0, d2, d2; \
 463        vpshufb a0, d3, d3; \
 464        vmovdqu d3, st1; \
 465        vmovdqu st0, d3; \
 466        vpshufb a0, d3, a0; \
 467        vmovdqu d2, st0; \
 468        \
 469        transpose_4x4(a0, b0, c0, d0, d2, d3); \
 470        transpose_4x4(a1, b1, c1, d1, d2, d3); \
 471        vmovdqu st0, d2; \
 472        vmovdqu st1, d3; \
 473        \
 474        vmovdqu b0, st0; \
 475        vmovdqu b1, st1; \
 476        transpose_4x4(a2, b2, c2, d2, b0, b1); \
 477        transpose_4x4(a3, b3, c3, d3, b0, b1); \
 478        vmovdqu st0, b0; \
 479        vmovdqu st1, b1; \
 480        /* does not adjust output bytes inside vectors */
 481
 482/* load blocks to registers and apply pre-whitening */
 483#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 484                     y6, y7, rio, key) \
 485        vmovq key, x0; \
 486        vpshufb .Lpack_bswap, x0, x0; \
 487        \
 488        vpxor 0 * 16(rio), x0, y7; \
 489        vpxor 1 * 16(rio), x0, y6; \
 490        vpxor 2 * 16(rio), x0, y5; \
 491        vpxor 3 * 16(rio), x0, y4; \
 492        vpxor 4 * 16(rio), x0, y3; \
 493        vpxor 5 * 16(rio), x0, y2; \
 494        vpxor 6 * 16(rio), x0, y1; \
 495        vpxor 7 * 16(rio), x0, y0; \
 496        vpxor 8 * 16(rio), x0, x7; \
 497        vpxor 9 * 16(rio), x0, x6; \
 498        vpxor 10 * 16(rio), x0, x5; \
 499        vpxor 11 * 16(rio), x0, x4; \
 500        vpxor 12 * 16(rio), x0, x3; \
 501        vpxor 13 * 16(rio), x0, x2; \
 502        vpxor 14 * 16(rio), x0, x1; \
 503        vpxor 15 * 16(rio), x0, x0;
 504
 505/* byteslice pre-whitened blocks and store to temporary memory */
 506#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 507                      y6, y7, mem_ab, mem_cd) \
 508        byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
 509                         y5, y6, y7, (mem_ab), (mem_cd)); \
 510        \
 511        vmovdqu x0, 0 * 16(mem_ab); \
 512        vmovdqu x1, 1 * 16(mem_ab); \
 513        vmovdqu x2, 2 * 16(mem_ab); \
 514        vmovdqu x3, 3 * 16(mem_ab); \
 515        vmovdqu x4, 4 * 16(mem_ab); \
 516        vmovdqu x5, 5 * 16(mem_ab); \
 517        vmovdqu x6, 6 * 16(mem_ab); \
 518        vmovdqu x7, 7 * 16(mem_ab); \
 519        vmovdqu y0, 0 * 16(mem_cd); \
 520        vmovdqu y1, 1 * 16(mem_cd); \
 521        vmovdqu y2, 2 * 16(mem_cd); \
 522        vmovdqu y3, 3 * 16(mem_cd); \
 523        vmovdqu y4, 4 * 16(mem_cd); \
 524        vmovdqu y5, 5 * 16(mem_cd); \
 525        vmovdqu y6, 6 * 16(mem_cd); \
 526        vmovdqu y7, 7 * 16(mem_cd);
 527
 528/* de-byteslice, apply post-whitening and store blocks */
 529#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
 530                    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
 531        byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
 532                         y7, x3, x7, stack_tmp0, stack_tmp1); \
 533        \
 534        vmovdqu x0, stack_tmp0; \
 535        \
 536        vmovq key, x0; \
 537        vpshufb .Lpack_bswap, x0, x0; \
 538        \
 539        vpxor x0, y7, y7; \
 540        vpxor x0, y6, y6; \
 541        vpxor x0, y5, y5; \
 542        vpxor x0, y4, y4; \
 543        vpxor x0, y3, y3; \
 544        vpxor x0, y2, y2; \
 545        vpxor x0, y1, y1; \
 546        vpxor x0, y0, y0; \
 547        vpxor x0, x7, x7; \
 548        vpxor x0, x6, x6; \
 549        vpxor x0, x5, x5; \
 550        vpxor x0, x4, x4; \
 551        vpxor x0, x3, x3; \
 552        vpxor x0, x2, x2; \
 553        vpxor x0, x1, x1; \
 554        vpxor stack_tmp0, x0, x0;
 555
 556#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 557                     y6, y7, rio) \
 558        vmovdqu x0, 0 * 16(rio); \
 559        vmovdqu x1, 1 * 16(rio); \
 560        vmovdqu x2, 2 * 16(rio); \
 561        vmovdqu x3, 3 * 16(rio); \
 562        vmovdqu x4, 4 * 16(rio); \
 563        vmovdqu x5, 5 * 16(rio); \
 564        vmovdqu x6, 6 * 16(rio); \
 565        vmovdqu x7, 7 * 16(rio); \
 566        vmovdqu y0, 8 * 16(rio); \
 567        vmovdqu y1, 9 * 16(rio); \
 568        vmovdqu y2, 10 * 16(rio); \
 569        vmovdqu y3, 11 * 16(rio); \
 570        vmovdqu y4, 12 * 16(rio); \
 571        vmovdqu y5, 13 * 16(rio); \
 572        vmovdqu y6, 14 * 16(rio); \
 573        vmovdqu y7, 15 * 16(rio);
 574
 575
 576/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
 577.section        .rodata.cst16, "aM", @progbits, 16
 578.align 16
 579
 580#define SHUFB_BYTES(idx) \
 581        0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
 582
 583.Lshufb_16x16b:
 584        .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
 585
 586.Lpack_bswap:
 587        .long 0x00010203
 588        .long 0x04050607
 589        .long 0x80808080
 590        .long 0x80808080
 591
 592/* For CTR-mode IV byteswap */
 593.Lbswap128_mask:
 594        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 595
 596/* For XTS mode IV generation */
 597.Lxts_gf128mul_and_shl1_mask:
 598        .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
 599
 600/*
 601 * pre-SubByte transform
 602 *
 603 * pre-lookup for sbox1, sbox2, sbox3:
 604 *   swap_bitendianness(
 605 *       isom_map_camellia_to_aes(
 606 *           camellia_f(
 607 *               swap_bitendianess(in)
 608 *           )
 609 *       )
 610 *   )
 611 *
 612 * (note: '⊕ 0xc5' inside camellia_f())
 613 */
 614.Lpre_tf_lo_s1:
 615        .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
 616        .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
 617.Lpre_tf_hi_s1:
 618        .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
 619        .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
 620
 621/*
 622 * pre-SubByte transform
 623 *
 624 * pre-lookup for sbox4:
 625 *   swap_bitendianness(
 626 *       isom_map_camellia_to_aes(
 627 *           camellia_f(
 628 *               swap_bitendianess(in <<< 1)
 629 *           )
 630 *       )
 631 *   )
 632 *
 633 * (note: '⊕ 0xc5' inside camellia_f())
 634 */
 635.Lpre_tf_lo_s4:
 636        .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
 637        .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
 638.Lpre_tf_hi_s4:
 639        .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
 640        .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
 641
 642/*
 643 * post-SubByte transform
 644 *
 645 * post-lookup for sbox1, sbox4:
 646 *  swap_bitendianness(
 647 *      camellia_h(
 648 *          isom_map_aes_to_camellia(
 649 *              swap_bitendianness(
 650 *                  aes_inverse_affine_transform(in)
 651 *              )
 652 *          )
 653 *      )
 654 *  )
 655 *
 656 * (note: '⊕ 0x6e' inside camellia_h())
 657 */
 658.Lpost_tf_lo_s1:
 659        .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
 660        .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
 661.Lpost_tf_hi_s1:
 662        .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
 663        .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
 664
 665/*
 666 * post-SubByte transform
 667 *
 668 * post-lookup for sbox2:
 669 *  swap_bitendianness(
 670 *      camellia_h(
 671 *          isom_map_aes_to_camellia(
 672 *              swap_bitendianness(
 673 *                  aes_inverse_affine_transform(in)
 674 *              )
 675 *          )
 676 *      )
 677 *  ) <<< 1
 678 *
 679 * (note: '⊕ 0x6e' inside camellia_h())
 680 */
 681.Lpost_tf_lo_s2:
 682        .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
 683        .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
 684.Lpost_tf_hi_s2:
 685        .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
 686        .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
 687
 688/*
 689 * post-SubByte transform
 690 *
 691 * post-lookup for sbox3:
 692 *  swap_bitendianness(
 693 *      camellia_h(
 694 *          isom_map_aes_to_camellia(
 695 *              swap_bitendianness(
 696 *                  aes_inverse_affine_transform(in)
 697 *              )
 698 *          )
 699 *      )
 700 *  ) >>> 1
 701 *
 702 * (note: '⊕ 0x6e' inside camellia_h())
 703 */
 704.Lpost_tf_lo_s3:
 705        .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
 706        .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
 707.Lpost_tf_hi_s3:
 708        .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
 709        .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
 710
 711/* For isolating SubBytes from AESENCLAST, inverse shift row */
 712.Linv_shift_row:
 713        .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
 714        .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
 715
 716/* 4-bit mask */
 717.section        .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
 718.align 4
 719.L0f0f0f0f:
 720        .long 0x0f0f0f0f
 721
 722.text
 723
 724.align 8
 725__camellia_enc_blk16:
 726        /* input:
 727         *      %rdi: ctx, CTX
 728         *      %rax: temporary storage, 256 bytes
 729         *      %xmm0..%xmm15: 16 plaintext blocks
 730         * output:
 731         *      %xmm0..%xmm15: 16 encrypted blocks, order swapped:
 732         *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 733         */
 734        FRAME_BEGIN
 735
 736        leaq 8 * 16(%rax), %rcx;
 737
 738        inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 739                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 740                      %xmm15, %rax, %rcx);
 741
 742        enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 743                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 744                     %xmm15, %rax, %rcx, 0);
 745
 746        fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 747              %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 748              %xmm15,
 749              ((key_table + (8) * 8) + 0)(CTX),
 750              ((key_table + (8) * 8) + 4)(CTX),
 751              ((key_table + (8) * 8) + 8)(CTX),
 752              ((key_table + (8) * 8) + 12)(CTX));
 753
 754        enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 755                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 756                     %xmm15, %rax, %rcx, 8);
 757
 758        fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 759              %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 760              %xmm15,
 761              ((key_table + (16) * 8) + 0)(CTX),
 762              ((key_table + (16) * 8) + 4)(CTX),
 763              ((key_table + (16) * 8) + 8)(CTX),
 764              ((key_table + (16) * 8) + 12)(CTX));
 765
 766        enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 767                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 768                     %xmm15, %rax, %rcx, 16);
 769
 770        movl $24, %r8d;
 771        cmpl $16, key_length(CTX);
 772        jne .Lenc_max32;
 773
 774.Lenc_done:
 775        /* load CD for output */
 776        vmovdqu 0 * 16(%rcx), %xmm8;
 777        vmovdqu 1 * 16(%rcx), %xmm9;
 778        vmovdqu 2 * 16(%rcx), %xmm10;
 779        vmovdqu 3 * 16(%rcx), %xmm11;
 780        vmovdqu 4 * 16(%rcx), %xmm12;
 781        vmovdqu 5 * 16(%rcx), %xmm13;
 782        vmovdqu 6 * 16(%rcx), %xmm14;
 783        vmovdqu 7 * 16(%rcx), %xmm15;
 784
 785        outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 786                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 787                    %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
 788
 789        FRAME_END
 790        ret;
 791
 792.align 8
 793.Lenc_max32:
 794        movl $32, %r8d;
 795
 796        fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 797              %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 798              %xmm15,
 799              ((key_table + (24) * 8) + 0)(CTX),
 800              ((key_table + (24) * 8) + 4)(CTX),
 801              ((key_table + (24) * 8) + 8)(CTX),
 802              ((key_table + (24) * 8) + 12)(CTX));
 803
 804        enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 805                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 806                     %xmm15, %rax, %rcx, 24);
 807
 808        jmp .Lenc_done;
 809ENDPROC(__camellia_enc_blk16)
 810
 811.align 8
 812__camellia_dec_blk16:
 813        /* input:
 814         *      %rdi: ctx, CTX
 815         *      %rax: temporary storage, 256 bytes
 816         *      %r8d: 24 for 16 byte key, 32 for larger
 817         *      %xmm0..%xmm15: 16 encrypted blocks
 818         * output:
 819         *      %xmm0..%xmm15: 16 plaintext blocks, order swapped:
 820         *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 821         */
 822        FRAME_BEGIN
 823
 824        leaq 8 * 16(%rax), %rcx;
 825
 826        inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 827                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 828                      %xmm15, %rax, %rcx);
 829
 830        cmpl $32, %r8d;
 831        je .Ldec_max32;
 832
 833.Ldec_max24:
 834        dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 835                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 836                     %xmm15, %rax, %rcx, 16);
 837
 838        fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 839              %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 840              %xmm15,
 841              ((key_table + (16) * 8) + 8)(CTX),
 842              ((key_table + (16) * 8) + 12)(CTX),
 843              ((key_table + (16) * 8) + 0)(CTX),
 844              ((key_table + (16) * 8) + 4)(CTX));
 845
 846        dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 847                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 848                     %xmm15, %rax, %rcx, 8);
 849
 850        fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 851              %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 852              %xmm15,
 853              ((key_table + (8) * 8) + 8)(CTX),
 854              ((key_table + (8) * 8) + 12)(CTX),
 855              ((key_table + (8) * 8) + 0)(CTX),
 856              ((key_table + (8) * 8) + 4)(CTX));
 857
 858        dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 859                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 860                     %xmm15, %rax, %rcx, 0);
 861
 862        /* load CD for output */
 863        vmovdqu 0 * 16(%rcx), %xmm8;
 864        vmovdqu 1 * 16(%rcx), %xmm9;
 865        vmovdqu 2 * 16(%rcx), %xmm10;
 866        vmovdqu 3 * 16(%rcx), %xmm11;
 867        vmovdqu 4 * 16(%rcx), %xmm12;
 868        vmovdqu 5 * 16(%rcx), %xmm13;
 869        vmovdqu 6 * 16(%rcx), %xmm14;
 870        vmovdqu 7 * 16(%rcx), %xmm15;
 871
 872        outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 873                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 874                    %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
 875
 876        FRAME_END
 877        ret;
 878
 879.align 8
 880.Ldec_max32:
 881        dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 882                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 883                     %xmm15, %rax, %rcx, 24);
 884
 885        fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 886              %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 887              %xmm15,
 888              ((key_table + (24) * 8) + 8)(CTX),
 889              ((key_table + (24) * 8) + 12)(CTX),
 890              ((key_table + (24) * 8) + 0)(CTX),
 891              ((key_table + (24) * 8) + 4)(CTX));
 892
 893        jmp .Ldec_max24;
 894ENDPROC(__camellia_dec_blk16)
 895
 896ENTRY(camellia_ecb_enc_16way)
 897        /* input:
 898         *      %rdi: ctx, CTX
 899         *      %rsi: dst (16 blocks)
 900         *      %rdx: src (16 blocks)
 901         */
 902         FRAME_BEGIN
 903
 904        inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 905                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 906                     %xmm15, %rdx, (key_table)(CTX));
 907
 908        /* now dst can be used as temporary buffer (even in src == dst case) */
 909        movq    %rsi, %rax;
 910
 911        call __camellia_enc_blk16;
 912
 913        write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
 914                     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 915                     %xmm8, %rsi);
 916
 917        FRAME_END
 918        ret;
 919ENDPROC(camellia_ecb_enc_16way)
 920
 921ENTRY(camellia_ecb_dec_16way)
 922        /* input:
 923         *      %rdi: ctx, CTX
 924         *      %rsi: dst (16 blocks)
 925         *      %rdx: src (16 blocks)
 926         */
 927         FRAME_BEGIN
 928
 929        cmpl $16, key_length(CTX);
 930        movl $32, %r8d;
 931        movl $24, %eax;
 932        cmovel %eax, %r8d; /* max */
 933
 934        inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 935                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 936                     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
 937
 938        /* now dst can be used as temporary buffer (even in src == dst case) */
 939        movq    %rsi, %rax;
 940
 941        call __camellia_dec_blk16;
 942
 943        write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
 944                     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 945                     %xmm8, %rsi);
 946
 947        FRAME_END
 948        ret;
 949ENDPROC(camellia_ecb_dec_16way)
 950
 951ENTRY(camellia_cbc_dec_16way)
 952        /* input:
 953         *      %rdi: ctx, CTX
 954         *      %rsi: dst (16 blocks)
 955         *      %rdx: src (16 blocks)
 956         */
 957        FRAME_BEGIN
 958
 959        cmpl $16, key_length(CTX);
 960        movl $32, %r8d;
 961        movl $24, %eax;
 962        cmovel %eax, %r8d; /* max */
 963
 964        inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 965                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 966                     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
 967
 968        /*
 969         * dst might still be in-use (in case dst == src), so use stack for
 970         * temporary storage.
 971         */
 972        subq $(16 * 16), %rsp;
 973        movq %rsp, %rax;
 974
 975        call __camellia_dec_blk16;
 976
 977        addq $(16 * 16), %rsp;
 978
 979        vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
 980        vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
 981        vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
 982        vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
 983        vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
 984        vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
 985        vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
 986        vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
 987        vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
 988        vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
 989        vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
 990        vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
 991        vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
 992        vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
 993        vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
 994        write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
 995                     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 996                     %xmm8, %rsi);
 997
 998        FRAME_END
 999        ret;
1000ENDPROC(camellia_cbc_dec_16way)
1001
1002#define inc_le128(x, minus_one, tmp) \
1003        vpcmpeqq minus_one, x, tmp; \
1004        vpsubq minus_one, x, x; \
1005        vpslldq $8, tmp, tmp; \
1006        vpsubq tmp, x, x;
1007
1008ENTRY(camellia_ctr_16way)
1009        /* input:
1010         *      %rdi: ctx, CTX
1011         *      %rsi: dst (16 blocks)
1012         *      %rdx: src (16 blocks)
1013         *      %rcx: iv (little endian, 128bit)
1014         */
1015        FRAME_BEGIN
1016
1017        subq $(16 * 16), %rsp;
1018        movq %rsp, %rax;
1019
1020        vmovdqa .Lbswap128_mask, %xmm14;
1021
1022        /* load IV and byteswap */
1023        vmovdqu (%rcx), %xmm0;
1024        vpshufb %xmm14, %xmm0, %xmm15;
1025        vmovdqu %xmm15, 15 * 16(%rax);
1026
1027        vpcmpeqd %xmm15, %xmm15, %xmm15;
1028        vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
1029
1030        /* construct IVs */
1031        inc_le128(%xmm0, %xmm15, %xmm13);
1032        vpshufb %xmm14, %xmm0, %xmm13;
1033        vmovdqu %xmm13, 14 * 16(%rax);
1034        inc_le128(%xmm0, %xmm15, %xmm13);
1035        vpshufb %xmm14, %xmm0, %xmm13;
1036        vmovdqu %xmm13, 13 * 16(%rax);
1037        inc_le128(%xmm0, %xmm15, %xmm13);
1038        vpshufb %xmm14, %xmm0, %xmm12;
1039        inc_le128(%xmm0, %xmm15, %xmm13);
1040        vpshufb %xmm14, %xmm0, %xmm11;
1041        inc_le128(%xmm0, %xmm15, %xmm13);
1042        vpshufb %xmm14, %xmm0, %xmm10;
1043        inc_le128(%xmm0, %xmm15, %xmm13);
1044        vpshufb %xmm14, %xmm0, %xmm9;
1045        inc_le128(%xmm0, %xmm15, %xmm13);
1046        vpshufb %xmm14, %xmm0, %xmm8;
1047        inc_le128(%xmm0, %xmm15, %xmm13);
1048        vpshufb %xmm14, %xmm0, %xmm7;
1049        inc_le128(%xmm0, %xmm15, %xmm13);
1050        vpshufb %xmm14, %xmm0, %xmm6;
1051        inc_le128(%xmm0, %xmm15, %xmm13);
1052        vpshufb %xmm14, %xmm0, %xmm5;
1053        inc_le128(%xmm0, %xmm15, %xmm13);
1054        vpshufb %xmm14, %xmm0, %xmm4;
1055        inc_le128(%xmm0, %xmm15, %xmm13);
1056        vpshufb %xmm14, %xmm0, %xmm3;
1057        inc_le128(%xmm0, %xmm15, %xmm13);
1058        vpshufb %xmm14, %xmm0, %xmm2;
1059        inc_le128(%xmm0, %xmm15, %xmm13);
1060        vpshufb %xmm14, %xmm0, %xmm1;
1061        inc_le128(%xmm0, %xmm15, %xmm13);
1062        vmovdqa %xmm0, %xmm13;
1063        vpshufb %xmm14, %xmm0, %xmm0;
1064        inc_le128(%xmm13, %xmm15, %xmm14);
1065        vmovdqu %xmm13, (%rcx);
1066
1067        /* inpack16_pre: */
1068        vmovq (key_table)(CTX), %xmm15;
1069        vpshufb .Lpack_bswap, %xmm15, %xmm15;
1070        vpxor %xmm0, %xmm15, %xmm0;
1071        vpxor %xmm1, %xmm15, %xmm1;
1072        vpxor %xmm2, %xmm15, %xmm2;
1073        vpxor %xmm3, %xmm15, %xmm3;
1074        vpxor %xmm4, %xmm15, %xmm4;
1075        vpxor %xmm5, %xmm15, %xmm5;
1076        vpxor %xmm6, %xmm15, %xmm6;
1077        vpxor %xmm7, %xmm15, %xmm7;
1078        vpxor %xmm8, %xmm15, %xmm8;
1079        vpxor %xmm9, %xmm15, %xmm9;
1080        vpxor %xmm10, %xmm15, %xmm10;
1081        vpxor %xmm11, %xmm15, %xmm11;
1082        vpxor %xmm12, %xmm15, %xmm12;
1083        vpxor 13 * 16(%rax), %xmm15, %xmm13;
1084        vpxor 14 * 16(%rax), %xmm15, %xmm14;
1085        vpxor 15 * 16(%rax), %xmm15, %xmm15;
1086
1087        call __camellia_enc_blk16;
1088
1089        addq $(16 * 16), %rsp;
1090
1091        vpxor 0 * 16(%rdx), %xmm7, %xmm7;
1092        vpxor 1 * 16(%rdx), %xmm6, %xmm6;
1093        vpxor 2 * 16(%rdx), %xmm5, %xmm5;
1094        vpxor 3 * 16(%rdx), %xmm4, %xmm4;
1095        vpxor 4 * 16(%rdx), %xmm3, %xmm3;
1096        vpxor 5 * 16(%rdx), %xmm2, %xmm2;
1097        vpxor 6 * 16(%rdx), %xmm1, %xmm1;
1098        vpxor 7 * 16(%rdx), %xmm0, %xmm0;
1099        vpxor 8 * 16(%rdx), %xmm15, %xmm15;
1100        vpxor 9 * 16(%rdx), %xmm14, %xmm14;
1101        vpxor 10 * 16(%rdx), %xmm13, %xmm13;
1102        vpxor 11 * 16(%rdx), %xmm12, %xmm12;
1103        vpxor 12 * 16(%rdx), %xmm11, %xmm11;
1104        vpxor 13 * 16(%rdx), %xmm10, %xmm10;
1105        vpxor 14 * 16(%rdx), %xmm9, %xmm9;
1106        vpxor 15 * 16(%rdx), %xmm8, %xmm8;
1107        write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1108                     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1109                     %xmm8, %rsi);
1110
1111        FRAME_END
1112        ret;
1113ENDPROC(camellia_ctr_16way)
1114
1115#define gf128mul_x_ble(iv, mask, tmp) \
1116        vpsrad $31, iv, tmp; \
1117        vpaddq iv, iv, iv; \
1118        vpshufd $0x13, tmp, tmp; \
1119        vpand mask, tmp, tmp; \
1120        vpxor tmp, iv, iv;
1121
1122.align 8
1123camellia_xts_crypt_16way:
1124        /* input:
1125         *      %rdi: ctx, CTX
1126         *      %rsi: dst (16 blocks)
1127         *      %rdx: src (16 blocks)
1128         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1129         *      %r8: index for input whitening key
1130         *      %r9: pointer to  __camellia_enc_blk16 or __camellia_dec_blk16
1131         */
1132        FRAME_BEGIN
1133
1134        subq $(16 * 16), %rsp;
1135        movq %rsp, %rax;
1136
1137        vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
1138
1139        /* load IV */
1140        vmovdqu (%rcx), %xmm0;
1141        vpxor 0 * 16(%rdx), %xmm0, %xmm15;
1142        vmovdqu %xmm15, 15 * 16(%rax);
1143        vmovdqu %xmm0, 0 * 16(%rsi);
1144
1145        /* construct IVs */
1146        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1147        vpxor 1 * 16(%rdx), %xmm0, %xmm15;
1148        vmovdqu %xmm15, 14 * 16(%rax);
1149        vmovdqu %xmm0, 1 * 16(%rsi);
1150
1151        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1152        vpxor 2 * 16(%rdx), %xmm0, %xmm13;
1153        vmovdqu %xmm0, 2 * 16(%rsi);
1154
1155        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1156        vpxor 3 * 16(%rdx), %xmm0, %xmm12;
1157        vmovdqu %xmm0, 3 * 16(%rsi);
1158
1159        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1160        vpxor 4 * 16(%rdx), %xmm0, %xmm11;
1161        vmovdqu %xmm0, 4 * 16(%rsi);
1162
1163        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1164        vpxor 5 * 16(%rdx), %xmm0, %xmm10;
1165        vmovdqu %xmm0, 5 * 16(%rsi);
1166
1167        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1168        vpxor 6 * 16(%rdx), %xmm0, %xmm9;
1169        vmovdqu %xmm0, 6 * 16(%rsi);
1170
1171        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1172        vpxor 7 * 16(%rdx), %xmm0, %xmm8;
1173        vmovdqu %xmm0, 7 * 16(%rsi);
1174
1175        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1176        vpxor 8 * 16(%rdx), %xmm0, %xmm7;
1177        vmovdqu %xmm0, 8 * 16(%rsi);
1178
1179        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1180        vpxor 9 * 16(%rdx), %xmm0, %xmm6;
1181        vmovdqu %xmm0, 9 * 16(%rsi);
1182
1183        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1184        vpxor 10 * 16(%rdx), %xmm0, %xmm5;
1185        vmovdqu %xmm0, 10 * 16(%rsi);
1186
1187        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1188        vpxor 11 * 16(%rdx), %xmm0, %xmm4;
1189        vmovdqu %xmm0, 11 * 16(%rsi);
1190
1191        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1192        vpxor 12 * 16(%rdx), %xmm0, %xmm3;
1193        vmovdqu %xmm0, 12 * 16(%rsi);
1194
1195        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1196        vpxor 13 * 16(%rdx), %xmm0, %xmm2;
1197        vmovdqu %xmm0, 13 * 16(%rsi);
1198
1199        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1200        vpxor 14 * 16(%rdx), %xmm0, %xmm1;
1201        vmovdqu %xmm0, 14 * 16(%rsi);
1202
1203        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1204        vpxor 15 * 16(%rdx), %xmm0, %xmm15;
1205        vmovdqu %xmm15, 0 * 16(%rax);
1206        vmovdqu %xmm0, 15 * 16(%rsi);
1207
1208        gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1209        vmovdqu %xmm0, (%rcx);
1210
1211        /* inpack16_pre: */
1212        vmovq (key_table)(CTX, %r8, 8), %xmm15;
1213        vpshufb .Lpack_bswap, %xmm15, %xmm15;
1214        vpxor 0 * 16(%rax), %xmm15, %xmm0;
1215        vpxor %xmm1, %xmm15, %xmm1;
1216        vpxor %xmm2, %xmm15, %xmm2;
1217        vpxor %xmm3, %xmm15, %xmm3;
1218        vpxor %xmm4, %xmm15, %xmm4;
1219        vpxor %xmm5, %xmm15, %xmm5;
1220        vpxor %xmm6, %xmm15, %xmm6;
1221        vpxor %xmm7, %xmm15, %xmm7;
1222        vpxor %xmm8, %xmm15, %xmm8;
1223        vpxor %xmm9, %xmm15, %xmm9;
1224        vpxor %xmm10, %xmm15, %xmm10;
1225        vpxor %xmm11, %xmm15, %xmm11;
1226        vpxor %xmm12, %xmm15, %xmm12;
1227        vpxor %xmm13, %xmm15, %xmm13;
1228        vpxor 14 * 16(%rax), %xmm15, %xmm14;
1229        vpxor 15 * 16(%rax), %xmm15, %xmm15;
1230
1231        CALL_NOSPEC %r9;
1232
1233        addq $(16 * 16), %rsp;
1234
1235        vpxor 0 * 16(%rsi), %xmm7, %xmm7;
1236        vpxor 1 * 16(%rsi), %xmm6, %xmm6;
1237        vpxor 2 * 16(%rsi), %xmm5, %xmm5;
1238        vpxor 3 * 16(%rsi), %xmm4, %xmm4;
1239        vpxor 4 * 16(%rsi), %xmm3, %xmm3;
1240        vpxor 5 * 16(%rsi), %xmm2, %xmm2;
1241        vpxor 6 * 16(%rsi), %xmm1, %xmm1;
1242        vpxor 7 * 16(%rsi), %xmm0, %xmm0;
1243        vpxor 8 * 16(%rsi), %xmm15, %xmm15;
1244        vpxor 9 * 16(%rsi), %xmm14, %xmm14;
1245        vpxor 10 * 16(%rsi), %xmm13, %xmm13;
1246        vpxor 11 * 16(%rsi), %xmm12, %xmm12;
1247        vpxor 12 * 16(%rsi), %xmm11, %xmm11;
1248        vpxor 13 * 16(%rsi), %xmm10, %xmm10;
1249        vpxor 14 * 16(%rsi), %xmm9, %xmm9;
1250        vpxor 15 * 16(%rsi), %xmm8, %xmm8;
1251        write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1252                     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1253                     %xmm8, %rsi);
1254
1255        FRAME_END
1256        ret;
1257ENDPROC(camellia_xts_crypt_16way)
1258
1259ENTRY(camellia_xts_enc_16way)
1260        /* input:
1261         *      %rdi: ctx, CTX
1262         *      %rsi: dst (16 blocks)
1263         *      %rdx: src (16 blocks)
1264         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1265         */
1266        xorl %r8d, %r8d; /* input whitening key, 0 for enc */
1267
1268        leaq __camellia_enc_blk16, %r9;
1269
1270        jmp camellia_xts_crypt_16way;
1271ENDPROC(camellia_xts_enc_16way)
1272
1273ENTRY(camellia_xts_dec_16way)
1274        /* input:
1275         *      %rdi: ctx, CTX
1276         *      %rsi: dst (16 blocks)
1277         *      %rdx: src (16 blocks)
1278         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1279         */
1280
1281        cmpl $16, key_length(CTX);
1282        movl $32, %r8d;
1283        movl $24, %eax;
1284        cmovel %eax, %r8d;  /* input whitening key, last for dec */
1285
1286        leaq __camellia_dec_blk16, %r9;
1287
1288        jmp camellia_xts_crypt_16way;
1289ENDPROC(camellia_xts_dec_16way)
1290