linux/arch/x86/crypto/camellia-aesni-avx-asm_64.S
<<
>>
Prefs
   1/*
   2 * x86_64/AVX/AES-NI assembler implementation of Camellia
   3 *
   4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation; either version 2 of the License, or
   9 * (at your option) any later version.
  10 *
  11 */
  12
  13/*
  14 * Version licensed under 2-clause BSD License is available at:
  15 *      http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
  16 */
  17
  18#include <linux/linkage.h>
  19#include <asm/frame.h>
  20
  21#define CAMELLIA_TABLE_BYTE_LEN 272
  22
  23/* struct camellia_ctx: */
  24#define key_table 0
  25#define key_length CAMELLIA_TABLE_BYTE_LEN
  26
  27/* register macros */
  28#define CTX %rdi
  29
  30/**********************************************************************
  31  16-way camellia
  32 **********************************************************************/
  33#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
  34        vpand x, mask4bit, tmp0; \
  35        vpandn x, mask4bit, x; \
  36        vpsrld $4, x, x; \
  37        \
  38        vpshufb tmp0, lo_t, tmp0; \
  39        vpshufb x, hi_t, x; \
  40        vpxor tmp0, x, x;
  41
  42/*
  43 * IN:
  44 *   x0..x7: byte-sliced AB state
  45 *   mem_cd: register pointer storing CD state
  46 *   key: index for key material
  47 * OUT:
  48 *   x0..x7: new byte-sliced CD state
  49 */
  50#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
  51                  t7, mem_cd, key) \
  52        /* \
  53         * S-function with AES subbytes \
  54         */ \
  55        vmovdqa .Linv_shift_row, t4; \
  56        vbroadcastss .L0f0f0f0f, t7; \
  57        vmovdqa .Lpre_tf_lo_s1, t0; \
  58        vmovdqa .Lpre_tf_hi_s1, t1; \
  59        \
  60        /* AES inverse shift rows */ \
  61        vpshufb t4, x0, x0; \
  62        vpshufb t4, x7, x7; \
  63        vpshufb t4, x1, x1; \
  64        vpshufb t4, x4, x4; \
  65        vpshufb t4, x2, x2; \
  66        vpshufb t4, x5, x5; \
  67        vpshufb t4, x3, x3; \
  68        vpshufb t4, x6, x6; \
  69        \
  70        /* prefilter sboxes 1, 2 and 3 */ \
  71        vmovdqa .Lpre_tf_lo_s4, t2; \
  72        vmovdqa .Lpre_tf_hi_s4, t3; \
  73        filter_8bit(x0, t0, t1, t7, t6); \
  74        filter_8bit(x7, t0, t1, t7, t6); \
  75        filter_8bit(x1, t0, t1, t7, t6); \
  76        filter_8bit(x4, t0, t1, t7, t6); \
  77        filter_8bit(x2, t0, t1, t7, t6); \
  78        filter_8bit(x5, t0, t1, t7, t6); \
  79        \
  80        /* prefilter sbox 4 */ \
  81        vpxor t4, t4, t4; \
  82        filter_8bit(x3, t2, t3, t7, t6); \
  83        filter_8bit(x6, t2, t3, t7, t6); \
  84        \
  85        /* AES subbytes + AES shift rows */ \
  86        vmovdqa .Lpost_tf_lo_s1, t0; \
  87        vmovdqa .Lpost_tf_hi_s1, t1; \
  88        vaesenclast t4, x0, x0; \
  89        vaesenclast t4, x7, x7; \
  90        vaesenclast t4, x1, x1; \
  91        vaesenclast t4, x4, x4; \
  92        vaesenclast t4, x2, x2; \
  93        vaesenclast t4, x5, x5; \
  94        vaesenclast t4, x3, x3; \
  95        vaesenclast t4, x6, x6; \
  96        \
  97        /* postfilter sboxes 1 and 4 */ \
  98        vmovdqa .Lpost_tf_lo_s3, t2; \
  99        vmovdqa .Lpost_tf_hi_s3, t3; \
 100        filter_8bit(x0, t0, t1, t7, t6); \
 101        filter_8bit(x7, t0, t1, t7, t6); \
 102        filter_8bit(x3, t0, t1, t7, t6); \
 103        filter_8bit(x6, t0, t1, t7, t6); \
 104        \
 105        /* postfilter sbox 3 */ \
 106        vmovdqa .Lpost_tf_lo_s2, t4; \
 107        vmovdqa .Lpost_tf_hi_s2, t5; \
 108        filter_8bit(x2, t2, t3, t7, t6); \
 109        filter_8bit(x5, t2, t3, t7, t6); \
 110        \
 111        vpxor t6, t6, t6; \
 112        vmovq key, t0; \
 113        \
 114        /* postfilter sbox 2 */ \
 115        filter_8bit(x1, t4, t5, t7, t2); \
 116        filter_8bit(x4, t4, t5, t7, t2); \
 117        \
 118        vpsrldq $5, t0, t5; \
 119        vpsrldq $1, t0, t1; \
 120        vpsrldq $2, t0, t2; \
 121        vpsrldq $3, t0, t3; \
 122        vpsrldq $4, t0, t4; \
 123        vpshufb t6, t0, t0; \
 124        vpshufb t6, t1, t1; \
 125        vpshufb t6, t2, t2; \
 126        vpshufb t6, t3, t3; \
 127        vpshufb t6, t4, t4; \
 128        vpsrldq $2, t5, t7; \
 129        vpshufb t6, t7, t7; \
 130        \
 131        /* \
 132         * P-function \
 133         */ \
 134        vpxor x5, x0, x0; \
 135        vpxor x6, x1, x1; \
 136        vpxor x7, x2, x2; \
 137        vpxor x4, x3, x3; \
 138        \
 139        vpxor x2, x4, x4; \
 140        vpxor x3, x5, x5; \
 141        vpxor x0, x6, x6; \
 142        vpxor x1, x7, x7; \
 143        \
 144        vpxor x7, x0, x0; \
 145        vpxor x4, x1, x1; \
 146        vpxor x5, x2, x2; \
 147        vpxor x6, x3, x3; \
 148        \
 149        vpxor x3, x4, x4; \
 150        vpxor x0, x5, x5; \
 151        vpxor x1, x6, x6; \
 152        vpxor x2, x7, x7; /* note: high and low parts swapped */ \
 153        \
 154        /* \
 155         * Add key material and result to CD (x becomes new CD) \
 156         */ \
 157        \
 158        vpxor t3, x4, x4; \
 159        vpxor 0 * 16(mem_cd), x4, x4; \
 160        \
 161        vpxor t2, x5, x5; \
 162        vpxor 1 * 16(mem_cd), x5, x5; \
 163        \
 164        vpsrldq $1, t5, t3; \
 165        vpshufb t6, t5, t5; \
 166        vpshufb t6, t3, t6; \
 167        \
 168        vpxor t1, x6, x6; \
 169        vpxor 2 * 16(mem_cd), x6, x6; \
 170        \
 171        vpxor t0, x7, x7; \
 172        vpxor 3 * 16(mem_cd), x7, x7; \
 173        \
 174        vpxor t7, x0, x0; \
 175        vpxor 4 * 16(mem_cd), x0, x0; \
 176        \
 177        vpxor t6, x1, x1; \
 178        vpxor 5 * 16(mem_cd), x1, x1; \
 179        \
 180        vpxor t5, x2, x2; \
 181        vpxor 6 * 16(mem_cd), x2, x2; \
 182        \
 183        vpxor t4, x3, x3; \
 184        vpxor 7 * 16(mem_cd), x3, x3;
 185
 186/*
 187 * Size optimization... with inlined roundsm16, binary would be over 5 times
 188 * larger and would only be 0.5% faster (on sandy-bridge).
 189 */
 190.align 8
 191SYM_FUNC_START_LOCAL(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
 192        roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 193                  %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
 194                  %rcx, (%r9));
 195        ret;
 196SYM_FUNC_END(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
 197
 198.align 8
 199SYM_FUNC_START_LOCAL(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 200        roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
 201                  %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
 202                  %rax, (%r9));
 203        ret;
 204SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 205
 206/*
 207 * IN/OUT:
 208 *  x0..x7: byte-sliced AB state preloaded
 209 *  mem_ab: byte-sliced AB state in memory
 210 *  mem_cb: byte-sliced CD state in memory
 211 */
 212#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 213                      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
 214        leaq (key_table + (i) * 8)(CTX), %r9; \
 215        call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
 216        \
 217        vmovdqu x4, 0 * 16(mem_cd); \
 218        vmovdqu x5, 1 * 16(mem_cd); \
 219        vmovdqu x6, 2 * 16(mem_cd); \
 220        vmovdqu x7, 3 * 16(mem_cd); \
 221        vmovdqu x0, 4 * 16(mem_cd); \
 222        vmovdqu x1, 5 * 16(mem_cd); \
 223        vmovdqu x2, 6 * 16(mem_cd); \
 224        vmovdqu x3, 7 * 16(mem_cd); \
 225        \
 226        leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
 227        call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
 228        \
 229        store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
 230
 231#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
 232
 233#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
 234        /* Store new AB state */ \
 235        vmovdqu x0, 0 * 16(mem_ab); \
 236        vmovdqu x1, 1 * 16(mem_ab); \
 237        vmovdqu x2, 2 * 16(mem_ab); \
 238        vmovdqu x3, 3 * 16(mem_ab); \
 239        vmovdqu x4, 4 * 16(mem_ab); \
 240        vmovdqu x5, 5 * 16(mem_ab); \
 241        vmovdqu x6, 6 * 16(mem_ab); \
 242        vmovdqu x7, 7 * 16(mem_ab);
 243
 244#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 245                      y6, y7, mem_ab, mem_cd, i) \
 246        two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 247                      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
 248        two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 249                      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
 250        two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 251                      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
 252
 253#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 254                      y6, y7, mem_ab, mem_cd, i) \
 255        two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 256                      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
 257        two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 258                      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
 259        two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 260                      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
 261
 262/*
 263 * IN:
 264 *  v0..3: byte-sliced 32-bit integers
 265 * OUT:
 266 *  v0..3: (IN <<< 1)
 267 */
 268#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
 269        vpcmpgtb v0, zero, t0; \
 270        vpaddb v0, v0, v0; \
 271        vpabsb t0, t0; \
 272        \
 273        vpcmpgtb v1, zero, t1; \
 274        vpaddb v1, v1, v1; \
 275        vpabsb t1, t1; \
 276        \
 277        vpcmpgtb v2, zero, t2; \
 278        vpaddb v2, v2, v2; \
 279        vpabsb t2, t2; \
 280        \
 281        vpor t0, v1, v1; \
 282        \
 283        vpcmpgtb v3, zero, t0; \
 284        vpaddb v3, v3, v3; \
 285        vpabsb t0, t0; \
 286        \
 287        vpor t1, v2, v2; \
 288        vpor t2, v3, v3; \
 289        vpor t0, v0, v0;
 290
 291/*
 292 * IN:
 293 *   r: byte-sliced AB state in memory
 294 *   l: byte-sliced CD state in memory
 295 * OUT:
 296 *   x0..x7: new byte-sliced CD state
 297 */
 298#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
 299              tt1, tt2, tt3, kll, klr, krl, krr) \
 300        /* \
 301         * t0 = kll; \
 302         * t0 &= ll; \
 303         * lr ^= rol32(t0, 1); \
 304         */ \
 305        vpxor tt0, tt0, tt0; \
 306        vmovd kll, t0; \
 307        vpshufb tt0, t0, t3; \
 308        vpsrldq $1, t0, t0; \
 309        vpshufb tt0, t0, t2; \
 310        vpsrldq $1, t0, t0; \
 311        vpshufb tt0, t0, t1; \
 312        vpsrldq $1, t0, t0; \
 313        vpshufb tt0, t0, t0; \
 314        \
 315        vpand l0, t0, t0; \
 316        vpand l1, t1, t1; \
 317        vpand l2, t2, t2; \
 318        vpand l3, t3, t3; \
 319        \
 320        rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
 321        \
 322        vpxor l4, t0, l4; \
 323        vmovdqu l4, 4 * 16(l); \
 324        vpxor l5, t1, l5; \
 325        vmovdqu l5, 5 * 16(l); \
 326        vpxor l6, t2, l6; \
 327        vmovdqu l6, 6 * 16(l); \
 328        vpxor l7, t3, l7; \
 329        vmovdqu l7, 7 * 16(l); \
 330        \
 331        /* \
 332         * t2 = krr; \
 333         * t2 |= rr; \
 334         * rl ^= t2; \
 335         */ \
 336        \
 337        vmovd krr, t0; \
 338        vpshufb tt0, t0, t3; \
 339        vpsrldq $1, t0, t0; \
 340        vpshufb tt0, t0, t2; \
 341        vpsrldq $1, t0, t0; \
 342        vpshufb tt0, t0, t1; \
 343        vpsrldq $1, t0, t0; \
 344        vpshufb tt0, t0, t0; \
 345        \
 346        vpor 4 * 16(r), t0, t0; \
 347        vpor 5 * 16(r), t1, t1; \
 348        vpor 6 * 16(r), t2, t2; \
 349        vpor 7 * 16(r), t3, t3; \
 350        \
 351        vpxor 0 * 16(r), t0, t0; \
 352        vpxor 1 * 16(r), t1, t1; \
 353        vpxor 2 * 16(r), t2, t2; \
 354        vpxor 3 * 16(r), t3, t3; \
 355        vmovdqu t0, 0 * 16(r); \
 356        vmovdqu t1, 1 * 16(r); \
 357        vmovdqu t2, 2 * 16(r); \
 358        vmovdqu t3, 3 * 16(r); \
 359        \
 360        /* \
 361         * t2 = krl; \
 362         * t2 &= rl; \
 363         * rr ^= rol32(t2, 1); \
 364         */ \
 365        vmovd krl, t0; \
 366        vpshufb tt0, t0, t3; \
 367        vpsrldq $1, t0, t0; \
 368        vpshufb tt0, t0, t2; \
 369        vpsrldq $1, t0, t0; \
 370        vpshufb tt0, t0, t1; \
 371        vpsrldq $1, t0, t0; \
 372        vpshufb tt0, t0, t0; \
 373        \
 374        vpand 0 * 16(r), t0, t0; \
 375        vpand 1 * 16(r), t1, t1; \
 376        vpand 2 * 16(r), t2, t2; \
 377        vpand 3 * 16(r), t3, t3; \
 378        \
 379        rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
 380        \
 381        vpxor 4 * 16(r), t0, t0; \
 382        vpxor 5 * 16(r), t1, t1; \
 383        vpxor 6 * 16(r), t2, t2; \
 384        vpxor 7 * 16(r), t3, t3; \
 385        vmovdqu t0, 4 * 16(r); \
 386        vmovdqu t1, 5 * 16(r); \
 387        vmovdqu t2, 6 * 16(r); \
 388        vmovdqu t3, 7 * 16(r); \
 389        \
 390        /* \
 391         * t0 = klr; \
 392         * t0 |= lr; \
 393         * ll ^= t0; \
 394         */ \
 395        \
 396        vmovd klr, t0; \
 397        vpshufb tt0, t0, t3; \
 398        vpsrldq $1, t0, t0; \
 399        vpshufb tt0, t0, t2; \
 400        vpsrldq $1, t0, t0; \
 401        vpshufb tt0, t0, t1; \
 402        vpsrldq $1, t0, t0; \
 403        vpshufb tt0, t0, t0; \
 404        \
 405        vpor l4, t0, t0; \
 406        vpor l5, t1, t1; \
 407        vpor l6, t2, t2; \
 408        vpor l7, t3, t3; \
 409        \
 410        vpxor l0, t0, l0; \
 411        vmovdqu l0, 0 * 16(l); \
 412        vpxor l1, t1, l1; \
 413        vmovdqu l1, 1 * 16(l); \
 414        vpxor l2, t2, l2; \
 415        vmovdqu l2, 2 * 16(l); \
 416        vpxor l3, t3, l3; \
 417        vmovdqu l3, 3 * 16(l);
 418
 419#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
 420        vpunpckhdq x1, x0, t2; \
 421        vpunpckldq x1, x0, x0; \
 422        \
 423        vpunpckldq x3, x2, t1; \
 424        vpunpckhdq x3, x2, x2; \
 425        \
 426        vpunpckhqdq t1, x0, x1; \
 427        vpunpcklqdq t1, x0, x0; \
 428        \
 429        vpunpckhqdq x2, t2, x3; \
 430        vpunpcklqdq x2, t2, x2;
 431
 432#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
 433                         b3, c3, d3, st0, st1) \
 434        vmovdqu d2, st0; \
 435        vmovdqu d3, st1; \
 436        transpose_4x4(a0, a1, a2, a3, d2, d3); \
 437        transpose_4x4(b0, b1, b2, b3, d2, d3); \
 438        vmovdqu st0, d2; \
 439        vmovdqu st1, d3; \
 440        \
 441        vmovdqu a0, st0; \
 442        vmovdqu a1, st1; \
 443        transpose_4x4(c0, c1, c2, c3, a0, a1); \
 444        transpose_4x4(d0, d1, d2, d3, a0, a1); \
 445        \
 446        vmovdqu .Lshufb_16x16b, a0; \
 447        vmovdqu st1, a1; \
 448        vpshufb a0, a2, a2; \
 449        vpshufb a0, a3, a3; \
 450        vpshufb a0, b0, b0; \
 451        vpshufb a0, b1, b1; \
 452        vpshufb a0, b2, b2; \
 453        vpshufb a0, b3, b3; \
 454        vpshufb a0, a1, a1; \
 455        vpshufb a0, c0, c0; \
 456        vpshufb a0, c1, c1; \
 457        vpshufb a0, c2, c2; \
 458        vpshufb a0, c3, c3; \
 459        vpshufb a0, d0, d0; \
 460        vpshufb a0, d1, d1; \
 461        vpshufb a0, d2, d2; \
 462        vpshufb a0, d3, d3; \
 463        vmovdqu d3, st1; \
 464        vmovdqu st0, d3; \
 465        vpshufb a0, d3, a0; \
 466        vmovdqu d2, st0; \
 467        \
 468        transpose_4x4(a0, b0, c0, d0, d2, d3); \
 469        transpose_4x4(a1, b1, c1, d1, d2, d3); \
 470        vmovdqu st0, d2; \
 471        vmovdqu st1, d3; \
 472        \
 473        vmovdqu b0, st0; \
 474        vmovdqu b1, st1; \
 475        transpose_4x4(a2, b2, c2, d2, b0, b1); \
 476        transpose_4x4(a3, b3, c3, d3, b0, b1); \
 477        vmovdqu st0, b0; \
 478        vmovdqu st1, b1; \
 479        /* does not adjust output bytes inside vectors */
 480
 481/* load blocks to registers and apply pre-whitening */
 482#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 483                     y6, y7, rio, key) \
 484        vmovq key, x0; \
 485        vpshufb .Lpack_bswap, x0, x0; \
 486        \
 487        vpxor 0 * 16(rio), x0, y7; \
 488        vpxor 1 * 16(rio), x0, y6; \
 489        vpxor 2 * 16(rio), x0, y5; \
 490        vpxor 3 * 16(rio), x0, y4; \
 491        vpxor 4 * 16(rio), x0, y3; \
 492        vpxor 5 * 16(rio), x0, y2; \
 493        vpxor 6 * 16(rio), x0, y1; \
 494        vpxor 7 * 16(rio), x0, y0; \
 495        vpxor 8 * 16(rio), x0, x7; \
 496        vpxor 9 * 16(rio), x0, x6; \
 497        vpxor 10 * 16(rio), x0, x5; \
 498        vpxor 11 * 16(rio), x0, x4; \
 499        vpxor 12 * 16(rio), x0, x3; \
 500        vpxor 13 * 16(rio), x0, x2; \
 501        vpxor 14 * 16(rio), x0, x1; \
 502        vpxor 15 * 16(rio), x0, x0;
 503
 504/* byteslice pre-whitened blocks and store to temporary memory */
 505#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 506                      y6, y7, mem_ab, mem_cd) \
 507        byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
 508                         y5, y6, y7, (mem_ab), (mem_cd)); \
 509        \
 510        vmovdqu x0, 0 * 16(mem_ab); \
 511        vmovdqu x1, 1 * 16(mem_ab); \
 512        vmovdqu x2, 2 * 16(mem_ab); \
 513        vmovdqu x3, 3 * 16(mem_ab); \
 514        vmovdqu x4, 4 * 16(mem_ab); \
 515        vmovdqu x5, 5 * 16(mem_ab); \
 516        vmovdqu x6, 6 * 16(mem_ab); \
 517        vmovdqu x7, 7 * 16(mem_ab); \
 518        vmovdqu y0, 0 * 16(mem_cd); \
 519        vmovdqu y1, 1 * 16(mem_cd); \
 520        vmovdqu y2, 2 * 16(mem_cd); \
 521        vmovdqu y3, 3 * 16(mem_cd); \
 522        vmovdqu y4, 4 * 16(mem_cd); \
 523        vmovdqu y5, 5 * 16(mem_cd); \
 524        vmovdqu y6, 6 * 16(mem_cd); \
 525        vmovdqu y7, 7 * 16(mem_cd);
 526
 527/* de-byteslice, apply post-whitening and store blocks */
 528#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
 529                    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
 530        byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
 531                         y7, x3, x7, stack_tmp0, stack_tmp1); \
 532        \
 533        vmovdqu x0, stack_tmp0; \
 534        \
 535        vmovq key, x0; \
 536        vpshufb .Lpack_bswap, x0, x0; \
 537        \
 538        vpxor x0, y7, y7; \
 539        vpxor x0, y6, y6; \
 540        vpxor x0, y5, y5; \
 541        vpxor x0, y4, y4; \
 542        vpxor x0, y3, y3; \
 543        vpxor x0, y2, y2; \
 544        vpxor x0, y1, y1; \
 545        vpxor x0, y0, y0; \
 546        vpxor x0, x7, x7; \
 547        vpxor x0, x6, x6; \
 548        vpxor x0, x5, x5; \
 549        vpxor x0, x4, x4; \
 550        vpxor x0, x3, x3; \
 551        vpxor x0, x2, x2; \
 552        vpxor x0, x1, x1; \
 553        vpxor stack_tmp0, x0, x0;
 554
 555#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 556                     y6, y7, rio) \
 557        vmovdqu x0, 0 * 16(rio); \
 558        vmovdqu x1, 1 * 16(rio); \
 559        vmovdqu x2, 2 * 16(rio); \
 560        vmovdqu x3, 3 * 16(rio); \
 561        vmovdqu x4, 4 * 16(rio); \
 562        vmovdqu x5, 5 * 16(rio); \
 563        vmovdqu x6, 6 * 16(rio); \
 564        vmovdqu x7, 7 * 16(rio); \
 565        vmovdqu y0, 8 * 16(rio); \
 566        vmovdqu y1, 9 * 16(rio); \
 567        vmovdqu y2, 10 * 16(rio); \
 568        vmovdqu y3, 11 * 16(rio); \
 569        vmovdqu y4, 12 * 16(rio); \
 570        vmovdqu y5, 13 * 16(rio); \
 571        vmovdqu y6, 14 * 16(rio); \
 572        vmovdqu y7, 15 * 16(rio);
 573
 574
 575/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
 576.section        .rodata.cst16, "aM", @progbits, 16
 577.align 16
 578
 579#define SHUFB_BYTES(idx) \
 580        0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
 581
 582.Lshufb_16x16b:
 583        .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
 584
 585.Lpack_bswap:
 586        .long 0x00010203
 587        .long 0x04050607
 588        .long 0x80808080
 589        .long 0x80808080
 590
 591/*
 592 * pre-SubByte transform
 593 *
 594 * pre-lookup for sbox1, sbox2, sbox3:
 595 *   swap_bitendianness(
 596 *       isom_map_camellia_to_aes(
 597 *           camellia_f(
 598 *               swap_bitendianess(in)
 599 *           )
 600 *       )
 601 *   )
 602 *
 603 * (note: '⊕ 0xc5' inside camellia_f())
 604 */
 605.Lpre_tf_lo_s1:
 606        .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
 607        .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
 608.Lpre_tf_hi_s1:
 609        .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
 610        .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
 611
 612/*
 613 * pre-SubByte transform
 614 *
 615 * pre-lookup for sbox4:
 616 *   swap_bitendianness(
 617 *       isom_map_camellia_to_aes(
 618 *           camellia_f(
 619 *               swap_bitendianess(in <<< 1)
 620 *           )
 621 *       )
 622 *   )
 623 *
 624 * (note: '⊕ 0xc5' inside camellia_f())
 625 */
 626.Lpre_tf_lo_s4:
 627        .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
 628        .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
 629.Lpre_tf_hi_s4:
 630        .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
 631        .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
 632
 633/*
 634 * post-SubByte transform
 635 *
 636 * post-lookup for sbox1, sbox4:
 637 *  swap_bitendianness(
 638 *      camellia_h(
 639 *          isom_map_aes_to_camellia(
 640 *              swap_bitendianness(
 641 *                  aes_inverse_affine_transform(in)
 642 *              )
 643 *          )
 644 *      )
 645 *  )
 646 *
 647 * (note: '⊕ 0x6e' inside camellia_h())
 648 */
 649.Lpost_tf_lo_s1:
 650        .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
 651        .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
 652.Lpost_tf_hi_s1:
 653        .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
 654        .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
 655
 656/*
 657 * post-SubByte transform
 658 *
 659 * post-lookup for sbox2:
 660 *  swap_bitendianness(
 661 *      camellia_h(
 662 *          isom_map_aes_to_camellia(
 663 *              swap_bitendianness(
 664 *                  aes_inverse_affine_transform(in)
 665 *              )
 666 *          )
 667 *      )
 668 *  ) <<< 1
 669 *
 670 * (note: '⊕ 0x6e' inside camellia_h())
 671 */
 672.Lpost_tf_lo_s2:
 673        .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
 674        .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
 675.Lpost_tf_hi_s2:
 676        .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
 677        .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
 678
 679/*
 680 * post-SubByte transform
 681 *
 682 * post-lookup for sbox3:
 683 *  swap_bitendianness(
 684 *      camellia_h(
 685 *          isom_map_aes_to_camellia(
 686 *              swap_bitendianness(
 687 *                  aes_inverse_affine_transform(in)
 688 *              )
 689 *          )
 690 *      )
 691 *  ) >>> 1
 692 *
 693 * (note: '⊕ 0x6e' inside camellia_h())
 694 */
 695.Lpost_tf_lo_s3:
 696        .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
 697        .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
 698.Lpost_tf_hi_s3:
 699        .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
 700        .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
 701
 702/* For isolating SubBytes from AESENCLAST, inverse shift row */
 703.Linv_shift_row:
 704        .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
 705        .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
 706
 707/* 4-bit mask */
 708.section        .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
 709.align 4
 710.L0f0f0f0f:
 711        .long 0x0f0f0f0f
 712
 713.text
 714
 715.align 8
 716SYM_FUNC_START_LOCAL(__camellia_enc_blk16)
 717        /* input:
 718         *      %rdi: ctx, CTX
 719         *      %rax: temporary storage, 256 bytes
 720         *      %xmm0..%xmm15: 16 plaintext blocks
 721         * output:
 722         *      %xmm0..%xmm15: 16 encrypted blocks, order swapped:
 723         *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 724         */
 725        FRAME_BEGIN
 726
 727        leaq 8 * 16(%rax), %rcx;
 728
 729        inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 730                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 731                      %xmm15, %rax, %rcx);
 732
 733        enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 734                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 735                     %xmm15, %rax, %rcx, 0);
 736
 737        fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 738              %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 739              %xmm15,
 740              ((key_table + (8) * 8) + 0)(CTX),
 741              ((key_table + (8) * 8) + 4)(CTX),
 742              ((key_table + (8) * 8) + 8)(CTX),
 743              ((key_table + (8) * 8) + 12)(CTX));
 744
 745        enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 746                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 747                     %xmm15, %rax, %rcx, 8);
 748
 749        fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 750              %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 751              %xmm15,
 752              ((key_table + (16) * 8) + 0)(CTX),
 753              ((key_table + (16) * 8) + 4)(CTX),
 754              ((key_table + (16) * 8) + 8)(CTX),
 755              ((key_table + (16) * 8) + 12)(CTX));
 756
 757        enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 758                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 759                     %xmm15, %rax, %rcx, 16);
 760
 761        movl $24, %r8d;
 762        cmpl $16, key_length(CTX);
 763        jne .Lenc_max32;
 764
 765.Lenc_done:
 766        /* load CD for output */
 767        vmovdqu 0 * 16(%rcx), %xmm8;
 768        vmovdqu 1 * 16(%rcx), %xmm9;
 769        vmovdqu 2 * 16(%rcx), %xmm10;
 770        vmovdqu 3 * 16(%rcx), %xmm11;
 771        vmovdqu 4 * 16(%rcx), %xmm12;
 772        vmovdqu 5 * 16(%rcx), %xmm13;
 773        vmovdqu 6 * 16(%rcx), %xmm14;
 774        vmovdqu 7 * 16(%rcx), %xmm15;
 775
 776        outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 777                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 778                    %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
 779
 780        FRAME_END
 781        ret;
 782
 783.align 8
 784.Lenc_max32:
 785        movl $32, %r8d;
 786
 787        fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 788              %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 789              %xmm15,
 790              ((key_table + (24) * 8) + 0)(CTX),
 791              ((key_table + (24) * 8) + 4)(CTX),
 792              ((key_table + (24) * 8) + 8)(CTX),
 793              ((key_table + (24) * 8) + 12)(CTX));
 794
 795        enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 796                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 797                     %xmm15, %rax, %rcx, 24);
 798
 799        jmp .Lenc_done;
 800SYM_FUNC_END(__camellia_enc_blk16)
 801
 802.align 8
 803SYM_FUNC_START_LOCAL(__camellia_dec_blk16)
 804        /* input:
 805         *      %rdi: ctx, CTX
 806         *      %rax: temporary storage, 256 bytes
 807         *      %r8d: 24 for 16 byte key, 32 for larger
 808         *      %xmm0..%xmm15: 16 encrypted blocks
 809         * output:
 810         *      %xmm0..%xmm15: 16 plaintext blocks, order swapped:
 811         *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 812         */
 813        FRAME_BEGIN
 814
 815        leaq 8 * 16(%rax), %rcx;
 816
 817        inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 818                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 819                      %xmm15, %rax, %rcx);
 820
 821        cmpl $32, %r8d;
 822        je .Ldec_max32;
 823
 824.Ldec_max24:
 825        dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 826                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 827                     %xmm15, %rax, %rcx, 16);
 828
 829        fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 830              %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 831              %xmm15,
 832              ((key_table + (16) * 8) + 8)(CTX),
 833              ((key_table + (16) * 8) + 12)(CTX),
 834              ((key_table + (16) * 8) + 0)(CTX),
 835              ((key_table + (16) * 8) + 4)(CTX));
 836
 837        dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 838                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 839                     %xmm15, %rax, %rcx, 8);
 840
 841        fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 842              %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 843              %xmm15,
 844              ((key_table + (8) * 8) + 8)(CTX),
 845              ((key_table + (8) * 8) + 12)(CTX),
 846              ((key_table + (8) * 8) + 0)(CTX),
 847              ((key_table + (8) * 8) + 4)(CTX));
 848
 849        dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 850                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 851                     %xmm15, %rax, %rcx, 0);
 852
 853        /* load CD for output */
 854        vmovdqu 0 * 16(%rcx), %xmm8;
 855        vmovdqu 1 * 16(%rcx), %xmm9;
 856        vmovdqu 2 * 16(%rcx), %xmm10;
 857        vmovdqu 3 * 16(%rcx), %xmm11;
 858        vmovdqu 4 * 16(%rcx), %xmm12;
 859        vmovdqu 5 * 16(%rcx), %xmm13;
 860        vmovdqu 6 * 16(%rcx), %xmm14;
 861        vmovdqu 7 * 16(%rcx), %xmm15;
 862
 863        outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 864                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 865                    %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
 866
 867        FRAME_END
 868        ret;
 869
 870.align 8
 871.Ldec_max32:
 872        dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 873                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 874                     %xmm15, %rax, %rcx, 24);
 875
 876        fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 877              %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 878              %xmm15,
 879              ((key_table + (24) * 8) + 8)(CTX),
 880              ((key_table + (24) * 8) + 12)(CTX),
 881              ((key_table + (24) * 8) + 0)(CTX),
 882              ((key_table + (24) * 8) + 4)(CTX));
 883
 884        jmp .Ldec_max24;
 885SYM_FUNC_END(__camellia_dec_blk16)
 886
 887SYM_FUNC_START(camellia_ecb_enc_16way)
 888        /* input:
 889         *      %rdi: ctx, CTX
 890         *      %rsi: dst (16 blocks)
 891         *      %rdx: src (16 blocks)
 892         */
 893         FRAME_BEGIN
 894
 895        inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 896                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 897                     %xmm15, %rdx, (key_table)(CTX));
 898
 899        /* now dst can be used as temporary buffer (even in src == dst case) */
 900        movq    %rsi, %rax;
 901
 902        call __camellia_enc_blk16;
 903
 904        write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
 905                     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 906                     %xmm8, %rsi);
 907
 908        FRAME_END
 909        ret;
 910SYM_FUNC_END(camellia_ecb_enc_16way)
 911
 912SYM_FUNC_START(camellia_ecb_dec_16way)
 913        /* input:
 914         *      %rdi: ctx, CTX
 915         *      %rsi: dst (16 blocks)
 916         *      %rdx: src (16 blocks)
 917         */
 918         FRAME_BEGIN
 919
 920        cmpl $16, key_length(CTX);
 921        movl $32, %r8d;
 922        movl $24, %eax;
 923        cmovel %eax, %r8d; /* max */
 924
 925        inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 926                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 927                     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
 928
 929        /* now dst can be used as temporary buffer (even in src == dst case) */
 930        movq    %rsi, %rax;
 931
 932        call __camellia_dec_blk16;
 933
 934        write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
 935                     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 936                     %xmm8, %rsi);
 937
 938        FRAME_END
 939        ret;
 940SYM_FUNC_END(camellia_ecb_dec_16way)
 941
 942SYM_FUNC_START(camellia_cbc_dec_16way)
 943        /* input:
 944         *      %rdi: ctx, CTX
 945         *      %rsi: dst (16 blocks)
 946         *      %rdx: src (16 blocks)
 947         */
 948        FRAME_BEGIN
 949
 950        cmpl $16, key_length(CTX);
 951        movl $32, %r8d;
 952        movl $24, %eax;
 953        cmovel %eax, %r8d; /* max */
 954
 955        inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 956                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 957                     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
 958
 959        /*
 960         * dst might still be in-use (in case dst == src), so use stack for
 961         * temporary storage.
 962         */
 963        subq $(16 * 16), %rsp;
 964        movq %rsp, %rax;
 965
 966        call __camellia_dec_blk16;
 967
 968        addq $(16 * 16), %rsp;
 969
 970        vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
 971        vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
 972        vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
 973        vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
 974        vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
 975        vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
 976        vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
 977        vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
 978        vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
 979        vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
 980        vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
 981        vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
 982        vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
 983        vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
 984        vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
 985        write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
 986                     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 987                     %xmm8, %rsi);
 988
 989        FRAME_END
 990        ret;
 991SYM_FUNC_END(camellia_cbc_dec_16way)
 992