linux/arch/x86/crypto/serpent-avx2-asm_64.S
<<
>>
Prefs
   1/*
   2 * x86_64/AVX2 assembler optimized version of Serpent
   3 *
   4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   5 *
   6 * Based on AVX assembler implementation of Serpent by:
   7 *  Copyright © 2012 Johannes Goetzfried
   8 *      <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
   9 *
  10 * This program is free software; you can redistribute it and/or modify
  11 * it under the terms of the GNU General Public License as published by
  12 * the Free Software Foundation; either version 2 of the License, or
  13 * (at your option) any later version.
  14 *
  15 */
  16
  17#include <linux/linkage.h>
  18#include <asm/frame.h>
  19#include "glue_helper-asm-avx2.S"
  20
  21.file "serpent-avx2-asm_64.S"
  22
  23.section        .rodata.cst16.bswap128_mask, "aM", @progbits, 16
  24.align 16
  25.Lbswap128_mask:
  26        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  27
  28.section        .rodata.cst16.xts_gf128mul_and_shl1_mask_0, "aM", @progbits, 16
  29.align 16
  30.Lxts_gf128mul_and_shl1_mask_0:
  31        .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
  32
  33.section        .rodata.cst16.xts_gf128mul_and_shl1_mask_1, "aM", @progbits, 16
  34.align 16
  35.Lxts_gf128mul_and_shl1_mask_1:
  36        .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
  37
  38.text
  39
  40#define CTX %rdi
  41
  42#define RNOT %ymm0
  43#define tp  %ymm1
  44
  45#define RA1 %ymm2
  46#define RA2 %ymm3
  47#define RB1 %ymm4
  48#define RB2 %ymm5
  49#define RC1 %ymm6
  50#define RC2 %ymm7
  51#define RD1 %ymm8
  52#define RD2 %ymm9
  53#define RE1 %ymm10
  54#define RE2 %ymm11
  55
  56#define RK0 %ymm12
  57#define RK1 %ymm13
  58#define RK2 %ymm14
  59#define RK3 %ymm15
  60
  61#define RK0x %xmm12
  62#define RK1x %xmm13
  63#define RK2x %xmm14
  64#define RK3x %xmm15
  65
  66#define S0_1(x0, x1, x2, x3, x4)      \
  67        vpor            x0,   x3, tp; \
  68        vpxor           x3,   x0, x0; \
  69        vpxor           x2,   x3, x4; \
  70        vpxor           RNOT, x4, x4; \
  71        vpxor           x1,   tp, x3; \
  72        vpand           x0,   x1, x1; \
  73        vpxor           x4,   x1, x1; \
  74        vpxor           x0,   x2, x2;
  75#define S0_2(x0, x1, x2, x3, x4)      \
  76        vpxor           x3,   x0, x0; \
  77        vpor            x0,   x4, x4; \
  78        vpxor           x2,   x0, x0; \
  79        vpand           x1,   x2, x2; \
  80        vpxor           x2,   x3, x3; \
  81        vpxor           RNOT, x1, x1; \
  82        vpxor           x4,   x2, x2; \
  83        vpxor           x2,   x1, x1;
  84
  85#define S1_1(x0, x1, x2, x3, x4)      \
  86        vpxor           x0,   x1, tp; \
  87        vpxor           x3,   x0, x0; \
  88        vpxor           RNOT, x3, x3; \
  89        vpand           tp,   x1, x4; \
  90        vpor            tp,   x0, x0; \
  91        vpxor           x2,   x3, x3; \
  92        vpxor           x3,   x0, x0; \
  93        vpxor           x3,   tp, x1;
  94#define S1_2(x0, x1, x2, x3, x4)      \
  95        vpxor           x4,   x3, x3; \
  96        vpor            x4,   x1, x1; \
  97        vpxor           x2,   x4, x4; \
  98        vpand           x0,   x2, x2; \
  99        vpxor           x1,   x2, x2; \
 100        vpor            x0,   x1, x1; \
 101        vpxor           RNOT, x0, x0; \
 102        vpxor           x2,   x0, x0; \
 103        vpxor           x1,   x4, x4;
 104
 105#define S2_1(x0, x1, x2, x3, x4)      \
 106        vpxor           RNOT, x3, x3; \
 107        vpxor           x0,   x1, x1; \
 108        vpand           x2,   x0, tp; \
 109        vpxor           x3,   tp, tp; \
 110        vpor            x0,   x3, x3; \
 111        vpxor           x1,   x2, x2; \
 112        vpxor           x1,   x3, x3; \
 113        vpand           tp,   x1, x1;
 114#define S2_2(x0, x1, x2, x3, x4)      \
 115        vpxor           x2,   tp, tp; \
 116        vpand           x3,   x2, x2; \
 117        vpor            x1,   x3, x3; \
 118        vpxor           RNOT, tp, tp; \
 119        vpxor           tp,   x3, x3; \
 120        vpxor           tp,   x0, x4; \
 121        vpxor           x2,   tp, x0; \
 122        vpor            x2,   x1, x1;
 123
 124#define S3_1(x0, x1, x2, x3, x4)      \
 125        vpxor           x3,   x1, tp; \
 126        vpor            x0,   x3, x3; \
 127        vpand           x0,   x1, x4; \
 128        vpxor           x2,   x0, x0; \
 129        vpxor           tp,   x2, x2; \
 130        vpand           x3,   tp, x1; \
 131        vpxor           x3,   x2, x2; \
 132        vpor            x4,   x0, x0; \
 133        vpxor           x3,   x4, x4;
 134#define S3_2(x0, x1, x2, x3, x4)      \
 135        vpxor           x0,   x1, x1; \
 136        vpand           x3,   x0, x0; \
 137        vpand           x4,   x3, x3; \
 138        vpxor           x2,   x3, x3; \
 139        vpor            x1,   x4, x4; \
 140        vpand           x1,   x2, x2; \
 141        vpxor           x3,   x4, x4; \
 142        vpxor           x3,   x0, x0; \
 143        vpxor           x2,   x3, x3;
 144
 145#define S4_1(x0, x1, x2, x3, x4)      \
 146        vpand           x0,   x3, tp; \
 147        vpxor           x3,   x0, x0; \
 148        vpxor           x2,   tp, tp; \
 149        vpor            x3,   x2, x2; \
 150        vpxor           x1,   x0, x0; \
 151        vpxor           tp,   x3, x4; \
 152        vpor            x0,   x2, x2; \
 153        vpxor           x1,   x2, x2;
 154#define S4_2(x0, x1, x2, x3, x4)      \
 155        vpand           x0,   x1, x1; \
 156        vpxor           x4,   x1, x1; \
 157        vpand           x2,   x4, x4; \
 158        vpxor           tp,   x2, x2; \
 159        vpxor           x0,   x4, x4; \
 160        vpor            x1,   tp, x3; \
 161        vpxor           RNOT, x1, x1; \
 162        vpxor           x0,   x3, x3;
 163
 164#define S5_1(x0, x1, x2, x3, x4)      \
 165        vpor            x0,   x1, tp; \
 166        vpxor           tp,   x2, x2; \
 167        vpxor           RNOT, x3, x3; \
 168        vpxor           x0,   x1, x4; \
 169        vpxor           x2,   x0, x0; \
 170        vpand           x4,   tp, x1; \
 171        vpor            x3,   x4, x4; \
 172        vpxor           x0,   x4, x4;
 173#define S5_2(x0, x1, x2, x3, x4)      \
 174        vpand           x3,   x0, x0; \
 175        vpxor           x3,   x1, x1; \
 176        vpxor           x2,   x3, x3; \
 177        vpxor           x1,   x0, x0; \
 178        vpand           x4,   x2, x2; \
 179        vpxor           x2,   x1, x1; \
 180        vpand           x0,   x2, x2; \
 181        vpxor           x2,   x3, x3;
 182
 183#define S6_1(x0, x1, x2, x3, x4)      \
 184        vpxor           x0,   x3, x3; \
 185        vpxor           x2,   x1, tp; \
 186        vpxor           x0,   x2, x2; \
 187        vpand           x3,   x0, x0; \
 188        vpor            x3,   tp, tp; \
 189        vpxor           RNOT, x1, x4; \
 190        vpxor           tp,   x0, x0; \
 191        vpxor           x2,   tp, x1;
 192#define S6_2(x0, x1, x2, x3, x4)      \
 193        vpxor           x4,   x3, x3; \
 194        vpxor           x0,   x4, x4; \
 195        vpand           x0,   x2, x2; \
 196        vpxor           x1,   x4, x4; \
 197        vpxor           x3,   x2, x2; \
 198        vpand           x1,   x3, x3; \
 199        vpxor           x0,   x3, x3; \
 200        vpxor           x2,   x1, x1;
 201
 202#define S7_1(x0, x1, x2, x3, x4)      \
 203        vpxor           RNOT, x1, tp; \
 204        vpxor           RNOT, x0, x0; \
 205        vpand           x2,   tp, x1; \
 206        vpxor           x3,   x1, x1; \
 207        vpor            tp,   x3, x3; \
 208        vpxor           x2,   tp, x4; \
 209        vpxor           x3,   x2, x2; \
 210        vpxor           x0,   x3, x3; \
 211        vpor            x1,   x0, x0;
 212#define S7_2(x0, x1, x2, x3, x4)      \
 213        vpand           x0,   x2, x2; \
 214        vpxor           x4,   x0, x0; \
 215        vpxor           x3,   x4, x4; \
 216        vpand           x0,   x3, x3; \
 217        vpxor           x1,   x4, x4; \
 218        vpxor           x4,   x2, x2; \
 219        vpxor           x1,   x3, x3; \
 220        vpor            x0,   x4, x4; \
 221        vpxor           x1,   x4, x4;
 222
 223#define SI0_1(x0, x1, x2, x3, x4)     \
 224        vpxor           x0,   x1, x1; \
 225        vpor            x1,   x3, tp; \
 226        vpxor           x1,   x3, x4; \
 227        vpxor           RNOT, x0, x0; \
 228        vpxor           tp,   x2, x2; \
 229        vpxor           x0,   tp, x3; \
 230        vpand           x1,   x0, x0; \
 231        vpxor           x2,   x0, x0;
 232#define SI0_2(x0, x1, x2, x3, x4)     \
 233        vpand           x3,   x2, x2; \
 234        vpxor           x4,   x3, x3; \
 235        vpxor           x3,   x2, x2; \
 236        vpxor           x3,   x1, x1; \
 237        vpand           x0,   x3, x3; \
 238        vpxor           x0,   x1, x1; \
 239        vpxor           x2,   x0, x0; \
 240        vpxor           x3,   x4, x4;
 241
 242#define SI1_1(x0, x1, x2, x3, x4)     \
 243        vpxor           x3,   x1, x1; \
 244        vpxor           x2,   x0, tp; \
 245        vpxor           RNOT, x2, x2; \
 246        vpor            x1,   x0, x4; \
 247        vpxor           x3,   x4, x4; \
 248        vpand           x1,   x3, x3; \
 249        vpxor           x2,   x1, x1; \
 250        vpand           x4,   x2, x2;
 251#define SI1_2(x0, x1, x2, x3, x4)     \
 252        vpxor           x1,   x4, x4; \
 253        vpor            x3,   x1, x1; \
 254        vpxor           tp,   x3, x3; \
 255        vpxor           tp,   x2, x2; \
 256        vpor            x4,   tp, x0; \
 257        vpxor           x4,   x2, x2; \
 258        vpxor           x0,   x1, x1; \
 259        vpxor           x1,   x4, x4;
 260
 261#define SI2_1(x0, x1, x2, x3, x4)     \
 262        vpxor           x1,   x2, x2; \
 263        vpxor           RNOT, x3, tp; \
 264        vpor            x2,   tp, tp; \
 265        vpxor           x3,   x2, x2; \
 266        vpxor           x0,   x3, x4; \
 267        vpxor           x1,   tp, x3; \
 268        vpor            x2,   x1, x1; \
 269        vpxor           x0,   x2, x2;
 270#define SI2_2(x0, x1, x2, x3, x4)     \
 271        vpxor           x4,   x1, x1; \
 272        vpor            x3,   x4, x4; \
 273        vpxor           x3,   x2, x2; \
 274        vpxor           x2,   x4, x4; \
 275        vpand           x1,   x2, x2; \
 276        vpxor           x3,   x2, x2; \
 277        vpxor           x4,   x3, x3; \
 278        vpxor           x0,   x4, x4;
 279
 280#define SI3_1(x0, x1, x2, x3, x4)     \
 281        vpxor           x1,   x2, x2; \
 282        vpand           x2,   x1, tp; \
 283        vpxor           x0,   tp, tp; \
 284        vpor            x1,   x0, x0; \
 285        vpxor           x3,   x1, x4; \
 286        vpxor           x3,   x0, x0; \
 287        vpor            tp,   x3, x3; \
 288        vpxor           x2,   tp, x1;
 289#define SI3_2(x0, x1, x2, x3, x4)     \
 290        vpxor           x3,   x1, x1; \
 291        vpxor           x2,   x0, x0; \
 292        vpxor           x3,   x2, x2; \
 293        vpand           x1,   x3, x3; \
 294        vpxor           x0,   x1, x1; \
 295        vpand           x2,   x0, x0; \
 296        vpxor           x3,   x4, x4; \
 297        vpxor           x0,   x3, x3; \
 298        vpxor           x1,   x0, x0;
 299
 300#define SI4_1(x0, x1, x2, x3, x4)     \
 301        vpxor           x3,   x2, x2; \
 302        vpand           x1,   x0, tp; \
 303        vpxor           x2,   tp, tp; \
 304        vpor            x3,   x2, x2; \
 305        vpxor           RNOT, x0, x4; \
 306        vpxor           tp,   x1, x1; \
 307        vpxor           x2,   tp, x0; \
 308        vpand           x4,   x2, x2;
 309#define SI4_2(x0, x1, x2, x3, x4)     \
 310        vpxor           x0,   x2, x2; \
 311        vpor            x4,   x0, x0; \
 312        vpxor           x3,   x0, x0; \
 313        vpand           x2,   x3, x3; \
 314        vpxor           x3,   x4, x4; \
 315        vpxor           x1,   x3, x3; \
 316        vpand           x0,   x1, x1; \
 317        vpxor           x1,   x4, x4; \
 318        vpxor           x3,   x0, x0;
 319
 320#define SI5_1(x0, x1, x2, x3, x4)     \
 321        vpor            x2,   x1, tp; \
 322        vpxor           x1,   x2, x2; \
 323        vpxor           x3,   tp, tp; \
 324        vpand           x1,   x3, x3; \
 325        vpxor           x3,   x2, x2; \
 326        vpor            x0,   x3, x3; \
 327        vpxor           RNOT, x0, x0; \
 328        vpxor           x2,   x3, x3; \
 329        vpor            x0,   x2, x2;
 330#define SI5_2(x0, x1, x2, x3, x4)     \
 331        vpxor           tp,   x1, x4; \
 332        vpxor           x4,   x2, x2; \
 333        vpand           x0,   x4, x4; \
 334        vpxor           tp,   x0, x0; \
 335        vpxor           x3,   tp, x1; \
 336        vpand           x2,   x0, x0; \
 337        vpxor           x3,   x2, x2; \
 338        vpxor           x2,   x0, x0; \
 339        vpxor           x4,   x2, x2; \
 340        vpxor           x3,   x4, x4;
 341
 342#define SI6_1(x0, x1, x2, x3, x4)     \
 343        vpxor           x2,   x0, x0; \
 344        vpand           x3,   x0, tp; \
 345        vpxor           x3,   x2, x2; \
 346        vpxor           x2,   tp, tp; \
 347        vpxor           x1,   x3, x3; \
 348        vpor            x0,   x2, x2; \
 349        vpxor           x3,   x2, x2; \
 350        vpand           tp,   x3, x3;
 351#define SI6_2(x0, x1, x2, x3, x4)     \
 352        vpxor           RNOT, tp, tp; \
 353        vpxor           x1,   x3, x3; \
 354        vpand           x2,   x1, x1; \
 355        vpxor           tp,   x0, x4; \
 356        vpxor           x4,   x3, x3; \
 357        vpxor           x2,   x4, x4; \
 358        vpxor           x1,   tp, x0; \
 359        vpxor           x0,   x2, x2;
 360
 361#define SI7_1(x0, x1, x2, x3, x4)     \
 362        vpand           x0,   x3, tp; \
 363        vpxor           x2,   x0, x0; \
 364        vpor            x3,   x2, x2; \
 365        vpxor           x1,   x3, x4; \
 366        vpxor           RNOT, x0, x0; \
 367        vpor            tp,   x1, x1; \
 368        vpxor           x0,   x4, x4; \
 369        vpand           x2,   x0, x0; \
 370        vpxor           x1,   x0, x0;
 371#define SI7_2(x0, x1, x2, x3, x4)     \
 372        vpand           x2,   x1, x1; \
 373        vpxor           x2,   tp, x3; \
 374        vpxor           x3,   x4, x4; \
 375        vpand           x3,   x2, x2; \
 376        vpor            x0,   x3, x3; \
 377        vpxor           x4,   x1, x1; \
 378        vpxor           x4,   x3, x3; \
 379        vpand           x0,   x4, x4; \
 380        vpxor           x2,   x4, x4;
 381
 382#define get_key(i,j,t) \
 383        vpbroadcastd (4*(i)+(j))*4(CTX), t;
 384
 385#define K2(x0, x1, x2, x3, x4, i) \
 386        get_key(i, 0, RK0); \
 387        get_key(i, 1, RK1); \
 388        get_key(i, 2, RK2); \
 389        get_key(i, 3, RK3); \
 390        vpxor RK0,      x0 ## 1, x0 ## 1; \
 391        vpxor RK1,      x1 ## 1, x1 ## 1; \
 392        vpxor RK2,      x2 ## 1, x2 ## 1; \
 393        vpxor RK3,      x3 ## 1, x3 ## 1; \
 394                vpxor RK0,      x0 ## 2, x0 ## 2; \
 395                vpxor RK1,      x1 ## 2, x1 ## 2; \
 396                vpxor RK2,      x2 ## 2, x2 ## 2; \
 397                vpxor RK3,      x3 ## 2, x3 ## 2;
 398
 399#define LK2(x0, x1, x2, x3, x4, i) \
 400        vpslld $13,             x0 ## 1, x4 ## 1;          \
 401        vpsrld $(32 - 13),      x0 ## 1, x0 ## 1;          \
 402        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 403        vpxor                   x0 ## 1, x1 ## 1, x1 ## 1; \
 404        vpslld $3,              x2 ## 1, x4 ## 1;          \
 405        vpsrld $(32 - 3),       x2 ## 1, x2 ## 1;          \
 406        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 407        vpxor                   x2 ## 1, x1 ## 1, x1 ## 1; \
 408                vpslld $13,             x0 ## 2, x4 ## 2;          \
 409                vpsrld $(32 - 13),      x0 ## 2, x0 ## 2;          \
 410                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 411                vpxor                   x0 ## 2, x1 ## 2, x1 ## 2; \
 412                vpslld $3,              x2 ## 2, x4 ## 2;          \
 413                vpsrld $(32 - 3),       x2 ## 2, x2 ## 2;          \
 414                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 415                vpxor                   x2 ## 2, x1 ## 2, x1 ## 2; \
 416        vpslld $1,              x1 ## 1, x4 ## 1;          \
 417        vpsrld $(32 - 1),       x1 ## 1, x1 ## 1;          \
 418        vpor                    x4 ## 1, x1 ## 1, x1 ## 1; \
 419        vpslld $3,              x0 ## 1, x4 ## 1;          \
 420        vpxor                   x2 ## 1, x3 ## 1, x3 ## 1; \
 421        vpxor                   x4 ## 1, x3 ## 1, x3 ## 1; \
 422        get_key(i, 1, RK1); \
 423                vpslld $1,              x1 ## 2, x4 ## 2;          \
 424                vpsrld $(32 - 1),       x1 ## 2, x1 ## 2;          \
 425                vpor                    x4 ## 2, x1 ## 2, x1 ## 2; \
 426                vpslld $3,              x0 ## 2, x4 ## 2;          \
 427                vpxor                   x2 ## 2, x3 ## 2, x3 ## 2; \
 428                vpxor                   x4 ## 2, x3 ## 2, x3 ## 2; \
 429                get_key(i, 3, RK3); \
 430        vpslld $7,              x3 ## 1, x4 ## 1;          \
 431        vpsrld $(32 - 7),       x3 ## 1, x3 ## 1;          \
 432        vpor                    x4 ## 1, x3 ## 1, x3 ## 1; \
 433        vpslld $7,              x1 ## 1, x4 ## 1;          \
 434        vpxor                   x1 ## 1, x0 ## 1, x0 ## 1; \
 435        vpxor                   x3 ## 1, x0 ## 1, x0 ## 1; \
 436        vpxor                   x3 ## 1, x2 ## 1, x2 ## 1; \
 437        vpxor                   x4 ## 1, x2 ## 1, x2 ## 1; \
 438        get_key(i, 0, RK0); \
 439                vpslld $7,              x3 ## 2, x4 ## 2;          \
 440                vpsrld $(32 - 7),       x3 ## 2, x3 ## 2;          \
 441                vpor                    x4 ## 2, x3 ## 2, x3 ## 2; \
 442                vpslld $7,              x1 ## 2, x4 ## 2;          \
 443                vpxor                   x1 ## 2, x0 ## 2, x0 ## 2; \
 444                vpxor                   x3 ## 2, x0 ## 2, x0 ## 2; \
 445                vpxor                   x3 ## 2, x2 ## 2, x2 ## 2; \
 446                vpxor                   x4 ## 2, x2 ## 2, x2 ## 2; \
 447                get_key(i, 2, RK2); \
 448        vpxor                   RK1, x1 ## 1, x1 ## 1;     \
 449        vpxor                   RK3, x3 ## 1, x3 ## 1;     \
 450        vpslld $5,              x0 ## 1, x4 ## 1;          \
 451        vpsrld $(32 - 5),       x0 ## 1, x0 ## 1;          \
 452        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 453        vpslld $22,             x2 ## 1, x4 ## 1;          \
 454        vpsrld $(32 - 22),      x2 ## 1, x2 ## 1;          \
 455        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 456        vpxor                   RK0, x0 ## 1, x0 ## 1;     \
 457        vpxor                   RK2, x2 ## 1, x2 ## 1;     \
 458                vpxor                   RK1, x1 ## 2, x1 ## 2;     \
 459                vpxor                   RK3, x3 ## 2, x3 ## 2;     \
 460                vpslld $5,              x0 ## 2, x4 ## 2;          \
 461                vpsrld $(32 - 5),       x0 ## 2, x0 ## 2;          \
 462                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 463                vpslld $22,             x2 ## 2, x4 ## 2;          \
 464                vpsrld $(32 - 22),      x2 ## 2, x2 ## 2;          \
 465                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 466                vpxor                   RK0, x0 ## 2, x0 ## 2;     \
 467                vpxor                   RK2, x2 ## 2, x2 ## 2;
 468
 469#define KL2(x0, x1, x2, x3, x4, i) \
 470        vpxor                   RK0, x0 ## 1, x0 ## 1;     \
 471        vpxor                   RK2, x2 ## 1, x2 ## 1;     \
 472        vpsrld $5,              x0 ## 1, x4 ## 1;          \
 473        vpslld $(32 - 5),       x0 ## 1, x0 ## 1;          \
 474        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 475        vpxor                   RK3, x3 ## 1, x3 ## 1;     \
 476        vpxor                   RK1, x1 ## 1, x1 ## 1;     \
 477        vpsrld $22,             x2 ## 1, x4 ## 1;          \
 478        vpslld $(32 - 22),      x2 ## 1, x2 ## 1;          \
 479        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 480        vpxor                   x3 ## 1, x2 ## 1, x2 ## 1; \
 481                vpxor                   RK0, x0 ## 2, x0 ## 2;     \
 482                vpxor                   RK2, x2 ## 2, x2 ## 2;     \
 483                vpsrld $5,              x0 ## 2, x4 ## 2;          \
 484                vpslld $(32 - 5),       x0 ## 2, x0 ## 2;          \
 485                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 486                vpxor                   RK3, x3 ## 2, x3 ## 2;     \
 487                vpxor                   RK1, x1 ## 2, x1 ## 2;     \
 488                vpsrld $22,             x2 ## 2, x4 ## 2;          \
 489                vpslld $(32 - 22),      x2 ## 2, x2 ## 2;          \
 490                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 491                vpxor                   x3 ## 2, x2 ## 2, x2 ## 2; \
 492        vpxor                   x3 ## 1, x0 ## 1, x0 ## 1; \
 493        vpslld $7,              x1 ## 1, x4 ## 1;          \
 494        vpxor                   x1 ## 1, x0 ## 1, x0 ## 1; \
 495        vpxor                   x4 ## 1, x2 ## 1, x2 ## 1; \
 496        vpsrld $1,              x1 ## 1, x4 ## 1;          \
 497        vpslld $(32 - 1),       x1 ## 1, x1 ## 1;          \
 498        vpor                    x4 ## 1, x1 ## 1, x1 ## 1; \
 499                vpxor                   x3 ## 2, x0 ## 2, x0 ## 2; \
 500                vpslld $7,              x1 ## 2, x4 ## 2;          \
 501                vpxor                   x1 ## 2, x0 ## 2, x0 ## 2; \
 502                vpxor                   x4 ## 2, x2 ## 2, x2 ## 2; \
 503                vpsrld $1,              x1 ## 2, x4 ## 2;          \
 504                vpslld $(32 - 1),       x1 ## 2, x1 ## 2;          \
 505                vpor                    x4 ## 2, x1 ## 2, x1 ## 2; \
 506        vpsrld $7,              x3 ## 1, x4 ## 1;          \
 507        vpslld $(32 - 7),       x3 ## 1, x3 ## 1;          \
 508        vpor                    x4 ## 1, x3 ## 1, x3 ## 1; \
 509        vpxor                   x0 ## 1, x1 ## 1, x1 ## 1; \
 510        vpslld $3,              x0 ## 1, x4 ## 1;          \
 511        vpxor                   x4 ## 1, x3 ## 1, x3 ## 1; \
 512                vpsrld $7,              x3 ## 2, x4 ## 2;          \
 513                vpslld $(32 - 7),       x3 ## 2, x3 ## 2;          \
 514                vpor                    x4 ## 2, x3 ## 2, x3 ## 2; \
 515                vpxor                   x0 ## 2, x1 ## 2, x1 ## 2; \
 516                vpslld $3,              x0 ## 2, x4 ## 2;          \
 517                vpxor                   x4 ## 2, x3 ## 2, x3 ## 2; \
 518        vpsrld $13,             x0 ## 1, x4 ## 1;          \
 519        vpslld $(32 - 13),      x0 ## 1, x0 ## 1;          \
 520        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 521        vpxor                   x2 ## 1, x1 ## 1, x1 ## 1; \
 522        vpxor                   x2 ## 1, x3 ## 1, x3 ## 1; \
 523        vpsrld $3,              x2 ## 1, x4 ## 1;          \
 524        vpslld $(32 - 3),       x2 ## 1, x2 ## 1;          \
 525        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 526                vpsrld $13,             x0 ## 2, x4 ## 2;          \
 527                vpslld $(32 - 13),      x0 ## 2, x0 ## 2;          \
 528                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 529                vpxor                   x2 ## 2, x1 ## 2, x1 ## 2; \
 530                vpxor                   x2 ## 2, x3 ## 2, x3 ## 2; \
 531                vpsrld $3,              x2 ## 2, x4 ## 2;          \
 532                vpslld $(32 - 3),       x2 ## 2, x2 ## 2;          \
 533                vpor                    x4 ## 2, x2 ## 2, x2 ## 2;
 534
 535#define S(SBOX, x0, x1, x2, x3, x4) \
 536        SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 537        SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 538        SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 539        SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
 540
 541#define SP(SBOX, x0, x1, x2, x3, x4, i) \
 542        get_key(i, 0, RK0); \
 543        SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 544        get_key(i, 2, RK2); \
 545        SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 546        get_key(i, 3, RK3); \
 547        SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 548        get_key(i, 1, RK1); \
 549        SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 550
 551#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 552        vpunpckldq              x1, x0, t0; \
 553        vpunpckhdq              x1, x0, t2; \
 554        vpunpckldq              x3, x2, t1; \
 555        vpunpckhdq              x3, x2, x3; \
 556        \
 557        vpunpcklqdq             t1, t0, x0; \
 558        vpunpckhqdq             t1, t0, x1; \
 559        vpunpcklqdq             x3, t2, x2; \
 560        vpunpckhqdq             x3, t2, x3;
 561
 562#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
 563        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 564
 565#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
 566        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 567
 568.align 8
 569__serpent_enc_blk16:
 570        /* input:
 571         *      %rdi: ctx, CTX
 572         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext
 573         * output:
 574         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
 575         */
 576
 577        vpcmpeqd RNOT, RNOT, RNOT;
 578
 579        read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 580        read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 581
 582                                                 K2(RA, RB, RC, RD, RE, 0);
 583        S(S0, RA, RB, RC, RD, RE);              LK2(RC, RB, RD, RA, RE, 1);
 584        S(S1, RC, RB, RD, RA, RE);              LK2(RE, RD, RA, RC, RB, 2);
 585        S(S2, RE, RD, RA, RC, RB);              LK2(RB, RD, RE, RC, RA, 3);
 586        S(S3, RB, RD, RE, RC, RA);              LK2(RC, RA, RD, RB, RE, 4);
 587        S(S4, RC, RA, RD, RB, RE);              LK2(RA, RD, RB, RE, RC, 5);
 588        S(S5, RA, RD, RB, RE, RC);              LK2(RC, RA, RD, RE, RB, 6);
 589        S(S6, RC, RA, RD, RE, RB);              LK2(RD, RB, RA, RE, RC, 7);
 590        S(S7, RD, RB, RA, RE, RC);              LK2(RC, RA, RE, RD, RB, 8);
 591        S(S0, RC, RA, RE, RD, RB);              LK2(RE, RA, RD, RC, RB, 9);
 592        S(S1, RE, RA, RD, RC, RB);              LK2(RB, RD, RC, RE, RA, 10);
 593        S(S2, RB, RD, RC, RE, RA);              LK2(RA, RD, RB, RE, RC, 11);
 594        S(S3, RA, RD, RB, RE, RC);              LK2(RE, RC, RD, RA, RB, 12);
 595        S(S4, RE, RC, RD, RA, RB);              LK2(RC, RD, RA, RB, RE, 13);
 596        S(S5, RC, RD, RA, RB, RE);              LK2(RE, RC, RD, RB, RA, 14);
 597        S(S6, RE, RC, RD, RB, RA);              LK2(RD, RA, RC, RB, RE, 15);
 598        S(S7, RD, RA, RC, RB, RE);              LK2(RE, RC, RB, RD, RA, 16);
 599        S(S0, RE, RC, RB, RD, RA);              LK2(RB, RC, RD, RE, RA, 17);
 600        S(S1, RB, RC, RD, RE, RA);              LK2(RA, RD, RE, RB, RC, 18);
 601        S(S2, RA, RD, RE, RB, RC);              LK2(RC, RD, RA, RB, RE, 19);
 602        S(S3, RC, RD, RA, RB, RE);              LK2(RB, RE, RD, RC, RA, 20);
 603        S(S4, RB, RE, RD, RC, RA);              LK2(RE, RD, RC, RA, RB, 21);
 604        S(S5, RE, RD, RC, RA, RB);              LK2(RB, RE, RD, RA, RC, 22);
 605        S(S6, RB, RE, RD, RA, RC);              LK2(RD, RC, RE, RA, RB, 23);
 606        S(S7, RD, RC, RE, RA, RB);              LK2(RB, RE, RA, RD, RC, 24);
 607        S(S0, RB, RE, RA, RD, RC);              LK2(RA, RE, RD, RB, RC, 25);
 608        S(S1, RA, RE, RD, RB, RC);              LK2(RC, RD, RB, RA, RE, 26);
 609        S(S2, RC, RD, RB, RA, RE);              LK2(RE, RD, RC, RA, RB, 27);
 610        S(S3, RE, RD, RC, RA, RB);              LK2(RA, RB, RD, RE, RC, 28);
 611        S(S4, RA, RB, RD, RE, RC);              LK2(RB, RD, RE, RC, RA, 29);
 612        S(S5, RB, RD, RE, RC, RA);              LK2(RA, RB, RD, RC, RE, 30);
 613        S(S6, RA, RB, RD, RC, RE);              LK2(RD, RE, RB, RC, RA, 31);
 614        S(S7, RD, RE, RB, RC, RA);               K2(RA, RB, RC, RD, RE, 32);
 615
 616        write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 617        write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 618
 619        ret;
 620ENDPROC(__serpent_enc_blk16)
 621
 622.align 8
 623__serpent_dec_blk16:
 624        /* input:
 625         *      %rdi: ctx, CTX
 626         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
 627         * output:
 628         *      RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext
 629         */
 630
 631        vpcmpeqd RNOT, RNOT, RNOT;
 632
 633        read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 634        read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 635
 636                                                 K2(RA, RB, RC, RD, RE, 32);
 637        SP(SI7, RA, RB, RC, RD, RE, 31);        KL2(RB, RD, RA, RE, RC, 31);
 638        SP(SI6, RB, RD, RA, RE, RC, 30);        KL2(RA, RC, RE, RB, RD, 30);
 639        SP(SI5, RA, RC, RE, RB, RD, 29);        KL2(RC, RD, RA, RE, RB, 29);
 640        SP(SI4, RC, RD, RA, RE, RB, 28);        KL2(RC, RA, RB, RE, RD, 28);
 641        SP(SI3, RC, RA, RB, RE, RD, 27);        KL2(RB, RC, RD, RE, RA, 27);
 642        SP(SI2, RB, RC, RD, RE, RA, 26);        KL2(RC, RA, RE, RD, RB, 26);
 643        SP(SI1, RC, RA, RE, RD, RB, 25);        KL2(RB, RA, RE, RD, RC, 25);
 644        SP(SI0, RB, RA, RE, RD, RC, 24);        KL2(RE, RC, RA, RB, RD, 24);
 645        SP(SI7, RE, RC, RA, RB, RD, 23);        KL2(RC, RB, RE, RD, RA, 23);
 646        SP(SI6, RC, RB, RE, RD, RA, 22);        KL2(RE, RA, RD, RC, RB, 22);
 647        SP(SI5, RE, RA, RD, RC, RB, 21);        KL2(RA, RB, RE, RD, RC, 21);
 648        SP(SI4, RA, RB, RE, RD, RC, 20);        KL2(RA, RE, RC, RD, RB, 20);
 649        SP(SI3, RA, RE, RC, RD, RB, 19);        KL2(RC, RA, RB, RD, RE, 19);
 650        SP(SI2, RC, RA, RB, RD, RE, 18);        KL2(RA, RE, RD, RB, RC, 18);
 651        SP(SI1, RA, RE, RD, RB, RC, 17);        KL2(RC, RE, RD, RB, RA, 17);
 652        SP(SI0, RC, RE, RD, RB, RA, 16);        KL2(RD, RA, RE, RC, RB, 16);
 653        SP(SI7, RD, RA, RE, RC, RB, 15);        KL2(RA, RC, RD, RB, RE, 15);
 654        SP(SI6, RA, RC, RD, RB, RE, 14);        KL2(RD, RE, RB, RA, RC, 14);
 655        SP(SI5, RD, RE, RB, RA, RC, 13);        KL2(RE, RC, RD, RB, RA, 13);
 656        SP(SI4, RE, RC, RD, RB, RA, 12);        KL2(RE, RD, RA, RB, RC, 12);
 657        SP(SI3, RE, RD, RA, RB, RC, 11);        KL2(RA, RE, RC, RB, RD, 11);
 658        SP(SI2, RA, RE, RC, RB, RD, 10);        KL2(RE, RD, RB, RC, RA, 10);
 659        SP(SI1, RE, RD, RB, RC, RA, 9);         KL2(RA, RD, RB, RC, RE, 9);
 660        SP(SI0, RA, RD, RB, RC, RE, 8);         KL2(RB, RE, RD, RA, RC, 8);
 661        SP(SI7, RB, RE, RD, RA, RC, 7);         KL2(RE, RA, RB, RC, RD, 7);
 662        SP(SI6, RE, RA, RB, RC, RD, 6);         KL2(RB, RD, RC, RE, RA, 6);
 663        SP(SI5, RB, RD, RC, RE, RA, 5);         KL2(RD, RA, RB, RC, RE, 5);
 664        SP(SI4, RD, RA, RB, RC, RE, 4);         KL2(RD, RB, RE, RC, RA, 4);
 665        SP(SI3, RD, RB, RE, RC, RA, 3);         KL2(RE, RD, RA, RC, RB, 3);
 666        SP(SI2, RE, RD, RA, RC, RB, 2);         KL2(RD, RB, RC, RA, RE, 2);
 667        SP(SI1, RD, RB, RC, RA, RE, 1);         KL2(RE, RB, RC, RA, RD, 1);
 668        S(SI0, RE, RB, RC, RA, RD);              K2(RC, RD, RB, RE, RA, 0);
 669
 670        write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
 671        write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
 672
 673        ret;
 674ENDPROC(__serpent_dec_blk16)
 675
 676ENTRY(serpent_ecb_enc_16way)
 677        /* input:
 678         *      %rdi: ctx, CTX
 679         *      %rsi: dst
 680         *      %rdx: src
 681         */
 682        FRAME_BEGIN
 683
 684        vzeroupper;
 685
 686        load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 687
 688        call __serpent_enc_blk16;
 689
 690        store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 691
 692        vzeroupper;
 693
 694        FRAME_END
 695        ret;
 696ENDPROC(serpent_ecb_enc_16way)
 697
 698ENTRY(serpent_ecb_dec_16way)
 699        /* input:
 700         *      %rdi: ctx, CTX
 701         *      %rsi: dst
 702         *      %rdx: src
 703         */
 704        FRAME_BEGIN
 705
 706        vzeroupper;
 707
 708        load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 709
 710        call __serpent_dec_blk16;
 711
 712        store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 713
 714        vzeroupper;
 715
 716        FRAME_END
 717        ret;
 718ENDPROC(serpent_ecb_dec_16way)
 719
 720ENTRY(serpent_cbc_dec_16way)
 721        /* input:
 722         *      %rdi: ctx, CTX
 723         *      %rsi: dst
 724         *      %rdx: src
 725         */
 726        FRAME_BEGIN
 727
 728        vzeroupper;
 729
 730        load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 731
 732        call __serpent_dec_blk16;
 733
 734        store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
 735                        RK0);
 736
 737        vzeroupper;
 738
 739        FRAME_END
 740        ret;
 741ENDPROC(serpent_cbc_dec_16way)
 742
 743ENTRY(serpent_ctr_16way)
 744        /* input:
 745         *      %rdi: ctx, CTX
 746         *      %rsi: dst (16 blocks)
 747         *      %rdx: src (16 blocks)
 748         *      %rcx: iv (little endian, 128bit)
 749         */
 750        FRAME_BEGIN
 751
 752        vzeroupper;
 753
 754        load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
 755                       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
 756                       tp);
 757
 758        call __serpent_enc_blk16;
 759
 760        store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 761
 762        vzeroupper;
 763
 764        FRAME_END
 765        ret;
 766ENDPROC(serpent_ctr_16way)
 767
 768ENTRY(serpent_xts_enc_16way)
 769        /* input:
 770         *      %rdi: ctx, CTX
 771         *      %rsi: dst (16 blocks)
 772         *      %rdx: src (16 blocks)
 773         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 774         */
 775        FRAME_BEGIN
 776
 777        vzeroupper;
 778
 779        load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
 780                       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
 781                       .Lxts_gf128mul_and_shl1_mask_0,
 782                       .Lxts_gf128mul_and_shl1_mask_1);
 783
 784        call __serpent_enc_blk16;
 785
 786        store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 787
 788        vzeroupper;
 789
 790        FRAME_END
 791        ret;
 792ENDPROC(serpent_xts_enc_16way)
 793
 794ENTRY(serpent_xts_dec_16way)
 795        /* input:
 796         *      %rdi: ctx, CTX
 797         *      %rsi: dst (16 blocks)
 798         *      %rdx: src (16 blocks)
 799         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 800         */
 801        FRAME_BEGIN
 802
 803        vzeroupper;
 804
 805        load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
 806                       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
 807                       .Lxts_gf128mul_and_shl1_mask_0,
 808                       .Lxts_gf128mul_and_shl1_mask_1);
 809
 810        call __serpent_dec_blk16;
 811
 812        store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 813
 814        vzeroupper;
 815
 816        FRAME_END
 817        ret;
 818ENDPROC(serpent_xts_dec_16way)
 819