linux/arch/x86/crypto/serpent-avx2-asm_64.S
<<
>>
Prefs
   1/*
   2 * x86_64/AVX2 assembler optimized version of Serpent
   3 *
   4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   5 *
   6 * Based on AVX assembler implementation of Serpent by:
   7 *  Copyright © 2012 Johannes Goetzfried
   8 *      <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
   9 *
  10 * This program is free software; you can redistribute it and/or modify
  11 * it under the terms of the GNU General Public License as published by
  12 * the Free Software Foundation; either version 2 of the License, or
  13 * (at your option) any later version.
  14 *
  15 */
  16
  17#include <linux/linkage.h>
  18#include <asm/frame.h>
  19#include "glue_helper-asm-avx2.S"
  20
  21.file "serpent-avx2-asm_64.S"
  22
  23.data
  24.align 16
  25
  26.Lbswap128_mask:
  27        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  28.Lxts_gf128mul_and_shl1_mask_0:
  29        .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
  30.Lxts_gf128mul_and_shl1_mask_1:
  31        .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
  32
  33.text
  34
  35#define CTX %rdi
  36
  37#define RNOT %ymm0
  38#define tp  %ymm1
  39
  40#define RA1 %ymm2
  41#define RA2 %ymm3
  42#define RB1 %ymm4
  43#define RB2 %ymm5
  44#define RC1 %ymm6
  45#define RC2 %ymm7
  46#define RD1 %ymm8
  47#define RD2 %ymm9
  48#define RE1 %ymm10
  49#define RE2 %ymm11
  50
  51#define RK0 %ymm12
  52#define RK1 %ymm13
  53#define RK2 %ymm14
  54#define RK3 %ymm15
  55
  56#define RK0x %xmm12
  57#define RK1x %xmm13
  58#define RK2x %xmm14
  59#define RK3x %xmm15
  60
  61#define S0_1(x0, x1, x2, x3, x4)      \
  62        vpor            x0,   x3, tp; \
  63        vpxor           x3,   x0, x0; \
  64        vpxor           x2,   x3, x4; \
  65        vpxor           RNOT, x4, x4; \
  66        vpxor           x1,   tp, x3; \
  67        vpand           x0,   x1, x1; \
  68        vpxor           x4,   x1, x1; \
  69        vpxor           x0,   x2, x2;
  70#define S0_2(x0, x1, x2, x3, x4)      \
  71        vpxor           x3,   x0, x0; \
  72        vpor            x0,   x4, x4; \
  73        vpxor           x2,   x0, x0; \
  74        vpand           x1,   x2, x2; \
  75        vpxor           x2,   x3, x3; \
  76        vpxor           RNOT, x1, x1; \
  77        vpxor           x4,   x2, x2; \
  78        vpxor           x2,   x1, x1;
  79
  80#define S1_1(x0, x1, x2, x3, x4)      \
  81        vpxor           x0,   x1, tp; \
  82        vpxor           x3,   x0, x0; \
  83        vpxor           RNOT, x3, x3; \
  84        vpand           tp,   x1, x4; \
  85        vpor            tp,   x0, x0; \
  86        vpxor           x2,   x3, x3; \
  87        vpxor           x3,   x0, x0; \
  88        vpxor           x3,   tp, x1;
  89#define S1_2(x0, x1, x2, x3, x4)      \
  90        vpxor           x4,   x3, x3; \
  91        vpor            x4,   x1, x1; \
  92        vpxor           x2,   x4, x4; \
  93        vpand           x0,   x2, x2; \
  94        vpxor           x1,   x2, x2; \
  95        vpor            x0,   x1, x1; \
  96        vpxor           RNOT, x0, x0; \
  97        vpxor           x2,   x0, x0; \
  98        vpxor           x1,   x4, x4;
  99
 100#define S2_1(x0, x1, x2, x3, x4)      \
 101        vpxor           RNOT, x3, x3; \
 102        vpxor           x0,   x1, x1; \
 103        vpand           x2,   x0, tp; \
 104        vpxor           x3,   tp, tp; \
 105        vpor            x0,   x3, x3; \
 106        vpxor           x1,   x2, x2; \
 107        vpxor           x1,   x3, x3; \
 108        vpand           tp,   x1, x1;
 109#define S2_2(x0, x1, x2, x3, x4)      \
 110        vpxor           x2,   tp, tp; \
 111        vpand           x3,   x2, x2; \
 112        vpor            x1,   x3, x3; \
 113        vpxor           RNOT, tp, tp; \
 114        vpxor           tp,   x3, x3; \
 115        vpxor           tp,   x0, x4; \
 116        vpxor           x2,   tp, x0; \
 117        vpor            x2,   x1, x1;
 118
 119#define S3_1(x0, x1, x2, x3, x4)      \
 120        vpxor           x3,   x1, tp; \
 121        vpor            x0,   x3, x3; \
 122        vpand           x0,   x1, x4; \
 123        vpxor           x2,   x0, x0; \
 124        vpxor           tp,   x2, x2; \
 125        vpand           x3,   tp, x1; \
 126        vpxor           x3,   x2, x2; \
 127        vpor            x4,   x0, x0; \
 128        vpxor           x3,   x4, x4;
 129#define S3_2(x0, x1, x2, x3, x4)      \
 130        vpxor           x0,   x1, x1; \
 131        vpand           x3,   x0, x0; \
 132        vpand           x4,   x3, x3; \
 133        vpxor           x2,   x3, x3; \
 134        vpor            x1,   x4, x4; \
 135        vpand           x1,   x2, x2; \
 136        vpxor           x3,   x4, x4; \
 137        vpxor           x3,   x0, x0; \
 138        vpxor           x2,   x3, x3;
 139
 140#define S4_1(x0, x1, x2, x3, x4)      \
 141        vpand           x0,   x3, tp; \
 142        vpxor           x3,   x0, x0; \
 143        vpxor           x2,   tp, tp; \
 144        vpor            x3,   x2, x2; \
 145        vpxor           x1,   x0, x0; \
 146        vpxor           tp,   x3, x4; \
 147        vpor            x0,   x2, x2; \
 148        vpxor           x1,   x2, x2;
 149#define S4_2(x0, x1, x2, x3, x4)      \
 150        vpand           x0,   x1, x1; \
 151        vpxor           x4,   x1, x1; \
 152        vpand           x2,   x4, x4; \
 153        vpxor           tp,   x2, x2; \
 154        vpxor           x0,   x4, x4; \
 155        vpor            x1,   tp, x3; \
 156        vpxor           RNOT, x1, x1; \
 157        vpxor           x0,   x3, x3;
 158
 159#define S5_1(x0, x1, x2, x3, x4)      \
 160        vpor            x0,   x1, tp; \
 161        vpxor           tp,   x2, x2; \
 162        vpxor           RNOT, x3, x3; \
 163        vpxor           x0,   x1, x4; \
 164        vpxor           x2,   x0, x0; \
 165        vpand           x4,   tp, x1; \
 166        vpor            x3,   x4, x4; \
 167        vpxor           x0,   x4, x4;
 168#define S5_2(x0, x1, x2, x3, x4)      \
 169        vpand           x3,   x0, x0; \
 170        vpxor           x3,   x1, x1; \
 171        vpxor           x2,   x3, x3; \
 172        vpxor           x1,   x0, x0; \
 173        vpand           x4,   x2, x2; \
 174        vpxor           x2,   x1, x1; \
 175        vpand           x0,   x2, x2; \
 176        vpxor           x2,   x3, x3;
 177
 178#define S6_1(x0, x1, x2, x3, x4)      \
 179        vpxor           x0,   x3, x3; \
 180        vpxor           x2,   x1, tp; \
 181        vpxor           x0,   x2, x2; \
 182        vpand           x3,   x0, x0; \
 183        vpor            x3,   tp, tp; \
 184        vpxor           RNOT, x1, x4; \
 185        vpxor           tp,   x0, x0; \
 186        vpxor           x2,   tp, x1;
 187#define S6_2(x0, x1, x2, x3, x4)      \
 188        vpxor           x4,   x3, x3; \
 189        vpxor           x0,   x4, x4; \
 190        vpand           x0,   x2, x2; \
 191        vpxor           x1,   x4, x4; \
 192        vpxor           x3,   x2, x2; \
 193        vpand           x1,   x3, x3; \
 194        vpxor           x0,   x3, x3; \
 195        vpxor           x2,   x1, x1;
 196
 197#define S7_1(x0, x1, x2, x3, x4)      \
 198        vpxor           RNOT, x1, tp; \
 199        vpxor           RNOT, x0, x0; \
 200        vpand           x2,   tp, x1; \
 201        vpxor           x3,   x1, x1; \
 202        vpor            tp,   x3, x3; \
 203        vpxor           x2,   tp, x4; \
 204        vpxor           x3,   x2, x2; \
 205        vpxor           x0,   x3, x3; \
 206        vpor            x1,   x0, x0;
 207#define S7_2(x0, x1, x2, x3, x4)      \
 208        vpand           x0,   x2, x2; \
 209        vpxor           x4,   x0, x0; \
 210        vpxor           x3,   x4, x4; \
 211        vpand           x0,   x3, x3; \
 212        vpxor           x1,   x4, x4; \
 213        vpxor           x4,   x2, x2; \
 214        vpxor           x1,   x3, x3; \
 215        vpor            x0,   x4, x4; \
 216        vpxor           x1,   x4, x4;
 217
 218#define SI0_1(x0, x1, x2, x3, x4)     \
 219        vpxor           x0,   x1, x1; \
 220        vpor            x1,   x3, tp; \
 221        vpxor           x1,   x3, x4; \
 222        vpxor           RNOT, x0, x0; \
 223        vpxor           tp,   x2, x2; \
 224        vpxor           x0,   tp, x3; \
 225        vpand           x1,   x0, x0; \
 226        vpxor           x2,   x0, x0;
 227#define SI0_2(x0, x1, x2, x3, x4)     \
 228        vpand           x3,   x2, x2; \
 229        vpxor           x4,   x3, x3; \
 230        vpxor           x3,   x2, x2; \
 231        vpxor           x3,   x1, x1; \
 232        vpand           x0,   x3, x3; \
 233        vpxor           x0,   x1, x1; \
 234        vpxor           x2,   x0, x0; \
 235        vpxor           x3,   x4, x4;
 236
 237#define SI1_1(x0, x1, x2, x3, x4)     \
 238        vpxor           x3,   x1, x1; \
 239        vpxor           x2,   x0, tp; \
 240        vpxor           RNOT, x2, x2; \
 241        vpor            x1,   x0, x4; \
 242        vpxor           x3,   x4, x4; \
 243        vpand           x1,   x3, x3; \
 244        vpxor           x2,   x1, x1; \
 245        vpand           x4,   x2, x2;
 246#define SI1_2(x0, x1, x2, x3, x4)     \
 247        vpxor           x1,   x4, x4; \
 248        vpor            x3,   x1, x1; \
 249        vpxor           tp,   x3, x3; \
 250        vpxor           tp,   x2, x2; \
 251        vpor            x4,   tp, x0; \
 252        vpxor           x4,   x2, x2; \
 253        vpxor           x0,   x1, x1; \
 254        vpxor           x1,   x4, x4;
 255
 256#define SI2_1(x0, x1, x2, x3, x4)     \
 257        vpxor           x1,   x2, x2; \
 258        vpxor           RNOT, x3, tp; \
 259        vpor            x2,   tp, tp; \
 260        vpxor           x3,   x2, x2; \
 261        vpxor           x0,   x3, x4; \
 262        vpxor           x1,   tp, x3; \
 263        vpor            x2,   x1, x1; \
 264        vpxor           x0,   x2, x2;
 265#define SI2_2(x0, x1, x2, x3, x4)     \
 266        vpxor           x4,   x1, x1; \
 267        vpor            x3,   x4, x4; \
 268        vpxor           x3,   x2, x2; \
 269        vpxor           x2,   x4, x4; \
 270        vpand           x1,   x2, x2; \
 271        vpxor           x3,   x2, x2; \
 272        vpxor           x4,   x3, x3; \
 273        vpxor           x0,   x4, x4;
 274
 275#define SI3_1(x0, x1, x2, x3, x4)     \
 276        vpxor           x1,   x2, x2; \
 277        vpand           x2,   x1, tp; \
 278        vpxor           x0,   tp, tp; \
 279        vpor            x1,   x0, x0; \
 280        vpxor           x3,   x1, x4; \
 281        vpxor           x3,   x0, x0; \
 282        vpor            tp,   x3, x3; \
 283        vpxor           x2,   tp, x1;
 284#define SI3_2(x0, x1, x2, x3, x4)     \
 285        vpxor           x3,   x1, x1; \
 286        vpxor           x2,   x0, x0; \
 287        vpxor           x3,   x2, x2; \
 288        vpand           x1,   x3, x3; \
 289        vpxor           x0,   x1, x1; \
 290        vpand           x2,   x0, x0; \
 291        vpxor           x3,   x4, x4; \
 292        vpxor           x0,   x3, x3; \
 293        vpxor           x1,   x0, x0;
 294
 295#define SI4_1(x0, x1, x2, x3, x4)     \
 296        vpxor           x3,   x2, x2; \
 297        vpand           x1,   x0, tp; \
 298        vpxor           x2,   tp, tp; \
 299        vpor            x3,   x2, x2; \
 300        vpxor           RNOT, x0, x4; \
 301        vpxor           tp,   x1, x1; \
 302        vpxor           x2,   tp, x0; \
 303        vpand           x4,   x2, x2;
 304#define SI4_2(x0, x1, x2, x3, x4)     \
 305        vpxor           x0,   x2, x2; \
 306        vpor            x4,   x0, x0; \
 307        vpxor           x3,   x0, x0; \
 308        vpand           x2,   x3, x3; \
 309        vpxor           x3,   x4, x4; \
 310        vpxor           x1,   x3, x3; \
 311        vpand           x0,   x1, x1; \
 312        vpxor           x1,   x4, x4; \
 313        vpxor           x3,   x0, x0;
 314
 315#define SI5_1(x0, x1, x2, x3, x4)     \
 316        vpor            x2,   x1, tp; \
 317        vpxor           x1,   x2, x2; \
 318        vpxor           x3,   tp, tp; \
 319        vpand           x1,   x3, x3; \
 320        vpxor           x3,   x2, x2; \
 321        vpor            x0,   x3, x3; \
 322        vpxor           RNOT, x0, x0; \
 323        vpxor           x2,   x3, x3; \
 324        vpor            x0,   x2, x2;
 325#define SI5_2(x0, x1, x2, x3, x4)     \
 326        vpxor           tp,   x1, x4; \
 327        vpxor           x4,   x2, x2; \
 328        vpand           x0,   x4, x4; \
 329        vpxor           tp,   x0, x0; \
 330        vpxor           x3,   tp, x1; \
 331        vpand           x2,   x0, x0; \
 332        vpxor           x3,   x2, x2; \
 333        vpxor           x2,   x0, x0; \
 334        vpxor           x4,   x2, x2; \
 335        vpxor           x3,   x4, x4;
 336
 337#define SI6_1(x0, x1, x2, x3, x4)     \
 338        vpxor           x2,   x0, x0; \
 339        vpand           x3,   x0, tp; \
 340        vpxor           x3,   x2, x2; \
 341        vpxor           x2,   tp, tp; \
 342        vpxor           x1,   x3, x3; \
 343        vpor            x0,   x2, x2; \
 344        vpxor           x3,   x2, x2; \
 345        vpand           tp,   x3, x3;
 346#define SI6_2(x0, x1, x2, x3, x4)     \
 347        vpxor           RNOT, tp, tp; \
 348        vpxor           x1,   x3, x3; \
 349        vpand           x2,   x1, x1; \
 350        vpxor           tp,   x0, x4; \
 351        vpxor           x4,   x3, x3; \
 352        vpxor           x2,   x4, x4; \
 353        vpxor           x1,   tp, x0; \
 354        vpxor           x0,   x2, x2;
 355
 356#define SI7_1(x0, x1, x2, x3, x4)     \
 357        vpand           x0,   x3, tp; \
 358        vpxor           x2,   x0, x0; \
 359        vpor            x3,   x2, x2; \
 360        vpxor           x1,   x3, x4; \
 361        vpxor           RNOT, x0, x0; \
 362        vpor            tp,   x1, x1; \
 363        vpxor           x0,   x4, x4; \
 364        vpand           x2,   x0, x0; \
 365        vpxor           x1,   x0, x0;
 366#define SI7_2(x0, x1, x2, x3, x4)     \
 367        vpand           x2,   x1, x1; \
 368        vpxor           x2,   tp, x3; \
 369        vpxor           x3,   x4, x4; \
 370        vpand           x3,   x2, x2; \
 371        vpor            x0,   x3, x3; \
 372        vpxor           x4,   x1, x1; \
 373        vpxor           x4,   x3, x3; \
 374        vpand           x0,   x4, x4; \
 375        vpxor           x2,   x4, x4;
 376
 377#define get_key(i,j,t) \
 378        vpbroadcastd (4*(i)+(j))*4(CTX), t;
 379
 380#define K2(x0, x1, x2, x3, x4, i) \
 381        get_key(i, 0, RK0); \
 382        get_key(i, 1, RK1); \
 383        get_key(i, 2, RK2); \
 384        get_key(i, 3, RK3); \
 385        vpxor RK0,      x0 ## 1, x0 ## 1; \
 386        vpxor RK1,      x1 ## 1, x1 ## 1; \
 387        vpxor RK2,      x2 ## 1, x2 ## 1; \
 388        vpxor RK3,      x3 ## 1, x3 ## 1; \
 389                vpxor RK0,      x0 ## 2, x0 ## 2; \
 390                vpxor RK1,      x1 ## 2, x1 ## 2; \
 391                vpxor RK2,      x2 ## 2, x2 ## 2; \
 392                vpxor RK3,      x3 ## 2, x3 ## 2;
 393
 394#define LK2(x0, x1, x2, x3, x4, i) \
 395        vpslld $13,             x0 ## 1, x4 ## 1;          \
 396        vpsrld $(32 - 13),      x0 ## 1, x0 ## 1;          \
 397        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 398        vpxor                   x0 ## 1, x1 ## 1, x1 ## 1; \
 399        vpslld $3,              x2 ## 1, x4 ## 1;          \
 400        vpsrld $(32 - 3),       x2 ## 1, x2 ## 1;          \
 401        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 402        vpxor                   x2 ## 1, x1 ## 1, x1 ## 1; \
 403                vpslld $13,             x0 ## 2, x4 ## 2;          \
 404                vpsrld $(32 - 13),      x0 ## 2, x0 ## 2;          \
 405                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 406                vpxor                   x0 ## 2, x1 ## 2, x1 ## 2; \
 407                vpslld $3,              x2 ## 2, x4 ## 2;          \
 408                vpsrld $(32 - 3),       x2 ## 2, x2 ## 2;          \
 409                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 410                vpxor                   x2 ## 2, x1 ## 2, x1 ## 2; \
 411        vpslld $1,              x1 ## 1, x4 ## 1;          \
 412        vpsrld $(32 - 1),       x1 ## 1, x1 ## 1;          \
 413        vpor                    x4 ## 1, x1 ## 1, x1 ## 1; \
 414        vpslld $3,              x0 ## 1, x4 ## 1;          \
 415        vpxor                   x2 ## 1, x3 ## 1, x3 ## 1; \
 416        vpxor                   x4 ## 1, x3 ## 1, x3 ## 1; \
 417        get_key(i, 1, RK1); \
 418                vpslld $1,              x1 ## 2, x4 ## 2;          \
 419                vpsrld $(32 - 1),       x1 ## 2, x1 ## 2;          \
 420                vpor                    x4 ## 2, x1 ## 2, x1 ## 2; \
 421                vpslld $3,              x0 ## 2, x4 ## 2;          \
 422                vpxor                   x2 ## 2, x3 ## 2, x3 ## 2; \
 423                vpxor                   x4 ## 2, x3 ## 2, x3 ## 2; \
 424                get_key(i, 3, RK3); \
 425        vpslld $7,              x3 ## 1, x4 ## 1;          \
 426        vpsrld $(32 - 7),       x3 ## 1, x3 ## 1;          \
 427        vpor                    x4 ## 1, x3 ## 1, x3 ## 1; \
 428        vpslld $7,              x1 ## 1, x4 ## 1;          \
 429        vpxor                   x1 ## 1, x0 ## 1, x0 ## 1; \
 430        vpxor                   x3 ## 1, x0 ## 1, x0 ## 1; \
 431        vpxor                   x3 ## 1, x2 ## 1, x2 ## 1; \
 432        vpxor                   x4 ## 1, x2 ## 1, x2 ## 1; \
 433        get_key(i, 0, RK0); \
 434                vpslld $7,              x3 ## 2, x4 ## 2;          \
 435                vpsrld $(32 - 7),       x3 ## 2, x3 ## 2;          \
 436                vpor                    x4 ## 2, x3 ## 2, x3 ## 2; \
 437                vpslld $7,              x1 ## 2, x4 ## 2;          \
 438                vpxor                   x1 ## 2, x0 ## 2, x0 ## 2; \
 439                vpxor                   x3 ## 2, x0 ## 2, x0 ## 2; \
 440                vpxor                   x3 ## 2, x2 ## 2, x2 ## 2; \
 441                vpxor                   x4 ## 2, x2 ## 2, x2 ## 2; \
 442                get_key(i, 2, RK2); \
 443        vpxor                   RK1, x1 ## 1, x1 ## 1;     \
 444        vpxor                   RK3, x3 ## 1, x3 ## 1;     \
 445        vpslld $5,              x0 ## 1, x4 ## 1;          \
 446        vpsrld $(32 - 5),       x0 ## 1, x0 ## 1;          \
 447        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 448        vpslld $22,             x2 ## 1, x4 ## 1;          \
 449        vpsrld $(32 - 22),      x2 ## 1, x2 ## 1;          \
 450        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 451        vpxor                   RK0, x0 ## 1, x0 ## 1;     \
 452        vpxor                   RK2, x2 ## 1, x2 ## 1;     \
 453                vpxor                   RK1, x1 ## 2, x1 ## 2;     \
 454                vpxor                   RK3, x3 ## 2, x3 ## 2;     \
 455                vpslld $5,              x0 ## 2, x4 ## 2;          \
 456                vpsrld $(32 - 5),       x0 ## 2, x0 ## 2;          \
 457                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 458                vpslld $22,             x2 ## 2, x4 ## 2;          \
 459                vpsrld $(32 - 22),      x2 ## 2, x2 ## 2;          \
 460                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 461                vpxor                   RK0, x0 ## 2, x0 ## 2;     \
 462                vpxor                   RK2, x2 ## 2, x2 ## 2;
 463
 464#define KL2(x0, x1, x2, x3, x4, i) \
 465        vpxor                   RK0, x0 ## 1, x0 ## 1;     \
 466        vpxor                   RK2, x2 ## 1, x2 ## 1;     \
 467        vpsrld $5,              x0 ## 1, x4 ## 1;          \
 468        vpslld $(32 - 5),       x0 ## 1, x0 ## 1;          \
 469        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 470        vpxor                   RK3, x3 ## 1, x3 ## 1;     \
 471        vpxor                   RK1, x1 ## 1, x1 ## 1;     \
 472        vpsrld $22,             x2 ## 1, x4 ## 1;          \
 473        vpslld $(32 - 22),      x2 ## 1, x2 ## 1;          \
 474        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 475        vpxor                   x3 ## 1, x2 ## 1, x2 ## 1; \
 476                vpxor                   RK0, x0 ## 2, x0 ## 2;     \
 477                vpxor                   RK2, x2 ## 2, x2 ## 2;     \
 478                vpsrld $5,              x0 ## 2, x4 ## 2;          \
 479                vpslld $(32 - 5),       x0 ## 2, x0 ## 2;          \
 480                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 481                vpxor                   RK3, x3 ## 2, x3 ## 2;     \
 482                vpxor                   RK1, x1 ## 2, x1 ## 2;     \
 483                vpsrld $22,             x2 ## 2, x4 ## 2;          \
 484                vpslld $(32 - 22),      x2 ## 2, x2 ## 2;          \
 485                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 486                vpxor                   x3 ## 2, x2 ## 2, x2 ## 2; \
 487        vpxor                   x3 ## 1, x0 ## 1, x0 ## 1; \
 488        vpslld $7,              x1 ## 1, x4 ## 1;          \
 489        vpxor                   x1 ## 1, x0 ## 1, x0 ## 1; \
 490        vpxor                   x4 ## 1, x2 ## 1, x2 ## 1; \
 491        vpsrld $1,              x1 ## 1, x4 ## 1;          \
 492        vpslld $(32 - 1),       x1 ## 1, x1 ## 1;          \
 493        vpor                    x4 ## 1, x1 ## 1, x1 ## 1; \
 494                vpxor                   x3 ## 2, x0 ## 2, x0 ## 2; \
 495                vpslld $7,              x1 ## 2, x4 ## 2;          \
 496                vpxor                   x1 ## 2, x0 ## 2, x0 ## 2; \
 497                vpxor                   x4 ## 2, x2 ## 2, x2 ## 2; \
 498                vpsrld $1,              x1 ## 2, x4 ## 2;          \
 499                vpslld $(32 - 1),       x1 ## 2, x1 ## 2;          \
 500                vpor                    x4 ## 2, x1 ## 2, x1 ## 2; \
 501        vpsrld $7,              x3 ## 1, x4 ## 1;          \
 502        vpslld $(32 - 7),       x3 ## 1, x3 ## 1;          \
 503        vpor                    x4 ## 1, x3 ## 1, x3 ## 1; \
 504        vpxor                   x0 ## 1, x1 ## 1, x1 ## 1; \
 505        vpslld $3,              x0 ## 1, x4 ## 1;          \
 506        vpxor                   x4 ## 1, x3 ## 1, x3 ## 1; \
 507                vpsrld $7,              x3 ## 2, x4 ## 2;          \
 508                vpslld $(32 - 7),       x3 ## 2, x3 ## 2;          \
 509                vpor                    x4 ## 2, x3 ## 2, x3 ## 2; \
 510                vpxor                   x0 ## 2, x1 ## 2, x1 ## 2; \
 511                vpslld $3,              x0 ## 2, x4 ## 2;          \
 512                vpxor                   x4 ## 2, x3 ## 2, x3 ## 2; \
 513        vpsrld $13,             x0 ## 1, x4 ## 1;          \
 514        vpslld $(32 - 13),      x0 ## 1, x0 ## 1;          \
 515        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 516        vpxor                   x2 ## 1, x1 ## 1, x1 ## 1; \
 517        vpxor                   x2 ## 1, x3 ## 1, x3 ## 1; \
 518        vpsrld $3,              x2 ## 1, x4 ## 1;          \
 519        vpslld $(32 - 3),       x2 ## 1, x2 ## 1;          \
 520        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 521                vpsrld $13,             x0 ## 2, x4 ## 2;          \
 522                vpslld $(32 - 13),      x0 ## 2, x0 ## 2;          \
 523                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 524                vpxor                   x2 ## 2, x1 ## 2, x1 ## 2; \
 525                vpxor                   x2 ## 2, x3 ## 2, x3 ## 2; \
 526                vpsrld $3,              x2 ## 2, x4 ## 2;          \
 527                vpslld $(32 - 3),       x2 ## 2, x2 ## 2;          \
 528                vpor                    x4 ## 2, x2 ## 2, x2 ## 2;
 529
 530#define S(SBOX, x0, x1, x2, x3, x4) \
 531        SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 532        SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 533        SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 534        SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
 535
 536#define SP(SBOX, x0, x1, x2, x3, x4, i) \
 537        get_key(i, 0, RK0); \
 538        SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 539        get_key(i, 2, RK2); \
 540        SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 541        get_key(i, 3, RK3); \
 542        SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 543        get_key(i, 1, RK1); \
 544        SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 545
 546#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 547        vpunpckldq              x1, x0, t0; \
 548        vpunpckhdq              x1, x0, t2; \
 549        vpunpckldq              x3, x2, t1; \
 550        vpunpckhdq              x3, x2, x3; \
 551        \
 552        vpunpcklqdq             t1, t0, x0; \
 553        vpunpckhqdq             t1, t0, x1; \
 554        vpunpcklqdq             x3, t2, x2; \
 555        vpunpckhqdq             x3, t2, x3;
 556
 557#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
 558        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 559
 560#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
 561        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 562
 563.align 8
 564__serpent_enc_blk16:
 565        /* input:
 566         *      %rdi: ctx, CTX
 567         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext
 568         * output:
 569         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
 570         */
 571
 572        vpcmpeqd RNOT, RNOT, RNOT;
 573
 574        read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 575        read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 576
 577                                                 K2(RA, RB, RC, RD, RE, 0);
 578        S(S0, RA, RB, RC, RD, RE);              LK2(RC, RB, RD, RA, RE, 1);
 579        S(S1, RC, RB, RD, RA, RE);              LK2(RE, RD, RA, RC, RB, 2);
 580        S(S2, RE, RD, RA, RC, RB);              LK2(RB, RD, RE, RC, RA, 3);
 581        S(S3, RB, RD, RE, RC, RA);              LK2(RC, RA, RD, RB, RE, 4);
 582        S(S4, RC, RA, RD, RB, RE);              LK2(RA, RD, RB, RE, RC, 5);
 583        S(S5, RA, RD, RB, RE, RC);              LK2(RC, RA, RD, RE, RB, 6);
 584        S(S6, RC, RA, RD, RE, RB);              LK2(RD, RB, RA, RE, RC, 7);
 585        S(S7, RD, RB, RA, RE, RC);              LK2(RC, RA, RE, RD, RB, 8);
 586        S(S0, RC, RA, RE, RD, RB);              LK2(RE, RA, RD, RC, RB, 9);
 587        S(S1, RE, RA, RD, RC, RB);              LK2(RB, RD, RC, RE, RA, 10);
 588        S(S2, RB, RD, RC, RE, RA);              LK2(RA, RD, RB, RE, RC, 11);
 589        S(S3, RA, RD, RB, RE, RC);              LK2(RE, RC, RD, RA, RB, 12);
 590        S(S4, RE, RC, RD, RA, RB);              LK2(RC, RD, RA, RB, RE, 13);
 591        S(S5, RC, RD, RA, RB, RE);              LK2(RE, RC, RD, RB, RA, 14);
 592        S(S6, RE, RC, RD, RB, RA);              LK2(RD, RA, RC, RB, RE, 15);
 593        S(S7, RD, RA, RC, RB, RE);              LK2(RE, RC, RB, RD, RA, 16);
 594        S(S0, RE, RC, RB, RD, RA);              LK2(RB, RC, RD, RE, RA, 17);
 595        S(S1, RB, RC, RD, RE, RA);              LK2(RA, RD, RE, RB, RC, 18);
 596        S(S2, RA, RD, RE, RB, RC);              LK2(RC, RD, RA, RB, RE, 19);
 597        S(S3, RC, RD, RA, RB, RE);              LK2(RB, RE, RD, RC, RA, 20);
 598        S(S4, RB, RE, RD, RC, RA);              LK2(RE, RD, RC, RA, RB, 21);
 599        S(S5, RE, RD, RC, RA, RB);              LK2(RB, RE, RD, RA, RC, 22);
 600        S(S6, RB, RE, RD, RA, RC);              LK2(RD, RC, RE, RA, RB, 23);
 601        S(S7, RD, RC, RE, RA, RB);              LK2(RB, RE, RA, RD, RC, 24);
 602        S(S0, RB, RE, RA, RD, RC);              LK2(RA, RE, RD, RB, RC, 25);
 603        S(S1, RA, RE, RD, RB, RC);              LK2(RC, RD, RB, RA, RE, 26);
 604        S(S2, RC, RD, RB, RA, RE);              LK2(RE, RD, RC, RA, RB, 27);
 605        S(S3, RE, RD, RC, RA, RB);              LK2(RA, RB, RD, RE, RC, 28);
 606        S(S4, RA, RB, RD, RE, RC);              LK2(RB, RD, RE, RC, RA, 29);
 607        S(S5, RB, RD, RE, RC, RA);              LK2(RA, RB, RD, RC, RE, 30);
 608        S(S6, RA, RB, RD, RC, RE);              LK2(RD, RE, RB, RC, RA, 31);
 609        S(S7, RD, RE, RB, RC, RA);               K2(RA, RB, RC, RD, RE, 32);
 610
 611        write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 612        write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 613
 614        ret;
 615ENDPROC(__serpent_enc_blk16)
 616
 617.align 8
 618__serpent_dec_blk16:
 619        /* input:
 620         *      %rdi: ctx, CTX
 621         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
 622         * output:
 623         *      RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext
 624         */
 625
 626        vpcmpeqd RNOT, RNOT, RNOT;
 627
 628        read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 629        read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 630
 631                                                 K2(RA, RB, RC, RD, RE, 32);
 632        SP(SI7, RA, RB, RC, RD, RE, 31);        KL2(RB, RD, RA, RE, RC, 31);
 633        SP(SI6, RB, RD, RA, RE, RC, 30);        KL2(RA, RC, RE, RB, RD, 30);
 634        SP(SI5, RA, RC, RE, RB, RD, 29);        KL2(RC, RD, RA, RE, RB, 29);
 635        SP(SI4, RC, RD, RA, RE, RB, 28);        KL2(RC, RA, RB, RE, RD, 28);
 636        SP(SI3, RC, RA, RB, RE, RD, 27);        KL2(RB, RC, RD, RE, RA, 27);
 637        SP(SI2, RB, RC, RD, RE, RA, 26);        KL2(RC, RA, RE, RD, RB, 26);
 638        SP(SI1, RC, RA, RE, RD, RB, 25);        KL2(RB, RA, RE, RD, RC, 25);
 639        SP(SI0, RB, RA, RE, RD, RC, 24);        KL2(RE, RC, RA, RB, RD, 24);
 640        SP(SI7, RE, RC, RA, RB, RD, 23);        KL2(RC, RB, RE, RD, RA, 23);
 641        SP(SI6, RC, RB, RE, RD, RA, 22);        KL2(RE, RA, RD, RC, RB, 22);
 642        SP(SI5, RE, RA, RD, RC, RB, 21);        KL2(RA, RB, RE, RD, RC, 21);
 643        SP(SI4, RA, RB, RE, RD, RC, 20);        KL2(RA, RE, RC, RD, RB, 20);
 644        SP(SI3, RA, RE, RC, RD, RB, 19);        KL2(RC, RA, RB, RD, RE, 19);
 645        SP(SI2, RC, RA, RB, RD, RE, 18);        KL2(RA, RE, RD, RB, RC, 18);
 646        SP(SI1, RA, RE, RD, RB, RC, 17);        KL2(RC, RE, RD, RB, RA, 17);
 647        SP(SI0, RC, RE, RD, RB, RA, 16);        KL2(RD, RA, RE, RC, RB, 16);
 648        SP(SI7, RD, RA, RE, RC, RB, 15);        KL2(RA, RC, RD, RB, RE, 15);
 649        SP(SI6, RA, RC, RD, RB, RE, 14);        KL2(RD, RE, RB, RA, RC, 14);
 650        SP(SI5, RD, RE, RB, RA, RC, 13);        KL2(RE, RC, RD, RB, RA, 13);
 651        SP(SI4, RE, RC, RD, RB, RA, 12);        KL2(RE, RD, RA, RB, RC, 12);
 652        SP(SI3, RE, RD, RA, RB, RC, 11);        KL2(RA, RE, RC, RB, RD, 11);
 653        SP(SI2, RA, RE, RC, RB, RD, 10);        KL2(RE, RD, RB, RC, RA, 10);
 654        SP(SI1, RE, RD, RB, RC, RA, 9);         KL2(RA, RD, RB, RC, RE, 9);
 655        SP(SI0, RA, RD, RB, RC, RE, 8);         KL2(RB, RE, RD, RA, RC, 8);
 656        SP(SI7, RB, RE, RD, RA, RC, 7);         KL2(RE, RA, RB, RC, RD, 7);
 657        SP(SI6, RE, RA, RB, RC, RD, 6);         KL2(RB, RD, RC, RE, RA, 6);
 658        SP(SI5, RB, RD, RC, RE, RA, 5);         KL2(RD, RA, RB, RC, RE, 5);
 659        SP(SI4, RD, RA, RB, RC, RE, 4);         KL2(RD, RB, RE, RC, RA, 4);
 660        SP(SI3, RD, RB, RE, RC, RA, 3);         KL2(RE, RD, RA, RC, RB, 3);
 661        SP(SI2, RE, RD, RA, RC, RB, 2);         KL2(RD, RB, RC, RA, RE, 2);
 662        SP(SI1, RD, RB, RC, RA, RE, 1);         KL2(RE, RB, RC, RA, RD, 1);
 663        S(SI0, RE, RB, RC, RA, RD);              K2(RC, RD, RB, RE, RA, 0);
 664
 665        write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
 666        write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
 667
 668        ret;
 669ENDPROC(__serpent_dec_blk16)
 670
 671ENTRY(serpent_ecb_enc_16way)
 672        /* input:
 673         *      %rdi: ctx, CTX
 674         *      %rsi: dst
 675         *      %rdx: src
 676         */
 677        FRAME_BEGIN
 678
 679        vzeroupper;
 680
 681        load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 682
 683        call __serpent_enc_blk16;
 684
 685        store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 686
 687        vzeroupper;
 688
 689        FRAME_END
 690        ret;
 691ENDPROC(serpent_ecb_enc_16way)
 692
 693ENTRY(serpent_ecb_dec_16way)
 694        /* input:
 695         *      %rdi: ctx, CTX
 696         *      %rsi: dst
 697         *      %rdx: src
 698         */
 699        FRAME_BEGIN
 700
 701        vzeroupper;
 702
 703        load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 704
 705        call __serpent_dec_blk16;
 706
 707        store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 708
 709        vzeroupper;
 710
 711        FRAME_END
 712        ret;
 713ENDPROC(serpent_ecb_dec_16way)
 714
 715ENTRY(serpent_cbc_dec_16way)
 716        /* input:
 717         *      %rdi: ctx, CTX
 718         *      %rsi: dst
 719         *      %rdx: src
 720         */
 721        FRAME_BEGIN
 722
 723        vzeroupper;
 724
 725        load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 726
 727        call __serpent_dec_blk16;
 728
 729        store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
 730                        RK0);
 731
 732        vzeroupper;
 733
 734        FRAME_END
 735        ret;
 736ENDPROC(serpent_cbc_dec_16way)
 737
 738ENTRY(serpent_ctr_16way)
 739        /* input:
 740         *      %rdi: ctx, CTX
 741         *      %rsi: dst (16 blocks)
 742         *      %rdx: src (16 blocks)
 743         *      %rcx: iv (little endian, 128bit)
 744         */
 745        FRAME_BEGIN
 746
 747        vzeroupper;
 748
 749        load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
 750                       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
 751                       tp);
 752
 753        call __serpent_enc_blk16;
 754
 755        store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 756
 757        vzeroupper;
 758
 759        FRAME_END
 760        ret;
 761ENDPROC(serpent_ctr_16way)
 762
 763ENTRY(serpent_xts_enc_16way)
 764        /* input:
 765         *      %rdi: ctx, CTX
 766         *      %rsi: dst (16 blocks)
 767         *      %rdx: src (16 blocks)
 768         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 769         */
 770        FRAME_BEGIN
 771
 772        vzeroupper;
 773
 774        load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
 775                       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
 776                       .Lxts_gf128mul_and_shl1_mask_0,
 777                       .Lxts_gf128mul_and_shl1_mask_1);
 778
 779        call __serpent_enc_blk16;
 780
 781        store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 782
 783        vzeroupper;
 784
 785        FRAME_END
 786        ret;
 787ENDPROC(serpent_xts_enc_16way)
 788
 789ENTRY(serpent_xts_dec_16way)
 790        /* input:
 791         *      %rdi: ctx, CTX
 792         *      %rsi: dst (16 blocks)
 793         *      %rdx: src (16 blocks)
 794         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 795         */
 796        FRAME_BEGIN
 797
 798        vzeroupper;
 799
 800        load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
 801                       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
 802                       .Lxts_gf128mul_and_shl1_mask_0,
 803                       .Lxts_gf128mul_and_shl1_mask_1);
 804
 805        call __serpent_dec_blk16;
 806
 807        store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 808
 809        vzeroupper;
 810
 811        FRAME_END
 812        ret;
 813ENDPROC(serpent_xts_dec_16way)
 814