linux/arch/x86/crypto/serpent-avx2-asm_64.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * x86_64/AVX2 assembler optimized version of Serpent
   4 *
   5 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   6 *
   7 * Based on AVX assembler implementation of Serpent by:
   8 *  Copyright © 2012 Johannes Goetzfried
   9 *      <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  10 */
  11
  12#include <linux/linkage.h>
  13#include <asm/frame.h>
  14#include "glue_helper-asm-avx2.S"
  15
  16.file "serpent-avx2-asm_64.S"
  17
  18.section        .rodata.cst16.bswap128_mask, "aM", @progbits, 16
  19.align 16
  20.Lbswap128_mask:
  21        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  22
  23.text
  24
  25#define CTX %rdi
  26
  27#define RNOT %ymm0
  28#define tp  %ymm1
  29
  30#define RA1 %ymm2
  31#define RA2 %ymm3
  32#define RB1 %ymm4
  33#define RB2 %ymm5
  34#define RC1 %ymm6
  35#define RC2 %ymm7
  36#define RD1 %ymm8
  37#define RD2 %ymm9
  38#define RE1 %ymm10
  39#define RE2 %ymm11
  40
  41#define RK0 %ymm12
  42#define RK1 %ymm13
  43#define RK2 %ymm14
  44#define RK3 %ymm15
  45
  46#define RK0x %xmm12
  47#define RK1x %xmm13
  48#define RK2x %xmm14
  49#define RK3x %xmm15
  50
  51#define S0_1(x0, x1, x2, x3, x4)      \
  52        vpor            x0,   x3, tp; \
  53        vpxor           x3,   x0, x0; \
  54        vpxor           x2,   x3, x4; \
  55        vpxor           RNOT, x4, x4; \
  56        vpxor           x1,   tp, x3; \
  57        vpand           x0,   x1, x1; \
  58        vpxor           x4,   x1, x1; \
  59        vpxor           x0,   x2, x2;
  60#define S0_2(x0, x1, x2, x3, x4)      \
  61        vpxor           x3,   x0, x0; \
  62        vpor            x0,   x4, x4; \
  63        vpxor           x2,   x0, x0; \
  64        vpand           x1,   x2, x2; \
  65        vpxor           x2,   x3, x3; \
  66        vpxor           RNOT, x1, x1; \
  67        vpxor           x4,   x2, x2; \
  68        vpxor           x2,   x1, x1;
  69
  70#define S1_1(x0, x1, x2, x3, x4)      \
  71        vpxor           x0,   x1, tp; \
  72        vpxor           x3,   x0, x0; \
  73        vpxor           RNOT, x3, x3; \
  74        vpand           tp,   x1, x4; \
  75        vpor            tp,   x0, x0; \
  76        vpxor           x2,   x3, x3; \
  77        vpxor           x3,   x0, x0; \
  78        vpxor           x3,   tp, x1;
  79#define S1_2(x0, x1, x2, x3, x4)      \
  80        vpxor           x4,   x3, x3; \
  81        vpor            x4,   x1, x1; \
  82        vpxor           x2,   x4, x4; \
  83        vpand           x0,   x2, x2; \
  84        vpxor           x1,   x2, x2; \
  85        vpor            x0,   x1, x1; \
  86        vpxor           RNOT, x0, x0; \
  87        vpxor           x2,   x0, x0; \
  88        vpxor           x1,   x4, x4;
  89
  90#define S2_1(x0, x1, x2, x3, x4)      \
  91        vpxor           RNOT, x3, x3; \
  92        vpxor           x0,   x1, x1; \
  93        vpand           x2,   x0, tp; \
  94        vpxor           x3,   tp, tp; \
  95        vpor            x0,   x3, x3; \
  96        vpxor           x1,   x2, x2; \
  97        vpxor           x1,   x3, x3; \
  98        vpand           tp,   x1, x1;
  99#define S2_2(x0, x1, x2, x3, x4)      \
 100        vpxor           x2,   tp, tp; \
 101        vpand           x3,   x2, x2; \
 102        vpor            x1,   x3, x3; \
 103        vpxor           RNOT, tp, tp; \
 104        vpxor           tp,   x3, x3; \
 105        vpxor           tp,   x0, x4; \
 106        vpxor           x2,   tp, x0; \
 107        vpor            x2,   x1, x1;
 108
 109#define S3_1(x0, x1, x2, x3, x4)      \
 110        vpxor           x3,   x1, tp; \
 111        vpor            x0,   x3, x3; \
 112        vpand           x0,   x1, x4; \
 113        vpxor           x2,   x0, x0; \
 114        vpxor           tp,   x2, x2; \
 115        vpand           x3,   tp, x1; \
 116        vpxor           x3,   x2, x2; \
 117        vpor            x4,   x0, x0; \
 118        vpxor           x3,   x4, x4;
 119#define S3_2(x0, x1, x2, x3, x4)      \
 120        vpxor           x0,   x1, x1; \
 121        vpand           x3,   x0, x0; \
 122        vpand           x4,   x3, x3; \
 123        vpxor           x2,   x3, x3; \
 124        vpor            x1,   x4, x4; \
 125        vpand           x1,   x2, x2; \
 126        vpxor           x3,   x4, x4; \
 127        vpxor           x3,   x0, x0; \
 128        vpxor           x2,   x3, x3;
 129
 130#define S4_1(x0, x1, x2, x3, x4)      \
 131        vpand           x0,   x3, tp; \
 132        vpxor           x3,   x0, x0; \
 133        vpxor           x2,   tp, tp; \
 134        vpor            x3,   x2, x2; \
 135        vpxor           x1,   x0, x0; \
 136        vpxor           tp,   x3, x4; \
 137        vpor            x0,   x2, x2; \
 138        vpxor           x1,   x2, x2;
 139#define S4_2(x0, x1, x2, x3, x4)      \
 140        vpand           x0,   x1, x1; \
 141        vpxor           x4,   x1, x1; \
 142        vpand           x2,   x4, x4; \
 143        vpxor           tp,   x2, x2; \
 144        vpxor           x0,   x4, x4; \
 145        vpor            x1,   tp, x3; \
 146        vpxor           RNOT, x1, x1; \
 147        vpxor           x0,   x3, x3;
 148
 149#define S5_1(x0, x1, x2, x3, x4)      \
 150        vpor            x0,   x1, tp; \
 151        vpxor           tp,   x2, x2; \
 152        vpxor           RNOT, x3, x3; \
 153        vpxor           x0,   x1, x4; \
 154        vpxor           x2,   x0, x0; \
 155        vpand           x4,   tp, x1; \
 156        vpor            x3,   x4, x4; \
 157        vpxor           x0,   x4, x4;
 158#define S5_2(x0, x1, x2, x3, x4)      \
 159        vpand           x3,   x0, x0; \
 160        vpxor           x3,   x1, x1; \
 161        vpxor           x2,   x3, x3; \
 162        vpxor           x1,   x0, x0; \
 163        vpand           x4,   x2, x2; \
 164        vpxor           x2,   x1, x1; \
 165        vpand           x0,   x2, x2; \
 166        vpxor           x2,   x3, x3;
 167
 168#define S6_1(x0, x1, x2, x3, x4)      \
 169        vpxor           x0,   x3, x3; \
 170        vpxor           x2,   x1, tp; \
 171        vpxor           x0,   x2, x2; \
 172        vpand           x3,   x0, x0; \
 173        vpor            x3,   tp, tp; \
 174        vpxor           RNOT, x1, x4; \
 175        vpxor           tp,   x0, x0; \
 176        vpxor           x2,   tp, x1;
 177#define S6_2(x0, x1, x2, x3, x4)      \
 178        vpxor           x4,   x3, x3; \
 179        vpxor           x0,   x4, x4; \
 180        vpand           x0,   x2, x2; \
 181        vpxor           x1,   x4, x4; \
 182        vpxor           x3,   x2, x2; \
 183        vpand           x1,   x3, x3; \
 184        vpxor           x0,   x3, x3; \
 185        vpxor           x2,   x1, x1;
 186
 187#define S7_1(x0, x1, x2, x3, x4)      \
 188        vpxor           RNOT, x1, tp; \
 189        vpxor           RNOT, x0, x0; \
 190        vpand           x2,   tp, x1; \
 191        vpxor           x3,   x1, x1; \
 192        vpor            tp,   x3, x3; \
 193        vpxor           x2,   tp, x4; \
 194        vpxor           x3,   x2, x2; \
 195        vpxor           x0,   x3, x3; \
 196        vpor            x1,   x0, x0;
 197#define S7_2(x0, x1, x2, x3, x4)      \
 198        vpand           x0,   x2, x2; \
 199        vpxor           x4,   x0, x0; \
 200        vpxor           x3,   x4, x4; \
 201        vpand           x0,   x3, x3; \
 202        vpxor           x1,   x4, x4; \
 203        vpxor           x4,   x2, x2; \
 204        vpxor           x1,   x3, x3; \
 205        vpor            x0,   x4, x4; \
 206        vpxor           x1,   x4, x4;
 207
 208#define SI0_1(x0, x1, x2, x3, x4)     \
 209        vpxor           x0,   x1, x1; \
 210        vpor            x1,   x3, tp; \
 211        vpxor           x1,   x3, x4; \
 212        vpxor           RNOT, x0, x0; \
 213        vpxor           tp,   x2, x2; \
 214        vpxor           x0,   tp, x3; \
 215        vpand           x1,   x0, x0; \
 216        vpxor           x2,   x0, x0;
 217#define SI0_2(x0, x1, x2, x3, x4)     \
 218        vpand           x3,   x2, x2; \
 219        vpxor           x4,   x3, x3; \
 220        vpxor           x3,   x2, x2; \
 221        vpxor           x3,   x1, x1; \
 222        vpand           x0,   x3, x3; \
 223        vpxor           x0,   x1, x1; \
 224        vpxor           x2,   x0, x0; \
 225        vpxor           x3,   x4, x4;
 226
 227#define SI1_1(x0, x1, x2, x3, x4)     \
 228        vpxor           x3,   x1, x1; \
 229        vpxor           x2,   x0, tp; \
 230        vpxor           RNOT, x2, x2; \
 231        vpor            x1,   x0, x4; \
 232        vpxor           x3,   x4, x4; \
 233        vpand           x1,   x3, x3; \
 234        vpxor           x2,   x1, x1; \
 235        vpand           x4,   x2, x2;
 236#define SI1_2(x0, x1, x2, x3, x4)     \
 237        vpxor           x1,   x4, x4; \
 238        vpor            x3,   x1, x1; \
 239        vpxor           tp,   x3, x3; \
 240        vpxor           tp,   x2, x2; \
 241        vpor            x4,   tp, x0; \
 242        vpxor           x4,   x2, x2; \
 243        vpxor           x0,   x1, x1; \
 244        vpxor           x1,   x4, x4;
 245
 246#define SI2_1(x0, x1, x2, x3, x4)     \
 247        vpxor           x1,   x2, x2; \
 248        vpxor           RNOT, x3, tp; \
 249        vpor            x2,   tp, tp; \
 250        vpxor           x3,   x2, x2; \
 251        vpxor           x0,   x3, x4; \
 252        vpxor           x1,   tp, x3; \
 253        vpor            x2,   x1, x1; \
 254        vpxor           x0,   x2, x2;
 255#define SI2_2(x0, x1, x2, x3, x4)     \
 256        vpxor           x4,   x1, x1; \
 257        vpor            x3,   x4, x4; \
 258        vpxor           x3,   x2, x2; \
 259        vpxor           x2,   x4, x4; \
 260        vpand           x1,   x2, x2; \
 261        vpxor           x3,   x2, x2; \
 262        vpxor           x4,   x3, x3; \
 263        vpxor           x0,   x4, x4;
 264
 265#define SI3_1(x0, x1, x2, x3, x4)     \
 266        vpxor           x1,   x2, x2; \
 267        vpand           x2,   x1, tp; \
 268        vpxor           x0,   tp, tp; \
 269        vpor            x1,   x0, x0; \
 270        vpxor           x3,   x1, x4; \
 271        vpxor           x3,   x0, x0; \
 272        vpor            tp,   x3, x3; \
 273        vpxor           x2,   tp, x1;
 274#define SI3_2(x0, x1, x2, x3, x4)     \
 275        vpxor           x3,   x1, x1; \
 276        vpxor           x2,   x0, x0; \
 277        vpxor           x3,   x2, x2; \
 278        vpand           x1,   x3, x3; \
 279        vpxor           x0,   x1, x1; \
 280        vpand           x2,   x0, x0; \
 281        vpxor           x3,   x4, x4; \
 282        vpxor           x0,   x3, x3; \
 283        vpxor           x1,   x0, x0;
 284
 285#define SI4_1(x0, x1, x2, x3, x4)     \
 286        vpxor           x3,   x2, x2; \
 287        vpand           x1,   x0, tp; \
 288        vpxor           x2,   tp, tp; \
 289        vpor            x3,   x2, x2; \
 290        vpxor           RNOT, x0, x4; \
 291        vpxor           tp,   x1, x1; \
 292        vpxor           x2,   tp, x0; \
 293        vpand           x4,   x2, x2;
 294#define SI4_2(x0, x1, x2, x3, x4)     \
 295        vpxor           x0,   x2, x2; \
 296        vpor            x4,   x0, x0; \
 297        vpxor           x3,   x0, x0; \
 298        vpand           x2,   x3, x3; \
 299        vpxor           x3,   x4, x4; \
 300        vpxor           x1,   x3, x3; \
 301        vpand           x0,   x1, x1; \
 302        vpxor           x1,   x4, x4; \
 303        vpxor           x3,   x0, x0;
 304
 305#define SI5_1(x0, x1, x2, x3, x4)     \
 306        vpor            x2,   x1, tp; \
 307        vpxor           x1,   x2, x2; \
 308        vpxor           x3,   tp, tp; \
 309        vpand           x1,   x3, x3; \
 310        vpxor           x3,   x2, x2; \
 311        vpor            x0,   x3, x3; \
 312        vpxor           RNOT, x0, x0; \
 313        vpxor           x2,   x3, x3; \
 314        vpor            x0,   x2, x2;
 315#define SI5_2(x0, x1, x2, x3, x4)     \
 316        vpxor           tp,   x1, x4; \
 317        vpxor           x4,   x2, x2; \
 318        vpand           x0,   x4, x4; \
 319        vpxor           tp,   x0, x0; \
 320        vpxor           x3,   tp, x1; \
 321        vpand           x2,   x0, x0; \
 322        vpxor           x3,   x2, x2; \
 323        vpxor           x2,   x0, x0; \
 324        vpxor           x4,   x2, x2; \
 325        vpxor           x3,   x4, x4;
 326
 327#define SI6_1(x0, x1, x2, x3, x4)     \
 328        vpxor           x2,   x0, x0; \
 329        vpand           x3,   x0, tp; \
 330        vpxor           x3,   x2, x2; \
 331        vpxor           x2,   tp, tp; \
 332        vpxor           x1,   x3, x3; \
 333        vpor            x0,   x2, x2; \
 334        vpxor           x3,   x2, x2; \
 335        vpand           tp,   x3, x3;
 336#define SI6_2(x0, x1, x2, x3, x4)     \
 337        vpxor           RNOT, tp, tp; \
 338        vpxor           x1,   x3, x3; \
 339        vpand           x2,   x1, x1; \
 340        vpxor           tp,   x0, x4; \
 341        vpxor           x4,   x3, x3; \
 342        vpxor           x2,   x4, x4; \
 343        vpxor           x1,   tp, x0; \
 344        vpxor           x0,   x2, x2;
 345
 346#define SI7_1(x0, x1, x2, x3, x4)     \
 347        vpand           x0,   x3, tp; \
 348        vpxor           x2,   x0, x0; \
 349        vpor            x3,   x2, x2; \
 350        vpxor           x1,   x3, x4; \
 351        vpxor           RNOT, x0, x0; \
 352        vpor            tp,   x1, x1; \
 353        vpxor           x0,   x4, x4; \
 354        vpand           x2,   x0, x0; \
 355        vpxor           x1,   x0, x0;
 356#define SI7_2(x0, x1, x2, x3, x4)     \
 357        vpand           x2,   x1, x1; \
 358        vpxor           x2,   tp, x3; \
 359        vpxor           x3,   x4, x4; \
 360        vpand           x3,   x2, x2; \
 361        vpor            x0,   x3, x3; \
 362        vpxor           x4,   x1, x1; \
 363        vpxor           x4,   x3, x3; \
 364        vpand           x0,   x4, x4; \
 365        vpxor           x2,   x4, x4;
 366
 367#define get_key(i,j,t) \
 368        vpbroadcastd (4*(i)+(j))*4(CTX), t;
 369
 370#define K2(x0, x1, x2, x3, x4, i) \
 371        get_key(i, 0, RK0); \
 372        get_key(i, 1, RK1); \
 373        get_key(i, 2, RK2); \
 374        get_key(i, 3, RK3); \
 375        vpxor RK0,      x0 ## 1, x0 ## 1; \
 376        vpxor RK1,      x1 ## 1, x1 ## 1; \
 377        vpxor RK2,      x2 ## 1, x2 ## 1; \
 378        vpxor RK3,      x3 ## 1, x3 ## 1; \
 379                vpxor RK0,      x0 ## 2, x0 ## 2; \
 380                vpxor RK1,      x1 ## 2, x1 ## 2; \
 381                vpxor RK2,      x2 ## 2, x2 ## 2; \
 382                vpxor RK3,      x3 ## 2, x3 ## 2;
 383
 384#define LK2(x0, x1, x2, x3, x4, i) \
 385        vpslld $13,             x0 ## 1, x4 ## 1;          \
 386        vpsrld $(32 - 13),      x0 ## 1, x0 ## 1;          \
 387        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 388        vpxor                   x0 ## 1, x1 ## 1, x1 ## 1; \
 389        vpslld $3,              x2 ## 1, x4 ## 1;          \
 390        vpsrld $(32 - 3),       x2 ## 1, x2 ## 1;          \
 391        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 392        vpxor                   x2 ## 1, x1 ## 1, x1 ## 1; \
 393                vpslld $13,             x0 ## 2, x4 ## 2;          \
 394                vpsrld $(32 - 13),      x0 ## 2, x0 ## 2;          \
 395                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 396                vpxor                   x0 ## 2, x1 ## 2, x1 ## 2; \
 397                vpslld $3,              x2 ## 2, x4 ## 2;          \
 398                vpsrld $(32 - 3),       x2 ## 2, x2 ## 2;          \
 399                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 400                vpxor                   x2 ## 2, x1 ## 2, x1 ## 2; \
 401        vpslld $1,              x1 ## 1, x4 ## 1;          \
 402        vpsrld $(32 - 1),       x1 ## 1, x1 ## 1;          \
 403        vpor                    x4 ## 1, x1 ## 1, x1 ## 1; \
 404        vpslld $3,              x0 ## 1, x4 ## 1;          \
 405        vpxor                   x2 ## 1, x3 ## 1, x3 ## 1; \
 406        vpxor                   x4 ## 1, x3 ## 1, x3 ## 1; \
 407        get_key(i, 1, RK1); \
 408                vpslld $1,              x1 ## 2, x4 ## 2;          \
 409                vpsrld $(32 - 1),       x1 ## 2, x1 ## 2;          \
 410                vpor                    x4 ## 2, x1 ## 2, x1 ## 2; \
 411                vpslld $3,              x0 ## 2, x4 ## 2;          \
 412                vpxor                   x2 ## 2, x3 ## 2, x3 ## 2; \
 413                vpxor                   x4 ## 2, x3 ## 2, x3 ## 2; \
 414                get_key(i, 3, RK3); \
 415        vpslld $7,              x3 ## 1, x4 ## 1;          \
 416        vpsrld $(32 - 7),       x3 ## 1, x3 ## 1;          \
 417        vpor                    x4 ## 1, x3 ## 1, x3 ## 1; \
 418        vpslld $7,              x1 ## 1, x4 ## 1;          \
 419        vpxor                   x1 ## 1, x0 ## 1, x0 ## 1; \
 420        vpxor                   x3 ## 1, x0 ## 1, x0 ## 1; \
 421        vpxor                   x3 ## 1, x2 ## 1, x2 ## 1; \
 422        vpxor                   x4 ## 1, x2 ## 1, x2 ## 1; \
 423        get_key(i, 0, RK0); \
 424                vpslld $7,              x3 ## 2, x4 ## 2;          \
 425                vpsrld $(32 - 7),       x3 ## 2, x3 ## 2;          \
 426                vpor                    x4 ## 2, x3 ## 2, x3 ## 2; \
 427                vpslld $7,              x1 ## 2, x4 ## 2;          \
 428                vpxor                   x1 ## 2, x0 ## 2, x0 ## 2; \
 429                vpxor                   x3 ## 2, x0 ## 2, x0 ## 2; \
 430                vpxor                   x3 ## 2, x2 ## 2, x2 ## 2; \
 431                vpxor                   x4 ## 2, x2 ## 2, x2 ## 2; \
 432                get_key(i, 2, RK2); \
 433        vpxor                   RK1, x1 ## 1, x1 ## 1;     \
 434        vpxor                   RK3, x3 ## 1, x3 ## 1;     \
 435        vpslld $5,              x0 ## 1, x4 ## 1;          \
 436        vpsrld $(32 - 5),       x0 ## 1, x0 ## 1;          \
 437        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 438        vpslld $22,             x2 ## 1, x4 ## 1;          \
 439        vpsrld $(32 - 22),      x2 ## 1, x2 ## 1;          \
 440        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 441        vpxor                   RK0, x0 ## 1, x0 ## 1;     \
 442        vpxor                   RK2, x2 ## 1, x2 ## 1;     \
 443                vpxor                   RK1, x1 ## 2, x1 ## 2;     \
 444                vpxor                   RK3, x3 ## 2, x3 ## 2;     \
 445                vpslld $5,              x0 ## 2, x4 ## 2;          \
 446                vpsrld $(32 - 5),       x0 ## 2, x0 ## 2;          \
 447                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 448                vpslld $22,             x2 ## 2, x4 ## 2;          \
 449                vpsrld $(32 - 22),      x2 ## 2, x2 ## 2;          \
 450                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 451                vpxor                   RK0, x0 ## 2, x0 ## 2;     \
 452                vpxor                   RK2, x2 ## 2, x2 ## 2;
 453
 454#define KL2(x0, x1, x2, x3, x4, i) \
 455        vpxor                   RK0, x0 ## 1, x0 ## 1;     \
 456        vpxor                   RK2, x2 ## 1, x2 ## 1;     \
 457        vpsrld $5,              x0 ## 1, x4 ## 1;          \
 458        vpslld $(32 - 5),       x0 ## 1, x0 ## 1;          \
 459        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 460        vpxor                   RK3, x3 ## 1, x3 ## 1;     \
 461        vpxor                   RK1, x1 ## 1, x1 ## 1;     \
 462        vpsrld $22,             x2 ## 1, x4 ## 1;          \
 463        vpslld $(32 - 22),      x2 ## 1, x2 ## 1;          \
 464        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 465        vpxor                   x3 ## 1, x2 ## 1, x2 ## 1; \
 466                vpxor                   RK0, x0 ## 2, x0 ## 2;     \
 467                vpxor                   RK2, x2 ## 2, x2 ## 2;     \
 468                vpsrld $5,              x0 ## 2, x4 ## 2;          \
 469                vpslld $(32 - 5),       x0 ## 2, x0 ## 2;          \
 470                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 471                vpxor                   RK3, x3 ## 2, x3 ## 2;     \
 472                vpxor                   RK1, x1 ## 2, x1 ## 2;     \
 473                vpsrld $22,             x2 ## 2, x4 ## 2;          \
 474                vpslld $(32 - 22),      x2 ## 2, x2 ## 2;          \
 475                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 476                vpxor                   x3 ## 2, x2 ## 2, x2 ## 2; \
 477        vpxor                   x3 ## 1, x0 ## 1, x0 ## 1; \
 478        vpslld $7,              x1 ## 1, x4 ## 1;          \
 479        vpxor                   x1 ## 1, x0 ## 1, x0 ## 1; \
 480        vpxor                   x4 ## 1, x2 ## 1, x2 ## 1; \
 481        vpsrld $1,              x1 ## 1, x4 ## 1;          \
 482        vpslld $(32 - 1),       x1 ## 1, x1 ## 1;          \
 483        vpor                    x4 ## 1, x1 ## 1, x1 ## 1; \
 484                vpxor                   x3 ## 2, x0 ## 2, x0 ## 2; \
 485                vpslld $7,              x1 ## 2, x4 ## 2;          \
 486                vpxor                   x1 ## 2, x0 ## 2, x0 ## 2; \
 487                vpxor                   x4 ## 2, x2 ## 2, x2 ## 2; \
 488                vpsrld $1,              x1 ## 2, x4 ## 2;          \
 489                vpslld $(32 - 1),       x1 ## 2, x1 ## 2;          \
 490                vpor                    x4 ## 2, x1 ## 2, x1 ## 2; \
 491        vpsrld $7,              x3 ## 1, x4 ## 1;          \
 492        vpslld $(32 - 7),       x3 ## 1, x3 ## 1;          \
 493        vpor                    x4 ## 1, x3 ## 1, x3 ## 1; \
 494        vpxor                   x0 ## 1, x1 ## 1, x1 ## 1; \
 495        vpslld $3,              x0 ## 1, x4 ## 1;          \
 496        vpxor                   x4 ## 1, x3 ## 1, x3 ## 1; \
 497                vpsrld $7,              x3 ## 2, x4 ## 2;          \
 498                vpslld $(32 - 7),       x3 ## 2, x3 ## 2;          \
 499                vpor                    x4 ## 2, x3 ## 2, x3 ## 2; \
 500                vpxor                   x0 ## 2, x1 ## 2, x1 ## 2; \
 501                vpslld $3,              x0 ## 2, x4 ## 2;          \
 502                vpxor                   x4 ## 2, x3 ## 2, x3 ## 2; \
 503        vpsrld $13,             x0 ## 1, x4 ## 1;          \
 504        vpslld $(32 - 13),      x0 ## 1, x0 ## 1;          \
 505        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 506        vpxor                   x2 ## 1, x1 ## 1, x1 ## 1; \
 507        vpxor                   x2 ## 1, x3 ## 1, x3 ## 1; \
 508        vpsrld $3,              x2 ## 1, x4 ## 1;          \
 509        vpslld $(32 - 3),       x2 ## 1, x2 ## 1;          \
 510        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 511                vpsrld $13,             x0 ## 2, x4 ## 2;          \
 512                vpslld $(32 - 13),      x0 ## 2, x0 ## 2;          \
 513                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 514                vpxor                   x2 ## 2, x1 ## 2, x1 ## 2; \
 515                vpxor                   x2 ## 2, x3 ## 2, x3 ## 2; \
 516                vpsrld $3,              x2 ## 2, x4 ## 2;          \
 517                vpslld $(32 - 3),       x2 ## 2, x2 ## 2;          \
 518                vpor                    x4 ## 2, x2 ## 2, x2 ## 2;
 519
 520#define S(SBOX, x0, x1, x2, x3, x4) \
 521        SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 522        SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 523        SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 524        SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
 525
 526#define SP(SBOX, x0, x1, x2, x3, x4, i) \
 527        get_key(i, 0, RK0); \
 528        SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 529        get_key(i, 2, RK2); \
 530        SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 531        get_key(i, 3, RK3); \
 532        SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 533        get_key(i, 1, RK1); \
 534        SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 535
 536#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 537        vpunpckldq              x1, x0, t0; \
 538        vpunpckhdq              x1, x0, t2; \
 539        vpunpckldq              x3, x2, t1; \
 540        vpunpckhdq              x3, x2, x3; \
 541        \
 542        vpunpcklqdq             t1, t0, x0; \
 543        vpunpckhqdq             t1, t0, x1; \
 544        vpunpcklqdq             x3, t2, x2; \
 545        vpunpckhqdq             x3, t2, x3;
 546
 547#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
 548        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 549
 550#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
 551        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 552
 553.align 8
 554SYM_FUNC_START_LOCAL(__serpent_enc_blk16)
 555        /* input:
 556         *      %rdi: ctx, CTX
 557         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext
 558         * output:
 559         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
 560         */
 561
 562        vpcmpeqd RNOT, RNOT, RNOT;
 563
 564        read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 565        read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 566
 567                                                 K2(RA, RB, RC, RD, RE, 0);
 568        S(S0, RA, RB, RC, RD, RE);              LK2(RC, RB, RD, RA, RE, 1);
 569        S(S1, RC, RB, RD, RA, RE);              LK2(RE, RD, RA, RC, RB, 2);
 570        S(S2, RE, RD, RA, RC, RB);              LK2(RB, RD, RE, RC, RA, 3);
 571        S(S3, RB, RD, RE, RC, RA);              LK2(RC, RA, RD, RB, RE, 4);
 572        S(S4, RC, RA, RD, RB, RE);              LK2(RA, RD, RB, RE, RC, 5);
 573        S(S5, RA, RD, RB, RE, RC);              LK2(RC, RA, RD, RE, RB, 6);
 574        S(S6, RC, RA, RD, RE, RB);              LK2(RD, RB, RA, RE, RC, 7);
 575        S(S7, RD, RB, RA, RE, RC);              LK2(RC, RA, RE, RD, RB, 8);
 576        S(S0, RC, RA, RE, RD, RB);              LK2(RE, RA, RD, RC, RB, 9);
 577        S(S1, RE, RA, RD, RC, RB);              LK2(RB, RD, RC, RE, RA, 10);
 578        S(S2, RB, RD, RC, RE, RA);              LK2(RA, RD, RB, RE, RC, 11);
 579        S(S3, RA, RD, RB, RE, RC);              LK2(RE, RC, RD, RA, RB, 12);
 580        S(S4, RE, RC, RD, RA, RB);              LK2(RC, RD, RA, RB, RE, 13);
 581        S(S5, RC, RD, RA, RB, RE);              LK2(RE, RC, RD, RB, RA, 14);
 582        S(S6, RE, RC, RD, RB, RA);              LK2(RD, RA, RC, RB, RE, 15);
 583        S(S7, RD, RA, RC, RB, RE);              LK2(RE, RC, RB, RD, RA, 16);
 584        S(S0, RE, RC, RB, RD, RA);              LK2(RB, RC, RD, RE, RA, 17);
 585        S(S1, RB, RC, RD, RE, RA);              LK2(RA, RD, RE, RB, RC, 18);
 586        S(S2, RA, RD, RE, RB, RC);              LK2(RC, RD, RA, RB, RE, 19);
 587        S(S3, RC, RD, RA, RB, RE);              LK2(RB, RE, RD, RC, RA, 20);
 588        S(S4, RB, RE, RD, RC, RA);              LK2(RE, RD, RC, RA, RB, 21);
 589        S(S5, RE, RD, RC, RA, RB);              LK2(RB, RE, RD, RA, RC, 22);
 590        S(S6, RB, RE, RD, RA, RC);              LK2(RD, RC, RE, RA, RB, 23);
 591        S(S7, RD, RC, RE, RA, RB);              LK2(RB, RE, RA, RD, RC, 24);
 592        S(S0, RB, RE, RA, RD, RC);              LK2(RA, RE, RD, RB, RC, 25);
 593        S(S1, RA, RE, RD, RB, RC);              LK2(RC, RD, RB, RA, RE, 26);
 594        S(S2, RC, RD, RB, RA, RE);              LK2(RE, RD, RC, RA, RB, 27);
 595        S(S3, RE, RD, RC, RA, RB);              LK2(RA, RB, RD, RE, RC, 28);
 596        S(S4, RA, RB, RD, RE, RC);              LK2(RB, RD, RE, RC, RA, 29);
 597        S(S5, RB, RD, RE, RC, RA);              LK2(RA, RB, RD, RC, RE, 30);
 598        S(S6, RA, RB, RD, RC, RE);              LK2(RD, RE, RB, RC, RA, 31);
 599        S(S7, RD, RE, RB, RC, RA);               K2(RA, RB, RC, RD, RE, 32);
 600
 601        write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 602        write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 603
 604        RET;
 605SYM_FUNC_END(__serpent_enc_blk16)
 606
 607.align 8
 608SYM_FUNC_START_LOCAL(__serpent_dec_blk16)
 609        /* input:
 610         *      %rdi: ctx, CTX
 611         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
 612         * output:
 613         *      RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext
 614         */
 615
 616        vpcmpeqd RNOT, RNOT, RNOT;
 617
 618        read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 619        read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 620
 621                                                 K2(RA, RB, RC, RD, RE, 32);
 622        SP(SI7, RA, RB, RC, RD, RE, 31);        KL2(RB, RD, RA, RE, RC, 31);
 623        SP(SI6, RB, RD, RA, RE, RC, 30);        KL2(RA, RC, RE, RB, RD, 30);
 624        SP(SI5, RA, RC, RE, RB, RD, 29);        KL2(RC, RD, RA, RE, RB, 29);
 625        SP(SI4, RC, RD, RA, RE, RB, 28);        KL2(RC, RA, RB, RE, RD, 28);
 626        SP(SI3, RC, RA, RB, RE, RD, 27);        KL2(RB, RC, RD, RE, RA, 27);
 627        SP(SI2, RB, RC, RD, RE, RA, 26);        KL2(RC, RA, RE, RD, RB, 26);
 628        SP(SI1, RC, RA, RE, RD, RB, 25);        KL2(RB, RA, RE, RD, RC, 25);
 629        SP(SI0, RB, RA, RE, RD, RC, 24);        KL2(RE, RC, RA, RB, RD, 24);
 630        SP(SI7, RE, RC, RA, RB, RD, 23);        KL2(RC, RB, RE, RD, RA, 23);
 631        SP(SI6, RC, RB, RE, RD, RA, 22);        KL2(RE, RA, RD, RC, RB, 22);
 632        SP(SI5, RE, RA, RD, RC, RB, 21);        KL2(RA, RB, RE, RD, RC, 21);
 633        SP(SI4, RA, RB, RE, RD, RC, 20);        KL2(RA, RE, RC, RD, RB, 20);
 634        SP(SI3, RA, RE, RC, RD, RB, 19);        KL2(RC, RA, RB, RD, RE, 19);
 635        SP(SI2, RC, RA, RB, RD, RE, 18);        KL2(RA, RE, RD, RB, RC, 18);
 636        SP(SI1, RA, RE, RD, RB, RC, 17);        KL2(RC, RE, RD, RB, RA, 17);
 637        SP(SI0, RC, RE, RD, RB, RA, 16);        KL2(RD, RA, RE, RC, RB, 16);
 638        SP(SI7, RD, RA, RE, RC, RB, 15);        KL2(RA, RC, RD, RB, RE, 15);
 639        SP(SI6, RA, RC, RD, RB, RE, 14);        KL2(RD, RE, RB, RA, RC, 14);
 640        SP(SI5, RD, RE, RB, RA, RC, 13);        KL2(RE, RC, RD, RB, RA, 13);
 641        SP(SI4, RE, RC, RD, RB, RA, 12);        KL2(RE, RD, RA, RB, RC, 12);
 642        SP(SI3, RE, RD, RA, RB, RC, 11);        KL2(RA, RE, RC, RB, RD, 11);
 643        SP(SI2, RA, RE, RC, RB, RD, 10);        KL2(RE, RD, RB, RC, RA, 10);
 644        SP(SI1, RE, RD, RB, RC, RA, 9);         KL2(RA, RD, RB, RC, RE, 9);
 645        SP(SI0, RA, RD, RB, RC, RE, 8);         KL2(RB, RE, RD, RA, RC, 8);
 646        SP(SI7, RB, RE, RD, RA, RC, 7);         KL2(RE, RA, RB, RC, RD, 7);
 647        SP(SI6, RE, RA, RB, RC, RD, 6);         KL2(RB, RD, RC, RE, RA, 6);
 648        SP(SI5, RB, RD, RC, RE, RA, 5);         KL2(RD, RA, RB, RC, RE, 5);
 649        SP(SI4, RD, RA, RB, RC, RE, 4);         KL2(RD, RB, RE, RC, RA, 4);
 650        SP(SI3, RD, RB, RE, RC, RA, 3);         KL2(RE, RD, RA, RC, RB, 3);
 651        SP(SI2, RE, RD, RA, RC, RB, 2);         KL2(RD, RB, RC, RA, RE, 2);
 652        SP(SI1, RD, RB, RC, RA, RE, 1);         KL2(RE, RB, RC, RA, RD, 1);
 653        S(SI0, RE, RB, RC, RA, RD);              K2(RC, RD, RB, RE, RA, 0);
 654
 655        write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
 656        write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
 657
 658        RET;
 659SYM_FUNC_END(__serpent_dec_blk16)
 660
 661SYM_FUNC_START(serpent_ecb_enc_16way)
 662        /* input:
 663         *      %rdi: ctx, CTX
 664         *      %rsi: dst
 665         *      %rdx: src
 666         */
 667        FRAME_BEGIN
 668
 669        vzeroupper;
 670
 671        load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 672
 673        call __serpent_enc_blk16;
 674
 675        store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 676
 677        vzeroupper;
 678
 679        FRAME_END
 680        RET;
 681SYM_FUNC_END(serpent_ecb_enc_16way)
 682
 683SYM_FUNC_START(serpent_ecb_dec_16way)
 684        /* input:
 685         *      %rdi: ctx, CTX
 686         *      %rsi: dst
 687         *      %rdx: src
 688         */
 689        FRAME_BEGIN
 690
 691        vzeroupper;
 692
 693        load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 694
 695        call __serpent_dec_blk16;
 696
 697        store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 698
 699        vzeroupper;
 700
 701        FRAME_END
 702        RET;
 703SYM_FUNC_END(serpent_ecb_dec_16way)
 704
 705SYM_FUNC_START(serpent_cbc_dec_16way)
 706        /* input:
 707         *      %rdi: ctx, CTX
 708         *      %rsi: dst
 709         *      %rdx: src
 710         */
 711        FRAME_BEGIN
 712
 713        vzeroupper;
 714
 715        load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 716
 717        call __serpent_dec_blk16;
 718
 719        store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
 720                        RK0);
 721
 722        vzeroupper;
 723
 724        FRAME_END
 725        RET;
 726SYM_FUNC_END(serpent_cbc_dec_16way)
 727