linux/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * Serpent Cipher 8-way parallel algorithm (x86_64/AVX)
   4 *
   5 * Copyright (C) 2012 Johannes Goetzfried
   6 *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
   7 *
   8 * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
   9 */
  10
  11#include <linux/linkage.h>
  12#include <asm/frame.h>
  13#include "glue_helper-asm-avx.S"
  14
  15.file "serpent-avx-x86_64-asm_64.S"
  16
  17.section        .rodata.cst16.bswap128_mask, "aM", @progbits, 16
  18.align 16
  19.Lbswap128_mask:
  20        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  21.section        .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
  22.align 16
  23.Lxts_gf128mul_and_shl1_mask:
  24        .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
  25
  26.text
  27
  28#define CTX %rdi
  29
  30/**********************************************************************
  31  8-way AVX serpent
  32 **********************************************************************/
  33#define RA1 %xmm0
  34#define RB1 %xmm1
  35#define RC1 %xmm2
  36#define RD1 %xmm3
  37#define RE1 %xmm4
  38
  39#define tp  %xmm5
  40
  41#define RA2 %xmm6
  42#define RB2 %xmm7
  43#define RC2 %xmm8
  44#define RD2 %xmm9
  45#define RE2 %xmm10
  46
  47#define RNOT %xmm11
  48
  49#define RK0 %xmm12
  50#define RK1 %xmm13
  51#define RK2 %xmm14
  52#define RK3 %xmm15
  53
  54
  55#define S0_1(x0, x1, x2, x3, x4)      \
  56        vpor            x0,   x3, tp; \
  57        vpxor           x3,   x0, x0; \
  58        vpxor           x2,   x3, x4; \
  59        vpxor           RNOT, x4, x4; \
  60        vpxor           x1,   tp, x3; \
  61        vpand           x0,   x1, x1; \
  62        vpxor           x4,   x1, x1; \
  63        vpxor           x0,   x2, x2;
  64#define S0_2(x0, x1, x2, x3, x4)      \
  65        vpxor           x3,   x0, x0; \
  66        vpor            x0,   x4, x4; \
  67        vpxor           x2,   x0, x0; \
  68        vpand           x1,   x2, x2; \
  69        vpxor           x2,   x3, x3; \
  70        vpxor           RNOT, x1, x1; \
  71        vpxor           x4,   x2, x2; \
  72        vpxor           x2,   x1, x1;
  73
  74#define S1_1(x0, x1, x2, x3, x4)      \
  75        vpxor           x0,   x1, tp; \
  76        vpxor           x3,   x0, x0; \
  77        vpxor           RNOT, x3, x3; \
  78        vpand           tp,   x1, x4; \
  79        vpor            tp,   x0, x0; \
  80        vpxor           x2,   x3, x3; \
  81        vpxor           x3,   x0, x0; \
  82        vpxor           x3,   tp, x1;
  83#define S1_2(x0, x1, x2, x3, x4)      \
  84        vpxor           x4,   x3, x3; \
  85        vpor            x4,   x1, x1; \
  86        vpxor           x2,   x4, x4; \
  87        vpand           x0,   x2, x2; \
  88        vpxor           x1,   x2, x2; \
  89        vpor            x0,   x1, x1; \
  90        vpxor           RNOT, x0, x0; \
  91        vpxor           x2,   x0, x0; \
  92        vpxor           x1,   x4, x4;
  93
  94#define S2_1(x0, x1, x2, x3, x4)      \
  95        vpxor           RNOT, x3, x3; \
  96        vpxor           x0,   x1, x1; \
  97        vpand           x2,   x0, tp; \
  98        vpxor           x3,   tp, tp; \
  99        vpor            x0,   x3, x3; \
 100        vpxor           x1,   x2, x2; \
 101        vpxor           x1,   x3, x3; \
 102        vpand           tp,   x1, x1;
 103#define S2_2(x0, x1, x2, x3, x4)      \
 104        vpxor           x2,   tp, tp; \
 105        vpand           x3,   x2, x2; \
 106        vpor            x1,   x3, x3; \
 107        vpxor           RNOT, tp, tp; \
 108        vpxor           tp,   x3, x3; \
 109        vpxor           tp,   x0, x4; \
 110        vpxor           x2,   tp, x0; \
 111        vpor            x2,   x1, x1;
 112
 113#define S3_1(x0, x1, x2, x3, x4)      \
 114        vpxor           x3,   x1, tp; \
 115        vpor            x0,   x3, x3; \
 116        vpand           x0,   x1, x4; \
 117        vpxor           x2,   x0, x0; \
 118        vpxor           tp,   x2, x2; \
 119        vpand           x3,   tp, x1; \
 120        vpxor           x3,   x2, x2; \
 121        vpor            x4,   x0, x0; \
 122        vpxor           x3,   x4, x4;
 123#define S3_2(x0, x1, x2, x3, x4)      \
 124        vpxor           x0,   x1, x1; \
 125        vpand           x3,   x0, x0; \
 126        vpand           x4,   x3, x3; \
 127        vpxor           x2,   x3, x3; \
 128        vpor            x1,   x4, x4; \
 129        vpand           x1,   x2, x2; \
 130        vpxor           x3,   x4, x4; \
 131        vpxor           x3,   x0, x0; \
 132        vpxor           x2,   x3, x3;
 133
 134#define S4_1(x0, x1, x2, x3, x4)      \
 135        vpand           x0,   x3, tp; \
 136        vpxor           x3,   x0, x0; \
 137        vpxor           x2,   tp, tp; \
 138        vpor            x3,   x2, x2; \
 139        vpxor           x1,   x0, x0; \
 140        vpxor           tp,   x3, x4; \
 141        vpor            x0,   x2, x2; \
 142        vpxor           x1,   x2, x2;
 143#define S4_2(x0, x1, x2, x3, x4)      \
 144        vpand           x0,   x1, x1; \
 145        vpxor           x4,   x1, x1; \
 146        vpand           x2,   x4, x4; \
 147        vpxor           tp,   x2, x2; \
 148        vpxor           x0,   x4, x4; \
 149        vpor            x1,   tp, x3; \
 150        vpxor           RNOT, x1, x1; \
 151        vpxor           x0,   x3, x3;
 152
 153#define S5_1(x0, x1, x2, x3, x4)      \
 154        vpor            x0,   x1, tp; \
 155        vpxor           tp,   x2, x2; \
 156        vpxor           RNOT, x3, x3; \
 157        vpxor           x0,   x1, x4; \
 158        vpxor           x2,   x0, x0; \
 159        vpand           x4,   tp, x1; \
 160        vpor            x3,   x4, x4; \
 161        vpxor           x0,   x4, x4;
 162#define S5_2(x0, x1, x2, x3, x4)      \
 163        vpand           x3,   x0, x0; \
 164        vpxor           x3,   x1, x1; \
 165        vpxor           x2,   x3, x3; \
 166        vpxor           x1,   x0, x0; \
 167        vpand           x4,   x2, x2; \
 168        vpxor           x2,   x1, x1; \
 169        vpand           x0,   x2, x2; \
 170        vpxor           x2,   x3, x3;
 171
 172#define S6_1(x0, x1, x2, x3, x4)      \
 173        vpxor           x0,   x3, x3; \
 174        vpxor           x2,   x1, tp; \
 175        vpxor           x0,   x2, x2; \
 176        vpand           x3,   x0, x0; \
 177        vpor            x3,   tp, tp; \
 178        vpxor           RNOT, x1, x4; \
 179        vpxor           tp,   x0, x0; \
 180        vpxor           x2,   tp, x1;
 181#define S6_2(x0, x1, x2, x3, x4)      \
 182        vpxor           x4,   x3, x3; \
 183        vpxor           x0,   x4, x4; \
 184        vpand           x0,   x2, x2; \
 185        vpxor           x1,   x4, x4; \
 186        vpxor           x3,   x2, x2; \
 187        vpand           x1,   x3, x3; \
 188        vpxor           x0,   x3, x3; \
 189        vpxor           x2,   x1, x1;
 190
 191#define S7_1(x0, x1, x2, x3, x4)      \
 192        vpxor           RNOT, x1, tp; \
 193        vpxor           RNOT, x0, x0; \
 194        vpand           x2,   tp, x1; \
 195        vpxor           x3,   x1, x1; \
 196        vpor            tp,   x3, x3; \
 197        vpxor           x2,   tp, x4; \
 198        vpxor           x3,   x2, x2; \
 199        vpxor           x0,   x3, x3; \
 200        vpor            x1,   x0, x0;
 201#define S7_2(x0, x1, x2, x3, x4)      \
 202        vpand           x0,   x2, x2; \
 203        vpxor           x4,   x0, x0; \
 204        vpxor           x3,   x4, x4; \
 205        vpand           x0,   x3, x3; \
 206        vpxor           x1,   x4, x4; \
 207        vpxor           x4,   x2, x2; \
 208        vpxor           x1,   x3, x3; \
 209        vpor            x0,   x4, x4; \
 210        vpxor           x1,   x4, x4;
 211
 212#define SI0_1(x0, x1, x2, x3, x4)     \
 213        vpxor           x0,   x1, x1; \
 214        vpor            x1,   x3, tp; \
 215        vpxor           x1,   x3, x4; \
 216        vpxor           RNOT, x0, x0; \
 217        vpxor           tp,   x2, x2; \
 218        vpxor           x0,   tp, x3; \
 219        vpand           x1,   x0, x0; \
 220        vpxor           x2,   x0, x0;
 221#define SI0_2(x0, x1, x2, x3, x4)     \
 222        vpand           x3,   x2, x2; \
 223        vpxor           x4,   x3, x3; \
 224        vpxor           x3,   x2, x2; \
 225        vpxor           x3,   x1, x1; \
 226        vpand           x0,   x3, x3; \
 227        vpxor           x0,   x1, x1; \
 228        vpxor           x2,   x0, x0; \
 229        vpxor           x3,   x4, x4;
 230
 231#define SI1_1(x0, x1, x2, x3, x4)     \
 232        vpxor           x3,   x1, x1; \
 233        vpxor           x2,   x0, tp; \
 234        vpxor           RNOT, x2, x2; \
 235        vpor            x1,   x0, x4; \
 236        vpxor           x3,   x4, x4; \
 237        vpand           x1,   x3, x3; \
 238        vpxor           x2,   x1, x1; \
 239        vpand           x4,   x2, x2;
 240#define SI1_2(x0, x1, x2, x3, x4)     \
 241        vpxor           x1,   x4, x4; \
 242        vpor            x3,   x1, x1; \
 243        vpxor           tp,   x3, x3; \
 244        vpxor           tp,   x2, x2; \
 245        vpor            x4,   tp, x0; \
 246        vpxor           x4,   x2, x2; \
 247        vpxor           x0,   x1, x1; \
 248        vpxor           x1,   x4, x4;
 249
 250#define SI2_1(x0, x1, x2, x3, x4)     \
 251        vpxor           x1,   x2, x2; \
 252        vpxor           RNOT, x3, tp; \
 253        vpor            x2,   tp, tp; \
 254        vpxor           x3,   x2, x2; \
 255        vpxor           x0,   x3, x4; \
 256        vpxor           x1,   tp, x3; \
 257        vpor            x2,   x1, x1; \
 258        vpxor           x0,   x2, x2;
 259#define SI2_2(x0, x1, x2, x3, x4)     \
 260        vpxor           x4,   x1, x1; \
 261        vpor            x3,   x4, x4; \
 262        vpxor           x3,   x2, x2; \
 263        vpxor           x2,   x4, x4; \
 264        vpand           x1,   x2, x2; \
 265        vpxor           x3,   x2, x2; \
 266        vpxor           x4,   x3, x3; \
 267        vpxor           x0,   x4, x4;
 268
 269#define SI3_1(x0, x1, x2, x3, x4)     \
 270        vpxor           x1,   x2, x2; \
 271        vpand           x2,   x1, tp; \
 272        vpxor           x0,   tp, tp; \
 273        vpor            x1,   x0, x0; \
 274        vpxor           x3,   x1, x4; \
 275        vpxor           x3,   x0, x0; \
 276        vpor            tp,   x3, x3; \
 277        vpxor           x2,   tp, x1;
 278#define SI3_2(x0, x1, x2, x3, x4)     \
 279        vpxor           x3,   x1, x1; \
 280        vpxor           x2,   x0, x0; \
 281        vpxor           x3,   x2, x2; \
 282        vpand           x1,   x3, x3; \
 283        vpxor           x0,   x1, x1; \
 284        vpand           x2,   x0, x0; \
 285        vpxor           x3,   x4, x4; \
 286        vpxor           x0,   x3, x3; \
 287        vpxor           x1,   x0, x0;
 288
 289#define SI4_1(x0, x1, x2, x3, x4)     \
 290        vpxor           x3,   x2, x2; \
 291        vpand           x1,   x0, tp; \
 292        vpxor           x2,   tp, tp; \
 293        vpor            x3,   x2, x2; \
 294        vpxor           RNOT, x0, x4; \
 295        vpxor           tp,   x1, x1; \
 296        vpxor           x2,   tp, x0; \
 297        vpand           x4,   x2, x2;
 298#define SI4_2(x0, x1, x2, x3, x4)     \
 299        vpxor           x0,   x2, x2; \
 300        vpor            x4,   x0, x0; \
 301        vpxor           x3,   x0, x0; \
 302        vpand           x2,   x3, x3; \
 303        vpxor           x3,   x4, x4; \
 304        vpxor           x1,   x3, x3; \
 305        vpand           x0,   x1, x1; \
 306        vpxor           x1,   x4, x4; \
 307        vpxor           x3,   x0, x0;
 308
 309#define SI5_1(x0, x1, x2, x3, x4)     \
 310        vpor            x2,   x1, tp; \
 311        vpxor           x1,   x2, x2; \
 312        vpxor           x3,   tp, tp; \
 313        vpand           x1,   x3, x3; \
 314        vpxor           x3,   x2, x2; \
 315        vpor            x0,   x3, x3; \
 316        vpxor           RNOT, x0, x0; \
 317        vpxor           x2,   x3, x3; \
 318        vpor            x0,   x2, x2;
 319#define SI5_2(x0, x1, x2, x3, x4)     \
 320        vpxor           tp,   x1, x4; \
 321        vpxor           x4,   x2, x2; \
 322        vpand           x0,   x4, x4; \
 323        vpxor           tp,   x0, x0; \
 324        vpxor           x3,   tp, x1; \
 325        vpand           x2,   x0, x0; \
 326        vpxor           x3,   x2, x2; \
 327        vpxor           x2,   x0, x0; \
 328        vpxor           x4,   x2, x2; \
 329        vpxor           x3,   x4, x4;
 330
 331#define SI6_1(x0, x1, x2, x3, x4)     \
 332        vpxor           x2,   x0, x0; \
 333        vpand           x3,   x0, tp; \
 334        vpxor           x3,   x2, x2; \
 335        vpxor           x2,   tp, tp; \
 336        vpxor           x1,   x3, x3; \
 337        vpor            x0,   x2, x2; \
 338        vpxor           x3,   x2, x2; \
 339        vpand           tp,   x3, x3;
 340#define SI6_2(x0, x1, x2, x3, x4)     \
 341        vpxor           RNOT, tp, tp; \
 342        vpxor           x1,   x3, x3; \
 343        vpand           x2,   x1, x1; \
 344        vpxor           tp,   x0, x4; \
 345        vpxor           x4,   x3, x3; \
 346        vpxor           x2,   x4, x4; \
 347        vpxor           x1,   tp, x0; \
 348        vpxor           x0,   x2, x2;
 349
 350#define SI7_1(x0, x1, x2, x3, x4)     \
 351        vpand           x0,   x3, tp; \
 352        vpxor           x2,   x0, x0; \
 353        vpor            x3,   x2, x2; \
 354        vpxor           x1,   x3, x4; \
 355        vpxor           RNOT, x0, x0; \
 356        vpor            tp,   x1, x1; \
 357        vpxor           x0,   x4, x4; \
 358        vpand           x2,   x0, x0; \
 359        vpxor           x1,   x0, x0;
 360#define SI7_2(x0, x1, x2, x3, x4)     \
 361        vpand           x2,   x1, x1; \
 362        vpxor           x2,   tp, x3; \
 363        vpxor           x3,   x4, x4; \
 364        vpand           x3,   x2, x2; \
 365        vpor            x0,   x3, x3; \
 366        vpxor           x4,   x1, x1; \
 367        vpxor           x4,   x3, x3; \
 368        vpand           x0,   x4, x4; \
 369        vpxor           x2,   x4, x4;
 370
 371#define get_key(i, j, t) \
 372        vbroadcastss (4*(i)+(j))*4(CTX), t;
 373
 374#define K2(x0, x1, x2, x3, x4, i) \
 375        get_key(i, 0, RK0); \
 376        get_key(i, 1, RK1); \
 377        get_key(i, 2, RK2); \
 378        get_key(i, 3, RK3); \
 379        vpxor RK0,      x0 ## 1, x0 ## 1; \
 380        vpxor RK1,      x1 ## 1, x1 ## 1; \
 381        vpxor RK2,      x2 ## 1, x2 ## 1; \
 382        vpxor RK3,      x3 ## 1, x3 ## 1; \
 383                vpxor RK0,      x0 ## 2, x0 ## 2; \
 384                vpxor RK1,      x1 ## 2, x1 ## 2; \
 385                vpxor RK2,      x2 ## 2, x2 ## 2; \
 386                vpxor RK3,      x3 ## 2, x3 ## 2;
 387
 388#define LK2(x0, x1, x2, x3, x4, i) \
 389        vpslld $13,             x0 ## 1, x4 ## 1;          \
 390        vpsrld $(32 - 13),      x0 ## 1, x0 ## 1;          \
 391        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 392        vpxor                   x0 ## 1, x1 ## 1, x1 ## 1; \
 393        vpslld $3,              x2 ## 1, x4 ## 1;          \
 394        vpsrld $(32 - 3),       x2 ## 1, x2 ## 1;          \
 395        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 396        vpxor                   x2 ## 1, x1 ## 1, x1 ## 1; \
 397                vpslld $13,             x0 ## 2, x4 ## 2;          \
 398                vpsrld $(32 - 13),      x0 ## 2, x0 ## 2;          \
 399                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 400                vpxor                   x0 ## 2, x1 ## 2, x1 ## 2; \
 401                vpslld $3,              x2 ## 2, x4 ## 2;          \
 402                vpsrld $(32 - 3),       x2 ## 2, x2 ## 2;          \
 403                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 404                vpxor                   x2 ## 2, x1 ## 2, x1 ## 2; \
 405        vpslld $1,              x1 ## 1, x4 ## 1;          \
 406        vpsrld $(32 - 1),       x1 ## 1, x1 ## 1;          \
 407        vpor                    x4 ## 1, x1 ## 1, x1 ## 1; \
 408        vpslld $3,              x0 ## 1, x4 ## 1;          \
 409        vpxor                   x2 ## 1, x3 ## 1, x3 ## 1; \
 410        vpxor                   x4 ## 1, x3 ## 1, x3 ## 1; \
 411        get_key(i, 1, RK1); \
 412                vpslld $1,              x1 ## 2, x4 ## 2;          \
 413                vpsrld $(32 - 1),       x1 ## 2, x1 ## 2;          \
 414                vpor                    x4 ## 2, x1 ## 2, x1 ## 2; \
 415                vpslld $3,              x0 ## 2, x4 ## 2;          \
 416                vpxor                   x2 ## 2, x3 ## 2, x3 ## 2; \
 417                vpxor                   x4 ## 2, x3 ## 2, x3 ## 2; \
 418                get_key(i, 3, RK3); \
 419        vpslld $7,              x3 ## 1, x4 ## 1;          \
 420        vpsrld $(32 - 7),       x3 ## 1, x3 ## 1;          \
 421        vpor                    x4 ## 1, x3 ## 1, x3 ## 1; \
 422        vpslld $7,              x1 ## 1, x4 ## 1;          \
 423        vpxor                   x1 ## 1, x0 ## 1, x0 ## 1; \
 424        vpxor                   x3 ## 1, x0 ## 1, x0 ## 1; \
 425        vpxor                   x3 ## 1, x2 ## 1, x2 ## 1; \
 426        vpxor                   x4 ## 1, x2 ## 1, x2 ## 1; \
 427        get_key(i, 0, RK0); \
 428                vpslld $7,              x3 ## 2, x4 ## 2;          \
 429                vpsrld $(32 - 7),       x3 ## 2, x3 ## 2;          \
 430                vpor                    x4 ## 2, x3 ## 2, x3 ## 2; \
 431                vpslld $7,              x1 ## 2, x4 ## 2;          \
 432                vpxor                   x1 ## 2, x0 ## 2, x0 ## 2; \
 433                vpxor                   x3 ## 2, x0 ## 2, x0 ## 2; \
 434                vpxor                   x3 ## 2, x2 ## 2, x2 ## 2; \
 435                vpxor                   x4 ## 2, x2 ## 2, x2 ## 2; \
 436                get_key(i, 2, RK2); \
 437        vpxor                   RK1, x1 ## 1, x1 ## 1;     \
 438        vpxor                   RK3, x3 ## 1, x3 ## 1;     \
 439        vpslld $5,              x0 ## 1, x4 ## 1;          \
 440        vpsrld $(32 - 5),       x0 ## 1, x0 ## 1;          \
 441        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 442        vpslld $22,             x2 ## 1, x4 ## 1;          \
 443        vpsrld $(32 - 22),      x2 ## 1, x2 ## 1;          \
 444        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 445        vpxor                   RK0, x0 ## 1, x0 ## 1;     \
 446        vpxor                   RK2, x2 ## 1, x2 ## 1;     \
 447                vpxor                   RK1, x1 ## 2, x1 ## 2;     \
 448                vpxor                   RK3, x3 ## 2, x3 ## 2;     \
 449                vpslld $5,              x0 ## 2, x4 ## 2;          \
 450                vpsrld $(32 - 5),       x0 ## 2, x0 ## 2;          \
 451                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 452                vpslld $22,             x2 ## 2, x4 ## 2;          \
 453                vpsrld $(32 - 22),      x2 ## 2, x2 ## 2;          \
 454                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 455                vpxor                   RK0, x0 ## 2, x0 ## 2;     \
 456                vpxor                   RK2, x2 ## 2, x2 ## 2;
 457
 458#define KL2(x0, x1, x2, x3, x4, i) \
 459        vpxor                   RK0, x0 ## 1, x0 ## 1;     \
 460        vpxor                   RK2, x2 ## 1, x2 ## 1;     \
 461        vpsrld $5,              x0 ## 1, x4 ## 1;          \
 462        vpslld $(32 - 5),       x0 ## 1, x0 ## 1;          \
 463        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 464        vpxor                   RK3, x3 ## 1, x3 ## 1;     \
 465        vpxor                   RK1, x1 ## 1, x1 ## 1;     \
 466        vpsrld $22,             x2 ## 1, x4 ## 1;          \
 467        vpslld $(32 - 22),      x2 ## 1, x2 ## 1;          \
 468        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 469        vpxor                   x3 ## 1, x2 ## 1, x2 ## 1; \
 470                vpxor                   RK0, x0 ## 2, x0 ## 2;     \
 471                vpxor                   RK2, x2 ## 2, x2 ## 2;     \
 472                vpsrld $5,              x0 ## 2, x4 ## 2;          \
 473                vpslld $(32 - 5),       x0 ## 2, x0 ## 2;          \
 474                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 475                vpxor                   RK3, x3 ## 2, x3 ## 2;     \
 476                vpxor                   RK1, x1 ## 2, x1 ## 2;     \
 477                vpsrld $22,             x2 ## 2, x4 ## 2;          \
 478                vpslld $(32 - 22),      x2 ## 2, x2 ## 2;          \
 479                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 480                vpxor                   x3 ## 2, x2 ## 2, x2 ## 2; \
 481        vpxor                   x3 ## 1, x0 ## 1, x0 ## 1; \
 482        vpslld $7,              x1 ## 1, x4 ## 1;          \
 483        vpxor                   x1 ## 1, x0 ## 1, x0 ## 1; \
 484        vpxor                   x4 ## 1, x2 ## 1, x2 ## 1; \
 485        vpsrld $1,              x1 ## 1, x4 ## 1;          \
 486        vpslld $(32 - 1),       x1 ## 1, x1 ## 1;          \
 487        vpor                    x4 ## 1, x1 ## 1, x1 ## 1; \
 488                vpxor                   x3 ## 2, x0 ## 2, x0 ## 2; \
 489                vpslld $7,              x1 ## 2, x4 ## 2;          \
 490                vpxor                   x1 ## 2, x0 ## 2, x0 ## 2; \
 491                vpxor                   x4 ## 2, x2 ## 2, x2 ## 2; \
 492                vpsrld $1,              x1 ## 2, x4 ## 2;          \
 493                vpslld $(32 - 1),       x1 ## 2, x1 ## 2;          \
 494                vpor                    x4 ## 2, x1 ## 2, x1 ## 2; \
 495        vpsrld $7,              x3 ## 1, x4 ## 1;          \
 496        vpslld $(32 - 7),       x3 ## 1, x3 ## 1;          \
 497        vpor                    x4 ## 1, x3 ## 1, x3 ## 1; \
 498        vpxor                   x0 ## 1, x1 ## 1, x1 ## 1; \
 499        vpslld $3,              x0 ## 1, x4 ## 1;          \
 500        vpxor                   x4 ## 1, x3 ## 1, x3 ## 1; \
 501                vpsrld $7,              x3 ## 2, x4 ## 2;          \
 502                vpslld $(32 - 7),       x3 ## 2, x3 ## 2;          \
 503                vpor                    x4 ## 2, x3 ## 2, x3 ## 2; \
 504                vpxor                   x0 ## 2, x1 ## 2, x1 ## 2; \
 505                vpslld $3,              x0 ## 2, x4 ## 2;          \
 506                vpxor                   x4 ## 2, x3 ## 2, x3 ## 2; \
 507        vpsrld $13,             x0 ## 1, x4 ## 1;          \
 508        vpslld $(32 - 13),      x0 ## 1, x0 ## 1;          \
 509        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 510        vpxor                   x2 ## 1, x1 ## 1, x1 ## 1; \
 511        vpxor                   x2 ## 1, x3 ## 1, x3 ## 1; \
 512        vpsrld $3,              x2 ## 1, x4 ## 1;          \
 513        vpslld $(32 - 3),       x2 ## 1, x2 ## 1;          \
 514        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 515                vpsrld $13,             x0 ## 2, x4 ## 2;          \
 516                vpslld $(32 - 13),      x0 ## 2, x0 ## 2;          \
 517                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 518                vpxor                   x2 ## 2, x1 ## 2, x1 ## 2; \
 519                vpxor                   x2 ## 2, x3 ## 2, x3 ## 2; \
 520                vpsrld $3,              x2 ## 2, x4 ## 2;          \
 521                vpslld $(32 - 3),       x2 ## 2, x2 ## 2;          \
 522                vpor                    x4 ## 2, x2 ## 2, x2 ## 2;
 523
 524#define S(SBOX, x0, x1, x2, x3, x4) \
 525        SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 526        SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 527        SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 528        SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
 529
 530#define SP(SBOX, x0, x1, x2, x3, x4, i) \
 531        get_key(i, 0, RK0); \
 532        SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 533        get_key(i, 2, RK2); \
 534        SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 535        get_key(i, 3, RK3); \
 536        SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 537        get_key(i, 1, RK1); \
 538        SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 539
 540#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 541        vpunpckldq              x1, x0, t0; \
 542        vpunpckhdq              x1, x0, t2; \
 543        vpunpckldq              x3, x2, t1; \
 544        vpunpckhdq              x3, x2, x3; \
 545        \
 546        vpunpcklqdq             t1, t0, x0; \
 547        vpunpckhqdq             t1, t0, x1; \
 548        vpunpcklqdq             x3, t2, x2; \
 549        vpunpckhqdq             x3, t2, x3;
 550
 551#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
 552        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 553
 554#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
 555        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 556
 557.align 8
 558SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx)
 559        /* input:
 560         *      %rdi: ctx, CTX
 561         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
 562         * output:
 563         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
 564         */
 565
 566        vpcmpeqd RNOT, RNOT, RNOT;
 567
 568        read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 569        read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 570
 571                                                 K2(RA, RB, RC, RD, RE, 0);
 572        S(S0, RA, RB, RC, RD, RE);              LK2(RC, RB, RD, RA, RE, 1);
 573        S(S1, RC, RB, RD, RA, RE);              LK2(RE, RD, RA, RC, RB, 2);
 574        S(S2, RE, RD, RA, RC, RB);              LK2(RB, RD, RE, RC, RA, 3);
 575        S(S3, RB, RD, RE, RC, RA);              LK2(RC, RA, RD, RB, RE, 4);
 576        S(S4, RC, RA, RD, RB, RE);              LK2(RA, RD, RB, RE, RC, 5);
 577        S(S5, RA, RD, RB, RE, RC);              LK2(RC, RA, RD, RE, RB, 6);
 578        S(S6, RC, RA, RD, RE, RB);              LK2(RD, RB, RA, RE, RC, 7);
 579        S(S7, RD, RB, RA, RE, RC);              LK2(RC, RA, RE, RD, RB, 8);
 580        S(S0, RC, RA, RE, RD, RB);              LK2(RE, RA, RD, RC, RB, 9);
 581        S(S1, RE, RA, RD, RC, RB);              LK2(RB, RD, RC, RE, RA, 10);
 582        S(S2, RB, RD, RC, RE, RA);              LK2(RA, RD, RB, RE, RC, 11);
 583        S(S3, RA, RD, RB, RE, RC);              LK2(RE, RC, RD, RA, RB, 12);
 584        S(S4, RE, RC, RD, RA, RB);              LK2(RC, RD, RA, RB, RE, 13);
 585        S(S5, RC, RD, RA, RB, RE);              LK2(RE, RC, RD, RB, RA, 14);
 586        S(S6, RE, RC, RD, RB, RA);              LK2(RD, RA, RC, RB, RE, 15);
 587        S(S7, RD, RA, RC, RB, RE);              LK2(RE, RC, RB, RD, RA, 16);
 588        S(S0, RE, RC, RB, RD, RA);              LK2(RB, RC, RD, RE, RA, 17);
 589        S(S1, RB, RC, RD, RE, RA);              LK2(RA, RD, RE, RB, RC, 18);
 590        S(S2, RA, RD, RE, RB, RC);              LK2(RC, RD, RA, RB, RE, 19);
 591        S(S3, RC, RD, RA, RB, RE);              LK2(RB, RE, RD, RC, RA, 20);
 592        S(S4, RB, RE, RD, RC, RA);              LK2(RE, RD, RC, RA, RB, 21);
 593        S(S5, RE, RD, RC, RA, RB);              LK2(RB, RE, RD, RA, RC, 22);
 594        S(S6, RB, RE, RD, RA, RC);              LK2(RD, RC, RE, RA, RB, 23);
 595        S(S7, RD, RC, RE, RA, RB);              LK2(RB, RE, RA, RD, RC, 24);
 596        S(S0, RB, RE, RA, RD, RC);              LK2(RA, RE, RD, RB, RC, 25);
 597        S(S1, RA, RE, RD, RB, RC);              LK2(RC, RD, RB, RA, RE, 26);
 598        S(S2, RC, RD, RB, RA, RE);              LK2(RE, RD, RC, RA, RB, 27);
 599        S(S3, RE, RD, RC, RA, RB);              LK2(RA, RB, RD, RE, RC, 28);
 600        S(S4, RA, RB, RD, RE, RC);              LK2(RB, RD, RE, RC, RA, 29);
 601        S(S5, RB, RD, RE, RC, RA);              LK2(RA, RB, RD, RC, RE, 30);
 602        S(S6, RA, RB, RD, RC, RE);              LK2(RD, RE, RB, RC, RA, 31);
 603        S(S7, RD, RE, RB, RC, RA);               K2(RA, RB, RC, RD, RE, 32);
 604
 605        write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 606        write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 607
 608        ret;
 609SYM_FUNC_END(__serpent_enc_blk8_avx)
 610
 611.align 8
 612SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx)
 613        /* input:
 614         *      %rdi: ctx, CTX
 615         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
 616         * output:
 617         *      RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks
 618         */
 619
 620        vpcmpeqd RNOT, RNOT, RNOT;
 621
 622        read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 623        read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 624
 625                                                 K2(RA, RB, RC, RD, RE, 32);
 626        SP(SI7, RA, RB, RC, RD, RE, 31);        KL2(RB, RD, RA, RE, RC, 31);
 627        SP(SI6, RB, RD, RA, RE, RC, 30);        KL2(RA, RC, RE, RB, RD, 30);
 628        SP(SI5, RA, RC, RE, RB, RD, 29);        KL2(RC, RD, RA, RE, RB, 29);
 629        SP(SI4, RC, RD, RA, RE, RB, 28);        KL2(RC, RA, RB, RE, RD, 28);
 630        SP(SI3, RC, RA, RB, RE, RD, 27);        KL2(RB, RC, RD, RE, RA, 27);
 631        SP(SI2, RB, RC, RD, RE, RA, 26);        KL2(RC, RA, RE, RD, RB, 26);
 632        SP(SI1, RC, RA, RE, RD, RB, 25);        KL2(RB, RA, RE, RD, RC, 25);
 633        SP(SI0, RB, RA, RE, RD, RC, 24);        KL2(RE, RC, RA, RB, RD, 24);
 634        SP(SI7, RE, RC, RA, RB, RD, 23);        KL2(RC, RB, RE, RD, RA, 23);
 635        SP(SI6, RC, RB, RE, RD, RA, 22);        KL2(RE, RA, RD, RC, RB, 22);
 636        SP(SI5, RE, RA, RD, RC, RB, 21);        KL2(RA, RB, RE, RD, RC, 21);
 637        SP(SI4, RA, RB, RE, RD, RC, 20);        KL2(RA, RE, RC, RD, RB, 20);
 638        SP(SI3, RA, RE, RC, RD, RB, 19);        KL2(RC, RA, RB, RD, RE, 19);
 639        SP(SI2, RC, RA, RB, RD, RE, 18);        KL2(RA, RE, RD, RB, RC, 18);
 640        SP(SI1, RA, RE, RD, RB, RC, 17);        KL2(RC, RE, RD, RB, RA, 17);
 641        SP(SI0, RC, RE, RD, RB, RA, 16);        KL2(RD, RA, RE, RC, RB, 16);
 642        SP(SI7, RD, RA, RE, RC, RB, 15);        KL2(RA, RC, RD, RB, RE, 15);
 643        SP(SI6, RA, RC, RD, RB, RE, 14);        KL2(RD, RE, RB, RA, RC, 14);
 644        SP(SI5, RD, RE, RB, RA, RC, 13);        KL2(RE, RC, RD, RB, RA, 13);
 645        SP(SI4, RE, RC, RD, RB, RA, 12);        KL2(RE, RD, RA, RB, RC, 12);
 646        SP(SI3, RE, RD, RA, RB, RC, 11);        KL2(RA, RE, RC, RB, RD, 11);
 647        SP(SI2, RA, RE, RC, RB, RD, 10);        KL2(RE, RD, RB, RC, RA, 10);
 648        SP(SI1, RE, RD, RB, RC, RA, 9);         KL2(RA, RD, RB, RC, RE, 9);
 649        SP(SI0, RA, RD, RB, RC, RE, 8);         KL2(RB, RE, RD, RA, RC, 8);
 650        SP(SI7, RB, RE, RD, RA, RC, 7);         KL2(RE, RA, RB, RC, RD, 7);
 651        SP(SI6, RE, RA, RB, RC, RD, 6);         KL2(RB, RD, RC, RE, RA, 6);
 652        SP(SI5, RB, RD, RC, RE, RA, 5);         KL2(RD, RA, RB, RC, RE, 5);
 653        SP(SI4, RD, RA, RB, RC, RE, 4);         KL2(RD, RB, RE, RC, RA, 4);
 654        SP(SI3, RD, RB, RE, RC, RA, 3);         KL2(RE, RD, RA, RC, RB, 3);
 655        SP(SI2, RE, RD, RA, RC, RB, 2);         KL2(RD, RB, RC, RA, RE, 2);
 656        SP(SI1, RD, RB, RC, RA, RE, 1);         KL2(RE, RB, RC, RA, RD, 1);
 657        S(SI0, RE, RB, RC, RA, RD);              K2(RC, RD, RB, RE, RA, 0);
 658
 659        write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
 660        write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
 661
 662        ret;
 663SYM_FUNC_END(__serpent_dec_blk8_avx)
 664
 665SYM_FUNC_START(serpent_ecb_enc_8way_avx)
 666        /* input:
 667         *      %rdi: ctx, CTX
 668         *      %rsi: dst
 669         *      %rdx: src
 670         */
 671        FRAME_BEGIN
 672
 673        load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 674
 675        call __serpent_enc_blk8_avx;
 676
 677        store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 678
 679        FRAME_END
 680        ret;
 681SYM_FUNC_END(serpent_ecb_enc_8way_avx)
 682
 683SYM_FUNC_START(serpent_ecb_dec_8way_avx)
 684        /* input:
 685         *      %rdi: ctx, CTX
 686         *      %rsi: dst
 687         *      %rdx: src
 688         */
 689        FRAME_BEGIN
 690
 691        load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 692
 693        call __serpent_dec_blk8_avx;
 694
 695        store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 696
 697        FRAME_END
 698        ret;
 699SYM_FUNC_END(serpent_ecb_dec_8way_avx)
 700
 701SYM_FUNC_START(serpent_cbc_dec_8way_avx)
 702        /* input:
 703         *      %rdi: ctx, CTX
 704         *      %rsi: dst
 705         *      %rdx: src
 706         */
 707        FRAME_BEGIN
 708
 709        load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 710
 711        call __serpent_dec_blk8_avx;
 712
 713        store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 714
 715        FRAME_END
 716        ret;
 717SYM_FUNC_END(serpent_cbc_dec_8way_avx)
 718
 719SYM_FUNC_START(serpent_ctr_8way_avx)
 720        /* input:
 721         *      %rdi: ctx, CTX
 722         *      %rsi: dst
 723         *      %rdx: src
 724         *      %rcx: iv (little endian, 128bit)
 725         */
 726        FRAME_BEGIN
 727
 728        load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
 729                      RD2, RK0, RK1, RK2);
 730
 731        call __serpent_enc_blk8_avx;
 732
 733        store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 734
 735        FRAME_END
 736        ret;
 737SYM_FUNC_END(serpent_ctr_8way_avx)
 738
 739SYM_FUNC_START(serpent_xts_enc_8way_avx)
 740        /* input:
 741         *      %rdi: ctx, CTX
 742         *      %rsi: dst
 743         *      %rdx: src
 744         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 745         */
 746        FRAME_BEGIN
 747
 748        /* regs <= src, dst <= IVs, regs <= regs xor IVs */
 749        load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
 750                      RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
 751
 752        call __serpent_enc_blk8_avx;
 753
 754        /* dst <= regs xor IVs(in dst) */
 755        store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 756
 757        FRAME_END
 758        ret;
 759SYM_FUNC_END(serpent_xts_enc_8way_avx)
 760
 761SYM_FUNC_START(serpent_xts_dec_8way_avx)
 762        /* input:
 763         *      %rdi: ctx, CTX
 764         *      %rsi: dst
 765         *      %rdx: src
 766         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 767         */
 768        FRAME_BEGIN
 769
 770        /* regs <= src, dst <= IVs, regs <= regs xor IVs */
 771        load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
 772                      RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
 773
 774        call __serpent_dec_blk8_avx;
 775
 776        /* dst <= regs xor IVs(in dst) */
 777        store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 778
 779        FRAME_END
 780        ret;
 781SYM_FUNC_END(serpent_xts_dec_8way_avx)
 782