linux/arch/x86/crypto/serpent-avx2-asm_64.S
<<
>>
Prefs
   1/*
   2 * x86_64/AVX2 assembler optimized version of Serpent
   3 *
   4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   5 *
   6 * Based on AVX assembler implementation of Serpent by:
   7 *  Copyright © 2012 Johannes Goetzfried
   8 *      <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
   9 *
  10 * This program is free software; you can redistribute it and/or modify
  11 * it under the terms of the GNU General Public License as published by
  12 * the Free Software Foundation; either version 2 of the License, or
  13 * (at your option) any later version.
  14 *
  15 */
  16
  17#include <linux/linkage.h>
  18#include "glue_helper-asm-avx2.S"
  19
  20.file "serpent-avx2-asm_64.S"
  21
  22.data
  23.align 16
  24
  25.Lbswap128_mask:
  26        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  27.Lxts_gf128mul_and_shl1_mask_0:
  28        .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
  29.Lxts_gf128mul_and_shl1_mask_1:
  30        .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
  31
  32.text
  33
  34#define CTX %rdi
  35
  36#define RNOT %ymm0
  37#define tp  %ymm1
  38
  39#define RA1 %ymm2
  40#define RA2 %ymm3
  41#define RB1 %ymm4
  42#define RB2 %ymm5
  43#define RC1 %ymm6
  44#define RC2 %ymm7
  45#define RD1 %ymm8
  46#define RD2 %ymm9
  47#define RE1 %ymm10
  48#define RE2 %ymm11
  49
  50#define RK0 %ymm12
  51#define RK1 %ymm13
  52#define RK2 %ymm14
  53#define RK3 %ymm15
  54
  55#define RK0x %xmm12
  56#define RK1x %xmm13
  57#define RK2x %xmm14
  58#define RK3x %xmm15
  59
  60#define S0_1(x0, x1, x2, x3, x4)      \
  61        vpor            x0,   x3, tp; \
  62        vpxor           x3,   x0, x0; \
  63        vpxor           x2,   x3, x4; \
  64        vpxor           RNOT, x4, x4; \
  65        vpxor           x1,   tp, x3; \
  66        vpand           x0,   x1, x1; \
  67        vpxor           x4,   x1, x1; \
  68        vpxor           x0,   x2, x2;
  69#define S0_2(x0, x1, x2, x3, x4)      \
  70        vpxor           x3,   x0, x0; \
  71        vpor            x0,   x4, x4; \
  72        vpxor           x2,   x0, x0; \
  73        vpand           x1,   x2, x2; \
  74        vpxor           x2,   x3, x3; \
  75        vpxor           RNOT, x1, x1; \
  76        vpxor           x4,   x2, x2; \
  77        vpxor           x2,   x1, x1;
  78
  79#define S1_1(x0, x1, x2, x3, x4)      \
  80        vpxor           x0,   x1, tp; \
  81        vpxor           x3,   x0, x0; \
  82        vpxor           RNOT, x3, x3; \
  83        vpand           tp,   x1, x4; \
  84        vpor            tp,   x0, x0; \
  85        vpxor           x2,   x3, x3; \
  86        vpxor           x3,   x0, x0; \
  87        vpxor           x3,   tp, x1;
  88#define S1_2(x0, x1, x2, x3, x4)      \
  89        vpxor           x4,   x3, x3; \
  90        vpor            x4,   x1, x1; \
  91        vpxor           x2,   x4, x4; \
  92        vpand           x0,   x2, x2; \
  93        vpxor           x1,   x2, x2; \
  94        vpor            x0,   x1, x1; \
  95        vpxor           RNOT, x0, x0; \
  96        vpxor           x2,   x0, x0; \
  97        vpxor           x1,   x4, x4;
  98
  99#define S2_1(x0, x1, x2, x3, x4)      \
 100        vpxor           RNOT, x3, x3; \
 101        vpxor           x0,   x1, x1; \
 102        vpand           x2,   x0, tp; \
 103        vpxor           x3,   tp, tp; \
 104        vpor            x0,   x3, x3; \
 105        vpxor           x1,   x2, x2; \
 106        vpxor           x1,   x3, x3; \
 107        vpand           tp,   x1, x1;
 108#define S2_2(x0, x1, x2, x3, x4)      \
 109        vpxor           x2,   tp, tp; \
 110        vpand           x3,   x2, x2; \
 111        vpor            x1,   x3, x3; \
 112        vpxor           RNOT, tp, tp; \
 113        vpxor           tp,   x3, x3; \
 114        vpxor           tp,   x0, x4; \
 115        vpxor           x2,   tp, x0; \
 116        vpor            x2,   x1, x1;
 117
 118#define S3_1(x0, x1, x2, x3, x4)      \
 119        vpxor           x3,   x1, tp; \
 120        vpor            x0,   x3, x3; \
 121        vpand           x0,   x1, x4; \
 122        vpxor           x2,   x0, x0; \
 123        vpxor           tp,   x2, x2; \
 124        vpand           x3,   tp, x1; \
 125        vpxor           x3,   x2, x2; \
 126        vpor            x4,   x0, x0; \
 127        vpxor           x3,   x4, x4;
 128#define S3_2(x0, x1, x2, x3, x4)      \
 129        vpxor           x0,   x1, x1; \
 130        vpand           x3,   x0, x0; \
 131        vpand           x4,   x3, x3; \
 132        vpxor           x2,   x3, x3; \
 133        vpor            x1,   x4, x4; \
 134        vpand           x1,   x2, x2; \
 135        vpxor           x3,   x4, x4; \
 136        vpxor           x3,   x0, x0; \
 137        vpxor           x2,   x3, x3;
 138
 139#define S4_1(x0, x1, x2, x3, x4)      \
 140        vpand           x0,   x3, tp; \
 141        vpxor           x3,   x0, x0; \
 142        vpxor           x2,   tp, tp; \
 143        vpor            x3,   x2, x2; \
 144        vpxor           x1,   x0, x0; \
 145        vpxor           tp,   x3, x4; \
 146        vpor            x0,   x2, x2; \
 147        vpxor           x1,   x2, x2;
 148#define S4_2(x0, x1, x2, x3, x4)      \
 149        vpand           x0,   x1, x1; \
 150        vpxor           x4,   x1, x1; \
 151        vpand           x2,   x4, x4; \
 152        vpxor           tp,   x2, x2; \
 153        vpxor           x0,   x4, x4; \
 154        vpor            x1,   tp, x3; \
 155        vpxor           RNOT, x1, x1; \
 156        vpxor           x0,   x3, x3;
 157
 158#define S5_1(x0, x1, x2, x3, x4)      \
 159        vpor            x0,   x1, tp; \
 160        vpxor           tp,   x2, x2; \
 161        vpxor           RNOT, x3, x3; \
 162        vpxor           x0,   x1, x4; \
 163        vpxor           x2,   x0, x0; \
 164        vpand           x4,   tp, x1; \
 165        vpor            x3,   x4, x4; \
 166        vpxor           x0,   x4, x4;
 167#define S5_2(x0, x1, x2, x3, x4)      \
 168        vpand           x3,   x0, x0; \
 169        vpxor           x3,   x1, x1; \
 170        vpxor           x2,   x3, x3; \
 171        vpxor           x1,   x0, x0; \
 172        vpand           x4,   x2, x2; \
 173        vpxor           x2,   x1, x1; \
 174        vpand           x0,   x2, x2; \
 175        vpxor           x2,   x3, x3;
 176
 177#define S6_1(x0, x1, x2, x3, x4)      \
 178        vpxor           x0,   x3, x3; \
 179        vpxor           x2,   x1, tp; \
 180        vpxor           x0,   x2, x2; \
 181        vpand           x3,   x0, x0; \
 182        vpor            x3,   tp, tp; \
 183        vpxor           RNOT, x1, x4; \
 184        vpxor           tp,   x0, x0; \
 185        vpxor           x2,   tp, x1;
 186#define S6_2(x0, x1, x2, x3, x4)      \
 187        vpxor           x4,   x3, x3; \
 188        vpxor           x0,   x4, x4; \
 189        vpand           x0,   x2, x2; \
 190        vpxor           x1,   x4, x4; \
 191        vpxor           x3,   x2, x2; \
 192        vpand           x1,   x3, x3; \
 193        vpxor           x0,   x3, x3; \
 194        vpxor           x2,   x1, x1;
 195
 196#define S7_1(x0, x1, x2, x3, x4)      \
 197        vpxor           RNOT, x1, tp; \
 198        vpxor           RNOT, x0, x0; \
 199        vpand           x2,   tp, x1; \
 200        vpxor           x3,   x1, x1; \
 201        vpor            tp,   x3, x3; \
 202        vpxor           x2,   tp, x4; \
 203        vpxor           x3,   x2, x2; \
 204        vpxor           x0,   x3, x3; \
 205        vpor            x1,   x0, x0;
 206#define S7_2(x0, x1, x2, x3, x4)      \
 207        vpand           x0,   x2, x2; \
 208        vpxor           x4,   x0, x0; \
 209        vpxor           x3,   x4, x4; \
 210        vpand           x0,   x3, x3; \
 211        vpxor           x1,   x4, x4; \
 212        vpxor           x4,   x2, x2; \
 213        vpxor           x1,   x3, x3; \
 214        vpor            x0,   x4, x4; \
 215        vpxor           x1,   x4, x4;
 216
 217#define SI0_1(x0, x1, x2, x3, x4)     \
 218        vpxor           x0,   x1, x1; \
 219        vpor            x1,   x3, tp; \
 220        vpxor           x1,   x3, x4; \
 221        vpxor           RNOT, x0, x0; \
 222        vpxor           tp,   x2, x2; \
 223        vpxor           x0,   tp, x3; \
 224        vpand           x1,   x0, x0; \
 225        vpxor           x2,   x0, x0;
 226#define SI0_2(x0, x1, x2, x3, x4)     \
 227        vpand           x3,   x2, x2; \
 228        vpxor           x4,   x3, x3; \
 229        vpxor           x3,   x2, x2; \
 230        vpxor           x3,   x1, x1; \
 231        vpand           x0,   x3, x3; \
 232        vpxor           x0,   x1, x1; \
 233        vpxor           x2,   x0, x0; \
 234        vpxor           x3,   x4, x4;
 235
 236#define SI1_1(x0, x1, x2, x3, x4)     \
 237        vpxor           x3,   x1, x1; \
 238        vpxor           x2,   x0, tp; \
 239        vpxor           RNOT, x2, x2; \
 240        vpor            x1,   x0, x4; \
 241        vpxor           x3,   x4, x4; \
 242        vpand           x1,   x3, x3; \
 243        vpxor           x2,   x1, x1; \
 244        vpand           x4,   x2, x2;
 245#define SI1_2(x0, x1, x2, x3, x4)     \
 246        vpxor           x1,   x4, x4; \
 247        vpor            x3,   x1, x1; \
 248        vpxor           tp,   x3, x3; \
 249        vpxor           tp,   x2, x2; \
 250        vpor            x4,   tp, x0; \
 251        vpxor           x4,   x2, x2; \
 252        vpxor           x0,   x1, x1; \
 253        vpxor           x1,   x4, x4;
 254
 255#define SI2_1(x0, x1, x2, x3, x4)     \
 256        vpxor           x1,   x2, x2; \
 257        vpxor           RNOT, x3, tp; \
 258        vpor            x2,   tp, tp; \
 259        vpxor           x3,   x2, x2; \
 260        vpxor           x0,   x3, x4; \
 261        vpxor           x1,   tp, x3; \
 262        vpor            x2,   x1, x1; \
 263        vpxor           x0,   x2, x2;
 264#define SI2_2(x0, x1, x2, x3, x4)     \
 265        vpxor           x4,   x1, x1; \
 266        vpor            x3,   x4, x4; \
 267        vpxor           x3,   x2, x2; \
 268        vpxor           x2,   x4, x4; \
 269        vpand           x1,   x2, x2; \
 270        vpxor           x3,   x2, x2; \
 271        vpxor           x4,   x3, x3; \
 272        vpxor           x0,   x4, x4;
 273
 274#define SI3_1(x0, x1, x2, x3, x4)     \
 275        vpxor           x1,   x2, x2; \
 276        vpand           x2,   x1, tp; \
 277        vpxor           x0,   tp, tp; \
 278        vpor            x1,   x0, x0; \
 279        vpxor           x3,   x1, x4; \
 280        vpxor           x3,   x0, x0; \
 281        vpor            tp,   x3, x3; \
 282        vpxor           x2,   tp, x1;
 283#define SI3_2(x0, x1, x2, x3, x4)     \
 284        vpxor           x3,   x1, x1; \
 285        vpxor           x2,   x0, x0; \
 286        vpxor           x3,   x2, x2; \
 287        vpand           x1,   x3, x3; \
 288        vpxor           x0,   x1, x1; \
 289        vpand           x2,   x0, x0; \
 290        vpxor           x3,   x4, x4; \
 291        vpxor           x0,   x3, x3; \
 292        vpxor           x1,   x0, x0;
 293
 294#define SI4_1(x0, x1, x2, x3, x4)     \
 295        vpxor           x3,   x2, x2; \
 296        vpand           x1,   x0, tp; \
 297        vpxor           x2,   tp, tp; \
 298        vpor            x3,   x2, x2; \
 299        vpxor           RNOT, x0, x4; \
 300        vpxor           tp,   x1, x1; \
 301        vpxor           x2,   tp, x0; \
 302        vpand           x4,   x2, x2;
 303#define SI4_2(x0, x1, x2, x3, x4)     \
 304        vpxor           x0,   x2, x2; \
 305        vpor            x4,   x0, x0; \
 306        vpxor           x3,   x0, x0; \
 307        vpand           x2,   x3, x3; \
 308        vpxor           x3,   x4, x4; \
 309        vpxor           x1,   x3, x3; \
 310        vpand           x0,   x1, x1; \
 311        vpxor           x1,   x4, x4; \
 312        vpxor           x3,   x0, x0;
 313
 314#define SI5_1(x0, x1, x2, x3, x4)     \
 315        vpor            x2,   x1, tp; \
 316        vpxor           x1,   x2, x2; \
 317        vpxor           x3,   tp, tp; \
 318        vpand           x1,   x3, x3; \
 319        vpxor           x3,   x2, x2; \
 320        vpor            x0,   x3, x3; \
 321        vpxor           RNOT, x0, x0; \
 322        vpxor           x2,   x3, x3; \
 323        vpor            x0,   x2, x2;
 324#define SI5_2(x0, x1, x2, x3, x4)     \
 325        vpxor           tp,   x1, x4; \
 326        vpxor           x4,   x2, x2; \
 327        vpand           x0,   x4, x4; \
 328        vpxor           tp,   x0, x0; \
 329        vpxor           x3,   tp, x1; \
 330        vpand           x2,   x0, x0; \
 331        vpxor           x3,   x2, x2; \
 332        vpxor           x2,   x0, x0; \
 333        vpxor           x4,   x2, x2; \
 334        vpxor           x3,   x4, x4;
 335
 336#define SI6_1(x0, x1, x2, x3, x4)     \
 337        vpxor           x2,   x0, x0; \
 338        vpand           x3,   x0, tp; \
 339        vpxor           x3,   x2, x2; \
 340        vpxor           x2,   tp, tp; \
 341        vpxor           x1,   x3, x3; \
 342        vpor            x0,   x2, x2; \
 343        vpxor           x3,   x2, x2; \
 344        vpand           tp,   x3, x3;
 345#define SI6_2(x0, x1, x2, x3, x4)     \
 346        vpxor           RNOT, tp, tp; \
 347        vpxor           x1,   x3, x3; \
 348        vpand           x2,   x1, x1; \
 349        vpxor           tp,   x0, x4; \
 350        vpxor           x4,   x3, x3; \
 351        vpxor           x2,   x4, x4; \
 352        vpxor           x1,   tp, x0; \
 353        vpxor           x0,   x2, x2;
 354
 355#define SI7_1(x0, x1, x2, x3, x4)     \
 356        vpand           x0,   x3, tp; \
 357        vpxor           x2,   x0, x0; \
 358        vpor            x3,   x2, x2; \
 359        vpxor           x1,   x3, x4; \
 360        vpxor           RNOT, x0, x0; \
 361        vpor            tp,   x1, x1; \
 362        vpxor           x0,   x4, x4; \
 363        vpand           x2,   x0, x0; \
 364        vpxor           x1,   x0, x0;
 365#define SI7_2(x0, x1, x2, x3, x4)     \
 366        vpand           x2,   x1, x1; \
 367        vpxor           x2,   tp, x3; \
 368        vpxor           x3,   x4, x4; \
 369        vpand           x3,   x2, x2; \
 370        vpor            x0,   x3, x3; \
 371        vpxor           x4,   x1, x1; \
 372        vpxor           x4,   x3, x3; \
 373        vpand           x0,   x4, x4; \
 374        vpxor           x2,   x4, x4;
 375
 376#define get_key(i,j,t) \
 377        vpbroadcastd (4*(i)+(j))*4(CTX), t;
 378
 379#define K2(x0, x1, x2, x3, x4, i) \
 380        get_key(i, 0, RK0); \
 381        get_key(i, 1, RK1); \
 382        get_key(i, 2, RK2); \
 383        get_key(i, 3, RK3); \
 384        vpxor RK0,      x0 ## 1, x0 ## 1; \
 385        vpxor RK1,      x1 ## 1, x1 ## 1; \
 386        vpxor RK2,      x2 ## 1, x2 ## 1; \
 387        vpxor RK3,      x3 ## 1, x3 ## 1; \
 388                vpxor RK0,      x0 ## 2, x0 ## 2; \
 389                vpxor RK1,      x1 ## 2, x1 ## 2; \
 390                vpxor RK2,      x2 ## 2, x2 ## 2; \
 391                vpxor RK3,      x3 ## 2, x3 ## 2;
 392
 393#define LK2(x0, x1, x2, x3, x4, i) \
 394        vpslld $13,             x0 ## 1, x4 ## 1;          \
 395        vpsrld $(32 - 13),      x0 ## 1, x0 ## 1;          \
 396        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 397        vpxor                   x0 ## 1, x1 ## 1, x1 ## 1; \
 398        vpslld $3,              x2 ## 1, x4 ## 1;          \
 399        vpsrld $(32 - 3),       x2 ## 1, x2 ## 1;          \
 400        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 401        vpxor                   x2 ## 1, x1 ## 1, x1 ## 1; \
 402                vpslld $13,             x0 ## 2, x4 ## 2;          \
 403                vpsrld $(32 - 13),      x0 ## 2, x0 ## 2;          \
 404                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 405                vpxor                   x0 ## 2, x1 ## 2, x1 ## 2; \
 406                vpslld $3,              x2 ## 2, x4 ## 2;          \
 407                vpsrld $(32 - 3),       x2 ## 2, x2 ## 2;          \
 408                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 409                vpxor                   x2 ## 2, x1 ## 2, x1 ## 2; \
 410        vpslld $1,              x1 ## 1, x4 ## 1;          \
 411        vpsrld $(32 - 1),       x1 ## 1, x1 ## 1;          \
 412        vpor                    x4 ## 1, x1 ## 1, x1 ## 1; \
 413        vpslld $3,              x0 ## 1, x4 ## 1;          \
 414        vpxor                   x2 ## 1, x3 ## 1, x3 ## 1; \
 415        vpxor                   x4 ## 1, x3 ## 1, x3 ## 1; \
 416        get_key(i, 1, RK1); \
 417                vpslld $1,              x1 ## 2, x4 ## 2;          \
 418                vpsrld $(32 - 1),       x1 ## 2, x1 ## 2;          \
 419                vpor                    x4 ## 2, x1 ## 2, x1 ## 2; \
 420                vpslld $3,              x0 ## 2, x4 ## 2;          \
 421                vpxor                   x2 ## 2, x3 ## 2, x3 ## 2; \
 422                vpxor                   x4 ## 2, x3 ## 2, x3 ## 2; \
 423                get_key(i, 3, RK3); \
 424        vpslld $7,              x3 ## 1, x4 ## 1;          \
 425        vpsrld $(32 - 7),       x3 ## 1, x3 ## 1;          \
 426        vpor                    x4 ## 1, x3 ## 1, x3 ## 1; \
 427        vpslld $7,              x1 ## 1, x4 ## 1;          \
 428        vpxor                   x1 ## 1, x0 ## 1, x0 ## 1; \
 429        vpxor                   x3 ## 1, x0 ## 1, x0 ## 1; \
 430        vpxor                   x3 ## 1, x2 ## 1, x2 ## 1; \
 431        vpxor                   x4 ## 1, x2 ## 1, x2 ## 1; \
 432        get_key(i, 0, RK0); \
 433                vpslld $7,              x3 ## 2, x4 ## 2;          \
 434                vpsrld $(32 - 7),       x3 ## 2, x3 ## 2;          \
 435                vpor                    x4 ## 2, x3 ## 2, x3 ## 2; \
 436                vpslld $7,              x1 ## 2, x4 ## 2;          \
 437                vpxor                   x1 ## 2, x0 ## 2, x0 ## 2; \
 438                vpxor                   x3 ## 2, x0 ## 2, x0 ## 2; \
 439                vpxor                   x3 ## 2, x2 ## 2, x2 ## 2; \
 440                vpxor                   x4 ## 2, x2 ## 2, x2 ## 2; \
 441                get_key(i, 2, RK2); \
 442        vpxor                   RK1, x1 ## 1, x1 ## 1;     \
 443        vpxor                   RK3, x3 ## 1, x3 ## 1;     \
 444        vpslld $5,              x0 ## 1, x4 ## 1;          \
 445        vpsrld $(32 - 5),       x0 ## 1, x0 ## 1;          \
 446        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 447        vpslld $22,             x2 ## 1, x4 ## 1;          \
 448        vpsrld $(32 - 22),      x2 ## 1, x2 ## 1;          \
 449        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 450        vpxor                   RK0, x0 ## 1, x0 ## 1;     \
 451        vpxor                   RK2, x2 ## 1, x2 ## 1;     \
 452                vpxor                   RK1, x1 ## 2, x1 ## 2;     \
 453                vpxor                   RK3, x3 ## 2, x3 ## 2;     \
 454                vpslld $5,              x0 ## 2, x4 ## 2;          \
 455                vpsrld $(32 - 5),       x0 ## 2, x0 ## 2;          \
 456                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 457                vpslld $22,             x2 ## 2, x4 ## 2;          \
 458                vpsrld $(32 - 22),      x2 ## 2, x2 ## 2;          \
 459                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 460                vpxor                   RK0, x0 ## 2, x0 ## 2;     \
 461                vpxor                   RK2, x2 ## 2, x2 ## 2;
 462
 463#define KL2(x0, x1, x2, x3, x4, i) \
 464        vpxor                   RK0, x0 ## 1, x0 ## 1;     \
 465        vpxor                   RK2, x2 ## 1, x2 ## 1;     \
 466        vpsrld $5,              x0 ## 1, x4 ## 1;          \
 467        vpslld $(32 - 5),       x0 ## 1, x0 ## 1;          \
 468        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 469        vpxor                   RK3, x3 ## 1, x3 ## 1;     \
 470        vpxor                   RK1, x1 ## 1, x1 ## 1;     \
 471        vpsrld $22,             x2 ## 1, x4 ## 1;          \
 472        vpslld $(32 - 22),      x2 ## 1, x2 ## 1;          \
 473        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 474        vpxor                   x3 ## 1, x2 ## 1, x2 ## 1; \
 475                vpxor                   RK0, x0 ## 2, x0 ## 2;     \
 476                vpxor                   RK2, x2 ## 2, x2 ## 2;     \
 477                vpsrld $5,              x0 ## 2, x4 ## 2;          \
 478                vpslld $(32 - 5),       x0 ## 2, x0 ## 2;          \
 479                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 480                vpxor                   RK3, x3 ## 2, x3 ## 2;     \
 481                vpxor                   RK1, x1 ## 2, x1 ## 2;     \
 482                vpsrld $22,             x2 ## 2, x4 ## 2;          \
 483                vpslld $(32 - 22),      x2 ## 2, x2 ## 2;          \
 484                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 485                vpxor                   x3 ## 2, x2 ## 2, x2 ## 2; \
 486        vpxor                   x3 ## 1, x0 ## 1, x0 ## 1; \
 487        vpslld $7,              x1 ## 1, x4 ## 1;          \
 488        vpxor                   x1 ## 1, x0 ## 1, x0 ## 1; \
 489        vpxor                   x4 ## 1, x2 ## 1, x2 ## 1; \
 490        vpsrld $1,              x1 ## 1, x4 ## 1;          \
 491        vpslld $(32 - 1),       x1 ## 1, x1 ## 1;          \
 492        vpor                    x4 ## 1, x1 ## 1, x1 ## 1; \
 493                vpxor                   x3 ## 2, x0 ## 2, x0 ## 2; \
 494                vpslld $7,              x1 ## 2, x4 ## 2;          \
 495                vpxor                   x1 ## 2, x0 ## 2, x0 ## 2; \
 496                vpxor                   x4 ## 2, x2 ## 2, x2 ## 2; \
 497                vpsrld $1,              x1 ## 2, x4 ## 2;          \
 498                vpslld $(32 - 1),       x1 ## 2, x1 ## 2;          \
 499                vpor                    x4 ## 2, x1 ## 2, x1 ## 2; \
 500        vpsrld $7,              x3 ## 1, x4 ## 1;          \
 501        vpslld $(32 - 7),       x3 ## 1, x3 ## 1;          \
 502        vpor                    x4 ## 1, x3 ## 1, x3 ## 1; \
 503        vpxor                   x0 ## 1, x1 ## 1, x1 ## 1; \
 504        vpslld $3,              x0 ## 1, x4 ## 1;          \
 505        vpxor                   x4 ## 1, x3 ## 1, x3 ## 1; \
 506                vpsrld $7,              x3 ## 2, x4 ## 2;          \
 507                vpslld $(32 - 7),       x3 ## 2, x3 ## 2;          \
 508                vpor                    x4 ## 2, x3 ## 2, x3 ## 2; \
 509                vpxor                   x0 ## 2, x1 ## 2, x1 ## 2; \
 510                vpslld $3,              x0 ## 2, x4 ## 2;          \
 511                vpxor                   x4 ## 2, x3 ## 2, x3 ## 2; \
 512        vpsrld $13,             x0 ## 1, x4 ## 1;          \
 513        vpslld $(32 - 13),      x0 ## 1, x0 ## 1;          \
 514        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 515        vpxor                   x2 ## 1, x1 ## 1, x1 ## 1; \
 516        vpxor                   x2 ## 1, x3 ## 1, x3 ## 1; \
 517        vpsrld $3,              x2 ## 1, x4 ## 1;          \
 518        vpslld $(32 - 3),       x2 ## 1, x2 ## 1;          \
 519        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 520                vpsrld $13,             x0 ## 2, x4 ## 2;          \
 521                vpslld $(32 - 13),      x0 ## 2, x0 ## 2;          \
 522                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 523                vpxor                   x2 ## 2, x1 ## 2, x1 ## 2; \
 524                vpxor                   x2 ## 2, x3 ## 2, x3 ## 2; \
 525                vpsrld $3,              x2 ## 2, x4 ## 2;          \
 526                vpslld $(32 - 3),       x2 ## 2, x2 ## 2;          \
 527                vpor                    x4 ## 2, x2 ## 2, x2 ## 2;
 528
 529#define S(SBOX, x0, x1, x2, x3, x4) \
 530        SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 531        SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 532        SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 533        SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
 534
 535#define SP(SBOX, x0, x1, x2, x3, x4, i) \
 536        get_key(i, 0, RK0); \
 537        SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 538        get_key(i, 2, RK2); \
 539        SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 540        get_key(i, 3, RK3); \
 541        SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 542        get_key(i, 1, RK1); \
 543        SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 544
 545#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 546        vpunpckldq              x1, x0, t0; \
 547        vpunpckhdq              x1, x0, t2; \
 548        vpunpckldq              x3, x2, t1; \
 549        vpunpckhdq              x3, x2, x3; \
 550        \
 551        vpunpcklqdq             t1, t0, x0; \
 552        vpunpckhqdq             t1, t0, x1; \
 553        vpunpcklqdq             x3, t2, x2; \
 554        vpunpckhqdq             x3, t2, x3;
 555
 556#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
 557        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 558
 559#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
 560        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 561
 562.align 8
 563__serpent_enc_blk16:
 564        /* input:
 565         *      %rdi: ctx, CTX
 566         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext
 567         * output:
 568         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
 569         */
 570
 571        vpcmpeqd RNOT, RNOT, RNOT;
 572
 573        read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 574        read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 575
 576                                                 K2(RA, RB, RC, RD, RE, 0);
 577        S(S0, RA, RB, RC, RD, RE);              LK2(RC, RB, RD, RA, RE, 1);
 578        S(S1, RC, RB, RD, RA, RE);              LK2(RE, RD, RA, RC, RB, 2);
 579        S(S2, RE, RD, RA, RC, RB);              LK2(RB, RD, RE, RC, RA, 3);
 580        S(S3, RB, RD, RE, RC, RA);              LK2(RC, RA, RD, RB, RE, 4);
 581        S(S4, RC, RA, RD, RB, RE);              LK2(RA, RD, RB, RE, RC, 5);
 582        S(S5, RA, RD, RB, RE, RC);              LK2(RC, RA, RD, RE, RB, 6);
 583        S(S6, RC, RA, RD, RE, RB);              LK2(RD, RB, RA, RE, RC, 7);
 584        S(S7, RD, RB, RA, RE, RC);              LK2(RC, RA, RE, RD, RB, 8);
 585        S(S0, RC, RA, RE, RD, RB);              LK2(RE, RA, RD, RC, RB, 9);
 586        S(S1, RE, RA, RD, RC, RB);              LK2(RB, RD, RC, RE, RA, 10);
 587        S(S2, RB, RD, RC, RE, RA);              LK2(RA, RD, RB, RE, RC, 11);
 588        S(S3, RA, RD, RB, RE, RC);              LK2(RE, RC, RD, RA, RB, 12);
 589        S(S4, RE, RC, RD, RA, RB);              LK2(RC, RD, RA, RB, RE, 13);
 590        S(S5, RC, RD, RA, RB, RE);              LK2(RE, RC, RD, RB, RA, 14);
 591        S(S6, RE, RC, RD, RB, RA);              LK2(RD, RA, RC, RB, RE, 15);
 592        S(S7, RD, RA, RC, RB, RE);              LK2(RE, RC, RB, RD, RA, 16);
 593        S(S0, RE, RC, RB, RD, RA);              LK2(RB, RC, RD, RE, RA, 17);
 594        S(S1, RB, RC, RD, RE, RA);              LK2(RA, RD, RE, RB, RC, 18);
 595        S(S2, RA, RD, RE, RB, RC);              LK2(RC, RD, RA, RB, RE, 19);
 596        S(S3, RC, RD, RA, RB, RE);              LK2(RB, RE, RD, RC, RA, 20);
 597        S(S4, RB, RE, RD, RC, RA);              LK2(RE, RD, RC, RA, RB, 21);
 598        S(S5, RE, RD, RC, RA, RB);              LK2(RB, RE, RD, RA, RC, 22);
 599        S(S6, RB, RE, RD, RA, RC);              LK2(RD, RC, RE, RA, RB, 23);
 600        S(S7, RD, RC, RE, RA, RB);              LK2(RB, RE, RA, RD, RC, 24);
 601        S(S0, RB, RE, RA, RD, RC);              LK2(RA, RE, RD, RB, RC, 25);
 602        S(S1, RA, RE, RD, RB, RC);              LK2(RC, RD, RB, RA, RE, 26);
 603        S(S2, RC, RD, RB, RA, RE);              LK2(RE, RD, RC, RA, RB, 27);
 604        S(S3, RE, RD, RC, RA, RB);              LK2(RA, RB, RD, RE, RC, 28);
 605        S(S4, RA, RB, RD, RE, RC);              LK2(RB, RD, RE, RC, RA, 29);
 606        S(S5, RB, RD, RE, RC, RA);              LK2(RA, RB, RD, RC, RE, 30);
 607        S(S6, RA, RB, RD, RC, RE);              LK2(RD, RE, RB, RC, RA, 31);
 608        S(S7, RD, RE, RB, RC, RA);               K2(RA, RB, RC, RD, RE, 32);
 609
 610        write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 611        write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 612
 613        ret;
 614ENDPROC(__serpent_enc_blk16)
 615
 616.align 8
 617__serpent_dec_blk16:
 618        /* input:
 619         *      %rdi: ctx, CTX
 620         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
 621         * output:
 622         *      RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext
 623         */
 624
 625        vpcmpeqd RNOT, RNOT, RNOT;
 626
 627        read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 628        read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 629
 630                                                 K2(RA, RB, RC, RD, RE, 32);
 631        SP(SI7, RA, RB, RC, RD, RE, 31);        KL2(RB, RD, RA, RE, RC, 31);
 632        SP(SI6, RB, RD, RA, RE, RC, 30);        KL2(RA, RC, RE, RB, RD, 30);
 633        SP(SI5, RA, RC, RE, RB, RD, 29);        KL2(RC, RD, RA, RE, RB, 29);
 634        SP(SI4, RC, RD, RA, RE, RB, 28);        KL2(RC, RA, RB, RE, RD, 28);
 635        SP(SI3, RC, RA, RB, RE, RD, 27);        KL2(RB, RC, RD, RE, RA, 27);
 636        SP(SI2, RB, RC, RD, RE, RA, 26);        KL2(RC, RA, RE, RD, RB, 26);
 637        SP(SI1, RC, RA, RE, RD, RB, 25);        KL2(RB, RA, RE, RD, RC, 25);
 638        SP(SI0, RB, RA, RE, RD, RC, 24);        KL2(RE, RC, RA, RB, RD, 24);
 639        SP(SI7, RE, RC, RA, RB, RD, 23);        KL2(RC, RB, RE, RD, RA, 23);
 640        SP(SI6, RC, RB, RE, RD, RA, 22);        KL2(RE, RA, RD, RC, RB, 22);
 641        SP(SI5, RE, RA, RD, RC, RB, 21);        KL2(RA, RB, RE, RD, RC, 21);
 642        SP(SI4, RA, RB, RE, RD, RC, 20);        KL2(RA, RE, RC, RD, RB, 20);
 643        SP(SI3, RA, RE, RC, RD, RB, 19);        KL2(RC, RA, RB, RD, RE, 19);
 644        SP(SI2, RC, RA, RB, RD, RE, 18);        KL2(RA, RE, RD, RB, RC, 18);
 645        SP(SI1, RA, RE, RD, RB, RC, 17);        KL2(RC, RE, RD, RB, RA, 17);
 646        SP(SI0, RC, RE, RD, RB, RA, 16);        KL2(RD, RA, RE, RC, RB, 16);
 647        SP(SI7, RD, RA, RE, RC, RB, 15);        KL2(RA, RC, RD, RB, RE, 15);
 648        SP(SI6, RA, RC, RD, RB, RE, 14);        KL2(RD, RE, RB, RA, RC, 14);
 649        SP(SI5, RD, RE, RB, RA, RC, 13);        KL2(RE, RC, RD, RB, RA, 13);
 650        SP(SI4, RE, RC, RD, RB, RA, 12);        KL2(RE, RD, RA, RB, RC, 12);
 651        SP(SI3, RE, RD, RA, RB, RC, 11);        KL2(RA, RE, RC, RB, RD, 11);
 652        SP(SI2, RA, RE, RC, RB, RD, 10);        KL2(RE, RD, RB, RC, RA, 10);
 653        SP(SI1, RE, RD, RB, RC, RA, 9);         KL2(RA, RD, RB, RC, RE, 9);
 654        SP(SI0, RA, RD, RB, RC, RE, 8);         KL2(RB, RE, RD, RA, RC, 8);
 655        SP(SI7, RB, RE, RD, RA, RC, 7);         KL2(RE, RA, RB, RC, RD, 7);
 656        SP(SI6, RE, RA, RB, RC, RD, 6);         KL2(RB, RD, RC, RE, RA, 6);
 657        SP(SI5, RB, RD, RC, RE, RA, 5);         KL2(RD, RA, RB, RC, RE, 5);
 658        SP(SI4, RD, RA, RB, RC, RE, 4);         KL2(RD, RB, RE, RC, RA, 4);
 659        SP(SI3, RD, RB, RE, RC, RA, 3);         KL2(RE, RD, RA, RC, RB, 3);
 660        SP(SI2, RE, RD, RA, RC, RB, 2);         KL2(RD, RB, RC, RA, RE, 2);
 661        SP(SI1, RD, RB, RC, RA, RE, 1);         KL2(RE, RB, RC, RA, RD, 1);
 662        S(SI0, RE, RB, RC, RA, RD);              K2(RC, RD, RB, RE, RA, 0);
 663
 664        write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
 665        write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
 666
 667        ret;
 668ENDPROC(__serpent_dec_blk16)
 669
 670ENTRY(serpent_ecb_enc_16way)
 671        /* input:
 672         *      %rdi: ctx, CTX
 673         *      %rsi: dst
 674         *      %rdx: src
 675         */
 676
 677        vzeroupper;
 678
 679        load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 680
 681        call __serpent_enc_blk16;
 682
 683        store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 684
 685        vzeroupper;
 686
 687        ret;
 688ENDPROC(serpent_ecb_enc_16way)
 689
 690ENTRY(serpent_ecb_dec_16way)
 691        /* input:
 692         *      %rdi: ctx, CTX
 693         *      %rsi: dst
 694         *      %rdx: src
 695         */
 696
 697        vzeroupper;
 698
 699        load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 700
 701        call __serpent_dec_blk16;
 702
 703        store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 704
 705        vzeroupper;
 706
 707        ret;
 708ENDPROC(serpent_ecb_dec_16way)
 709
 710ENTRY(serpent_cbc_dec_16way)
 711        /* input:
 712         *      %rdi: ctx, CTX
 713         *      %rsi: dst
 714         *      %rdx: src
 715         */
 716
 717        vzeroupper;
 718
 719        load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 720
 721        call __serpent_dec_blk16;
 722
 723        store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
 724                        RK0);
 725
 726        vzeroupper;
 727
 728        ret;
 729ENDPROC(serpent_cbc_dec_16way)
 730
 731ENTRY(serpent_ctr_16way)
 732        /* input:
 733         *      %rdi: ctx, CTX
 734         *      %rsi: dst (16 blocks)
 735         *      %rdx: src (16 blocks)
 736         *      %rcx: iv (little endian, 128bit)
 737         */
 738
 739        vzeroupper;
 740
 741        load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
 742                       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
 743                       tp);
 744
 745        call __serpent_enc_blk16;
 746
 747        store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 748
 749        vzeroupper;
 750
 751        ret;
 752ENDPROC(serpent_ctr_16way)
 753
 754ENTRY(serpent_xts_enc_16way)
 755        /* input:
 756         *      %rdi: ctx, CTX
 757         *      %rsi: dst (16 blocks)
 758         *      %rdx: src (16 blocks)
 759         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 760         */
 761
 762        vzeroupper;
 763
 764        load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
 765                       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
 766                       .Lxts_gf128mul_and_shl1_mask_0,
 767                       .Lxts_gf128mul_and_shl1_mask_1);
 768
 769        call __serpent_enc_blk16;
 770
 771        store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 772
 773        vzeroupper;
 774
 775        ret;
 776ENDPROC(serpent_xts_enc_16way)
 777
 778ENTRY(serpent_xts_dec_16way)
 779        /* input:
 780         *      %rdi: ctx, CTX
 781         *      %rsi: dst (16 blocks)
 782         *      %rdx: src (16 blocks)
 783         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 784         */
 785
 786        vzeroupper;
 787
 788        load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
 789                       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
 790                       .Lxts_gf128mul_and_shl1_mask_0,
 791                       .Lxts_gf128mul_and_shl1_mask_1);
 792
 793        call __serpent_dec_blk16;
 794
 795        store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 796
 797        vzeroupper;
 798
 799        ret;
 800ENDPROC(serpent_xts_dec_16way)
 801