linux/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
<<
>>
Prefs
   1/*
   2 * Serpent Cipher 8-way parallel algorithm (x86_64/AVX)
   3 *
   4 * Copyright (C) 2012 Johannes Goetzfried
   5 *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
   6 *
   7 * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of the GNU General Public License as published by
  11 * the Free Software Foundation; either version 2 of the License, or
  12 * (at your option) any later version.
  13 *
  14 * This program is distributed in the hope that it will be useful,
  15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 * GNU General Public License for more details.
  18 *
  19 * You should have received a copy of the GNU General Public License
  20 * along with this program; if not, write to the Free Software
  21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
  22 * USA
  23 *
  24 */
  25
  26#include <linux/linkage.h>
  27#include <asm/frame.h>
  28#include "glue_helper-asm-avx.S"
  29
  30.file "serpent-avx-x86_64-asm_64.S"
  31
  32.section        .rodata.cst16.bswap128_mask, "aM", @progbits, 16
  33.align 16
  34.Lbswap128_mask:
  35        .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  36.section        .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
  37.align 16
  38.Lxts_gf128mul_and_shl1_mask:
  39        .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
  40
  41.text
  42
  43#define CTX %rdi
  44
  45/**********************************************************************
  46  8-way AVX serpent
  47 **********************************************************************/
  48#define RA1 %xmm0
  49#define RB1 %xmm1
  50#define RC1 %xmm2
  51#define RD1 %xmm3
  52#define RE1 %xmm4
  53
  54#define tp  %xmm5
  55
  56#define RA2 %xmm6
  57#define RB2 %xmm7
  58#define RC2 %xmm8
  59#define RD2 %xmm9
  60#define RE2 %xmm10
  61
  62#define RNOT %xmm11
  63
  64#define RK0 %xmm12
  65#define RK1 %xmm13
  66#define RK2 %xmm14
  67#define RK3 %xmm15
  68
  69
  70#define S0_1(x0, x1, x2, x3, x4)      \
  71        vpor            x0,   x3, tp; \
  72        vpxor           x3,   x0, x0; \
  73        vpxor           x2,   x3, x4; \
  74        vpxor           RNOT, x4, x4; \
  75        vpxor           x1,   tp, x3; \
  76        vpand           x0,   x1, x1; \
  77        vpxor           x4,   x1, x1; \
  78        vpxor           x0,   x2, x2;
  79#define S0_2(x0, x1, x2, x3, x4)      \
  80        vpxor           x3,   x0, x0; \
  81        vpor            x0,   x4, x4; \
  82        vpxor           x2,   x0, x0; \
  83        vpand           x1,   x2, x2; \
  84        vpxor           x2,   x3, x3; \
  85        vpxor           RNOT, x1, x1; \
  86        vpxor           x4,   x2, x2; \
  87        vpxor           x2,   x1, x1;
  88
  89#define S1_1(x0, x1, x2, x3, x4)      \
  90        vpxor           x0,   x1, tp; \
  91        vpxor           x3,   x0, x0; \
  92        vpxor           RNOT, x3, x3; \
  93        vpand           tp,   x1, x4; \
  94        vpor            tp,   x0, x0; \
  95        vpxor           x2,   x3, x3; \
  96        vpxor           x3,   x0, x0; \
  97        vpxor           x3,   tp, x1;
  98#define S1_2(x0, x1, x2, x3, x4)      \
  99        vpxor           x4,   x3, x3; \
 100        vpor            x4,   x1, x1; \
 101        vpxor           x2,   x4, x4; \
 102        vpand           x0,   x2, x2; \
 103        vpxor           x1,   x2, x2; \
 104        vpor            x0,   x1, x1; \
 105        vpxor           RNOT, x0, x0; \
 106        vpxor           x2,   x0, x0; \
 107        vpxor           x1,   x4, x4;
 108
 109#define S2_1(x0, x1, x2, x3, x4)      \
 110        vpxor           RNOT, x3, x3; \
 111        vpxor           x0,   x1, x1; \
 112        vpand           x2,   x0, tp; \
 113        vpxor           x3,   tp, tp; \
 114        vpor            x0,   x3, x3; \
 115        vpxor           x1,   x2, x2; \
 116        vpxor           x1,   x3, x3; \
 117        vpand           tp,   x1, x1;
 118#define S2_2(x0, x1, x2, x3, x4)      \
 119        vpxor           x2,   tp, tp; \
 120        vpand           x3,   x2, x2; \
 121        vpor            x1,   x3, x3; \
 122        vpxor           RNOT, tp, tp; \
 123        vpxor           tp,   x3, x3; \
 124        vpxor           tp,   x0, x4; \
 125        vpxor           x2,   tp, x0; \
 126        vpor            x2,   x1, x1;
 127
 128#define S3_1(x0, x1, x2, x3, x4)      \
 129        vpxor           x3,   x1, tp; \
 130        vpor            x0,   x3, x3; \
 131        vpand           x0,   x1, x4; \
 132        vpxor           x2,   x0, x0; \
 133        vpxor           tp,   x2, x2; \
 134        vpand           x3,   tp, x1; \
 135        vpxor           x3,   x2, x2; \
 136        vpor            x4,   x0, x0; \
 137        vpxor           x3,   x4, x4;
 138#define S3_2(x0, x1, x2, x3, x4)      \
 139        vpxor           x0,   x1, x1; \
 140        vpand           x3,   x0, x0; \
 141        vpand           x4,   x3, x3; \
 142        vpxor           x2,   x3, x3; \
 143        vpor            x1,   x4, x4; \
 144        vpand           x1,   x2, x2; \
 145        vpxor           x3,   x4, x4; \
 146        vpxor           x3,   x0, x0; \
 147        vpxor           x2,   x3, x3;
 148
 149#define S4_1(x0, x1, x2, x3, x4)      \
 150        vpand           x0,   x3, tp; \
 151        vpxor           x3,   x0, x0; \
 152        vpxor           x2,   tp, tp; \
 153        vpor            x3,   x2, x2; \
 154        vpxor           x1,   x0, x0; \
 155        vpxor           tp,   x3, x4; \
 156        vpor            x0,   x2, x2; \
 157        vpxor           x1,   x2, x2;
 158#define S4_2(x0, x1, x2, x3, x4)      \
 159        vpand           x0,   x1, x1; \
 160        vpxor           x4,   x1, x1; \
 161        vpand           x2,   x4, x4; \
 162        vpxor           tp,   x2, x2; \
 163        vpxor           x0,   x4, x4; \
 164        vpor            x1,   tp, x3; \
 165        vpxor           RNOT, x1, x1; \
 166        vpxor           x0,   x3, x3;
 167
 168#define S5_1(x0, x1, x2, x3, x4)      \
 169        vpor            x0,   x1, tp; \
 170        vpxor           tp,   x2, x2; \
 171        vpxor           RNOT, x3, x3; \
 172        vpxor           x0,   x1, x4; \
 173        vpxor           x2,   x0, x0; \
 174        vpand           x4,   tp, x1; \
 175        vpor            x3,   x4, x4; \
 176        vpxor           x0,   x4, x4;
 177#define S5_2(x0, x1, x2, x3, x4)      \
 178        vpand           x3,   x0, x0; \
 179        vpxor           x3,   x1, x1; \
 180        vpxor           x2,   x3, x3; \
 181        vpxor           x1,   x0, x0; \
 182        vpand           x4,   x2, x2; \
 183        vpxor           x2,   x1, x1; \
 184        vpand           x0,   x2, x2; \
 185        vpxor           x2,   x3, x3;
 186
 187#define S6_1(x0, x1, x2, x3, x4)      \
 188        vpxor           x0,   x3, x3; \
 189        vpxor           x2,   x1, tp; \
 190        vpxor           x0,   x2, x2; \
 191        vpand           x3,   x0, x0; \
 192        vpor            x3,   tp, tp; \
 193        vpxor           RNOT, x1, x4; \
 194        vpxor           tp,   x0, x0; \
 195        vpxor           x2,   tp, x1;
 196#define S6_2(x0, x1, x2, x3, x4)      \
 197        vpxor           x4,   x3, x3; \
 198        vpxor           x0,   x4, x4; \
 199        vpand           x0,   x2, x2; \
 200        vpxor           x1,   x4, x4; \
 201        vpxor           x3,   x2, x2; \
 202        vpand           x1,   x3, x3; \
 203        vpxor           x0,   x3, x3; \
 204        vpxor           x2,   x1, x1;
 205
 206#define S7_1(x0, x1, x2, x3, x4)      \
 207        vpxor           RNOT, x1, tp; \
 208        vpxor           RNOT, x0, x0; \
 209        vpand           x2,   tp, x1; \
 210        vpxor           x3,   x1, x1; \
 211        vpor            tp,   x3, x3; \
 212        vpxor           x2,   tp, x4; \
 213        vpxor           x3,   x2, x2; \
 214        vpxor           x0,   x3, x3; \
 215        vpor            x1,   x0, x0;
 216#define S7_2(x0, x1, x2, x3, x4)      \
 217        vpand           x0,   x2, x2; \
 218        vpxor           x4,   x0, x0; \
 219        vpxor           x3,   x4, x4; \
 220        vpand           x0,   x3, x3; \
 221        vpxor           x1,   x4, x4; \
 222        vpxor           x4,   x2, x2; \
 223        vpxor           x1,   x3, x3; \
 224        vpor            x0,   x4, x4; \
 225        vpxor           x1,   x4, x4;
 226
 227#define SI0_1(x0, x1, x2, x3, x4)     \
 228        vpxor           x0,   x1, x1; \
 229        vpor            x1,   x3, tp; \
 230        vpxor           x1,   x3, x4; \
 231        vpxor           RNOT, x0, x0; \
 232        vpxor           tp,   x2, x2; \
 233        vpxor           x0,   tp, x3; \
 234        vpand           x1,   x0, x0; \
 235        vpxor           x2,   x0, x0;
 236#define SI0_2(x0, x1, x2, x3, x4)     \
 237        vpand           x3,   x2, x2; \
 238        vpxor           x4,   x3, x3; \
 239        vpxor           x3,   x2, x2; \
 240        vpxor           x3,   x1, x1; \
 241        vpand           x0,   x3, x3; \
 242        vpxor           x0,   x1, x1; \
 243        vpxor           x2,   x0, x0; \
 244        vpxor           x3,   x4, x4;
 245
 246#define SI1_1(x0, x1, x2, x3, x4)     \
 247        vpxor           x3,   x1, x1; \
 248        vpxor           x2,   x0, tp; \
 249        vpxor           RNOT, x2, x2; \
 250        vpor            x1,   x0, x4; \
 251        vpxor           x3,   x4, x4; \
 252        vpand           x1,   x3, x3; \
 253        vpxor           x2,   x1, x1; \
 254        vpand           x4,   x2, x2;
 255#define SI1_2(x0, x1, x2, x3, x4)     \
 256        vpxor           x1,   x4, x4; \
 257        vpor            x3,   x1, x1; \
 258        vpxor           tp,   x3, x3; \
 259        vpxor           tp,   x2, x2; \
 260        vpor            x4,   tp, x0; \
 261        vpxor           x4,   x2, x2; \
 262        vpxor           x0,   x1, x1; \
 263        vpxor           x1,   x4, x4;
 264
 265#define SI2_1(x0, x1, x2, x3, x4)     \
 266        vpxor           x1,   x2, x2; \
 267        vpxor           RNOT, x3, tp; \
 268        vpor            x2,   tp, tp; \
 269        vpxor           x3,   x2, x2; \
 270        vpxor           x0,   x3, x4; \
 271        vpxor           x1,   tp, x3; \
 272        vpor            x2,   x1, x1; \
 273        vpxor           x0,   x2, x2;
 274#define SI2_2(x0, x1, x2, x3, x4)     \
 275        vpxor           x4,   x1, x1; \
 276        vpor            x3,   x4, x4; \
 277        vpxor           x3,   x2, x2; \
 278        vpxor           x2,   x4, x4; \
 279        vpand           x1,   x2, x2; \
 280        vpxor           x3,   x2, x2; \
 281        vpxor           x4,   x3, x3; \
 282        vpxor           x0,   x4, x4;
 283
 284#define SI3_1(x0, x1, x2, x3, x4)     \
 285        vpxor           x1,   x2, x2; \
 286        vpand           x2,   x1, tp; \
 287        vpxor           x0,   tp, tp; \
 288        vpor            x1,   x0, x0; \
 289        vpxor           x3,   x1, x4; \
 290        vpxor           x3,   x0, x0; \
 291        vpor            tp,   x3, x3; \
 292        vpxor           x2,   tp, x1;
 293#define SI3_2(x0, x1, x2, x3, x4)     \
 294        vpxor           x3,   x1, x1; \
 295        vpxor           x2,   x0, x0; \
 296        vpxor           x3,   x2, x2; \
 297        vpand           x1,   x3, x3; \
 298        vpxor           x0,   x1, x1; \
 299        vpand           x2,   x0, x0; \
 300        vpxor           x3,   x4, x4; \
 301        vpxor           x0,   x3, x3; \
 302        vpxor           x1,   x0, x0;
 303
 304#define SI4_1(x0, x1, x2, x3, x4)     \
 305        vpxor           x3,   x2, x2; \
 306        vpand           x1,   x0, tp; \
 307        vpxor           x2,   tp, tp; \
 308        vpor            x3,   x2, x2; \
 309        vpxor           RNOT, x0, x4; \
 310        vpxor           tp,   x1, x1; \
 311        vpxor           x2,   tp, x0; \
 312        vpand           x4,   x2, x2;
 313#define SI4_2(x0, x1, x2, x3, x4)     \
 314        vpxor           x0,   x2, x2; \
 315        vpor            x4,   x0, x0; \
 316        vpxor           x3,   x0, x0; \
 317        vpand           x2,   x3, x3; \
 318        vpxor           x3,   x4, x4; \
 319        vpxor           x1,   x3, x3; \
 320        vpand           x0,   x1, x1; \
 321        vpxor           x1,   x4, x4; \
 322        vpxor           x3,   x0, x0;
 323
 324#define SI5_1(x0, x1, x2, x3, x4)     \
 325        vpor            x2,   x1, tp; \
 326        vpxor           x1,   x2, x2; \
 327        vpxor           x3,   tp, tp; \
 328        vpand           x1,   x3, x3; \
 329        vpxor           x3,   x2, x2; \
 330        vpor            x0,   x3, x3; \
 331        vpxor           RNOT, x0, x0; \
 332        vpxor           x2,   x3, x3; \
 333        vpor            x0,   x2, x2;
 334#define SI5_2(x0, x1, x2, x3, x4)     \
 335        vpxor           tp,   x1, x4; \
 336        vpxor           x4,   x2, x2; \
 337        vpand           x0,   x4, x4; \
 338        vpxor           tp,   x0, x0; \
 339        vpxor           x3,   tp, x1; \
 340        vpand           x2,   x0, x0; \
 341        vpxor           x3,   x2, x2; \
 342        vpxor           x2,   x0, x0; \
 343        vpxor           x4,   x2, x2; \
 344        vpxor           x3,   x4, x4;
 345
 346#define SI6_1(x0, x1, x2, x3, x4)     \
 347        vpxor           x2,   x0, x0; \
 348        vpand           x3,   x0, tp; \
 349        vpxor           x3,   x2, x2; \
 350        vpxor           x2,   tp, tp; \
 351        vpxor           x1,   x3, x3; \
 352        vpor            x0,   x2, x2; \
 353        vpxor           x3,   x2, x2; \
 354        vpand           tp,   x3, x3;
 355#define SI6_2(x0, x1, x2, x3, x4)     \
 356        vpxor           RNOT, tp, tp; \
 357        vpxor           x1,   x3, x3; \
 358        vpand           x2,   x1, x1; \
 359        vpxor           tp,   x0, x4; \
 360        vpxor           x4,   x3, x3; \
 361        vpxor           x2,   x4, x4; \
 362        vpxor           x1,   tp, x0; \
 363        vpxor           x0,   x2, x2;
 364
 365#define SI7_1(x0, x1, x2, x3, x4)     \
 366        vpand           x0,   x3, tp; \
 367        vpxor           x2,   x0, x0; \
 368        vpor            x3,   x2, x2; \
 369        vpxor           x1,   x3, x4; \
 370        vpxor           RNOT, x0, x0; \
 371        vpor            tp,   x1, x1; \
 372        vpxor           x0,   x4, x4; \
 373        vpand           x2,   x0, x0; \
 374        vpxor           x1,   x0, x0;
 375#define SI7_2(x0, x1, x2, x3, x4)     \
 376        vpand           x2,   x1, x1; \
 377        vpxor           x2,   tp, x3; \
 378        vpxor           x3,   x4, x4; \
 379        vpand           x3,   x2, x2; \
 380        vpor            x0,   x3, x3; \
 381        vpxor           x4,   x1, x1; \
 382        vpxor           x4,   x3, x3; \
 383        vpand           x0,   x4, x4; \
 384        vpxor           x2,   x4, x4;
 385
 386#define get_key(i, j, t) \
 387        vbroadcastss (4*(i)+(j))*4(CTX), t;
 388
 389#define K2(x0, x1, x2, x3, x4, i) \
 390        get_key(i, 0, RK0); \
 391        get_key(i, 1, RK1); \
 392        get_key(i, 2, RK2); \
 393        get_key(i, 3, RK3); \
 394        vpxor RK0,      x0 ## 1, x0 ## 1; \
 395        vpxor RK1,      x1 ## 1, x1 ## 1; \
 396        vpxor RK2,      x2 ## 1, x2 ## 1; \
 397        vpxor RK3,      x3 ## 1, x3 ## 1; \
 398                vpxor RK0,      x0 ## 2, x0 ## 2; \
 399                vpxor RK1,      x1 ## 2, x1 ## 2; \
 400                vpxor RK2,      x2 ## 2, x2 ## 2; \
 401                vpxor RK3,      x3 ## 2, x3 ## 2;
 402
 403#define LK2(x0, x1, x2, x3, x4, i) \
 404        vpslld $13,             x0 ## 1, x4 ## 1;          \
 405        vpsrld $(32 - 13),      x0 ## 1, x0 ## 1;          \
 406        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 407        vpxor                   x0 ## 1, x1 ## 1, x1 ## 1; \
 408        vpslld $3,              x2 ## 1, x4 ## 1;          \
 409        vpsrld $(32 - 3),       x2 ## 1, x2 ## 1;          \
 410        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 411        vpxor                   x2 ## 1, x1 ## 1, x1 ## 1; \
 412                vpslld $13,             x0 ## 2, x4 ## 2;          \
 413                vpsrld $(32 - 13),      x0 ## 2, x0 ## 2;          \
 414                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 415                vpxor                   x0 ## 2, x1 ## 2, x1 ## 2; \
 416                vpslld $3,              x2 ## 2, x4 ## 2;          \
 417                vpsrld $(32 - 3),       x2 ## 2, x2 ## 2;          \
 418                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 419                vpxor                   x2 ## 2, x1 ## 2, x1 ## 2; \
 420        vpslld $1,              x1 ## 1, x4 ## 1;          \
 421        vpsrld $(32 - 1),       x1 ## 1, x1 ## 1;          \
 422        vpor                    x4 ## 1, x1 ## 1, x1 ## 1; \
 423        vpslld $3,              x0 ## 1, x4 ## 1;          \
 424        vpxor                   x2 ## 1, x3 ## 1, x3 ## 1; \
 425        vpxor                   x4 ## 1, x3 ## 1, x3 ## 1; \
 426        get_key(i, 1, RK1); \
 427                vpslld $1,              x1 ## 2, x4 ## 2;          \
 428                vpsrld $(32 - 1),       x1 ## 2, x1 ## 2;          \
 429                vpor                    x4 ## 2, x1 ## 2, x1 ## 2; \
 430                vpslld $3,              x0 ## 2, x4 ## 2;          \
 431                vpxor                   x2 ## 2, x3 ## 2, x3 ## 2; \
 432                vpxor                   x4 ## 2, x3 ## 2, x3 ## 2; \
 433                get_key(i, 3, RK3); \
 434        vpslld $7,              x3 ## 1, x4 ## 1;          \
 435        vpsrld $(32 - 7),       x3 ## 1, x3 ## 1;          \
 436        vpor                    x4 ## 1, x3 ## 1, x3 ## 1; \
 437        vpslld $7,              x1 ## 1, x4 ## 1;          \
 438        vpxor                   x1 ## 1, x0 ## 1, x0 ## 1; \
 439        vpxor                   x3 ## 1, x0 ## 1, x0 ## 1; \
 440        vpxor                   x3 ## 1, x2 ## 1, x2 ## 1; \
 441        vpxor                   x4 ## 1, x2 ## 1, x2 ## 1; \
 442        get_key(i, 0, RK0); \
 443                vpslld $7,              x3 ## 2, x4 ## 2;          \
 444                vpsrld $(32 - 7),       x3 ## 2, x3 ## 2;          \
 445                vpor                    x4 ## 2, x3 ## 2, x3 ## 2; \
 446                vpslld $7,              x1 ## 2, x4 ## 2;          \
 447                vpxor                   x1 ## 2, x0 ## 2, x0 ## 2; \
 448                vpxor                   x3 ## 2, x0 ## 2, x0 ## 2; \
 449                vpxor                   x3 ## 2, x2 ## 2, x2 ## 2; \
 450                vpxor                   x4 ## 2, x2 ## 2, x2 ## 2; \
 451                get_key(i, 2, RK2); \
 452        vpxor                   RK1, x1 ## 1, x1 ## 1;     \
 453        vpxor                   RK3, x3 ## 1, x3 ## 1;     \
 454        vpslld $5,              x0 ## 1, x4 ## 1;          \
 455        vpsrld $(32 - 5),       x0 ## 1, x0 ## 1;          \
 456        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 457        vpslld $22,             x2 ## 1, x4 ## 1;          \
 458        vpsrld $(32 - 22),      x2 ## 1, x2 ## 1;          \
 459        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 460        vpxor                   RK0, x0 ## 1, x0 ## 1;     \
 461        vpxor                   RK2, x2 ## 1, x2 ## 1;     \
 462                vpxor                   RK1, x1 ## 2, x1 ## 2;     \
 463                vpxor                   RK3, x3 ## 2, x3 ## 2;     \
 464                vpslld $5,              x0 ## 2, x4 ## 2;          \
 465                vpsrld $(32 - 5),       x0 ## 2, x0 ## 2;          \
 466                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 467                vpslld $22,             x2 ## 2, x4 ## 2;          \
 468                vpsrld $(32 - 22),      x2 ## 2, x2 ## 2;          \
 469                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 470                vpxor                   RK0, x0 ## 2, x0 ## 2;     \
 471                vpxor                   RK2, x2 ## 2, x2 ## 2;
 472
 473#define KL2(x0, x1, x2, x3, x4, i) \
 474        vpxor                   RK0, x0 ## 1, x0 ## 1;     \
 475        vpxor                   RK2, x2 ## 1, x2 ## 1;     \
 476        vpsrld $5,              x0 ## 1, x4 ## 1;          \
 477        vpslld $(32 - 5),       x0 ## 1, x0 ## 1;          \
 478        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 479        vpxor                   RK3, x3 ## 1, x3 ## 1;     \
 480        vpxor                   RK1, x1 ## 1, x1 ## 1;     \
 481        vpsrld $22,             x2 ## 1, x4 ## 1;          \
 482        vpslld $(32 - 22),      x2 ## 1, x2 ## 1;          \
 483        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 484        vpxor                   x3 ## 1, x2 ## 1, x2 ## 1; \
 485                vpxor                   RK0, x0 ## 2, x0 ## 2;     \
 486                vpxor                   RK2, x2 ## 2, x2 ## 2;     \
 487                vpsrld $5,              x0 ## 2, x4 ## 2;          \
 488                vpslld $(32 - 5),       x0 ## 2, x0 ## 2;          \
 489                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 490                vpxor                   RK3, x3 ## 2, x3 ## 2;     \
 491                vpxor                   RK1, x1 ## 2, x1 ## 2;     \
 492                vpsrld $22,             x2 ## 2, x4 ## 2;          \
 493                vpslld $(32 - 22),      x2 ## 2, x2 ## 2;          \
 494                vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 495                vpxor                   x3 ## 2, x2 ## 2, x2 ## 2; \
 496        vpxor                   x3 ## 1, x0 ## 1, x0 ## 1; \
 497        vpslld $7,              x1 ## 1, x4 ## 1;          \
 498        vpxor                   x1 ## 1, x0 ## 1, x0 ## 1; \
 499        vpxor                   x4 ## 1, x2 ## 1, x2 ## 1; \
 500        vpsrld $1,              x1 ## 1, x4 ## 1;          \
 501        vpslld $(32 - 1),       x1 ## 1, x1 ## 1;          \
 502        vpor                    x4 ## 1, x1 ## 1, x1 ## 1; \
 503                vpxor                   x3 ## 2, x0 ## 2, x0 ## 2; \
 504                vpslld $7,              x1 ## 2, x4 ## 2;          \
 505                vpxor                   x1 ## 2, x0 ## 2, x0 ## 2; \
 506                vpxor                   x4 ## 2, x2 ## 2, x2 ## 2; \
 507                vpsrld $1,              x1 ## 2, x4 ## 2;          \
 508                vpslld $(32 - 1),       x1 ## 2, x1 ## 2;          \
 509                vpor                    x4 ## 2, x1 ## 2, x1 ## 2; \
 510        vpsrld $7,              x3 ## 1, x4 ## 1;          \
 511        vpslld $(32 - 7),       x3 ## 1, x3 ## 1;          \
 512        vpor                    x4 ## 1, x3 ## 1, x3 ## 1; \
 513        vpxor                   x0 ## 1, x1 ## 1, x1 ## 1; \
 514        vpslld $3,              x0 ## 1, x4 ## 1;          \
 515        vpxor                   x4 ## 1, x3 ## 1, x3 ## 1; \
 516                vpsrld $7,              x3 ## 2, x4 ## 2;          \
 517                vpslld $(32 - 7),       x3 ## 2, x3 ## 2;          \
 518                vpor                    x4 ## 2, x3 ## 2, x3 ## 2; \
 519                vpxor                   x0 ## 2, x1 ## 2, x1 ## 2; \
 520                vpslld $3,              x0 ## 2, x4 ## 2;          \
 521                vpxor                   x4 ## 2, x3 ## 2, x3 ## 2; \
 522        vpsrld $13,             x0 ## 1, x4 ## 1;          \
 523        vpslld $(32 - 13),      x0 ## 1, x0 ## 1;          \
 524        vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 525        vpxor                   x2 ## 1, x1 ## 1, x1 ## 1; \
 526        vpxor                   x2 ## 1, x3 ## 1, x3 ## 1; \
 527        vpsrld $3,              x2 ## 1, x4 ## 1;          \
 528        vpslld $(32 - 3),       x2 ## 1, x2 ## 1;          \
 529        vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 530                vpsrld $13,             x0 ## 2, x4 ## 2;          \
 531                vpslld $(32 - 13),      x0 ## 2, x0 ## 2;          \
 532                vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 533                vpxor                   x2 ## 2, x1 ## 2, x1 ## 2; \
 534                vpxor                   x2 ## 2, x3 ## 2, x3 ## 2; \
 535                vpsrld $3,              x2 ## 2, x4 ## 2;          \
 536                vpslld $(32 - 3),       x2 ## 2, x2 ## 2;          \
 537                vpor                    x4 ## 2, x2 ## 2, x2 ## 2;
 538
 539#define S(SBOX, x0, x1, x2, x3, x4) \
 540        SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 541        SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 542        SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 543        SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
 544
 545#define SP(SBOX, x0, x1, x2, x3, x4, i) \
 546        get_key(i, 0, RK0); \
 547        SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 548        get_key(i, 2, RK2); \
 549        SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 550        get_key(i, 3, RK3); \
 551        SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 552        get_key(i, 1, RK1); \
 553        SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 554
 555#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 556        vpunpckldq              x1, x0, t0; \
 557        vpunpckhdq              x1, x0, t2; \
 558        vpunpckldq              x3, x2, t1; \
 559        vpunpckhdq              x3, x2, x3; \
 560        \
 561        vpunpcklqdq             t1, t0, x0; \
 562        vpunpckhqdq             t1, t0, x1; \
 563        vpunpcklqdq             x3, t2, x2; \
 564        vpunpckhqdq             x3, t2, x3;
 565
 566#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
 567        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 568
 569#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
 570        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 571
 572.align 8
 573__serpent_enc_blk8_avx:
 574        /* input:
 575         *      %rdi: ctx, CTX
 576         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
 577         * output:
 578         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
 579         */
 580
 581        vpcmpeqd RNOT, RNOT, RNOT;
 582
 583        read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 584        read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 585
 586                                                 K2(RA, RB, RC, RD, RE, 0);
 587        S(S0, RA, RB, RC, RD, RE);              LK2(RC, RB, RD, RA, RE, 1);
 588        S(S1, RC, RB, RD, RA, RE);              LK2(RE, RD, RA, RC, RB, 2);
 589        S(S2, RE, RD, RA, RC, RB);              LK2(RB, RD, RE, RC, RA, 3);
 590        S(S3, RB, RD, RE, RC, RA);              LK2(RC, RA, RD, RB, RE, 4);
 591        S(S4, RC, RA, RD, RB, RE);              LK2(RA, RD, RB, RE, RC, 5);
 592        S(S5, RA, RD, RB, RE, RC);              LK2(RC, RA, RD, RE, RB, 6);
 593        S(S6, RC, RA, RD, RE, RB);              LK2(RD, RB, RA, RE, RC, 7);
 594        S(S7, RD, RB, RA, RE, RC);              LK2(RC, RA, RE, RD, RB, 8);
 595        S(S0, RC, RA, RE, RD, RB);              LK2(RE, RA, RD, RC, RB, 9);
 596        S(S1, RE, RA, RD, RC, RB);              LK2(RB, RD, RC, RE, RA, 10);
 597        S(S2, RB, RD, RC, RE, RA);              LK2(RA, RD, RB, RE, RC, 11);
 598        S(S3, RA, RD, RB, RE, RC);              LK2(RE, RC, RD, RA, RB, 12);
 599        S(S4, RE, RC, RD, RA, RB);              LK2(RC, RD, RA, RB, RE, 13);
 600        S(S5, RC, RD, RA, RB, RE);              LK2(RE, RC, RD, RB, RA, 14);
 601        S(S6, RE, RC, RD, RB, RA);              LK2(RD, RA, RC, RB, RE, 15);
 602        S(S7, RD, RA, RC, RB, RE);              LK2(RE, RC, RB, RD, RA, 16);
 603        S(S0, RE, RC, RB, RD, RA);              LK2(RB, RC, RD, RE, RA, 17);
 604        S(S1, RB, RC, RD, RE, RA);              LK2(RA, RD, RE, RB, RC, 18);
 605        S(S2, RA, RD, RE, RB, RC);              LK2(RC, RD, RA, RB, RE, 19);
 606        S(S3, RC, RD, RA, RB, RE);              LK2(RB, RE, RD, RC, RA, 20);
 607        S(S4, RB, RE, RD, RC, RA);              LK2(RE, RD, RC, RA, RB, 21);
 608        S(S5, RE, RD, RC, RA, RB);              LK2(RB, RE, RD, RA, RC, 22);
 609        S(S6, RB, RE, RD, RA, RC);              LK2(RD, RC, RE, RA, RB, 23);
 610        S(S7, RD, RC, RE, RA, RB);              LK2(RB, RE, RA, RD, RC, 24);
 611        S(S0, RB, RE, RA, RD, RC);              LK2(RA, RE, RD, RB, RC, 25);
 612        S(S1, RA, RE, RD, RB, RC);              LK2(RC, RD, RB, RA, RE, 26);
 613        S(S2, RC, RD, RB, RA, RE);              LK2(RE, RD, RC, RA, RB, 27);
 614        S(S3, RE, RD, RC, RA, RB);              LK2(RA, RB, RD, RE, RC, 28);
 615        S(S4, RA, RB, RD, RE, RC);              LK2(RB, RD, RE, RC, RA, 29);
 616        S(S5, RB, RD, RE, RC, RA);              LK2(RA, RB, RD, RC, RE, 30);
 617        S(S6, RA, RB, RD, RC, RE);              LK2(RD, RE, RB, RC, RA, 31);
 618        S(S7, RD, RE, RB, RC, RA);               K2(RA, RB, RC, RD, RE, 32);
 619
 620        write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 621        write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 622
 623        ret;
 624ENDPROC(__serpent_enc_blk8_avx)
 625
 626.align 8
 627__serpent_dec_blk8_avx:
 628        /* input:
 629         *      %rdi: ctx, CTX
 630         *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
 631         * output:
 632         *      RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks
 633         */
 634
 635        vpcmpeqd RNOT, RNOT, RNOT;
 636
 637        read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 638        read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 639
 640                                                 K2(RA, RB, RC, RD, RE, 32);
 641        SP(SI7, RA, RB, RC, RD, RE, 31);        KL2(RB, RD, RA, RE, RC, 31);
 642        SP(SI6, RB, RD, RA, RE, RC, 30);        KL2(RA, RC, RE, RB, RD, 30);
 643        SP(SI5, RA, RC, RE, RB, RD, 29);        KL2(RC, RD, RA, RE, RB, 29);
 644        SP(SI4, RC, RD, RA, RE, RB, 28);        KL2(RC, RA, RB, RE, RD, 28);
 645        SP(SI3, RC, RA, RB, RE, RD, 27);        KL2(RB, RC, RD, RE, RA, 27);
 646        SP(SI2, RB, RC, RD, RE, RA, 26);        KL2(RC, RA, RE, RD, RB, 26);
 647        SP(SI1, RC, RA, RE, RD, RB, 25);        KL2(RB, RA, RE, RD, RC, 25);
 648        SP(SI0, RB, RA, RE, RD, RC, 24);        KL2(RE, RC, RA, RB, RD, 24);
 649        SP(SI7, RE, RC, RA, RB, RD, 23);        KL2(RC, RB, RE, RD, RA, 23);
 650        SP(SI6, RC, RB, RE, RD, RA, 22);        KL2(RE, RA, RD, RC, RB, 22);
 651        SP(SI5, RE, RA, RD, RC, RB, 21);        KL2(RA, RB, RE, RD, RC, 21);
 652        SP(SI4, RA, RB, RE, RD, RC, 20);        KL2(RA, RE, RC, RD, RB, 20);
 653        SP(SI3, RA, RE, RC, RD, RB, 19);        KL2(RC, RA, RB, RD, RE, 19);
 654        SP(SI2, RC, RA, RB, RD, RE, 18);        KL2(RA, RE, RD, RB, RC, 18);
 655        SP(SI1, RA, RE, RD, RB, RC, 17);        KL2(RC, RE, RD, RB, RA, 17);
 656        SP(SI0, RC, RE, RD, RB, RA, 16);        KL2(RD, RA, RE, RC, RB, 16);
 657        SP(SI7, RD, RA, RE, RC, RB, 15);        KL2(RA, RC, RD, RB, RE, 15);
 658        SP(SI6, RA, RC, RD, RB, RE, 14);        KL2(RD, RE, RB, RA, RC, 14);
 659        SP(SI5, RD, RE, RB, RA, RC, 13);        KL2(RE, RC, RD, RB, RA, 13);
 660        SP(SI4, RE, RC, RD, RB, RA, 12);        KL2(RE, RD, RA, RB, RC, 12);
 661        SP(SI3, RE, RD, RA, RB, RC, 11);        KL2(RA, RE, RC, RB, RD, 11);
 662        SP(SI2, RA, RE, RC, RB, RD, 10);        KL2(RE, RD, RB, RC, RA, 10);
 663        SP(SI1, RE, RD, RB, RC, RA, 9);         KL2(RA, RD, RB, RC, RE, 9);
 664        SP(SI0, RA, RD, RB, RC, RE, 8);         KL2(RB, RE, RD, RA, RC, 8);
 665        SP(SI7, RB, RE, RD, RA, RC, 7);         KL2(RE, RA, RB, RC, RD, 7);
 666        SP(SI6, RE, RA, RB, RC, RD, 6);         KL2(RB, RD, RC, RE, RA, 6);
 667        SP(SI5, RB, RD, RC, RE, RA, 5);         KL2(RD, RA, RB, RC, RE, 5);
 668        SP(SI4, RD, RA, RB, RC, RE, 4);         KL2(RD, RB, RE, RC, RA, 4);
 669        SP(SI3, RD, RB, RE, RC, RA, 3);         KL2(RE, RD, RA, RC, RB, 3);
 670        SP(SI2, RE, RD, RA, RC, RB, 2);         KL2(RD, RB, RC, RA, RE, 2);
 671        SP(SI1, RD, RB, RC, RA, RE, 1);         KL2(RE, RB, RC, RA, RD, 1);
 672        S(SI0, RE, RB, RC, RA, RD);              K2(RC, RD, RB, RE, RA, 0);
 673
 674        write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
 675        write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
 676
 677        ret;
 678ENDPROC(__serpent_dec_blk8_avx)
 679
 680ENTRY(serpent_ecb_enc_8way_avx)
 681        /* input:
 682         *      %rdi: ctx, CTX
 683         *      %rsi: dst
 684         *      %rdx: src
 685         */
 686        FRAME_BEGIN
 687
 688        load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 689
 690        call __serpent_enc_blk8_avx;
 691
 692        store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 693
 694        FRAME_END
 695        ret;
 696ENDPROC(serpent_ecb_enc_8way_avx)
 697
 698ENTRY(serpent_ecb_dec_8way_avx)
 699        /* input:
 700         *      %rdi: ctx, CTX
 701         *      %rsi: dst
 702         *      %rdx: src
 703         */
 704        FRAME_BEGIN
 705
 706        load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 707
 708        call __serpent_dec_blk8_avx;
 709
 710        store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 711
 712        FRAME_END
 713        ret;
 714ENDPROC(serpent_ecb_dec_8way_avx)
 715
 716ENTRY(serpent_cbc_dec_8way_avx)
 717        /* input:
 718         *      %rdi: ctx, CTX
 719         *      %rsi: dst
 720         *      %rdx: src
 721         */
 722        FRAME_BEGIN
 723
 724        load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 725
 726        call __serpent_dec_blk8_avx;
 727
 728        store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 729
 730        FRAME_END
 731        ret;
 732ENDPROC(serpent_cbc_dec_8way_avx)
 733
 734ENTRY(serpent_ctr_8way_avx)
 735        /* input:
 736         *      %rdi: ctx, CTX
 737         *      %rsi: dst
 738         *      %rdx: src
 739         *      %rcx: iv (little endian, 128bit)
 740         */
 741        FRAME_BEGIN
 742
 743        load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
 744                      RD2, RK0, RK1, RK2);
 745
 746        call __serpent_enc_blk8_avx;
 747
 748        store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 749
 750        FRAME_END
 751        ret;
 752ENDPROC(serpent_ctr_8way_avx)
 753
 754ENTRY(serpent_xts_enc_8way_avx)
 755        /* input:
 756         *      %rdi: ctx, CTX
 757         *      %rsi: dst
 758         *      %rdx: src
 759         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 760         */
 761        FRAME_BEGIN
 762
 763        /* regs <= src, dst <= IVs, regs <= regs xor IVs */
 764        load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
 765                      RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
 766
 767        call __serpent_enc_blk8_avx;
 768
 769        /* dst <= regs xor IVs(in dst) */
 770        store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 771
 772        FRAME_END
 773        ret;
 774ENDPROC(serpent_xts_enc_8way_avx)
 775
 776ENTRY(serpent_xts_dec_8way_avx)
 777        /* input:
 778         *      %rdi: ctx, CTX
 779         *      %rsi: dst
 780         *      %rdx: src
 781         *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 782         */
 783        FRAME_BEGIN
 784
 785        /* regs <= src, dst <= IVs, regs <= regs xor IVs */
 786        load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
 787                      RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
 788
 789        call __serpent_dec_blk8_avx;
 790
 791        /* dst <= regs xor IVs(in dst) */
 792        store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 793
 794        FRAME_END
 795        ret;
 796ENDPROC(serpent_xts_dec_8way_avx)
 797