linux/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
   4 *
   5 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   6 *
   7 * Based on crypto/serpent.c by
   8 *  Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
   9 *                2003 Herbert Valerio Riedel <hvr@gnu.org>
  10 */
  11
  12#include <linux/linkage.h>
  13
  14.file "serpent-sse2-x86_64-asm_64.S"
  15.text
  16
  17#define CTX %rdi
  18
  19/**********************************************************************
  20  8-way SSE2 serpent
  21 **********************************************************************/
  22#define RA1 %xmm0
  23#define RB1 %xmm1
  24#define RC1 %xmm2
  25#define RD1 %xmm3
  26#define RE1 %xmm4
  27
  28#define RA2 %xmm5
  29#define RB2 %xmm6
  30#define RC2 %xmm7
  31#define RD2 %xmm8
  32#define RE2 %xmm9
  33
  34#define RNOT %xmm10
  35
  36#define RK0 %xmm11
  37#define RK1 %xmm12
  38#define RK2 %xmm13
  39#define RK3 %xmm14
  40
  41#define S0_1(x0, x1, x2, x3, x4) \
  42        movdqa x3,              x4; \
  43        por x0,                 x3; \
  44        pxor x4,                x0; \
  45        pxor x2,                x4; \
  46        pxor RNOT,              x4; \
  47        pxor x1,                x3; \
  48        pand x0,                x1; \
  49        pxor x4,                x1; \
  50        pxor x0,                x2;
  51#define S0_2(x0, x1, x2, x3, x4) \
  52        pxor x3,                x0; \
  53        por x0,                 x4; \
  54        pxor x2,                x0; \
  55        pand x1,                x2; \
  56        pxor x2,                x3; \
  57        pxor RNOT,              x1; \
  58        pxor x4,                x2; \
  59        pxor x2,                x1;
  60
  61#define S1_1(x0, x1, x2, x3, x4) \
  62        movdqa x1,              x4; \
  63        pxor x0,                x1; \
  64        pxor x3,                x0; \
  65        pxor RNOT,              x3; \
  66        pand x1,                x4; \
  67        por x1,                 x0; \
  68        pxor x2,                x3; \
  69        pxor x3,                x0; \
  70        pxor x3,                x1;
  71#define S1_2(x0, x1, x2, x3, x4) \
  72        pxor x4,                x3; \
  73        por x4,                 x1; \
  74        pxor x2,                x4; \
  75        pand x0,                x2; \
  76        pxor x1,                x2; \
  77        por x0,                 x1; \
  78        pxor RNOT,              x0; \
  79        pxor x2,                x0; \
  80        pxor x1,                x4;
  81
  82#define S2_1(x0, x1, x2, x3, x4) \
  83        pxor RNOT,              x3; \
  84        pxor x0,                x1; \
  85        movdqa x0,              x4; \
  86        pand x2,                x0; \
  87        pxor x3,                x0; \
  88        por x4,                 x3; \
  89        pxor x1,                x2; \
  90        pxor x1,                x3; \
  91        pand x0,                x1;
  92#define S2_2(x0, x1, x2, x3, x4) \
  93        pxor x2,                x0; \
  94        pand x3,                x2; \
  95        por x1,                 x3; \
  96        pxor RNOT,              x0; \
  97        pxor x0,                x3; \
  98        pxor x0,                x4; \
  99        pxor x2,                x0; \
 100        por x2,                 x1;
 101
 102#define S3_1(x0, x1, x2, x3, x4) \
 103        movdqa x1,              x4; \
 104        pxor x3,                x1; \
 105        por x0,                 x3; \
 106        pand x0,                x4; \
 107        pxor x2,                x0; \
 108        pxor x1,                x2; \
 109        pand x3,                x1; \
 110        pxor x3,                x2; \
 111        por x4,                 x0; \
 112        pxor x3,                x4;
 113#define S3_2(x0, x1, x2, x3, x4) \
 114        pxor x0,                x1; \
 115        pand x3,                x0; \
 116        pand x4,                x3; \
 117        pxor x2,                x3; \
 118        por x1,                 x4; \
 119        pand x1,                x2; \
 120        pxor x3,                x4; \
 121        pxor x3,                x0; \
 122        pxor x2,                x3;
 123
 124#define S4_1(x0, x1, x2, x3, x4) \
 125        movdqa x3,              x4; \
 126        pand x0,                x3; \
 127        pxor x4,                x0; \
 128        pxor x2,                x3; \
 129        por x4,                 x2; \
 130        pxor x1,                x0; \
 131        pxor x3,                x4; \
 132        por x0,                 x2; \
 133        pxor x1,                x2;
 134#define S4_2(x0, x1, x2, x3, x4) \
 135        pand x0,                x1; \
 136        pxor x4,                x1; \
 137        pand x2,                x4; \
 138        pxor x3,                x2; \
 139        pxor x0,                x4; \
 140        por x1,                 x3; \
 141        pxor RNOT,              x1; \
 142        pxor x0,                x3;
 143
 144#define S5_1(x0, x1, x2, x3, x4) \
 145        movdqa x1,              x4; \
 146        por x0,                 x1; \
 147        pxor x1,                x2; \
 148        pxor RNOT,              x3; \
 149        pxor x0,                x4; \
 150        pxor x2,                x0; \
 151        pand x4,                x1; \
 152        por x3,                 x4; \
 153        pxor x0,                x4;
 154#define S5_2(x0, x1, x2, x3, x4) \
 155        pand x3,                x0; \
 156        pxor x3,                x1; \
 157        pxor x2,                x3; \
 158        pxor x1,                x0; \
 159        pand x4,                x2; \
 160        pxor x2,                x1; \
 161        pand x0,                x2; \
 162        pxor x2,                x3;
 163
 164#define S6_1(x0, x1, x2, x3, x4) \
 165        movdqa x1,              x4; \
 166        pxor x0,                x3; \
 167        pxor x2,                x1; \
 168        pxor x0,                x2; \
 169        pand x3,                x0; \
 170        por x3,                 x1; \
 171        pxor RNOT,              x4; \
 172        pxor x1,                x0; \
 173        pxor x2,                x1;
 174#define S6_2(x0, x1, x2, x3, x4) \
 175        pxor x4,                x3; \
 176        pxor x0,                x4; \
 177        pand x0,                x2; \
 178        pxor x1,                x4; \
 179        pxor x3,                x2; \
 180        pand x1,                x3; \
 181        pxor x0,                x3; \
 182        pxor x2,                x1;
 183
 184#define S7_1(x0, x1, x2, x3, x4) \
 185        pxor RNOT,              x1; \
 186        movdqa x1,              x4; \
 187        pxor RNOT,              x0; \
 188        pand x2,                x1; \
 189        pxor x3,                x1; \
 190        por x4,                 x3; \
 191        pxor x2,                x4; \
 192        pxor x3,                x2; \
 193        pxor x0,                x3; \
 194        por x1,                 x0;
 195#define S7_2(x0, x1, x2, x3, x4) \
 196        pand x0,                x2; \
 197        pxor x4,                x0; \
 198        pxor x3,                x4; \
 199        pand x0,                x3; \
 200        pxor x1,                x4; \
 201        pxor x4,                x2; \
 202        pxor x1,                x3; \
 203        por x0,                 x4; \
 204        pxor x1,                x4;
 205
 206#define SI0_1(x0, x1, x2, x3, x4) \
 207        movdqa x3,              x4; \
 208        pxor x0,                x1; \
 209        por x1,                 x3; \
 210        pxor x1,                x4; \
 211        pxor RNOT,              x0; \
 212        pxor x3,                x2; \
 213        pxor x0,                x3; \
 214        pand x1,                x0; \
 215        pxor x2,                x0;
 216#define SI0_2(x0, x1, x2, x3, x4) \
 217        pand x3,                x2; \
 218        pxor x4,                x3; \
 219        pxor x3,                x2; \
 220        pxor x3,                x1; \
 221        pand x0,                x3; \
 222        pxor x0,                x1; \
 223        pxor x2,                x0; \
 224        pxor x3,                x4;
 225
 226#define SI1_1(x0, x1, x2, x3, x4) \
 227        pxor x3,                x1; \
 228        movdqa x0,              x4; \
 229        pxor x2,                x0; \
 230        pxor RNOT,              x2; \
 231        por x1,                 x4; \
 232        pxor x3,                x4; \
 233        pand x1,                x3; \
 234        pxor x2,                x1; \
 235        pand x4,                x2;
 236#define SI1_2(x0, x1, x2, x3, x4) \
 237        pxor x1,                x4; \
 238        por x3,                 x1; \
 239        pxor x0,                x3; \
 240        pxor x0,                x2; \
 241        por x4,                 x0; \
 242        pxor x4,                x2; \
 243        pxor x0,                x1; \
 244        pxor x1,                x4;
 245
 246#define SI2_1(x0, x1, x2, x3, x4) \
 247        pxor x1,                x2; \
 248        movdqa x3,              x4; \
 249        pxor RNOT,              x3; \
 250        por x2,                 x3; \
 251        pxor x4,                x2; \
 252        pxor x0,                x4; \
 253        pxor x1,                x3; \
 254        por x2,                 x1; \
 255        pxor x0,                x2;
 256#define SI2_2(x0, x1, x2, x3, x4) \
 257        pxor x4,                x1; \
 258        por x3,                 x4; \
 259        pxor x3,                x2; \
 260        pxor x2,                x4; \
 261        pand x1,                x2; \
 262        pxor x3,                x2; \
 263        pxor x4,                x3; \
 264        pxor x0,                x4;
 265
 266#define SI3_1(x0, x1, x2, x3, x4) \
 267        pxor x1,                x2; \
 268        movdqa x1,              x4; \
 269        pand x2,                x1; \
 270        pxor x0,                x1; \
 271        por x4,                 x0; \
 272        pxor x3,                x4; \
 273        pxor x3,                x0; \
 274        por x1,                 x3; \
 275        pxor x2,                x1;
 276#define SI3_2(x0, x1, x2, x3, x4) \
 277        pxor x3,                x1; \
 278        pxor x2,                x0; \
 279        pxor x3,                x2; \
 280        pand x1,                x3; \
 281        pxor x0,                x1; \
 282        pand x2,                x0; \
 283        pxor x3,                x4; \
 284        pxor x0,                x3; \
 285        pxor x1,                x0;
 286
 287#define SI4_1(x0, x1, x2, x3, x4) \
 288        pxor x3,                x2; \
 289        movdqa x0,              x4; \
 290        pand x1,                x0; \
 291        pxor x2,                x0; \
 292        por x3,                 x2; \
 293        pxor RNOT,              x4; \
 294        pxor x0,                x1; \
 295        pxor x2,                x0; \
 296        pand x4,                x2;
 297#define SI4_2(x0, x1, x2, x3, x4) \
 298        pxor x0,                x2; \
 299        por x4,                 x0; \
 300        pxor x3,                x0; \
 301        pand x2,                x3; \
 302        pxor x3,                x4; \
 303        pxor x1,                x3; \
 304        pand x0,                x1; \
 305        pxor x1,                x4; \
 306        pxor x3,                x0;
 307
 308#define SI5_1(x0, x1, x2, x3, x4) \
 309        movdqa x1,              x4; \
 310        por x2,                 x1; \
 311        pxor x4,                x2; \
 312        pxor x3,                x1; \
 313        pand x4,                x3; \
 314        pxor x3,                x2; \
 315        por x0,                 x3; \
 316        pxor RNOT,              x0; \
 317        pxor x2,                x3; \
 318        por x0,                 x2;
 319#define SI5_2(x0, x1, x2, x3, x4) \
 320        pxor x1,                x4; \
 321        pxor x4,                x2; \
 322        pand x0,                x4; \
 323        pxor x1,                x0; \
 324        pxor x3,                x1; \
 325        pand x2,                x0; \
 326        pxor x3,                x2; \
 327        pxor x2,                x0; \
 328        pxor x4,                x2; \
 329        pxor x3,                x4;
 330
 331#define SI6_1(x0, x1, x2, x3, x4) \
 332        pxor x2,                x0; \
 333        movdqa x0,              x4; \
 334        pand x3,                x0; \
 335        pxor x3,                x2; \
 336        pxor x2,                x0; \
 337        pxor x1,                x3; \
 338        por x4,                 x2; \
 339        pxor x3,                x2; \
 340        pand x0,                x3;
 341#define SI6_2(x0, x1, x2, x3, x4) \
 342        pxor RNOT,              x0; \
 343        pxor x1,                x3; \
 344        pand x2,                x1; \
 345        pxor x0,                x4; \
 346        pxor x4,                x3; \
 347        pxor x2,                x4; \
 348        pxor x1,                x0; \
 349        pxor x0,                x2;
 350
 351#define SI7_1(x0, x1, x2, x3, x4) \
 352        movdqa x3,              x4; \
 353        pand x0,                x3; \
 354        pxor x2,                x0; \
 355        por x4,                 x2; \
 356        pxor x1,                x4; \
 357        pxor RNOT,              x0; \
 358        por x3,                 x1; \
 359        pxor x0,                x4; \
 360        pand x2,                x0; \
 361        pxor x1,                x0;
 362#define SI7_2(x0, x1, x2, x3, x4) \
 363        pand x2,                x1; \
 364        pxor x2,                x3; \
 365        pxor x3,                x4; \
 366        pand x3,                x2; \
 367        por x0,                 x3; \
 368        pxor x4,                x1; \
 369        pxor x4,                x3; \
 370        pand x0,                x4; \
 371        pxor x2,                x4;
 372
 373#define get_key(i, j, t) \
 374        movd (4*(i)+(j))*4(CTX), t; \
 375        pshufd $0, t, t;
 376
 377#define K2(x0, x1, x2, x3, x4, i) \
 378        get_key(i, 0, RK0); \
 379        get_key(i, 1, RK1); \
 380        get_key(i, 2, RK2); \
 381        get_key(i, 3, RK3); \
 382        pxor RK0,               x0 ## 1; \
 383        pxor RK1,               x1 ## 1; \
 384        pxor RK2,               x2 ## 1; \
 385        pxor RK3,               x3 ## 1; \
 386                pxor RK0,               x0 ## 2; \
 387                pxor RK1,               x1 ## 2; \
 388                pxor RK2,               x2 ## 2; \
 389                pxor RK3,               x3 ## 2;
 390
 391#define LK2(x0, x1, x2, x3, x4, i) \
 392        movdqa x0 ## 1,         x4 ## 1; \
 393        pslld $13,              x0 ## 1; \
 394        psrld $(32 - 13),       x4 ## 1; \
 395        por x4 ## 1,            x0 ## 1; \
 396        pxor x0 ## 1,           x1 ## 1; \
 397        movdqa x2 ## 1,         x4 ## 1; \
 398        pslld $3,               x2 ## 1; \
 399        psrld $(32 - 3),        x4 ## 1; \
 400        por x4 ## 1,            x2 ## 1; \
 401        pxor x2 ## 1,           x1 ## 1; \
 402                movdqa x0 ## 2,         x4 ## 2; \
 403                pslld $13,              x0 ## 2; \
 404                psrld $(32 - 13),       x4 ## 2; \
 405                por x4 ## 2,            x0 ## 2; \
 406                pxor x0 ## 2,           x1 ## 2; \
 407                movdqa x2 ## 2,         x4 ## 2; \
 408                pslld $3,               x2 ## 2; \
 409                psrld $(32 - 3),        x4 ## 2; \
 410                por x4 ## 2,            x2 ## 2; \
 411                pxor x2 ## 2,           x1 ## 2; \
 412        movdqa x1 ## 1,         x4 ## 1; \
 413        pslld $1,               x1 ## 1; \
 414        psrld $(32 - 1),        x4 ## 1; \
 415        por x4 ## 1,            x1 ## 1; \
 416        movdqa x0 ## 1,         x4 ## 1; \
 417        pslld $3,               x4 ## 1; \
 418        pxor x2 ## 1,           x3 ## 1; \
 419        pxor x4 ## 1,           x3 ## 1; \
 420        movdqa x3 ## 1,         x4 ## 1; \
 421        get_key(i, 1, RK1); \
 422                movdqa x1 ## 2,         x4 ## 2; \
 423                pslld $1,               x1 ## 2; \
 424                psrld $(32 - 1),        x4 ## 2; \
 425                por x4 ## 2,            x1 ## 2; \
 426                movdqa x0 ## 2,         x4 ## 2; \
 427                pslld $3,               x4 ## 2; \
 428                pxor x2 ## 2,           x3 ## 2; \
 429                pxor x4 ## 2,           x3 ## 2; \
 430                movdqa x3 ## 2,         x4 ## 2; \
 431                get_key(i, 3, RK3); \
 432        pslld $7,               x3 ## 1; \
 433        psrld $(32 - 7),        x4 ## 1; \
 434        por x4 ## 1,            x3 ## 1; \
 435        movdqa x1 ## 1,         x4 ## 1; \
 436        pslld $7,               x4 ## 1; \
 437        pxor x1 ## 1,           x0 ## 1; \
 438        pxor x3 ## 1,           x0 ## 1; \
 439        pxor x3 ## 1,           x2 ## 1; \
 440        pxor x4 ## 1,           x2 ## 1; \
 441        get_key(i, 0, RK0); \
 442                pslld $7,               x3 ## 2; \
 443                psrld $(32 - 7),        x4 ## 2; \
 444                por x4 ## 2,            x3 ## 2; \
 445                movdqa x1 ## 2,         x4 ## 2; \
 446                pslld $7,               x4 ## 2; \
 447                pxor x1 ## 2,           x0 ## 2; \
 448                pxor x3 ## 2,           x0 ## 2; \
 449                pxor x3 ## 2,           x2 ## 2; \
 450                pxor x4 ## 2,           x2 ## 2; \
 451                get_key(i, 2, RK2); \
 452        pxor RK1,               x1 ## 1; \
 453        pxor RK3,               x3 ## 1; \
 454        movdqa x0 ## 1,         x4 ## 1; \
 455        pslld $5,               x0 ## 1; \
 456        psrld $(32 - 5),        x4 ## 1; \
 457        por x4 ## 1,            x0 ## 1; \
 458        movdqa x2 ## 1,         x4 ## 1; \
 459        pslld $22,              x2 ## 1; \
 460        psrld $(32 - 22),       x4 ## 1; \
 461        por x4 ## 1,            x2 ## 1; \
 462        pxor RK0,               x0 ## 1; \
 463        pxor RK2,               x2 ## 1; \
 464                pxor RK1,               x1 ## 2; \
 465                pxor RK3,               x3 ## 2; \
 466                movdqa x0 ## 2,         x4 ## 2; \
 467                pslld $5,               x0 ## 2; \
 468                psrld $(32 - 5),        x4 ## 2; \
 469                por x4 ## 2,            x0 ## 2; \
 470                movdqa x2 ## 2,         x4 ## 2; \
 471                pslld $22,              x2 ## 2; \
 472                psrld $(32 - 22),       x4 ## 2; \
 473                por x4 ## 2,            x2 ## 2; \
 474                pxor RK0,               x0 ## 2; \
 475                pxor RK2,               x2 ## 2;
 476
 477#define KL2(x0, x1, x2, x3, x4, i) \
 478        pxor RK0,               x0 ## 1; \
 479        pxor RK2,               x2 ## 1; \
 480        movdqa x0 ## 1,         x4 ## 1; \
 481        psrld $5,               x0 ## 1; \
 482        pslld $(32 - 5),        x4 ## 1; \
 483        por x4 ## 1,            x0 ## 1; \
 484        pxor RK3,               x3 ## 1; \
 485        pxor RK1,               x1 ## 1; \
 486        movdqa x2 ## 1,         x4 ## 1; \
 487        psrld $22,              x2 ## 1; \
 488        pslld $(32 - 22),       x4 ## 1; \
 489        por x4 ## 1,            x2 ## 1; \
 490        pxor x3 ## 1,           x2 ## 1; \
 491                pxor RK0,               x0 ## 2; \
 492                pxor RK2,               x2 ## 2; \
 493                movdqa x0 ## 2,         x4 ## 2; \
 494                psrld $5,               x0 ## 2; \
 495                pslld $(32 - 5),        x4 ## 2; \
 496                por x4 ## 2,            x0 ## 2; \
 497                pxor RK3,               x3 ## 2; \
 498                pxor RK1,               x1 ## 2; \
 499                movdqa x2 ## 2,         x4 ## 2; \
 500                psrld $22,              x2 ## 2; \
 501                pslld $(32 - 22),       x4 ## 2; \
 502                por x4 ## 2,            x2 ## 2; \
 503                pxor x3 ## 2,           x2 ## 2; \
 504        pxor x3 ## 1,           x0 ## 1; \
 505        movdqa x1 ## 1,         x4 ## 1; \
 506        pslld $7,               x4 ## 1; \
 507        pxor x1 ## 1,           x0 ## 1; \
 508        pxor x4 ## 1,           x2 ## 1; \
 509        movdqa x1 ## 1,         x4 ## 1; \
 510        psrld $1,               x1 ## 1; \
 511        pslld $(32 - 1),        x4 ## 1; \
 512        por x4 ## 1,            x1 ## 1; \
 513                pxor x3 ## 2,           x0 ## 2; \
 514                movdqa x1 ## 2,         x4 ## 2; \
 515                pslld $7,               x4 ## 2; \
 516                pxor x1 ## 2,           x0 ## 2; \
 517                pxor x4 ## 2,           x2 ## 2; \
 518                movdqa x1 ## 2,         x4 ## 2; \
 519                psrld $1,               x1 ## 2; \
 520                pslld $(32 - 1),        x4 ## 2; \
 521                por x4 ## 2,            x1 ## 2; \
 522        movdqa x3 ## 1,         x4 ## 1; \
 523        psrld $7,               x3 ## 1; \
 524        pslld $(32 - 7),        x4 ## 1; \
 525        por x4 ## 1,            x3 ## 1; \
 526        pxor x0 ## 1,           x1 ## 1; \
 527        movdqa x0 ## 1,         x4 ## 1; \
 528        pslld $3,               x4 ## 1; \
 529        pxor x4 ## 1,           x3 ## 1; \
 530        movdqa x0 ## 1,         x4 ## 1; \
 531                movdqa x3 ## 2,         x4 ## 2; \
 532                psrld $7,               x3 ## 2; \
 533                pslld $(32 - 7),        x4 ## 2; \
 534                por x4 ## 2,            x3 ## 2; \
 535                pxor x0 ## 2,           x1 ## 2; \
 536                movdqa x0 ## 2,         x4 ## 2; \
 537                pslld $3,               x4 ## 2; \
 538                pxor x4 ## 2,           x3 ## 2; \
 539                movdqa x0 ## 2,         x4 ## 2; \
 540        psrld $13,              x0 ## 1; \
 541        pslld $(32 - 13),       x4 ## 1; \
 542        por x4 ## 1,            x0 ## 1; \
 543        pxor x2 ## 1,           x1 ## 1; \
 544        pxor x2 ## 1,           x3 ## 1; \
 545        movdqa x2 ## 1,         x4 ## 1; \
 546        psrld $3,               x2 ## 1; \
 547        pslld $(32 - 3),        x4 ## 1; \
 548        por x4 ## 1,            x2 ## 1; \
 549                psrld $13,              x0 ## 2; \
 550                pslld $(32 - 13),       x4 ## 2; \
 551                por x4 ## 2,            x0 ## 2; \
 552                pxor x2 ## 2,           x1 ## 2; \
 553                pxor x2 ## 2,           x3 ## 2; \
 554                movdqa x2 ## 2,         x4 ## 2; \
 555                psrld $3,               x2 ## 2; \
 556                pslld $(32 - 3),        x4 ## 2; \
 557                por x4 ## 2,            x2 ## 2;
 558
 559#define S(SBOX, x0, x1, x2, x3, x4) \
 560        SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 561        SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 562        SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 563        SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
 564
 565#define SP(SBOX, x0, x1, x2, x3, x4, i) \
 566        get_key(i, 0, RK0); \
 567        SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 568        get_key(i, 2, RK2); \
 569        SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 570        get_key(i, 3, RK3); \
 571        SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 572        get_key(i, 1, RK1); \
 573        SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 574
 575#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 576        movdqa x0,              t2; \
 577        punpckldq x1,           x0; \
 578        punpckhdq x1,           t2; \
 579        movdqa x2,              t1; \
 580        punpckhdq x3,           x2; \
 581        punpckldq x3,           t1; \
 582        movdqa x0,              x1; \
 583        punpcklqdq t1,          x0; \
 584        punpckhqdq t1,          x1; \
 585        movdqa t2,              x3; \
 586        punpcklqdq x2,          t2; \
 587        punpckhqdq x2,          x3; \
 588        movdqa t2,              x2;
 589
 590#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
 591        movdqu (0*4*4)(in),     x0; \
 592        movdqu (1*4*4)(in),     x1; \
 593        movdqu (2*4*4)(in),     x2; \
 594        movdqu (3*4*4)(in),     x3; \
 595        \
 596        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 597
 598#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
 599        transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 600        \
 601        movdqu x0,              (0*4*4)(out); \
 602        movdqu x1,              (1*4*4)(out); \
 603        movdqu x2,              (2*4*4)(out); \
 604        movdqu x3,              (3*4*4)(out);
 605
 606#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
 607        transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 608        \
 609        movdqu (0*4*4)(out),    t0; \
 610        pxor t0,                x0; \
 611        movdqu x0,              (0*4*4)(out); \
 612        movdqu (1*4*4)(out),    t0; \
 613        pxor t0,                x1; \
 614        movdqu x1,              (1*4*4)(out); \
 615        movdqu (2*4*4)(out),    t0; \
 616        pxor t0,                x2; \
 617        movdqu x2,              (2*4*4)(out); \
 618        movdqu (3*4*4)(out),    t0; \
 619        pxor t0,                x3; \
 620        movdqu x3,              (3*4*4)(out);
 621
 622SYM_FUNC_START(__serpent_enc_blk_8way)
 623        /* input:
 624         *      %rdi: ctx, CTX
 625         *      %rsi: dst
 626         *      %rdx: src
 627         *      %rcx: bool, if true: xor output
 628         */
 629
 630        pcmpeqd RNOT, RNOT;
 631
 632        leaq (4*4*4)(%rdx), %rax;
 633        read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 634        read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 635
 636                                                 K2(RA, RB, RC, RD, RE, 0);
 637        S(S0, RA, RB, RC, RD, RE);              LK2(RC, RB, RD, RA, RE, 1);
 638        S(S1, RC, RB, RD, RA, RE);              LK2(RE, RD, RA, RC, RB, 2);
 639        S(S2, RE, RD, RA, RC, RB);              LK2(RB, RD, RE, RC, RA, 3);
 640        S(S3, RB, RD, RE, RC, RA);              LK2(RC, RA, RD, RB, RE, 4);
 641        S(S4, RC, RA, RD, RB, RE);              LK2(RA, RD, RB, RE, RC, 5);
 642        S(S5, RA, RD, RB, RE, RC);              LK2(RC, RA, RD, RE, RB, 6);
 643        S(S6, RC, RA, RD, RE, RB);              LK2(RD, RB, RA, RE, RC, 7);
 644        S(S7, RD, RB, RA, RE, RC);              LK2(RC, RA, RE, RD, RB, 8);
 645        S(S0, RC, RA, RE, RD, RB);              LK2(RE, RA, RD, RC, RB, 9);
 646        S(S1, RE, RA, RD, RC, RB);              LK2(RB, RD, RC, RE, RA, 10);
 647        S(S2, RB, RD, RC, RE, RA);              LK2(RA, RD, RB, RE, RC, 11);
 648        S(S3, RA, RD, RB, RE, RC);              LK2(RE, RC, RD, RA, RB, 12);
 649        S(S4, RE, RC, RD, RA, RB);              LK2(RC, RD, RA, RB, RE, 13);
 650        S(S5, RC, RD, RA, RB, RE);              LK2(RE, RC, RD, RB, RA, 14);
 651        S(S6, RE, RC, RD, RB, RA);              LK2(RD, RA, RC, RB, RE, 15);
 652        S(S7, RD, RA, RC, RB, RE);              LK2(RE, RC, RB, RD, RA, 16);
 653        S(S0, RE, RC, RB, RD, RA);              LK2(RB, RC, RD, RE, RA, 17);
 654        S(S1, RB, RC, RD, RE, RA);              LK2(RA, RD, RE, RB, RC, 18);
 655        S(S2, RA, RD, RE, RB, RC);              LK2(RC, RD, RA, RB, RE, 19);
 656        S(S3, RC, RD, RA, RB, RE);              LK2(RB, RE, RD, RC, RA, 20);
 657        S(S4, RB, RE, RD, RC, RA);              LK2(RE, RD, RC, RA, RB, 21);
 658        S(S5, RE, RD, RC, RA, RB);              LK2(RB, RE, RD, RA, RC, 22);
 659        S(S6, RB, RE, RD, RA, RC);              LK2(RD, RC, RE, RA, RB, 23);
 660        S(S7, RD, RC, RE, RA, RB);              LK2(RB, RE, RA, RD, RC, 24);
 661        S(S0, RB, RE, RA, RD, RC);              LK2(RA, RE, RD, RB, RC, 25);
 662        S(S1, RA, RE, RD, RB, RC);              LK2(RC, RD, RB, RA, RE, 26);
 663        S(S2, RC, RD, RB, RA, RE);              LK2(RE, RD, RC, RA, RB, 27);
 664        S(S3, RE, RD, RC, RA, RB);              LK2(RA, RB, RD, RE, RC, 28);
 665        S(S4, RA, RB, RD, RE, RC);              LK2(RB, RD, RE, RC, RA, 29);
 666        S(S5, RB, RD, RE, RC, RA);              LK2(RA, RB, RD, RC, RE, 30);
 667        S(S6, RA, RB, RD, RC, RE);              LK2(RD, RE, RB, RC, RA, 31);
 668        S(S7, RD, RE, RB, RC, RA);               K2(RA, RB, RC, RD, RE, 32);
 669
 670        leaq (4*4*4)(%rsi), %rax;
 671
 672        testb %cl, %cl;
 673        jnz .L__enc_xor8;
 674
 675        write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 676        write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 677
 678        ret;
 679
 680.L__enc_xor8:
 681        xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 682        xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 683
 684        ret;
 685SYM_FUNC_END(__serpent_enc_blk_8way)
 686
 687SYM_FUNC_START(serpent_dec_blk_8way)
 688        /* input:
 689         *      %rdi: ctx, CTX
 690         *      %rsi: dst
 691         *      %rdx: src
 692         */
 693
 694        pcmpeqd RNOT, RNOT;
 695
 696        leaq (4*4*4)(%rdx), %rax;
 697        read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 698        read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 699
 700                                                 K2(RA, RB, RC, RD, RE, 32);
 701        SP(SI7, RA, RB, RC, RD, RE, 31);        KL2(RB, RD, RA, RE, RC, 31);
 702        SP(SI6, RB, RD, RA, RE, RC, 30);        KL2(RA, RC, RE, RB, RD, 30);
 703        SP(SI5, RA, RC, RE, RB, RD, 29);        KL2(RC, RD, RA, RE, RB, 29);
 704        SP(SI4, RC, RD, RA, RE, RB, 28);        KL2(RC, RA, RB, RE, RD, 28);
 705        SP(SI3, RC, RA, RB, RE, RD, 27);        KL2(RB, RC, RD, RE, RA, 27);
 706        SP(SI2, RB, RC, RD, RE, RA, 26);        KL2(RC, RA, RE, RD, RB, 26);
 707        SP(SI1, RC, RA, RE, RD, RB, 25);        KL2(RB, RA, RE, RD, RC, 25);
 708        SP(SI0, RB, RA, RE, RD, RC, 24);        KL2(RE, RC, RA, RB, RD, 24);
 709        SP(SI7, RE, RC, RA, RB, RD, 23);        KL2(RC, RB, RE, RD, RA, 23);
 710        SP(SI6, RC, RB, RE, RD, RA, 22);        KL2(RE, RA, RD, RC, RB, 22);
 711        SP(SI5, RE, RA, RD, RC, RB, 21);        KL2(RA, RB, RE, RD, RC, 21);
 712        SP(SI4, RA, RB, RE, RD, RC, 20);        KL2(RA, RE, RC, RD, RB, 20);
 713        SP(SI3, RA, RE, RC, RD, RB, 19);        KL2(RC, RA, RB, RD, RE, 19);
 714        SP(SI2, RC, RA, RB, RD, RE, 18);        KL2(RA, RE, RD, RB, RC, 18);
 715        SP(SI1, RA, RE, RD, RB, RC, 17);        KL2(RC, RE, RD, RB, RA, 17);
 716        SP(SI0, RC, RE, RD, RB, RA, 16);        KL2(RD, RA, RE, RC, RB, 16);
 717        SP(SI7, RD, RA, RE, RC, RB, 15);        KL2(RA, RC, RD, RB, RE, 15);
 718        SP(SI6, RA, RC, RD, RB, RE, 14);        KL2(RD, RE, RB, RA, RC, 14);
 719        SP(SI5, RD, RE, RB, RA, RC, 13);        KL2(RE, RC, RD, RB, RA, 13);
 720        SP(SI4, RE, RC, RD, RB, RA, 12);        KL2(RE, RD, RA, RB, RC, 12);
 721        SP(SI3, RE, RD, RA, RB, RC, 11);        KL2(RA, RE, RC, RB, RD, 11);
 722        SP(SI2, RA, RE, RC, RB, RD, 10);        KL2(RE, RD, RB, RC, RA, 10);
 723        SP(SI1, RE, RD, RB, RC, RA, 9);         KL2(RA, RD, RB, RC, RE, 9);
 724        SP(SI0, RA, RD, RB, RC, RE, 8);         KL2(RB, RE, RD, RA, RC, 8);
 725        SP(SI7, RB, RE, RD, RA, RC, 7);         KL2(RE, RA, RB, RC, RD, 7);
 726        SP(SI6, RE, RA, RB, RC, RD, 6);         KL2(RB, RD, RC, RE, RA, 6);
 727        SP(SI5, RB, RD, RC, RE, RA, 5);         KL2(RD, RA, RB, RC, RE, 5);
 728        SP(SI4, RD, RA, RB, RC, RE, 4);         KL2(RD, RB, RE, RC, RA, 4);
 729        SP(SI3, RD, RB, RE, RC, RA, 3);         KL2(RE, RD, RA, RC, RB, 3);
 730        SP(SI2, RE, RD, RA, RC, RB, 2);         KL2(RD, RB, RC, RA, RE, 2);
 731        SP(SI1, RD, RB, RC, RA, RE, 1);         KL2(RE, RB, RC, RA, RD, 1);
 732        S(SI0, RE, RB, RC, RA, RD);              K2(RC, RD, RB, RE, RA, 0);
 733
 734        leaq (4*4*4)(%rsi), %rax;
 735        write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
 736        write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
 737
 738        ret;
 739SYM_FUNC_END(serpent_dec_blk_8way)
 740