linux/arch/x86/include/asm/xor_32.h
<<
>>
Prefs
   1#ifndef _ASM_X86_XOR_32_H
   2#define _ASM_X86_XOR_32_H
   3
   4/*
   5 * Optimized RAID-5 checksumming functions for MMX and SSE.
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License as published by
   9 * the Free Software Foundation; either version 2, or (at your option)
  10 * any later version.
  11 *
  12 * You should have received a copy of the GNU General Public License
  13 * (for example /usr/src/linux/COPYING); if not, write to the Free
  14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15 */
  16
  17/*
  18 * High-speed RAID5 checksumming functions utilizing MMX instructions.
  19 * Copyright (C) 1998 Ingo Molnar.
  20 */
  21
  22#define LD(x, y)        "       movq   8*("#x")(%1), %%mm"#y"   ;\n"
  23#define ST(x, y)        "       movq %%mm"#y",   8*("#x")(%1)   ;\n"
  24#define XO1(x, y)       "       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
  25#define XO2(x, y)       "       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
  26#define XO3(x, y)       "       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
  27#define XO4(x, y)       "       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
  28
  29#include <asm/i387.h>
  30
  31static void
  32xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
  33{
  34        unsigned long lines = bytes >> 7;
  35
  36        kernel_fpu_begin();
  37
  38        asm volatile(
  39#undef BLOCK
  40#define BLOCK(i)                                \
  41        LD(i, 0)                                \
  42                LD(i + 1, 1)                    \
  43                        LD(i + 2, 2)            \
  44                                LD(i + 3, 3)    \
  45        XO1(i, 0)                               \
  46        ST(i, 0)                                \
  47                XO1(i+1, 1)                     \
  48                ST(i+1, 1)                      \
  49                        XO1(i + 2, 2)           \
  50                        ST(i + 2, 2)            \
  51                                XO1(i + 3, 3)   \
  52                                ST(i + 3, 3)
  53
  54        " .align 32                     ;\n"
  55        " 1:                            ;\n"
  56
  57        BLOCK(0)
  58        BLOCK(4)
  59        BLOCK(8)
  60        BLOCK(12)
  61
  62        "       addl $128, %1         ;\n"
  63        "       addl $128, %2         ;\n"
  64        "       decl %0               ;\n"
  65        "       jnz 1b                ;\n"
  66        : "+r" (lines),
  67          "+r" (p1), "+r" (p2)
  68        :
  69        : "memory");
  70
  71        kernel_fpu_end();
  72}
  73
  74static void
  75xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
  76              unsigned long *p3)
  77{
  78        unsigned long lines = bytes >> 7;
  79
  80        kernel_fpu_begin();
  81
  82        asm volatile(
  83#undef BLOCK
  84#define BLOCK(i)                                \
  85        LD(i, 0)                                \
  86                LD(i + 1, 1)                    \
  87                        LD(i + 2, 2)            \
  88                                LD(i + 3, 3)    \
  89        XO1(i, 0)                               \
  90                XO1(i + 1, 1)                   \
  91                        XO1(i + 2, 2)           \
  92                                XO1(i + 3, 3)   \
  93        XO2(i, 0)                               \
  94        ST(i, 0)                                \
  95                XO2(i + 1, 1)                   \
  96                ST(i + 1, 1)                    \
  97                        XO2(i + 2, 2)           \
  98                        ST(i + 2, 2)            \
  99                                XO2(i + 3, 3)   \
 100                                ST(i + 3, 3)
 101
 102        " .align 32                     ;\n"
 103        " 1:                            ;\n"
 104
 105        BLOCK(0)
 106        BLOCK(4)
 107        BLOCK(8)
 108        BLOCK(12)
 109
 110        "       addl $128, %1         ;\n"
 111        "       addl $128, %2         ;\n"
 112        "       addl $128, %3         ;\n"
 113        "       decl %0               ;\n"
 114        "       jnz 1b                ;\n"
 115        : "+r" (lines),
 116          "+r" (p1), "+r" (p2), "+r" (p3)
 117        :
 118        : "memory");
 119
 120        kernel_fpu_end();
 121}
 122
 123static void
 124xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 125              unsigned long *p3, unsigned long *p4)
 126{
 127        unsigned long lines = bytes >> 7;
 128
 129        kernel_fpu_begin();
 130
 131        asm volatile(
 132#undef BLOCK
 133#define BLOCK(i)                                \
 134        LD(i, 0)                                \
 135                LD(i + 1, 1)                    \
 136                        LD(i + 2, 2)            \
 137                                LD(i + 3, 3)    \
 138        XO1(i, 0)                               \
 139                XO1(i + 1, 1)                   \
 140                        XO1(i + 2, 2)           \
 141                                XO1(i + 3, 3)   \
 142        XO2(i, 0)                               \
 143                XO2(i + 1, 1)                   \
 144                        XO2(i + 2, 2)           \
 145                                XO2(i + 3, 3)   \
 146        XO3(i, 0)                               \
 147        ST(i, 0)                                \
 148                XO3(i + 1, 1)                   \
 149                ST(i + 1, 1)                    \
 150                        XO3(i + 2, 2)           \
 151                        ST(i + 2, 2)            \
 152                                XO3(i + 3, 3)   \
 153                                ST(i + 3, 3)
 154
 155        " .align 32                     ;\n"
 156        " 1:                            ;\n"
 157
 158        BLOCK(0)
 159        BLOCK(4)
 160        BLOCK(8)
 161        BLOCK(12)
 162
 163        "       addl $128, %1         ;\n"
 164        "       addl $128, %2         ;\n"
 165        "       addl $128, %3         ;\n"
 166        "       addl $128, %4         ;\n"
 167        "       decl %0               ;\n"
 168        "       jnz 1b                ;\n"
 169        : "+r" (lines),
 170          "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
 171        :
 172        : "memory");
 173
 174        kernel_fpu_end();
 175}
 176
 177
 178static void
 179xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 180              unsigned long *p3, unsigned long *p4, unsigned long *p5)
 181{
 182        unsigned long lines = bytes >> 7;
 183
 184        kernel_fpu_begin();
 185
 186        /* Make sure GCC forgets anything it knows about p4 or p5,
 187           such that it won't pass to the asm volatile below a
 188           register that is shared with any other variable.  That's
 189           because we modify p4 and p5 there, but we can't mark them
 190           as read/write, otherwise we'd overflow the 10-asm-operands
 191           limit of GCC < 3.1.  */
 192        asm("" : "+r" (p4), "+r" (p5));
 193
 194        asm volatile(
 195#undef BLOCK
 196#define BLOCK(i)                                \
 197        LD(i, 0)                                \
 198                LD(i + 1, 1)                    \
 199                        LD(i + 2, 2)            \
 200                                LD(i + 3, 3)    \
 201        XO1(i, 0)                               \
 202                XO1(i + 1, 1)                   \
 203                        XO1(i + 2, 2)           \
 204                                XO1(i + 3, 3)   \
 205        XO2(i, 0)                               \
 206                XO2(i + 1, 1)                   \
 207                        XO2(i + 2, 2)           \
 208                                XO2(i + 3, 3)   \
 209        XO3(i, 0)                               \
 210                XO3(i + 1, 1)                   \
 211                        XO3(i + 2, 2)           \
 212                                XO3(i + 3, 3)   \
 213        XO4(i, 0)                               \
 214        ST(i, 0)                                \
 215                XO4(i + 1, 1)                   \
 216                ST(i + 1, 1)                    \
 217                        XO4(i + 2, 2)           \
 218                        ST(i + 2, 2)            \
 219                                XO4(i + 3, 3)   \
 220                                ST(i + 3, 3)
 221
 222        " .align 32                     ;\n"
 223        " 1:                            ;\n"
 224
 225        BLOCK(0)
 226        BLOCK(4)
 227        BLOCK(8)
 228        BLOCK(12)
 229
 230        "       addl $128, %1         ;\n"
 231        "       addl $128, %2         ;\n"
 232        "       addl $128, %3         ;\n"
 233        "       addl $128, %4         ;\n"
 234        "       addl $128, %5         ;\n"
 235        "       decl %0               ;\n"
 236        "       jnz 1b                ;\n"
 237        : "+r" (lines),
 238          "+r" (p1), "+r" (p2), "+r" (p3)
 239        : "r" (p4), "r" (p5)
 240        : "memory");
 241
 242        /* p4 and p5 were modified, and now the variables are dead.
 243           Clobber them just to be sure nobody does something stupid
 244           like assuming they have some legal value.  */
 245        asm("" : "=r" (p4), "=r" (p5));
 246
 247        kernel_fpu_end();
 248}
 249
 250#undef LD
 251#undef XO1
 252#undef XO2
 253#undef XO3
 254#undef XO4
 255#undef ST
 256#undef BLOCK
 257
 258static void
 259xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 260{
 261        unsigned long lines = bytes >> 6;
 262
 263        kernel_fpu_begin();
 264
 265        asm volatile(
 266        " .align 32                  ;\n"
 267        " 1:                         ;\n"
 268        "       movq   (%1), %%mm0   ;\n"
 269        "       movq  8(%1), %%mm1   ;\n"
 270        "       pxor   (%2), %%mm0   ;\n"
 271        "       movq 16(%1), %%mm2   ;\n"
 272        "       movq %%mm0,   (%1)   ;\n"
 273        "       pxor  8(%2), %%mm1   ;\n"
 274        "       movq 24(%1), %%mm3   ;\n"
 275        "       movq %%mm1,  8(%1)   ;\n"
 276        "       pxor 16(%2), %%mm2   ;\n"
 277        "       movq 32(%1), %%mm4   ;\n"
 278        "       movq %%mm2, 16(%1)   ;\n"
 279        "       pxor 24(%2), %%mm3   ;\n"
 280        "       movq 40(%1), %%mm5   ;\n"
 281        "       movq %%mm3, 24(%1)   ;\n"
 282        "       pxor 32(%2), %%mm4   ;\n"
 283        "       movq 48(%1), %%mm6   ;\n"
 284        "       movq %%mm4, 32(%1)   ;\n"
 285        "       pxor 40(%2), %%mm5   ;\n"
 286        "       movq 56(%1), %%mm7   ;\n"
 287        "       movq %%mm5, 40(%1)   ;\n"
 288        "       pxor 48(%2), %%mm6   ;\n"
 289        "       pxor 56(%2), %%mm7   ;\n"
 290        "       movq %%mm6, 48(%1)   ;\n"
 291        "       movq %%mm7, 56(%1)   ;\n"
 292
 293        "       addl $64, %1         ;\n"
 294        "       addl $64, %2         ;\n"
 295        "       decl %0              ;\n"
 296        "       jnz 1b               ;\n"
 297        : "+r" (lines),
 298          "+r" (p1), "+r" (p2)
 299        :
 300        : "memory");
 301
 302        kernel_fpu_end();
 303}
 304
 305static void
 306xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 307             unsigned long *p3)
 308{
 309        unsigned long lines = bytes >> 6;
 310
 311        kernel_fpu_begin();
 312
 313        asm volatile(
 314        " .align 32,0x90             ;\n"
 315        " 1:                         ;\n"
 316        "       movq   (%1), %%mm0   ;\n"
 317        "       movq  8(%1), %%mm1   ;\n"
 318        "       pxor   (%2), %%mm0   ;\n"
 319        "       movq 16(%1), %%mm2   ;\n"
 320        "       pxor  8(%2), %%mm1   ;\n"
 321        "       pxor   (%3), %%mm0   ;\n"
 322        "       pxor 16(%2), %%mm2   ;\n"
 323        "       movq %%mm0,   (%1)   ;\n"
 324        "       pxor  8(%3), %%mm1   ;\n"
 325        "       pxor 16(%3), %%mm2   ;\n"
 326        "       movq 24(%1), %%mm3   ;\n"
 327        "       movq %%mm1,  8(%1)   ;\n"
 328        "       movq 32(%1), %%mm4   ;\n"
 329        "       movq 40(%1), %%mm5   ;\n"
 330        "       pxor 24(%2), %%mm3   ;\n"
 331        "       movq %%mm2, 16(%1)   ;\n"
 332        "       pxor 32(%2), %%mm4   ;\n"
 333        "       pxor 24(%3), %%mm3   ;\n"
 334        "       pxor 40(%2), %%mm5   ;\n"
 335        "       movq %%mm3, 24(%1)   ;\n"
 336        "       pxor 32(%3), %%mm4   ;\n"
 337        "       pxor 40(%3), %%mm5   ;\n"
 338        "       movq 48(%1), %%mm6   ;\n"
 339        "       movq %%mm4, 32(%1)   ;\n"
 340        "       movq 56(%1), %%mm7   ;\n"
 341        "       pxor 48(%2), %%mm6   ;\n"
 342        "       movq %%mm5, 40(%1)   ;\n"
 343        "       pxor 56(%2), %%mm7   ;\n"
 344        "       pxor 48(%3), %%mm6   ;\n"
 345        "       pxor 56(%3), %%mm7   ;\n"
 346        "       movq %%mm6, 48(%1)   ;\n"
 347        "       movq %%mm7, 56(%1)   ;\n"
 348
 349        "       addl $64, %1         ;\n"
 350        "       addl $64, %2         ;\n"
 351        "       addl $64, %3         ;\n"
 352        "       decl %0              ;\n"
 353        "       jnz 1b               ;\n"
 354        : "+r" (lines),
 355          "+r" (p1), "+r" (p2), "+r" (p3)
 356        :
 357        : "memory" );
 358
 359        kernel_fpu_end();
 360}
 361
 362static void
 363xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 364             unsigned long *p3, unsigned long *p4)
 365{
 366        unsigned long lines = bytes >> 6;
 367
 368        kernel_fpu_begin();
 369
 370        asm volatile(
 371        " .align 32,0x90             ;\n"
 372        " 1:                         ;\n"
 373        "       movq   (%1), %%mm0   ;\n"
 374        "       movq  8(%1), %%mm1   ;\n"
 375        "       pxor   (%2), %%mm0   ;\n"
 376        "       movq 16(%1), %%mm2   ;\n"
 377        "       pxor  8(%2), %%mm1   ;\n"
 378        "       pxor   (%3), %%mm0   ;\n"
 379        "       pxor 16(%2), %%mm2   ;\n"
 380        "       pxor  8(%3), %%mm1   ;\n"
 381        "       pxor   (%4), %%mm0   ;\n"
 382        "       movq 24(%1), %%mm3   ;\n"
 383        "       pxor 16(%3), %%mm2   ;\n"
 384        "       pxor  8(%4), %%mm1   ;\n"
 385        "       movq %%mm0,   (%1)   ;\n"
 386        "       movq 32(%1), %%mm4   ;\n"
 387        "       pxor 24(%2), %%mm3   ;\n"
 388        "       pxor 16(%4), %%mm2   ;\n"
 389        "       movq %%mm1,  8(%1)   ;\n"
 390        "       movq 40(%1), %%mm5   ;\n"
 391        "       pxor 32(%2), %%mm4   ;\n"
 392        "       pxor 24(%3), %%mm3   ;\n"
 393        "       movq %%mm2, 16(%1)   ;\n"
 394        "       pxor 40(%2), %%mm5   ;\n"
 395        "       pxor 32(%3), %%mm4   ;\n"
 396        "       pxor 24(%4), %%mm3   ;\n"
 397        "       movq %%mm3, 24(%1)   ;\n"
 398        "       movq 56(%1), %%mm7   ;\n"
 399        "       movq 48(%1), %%mm6   ;\n"
 400        "       pxor 40(%3), %%mm5   ;\n"
 401        "       pxor 32(%4), %%mm4   ;\n"
 402        "       pxor 48(%2), %%mm6   ;\n"
 403        "       movq %%mm4, 32(%1)   ;\n"
 404        "       pxor 56(%2), %%mm7   ;\n"
 405        "       pxor 40(%4), %%mm5   ;\n"
 406        "       pxor 48(%3), %%mm6   ;\n"
 407        "       pxor 56(%3), %%mm7   ;\n"
 408        "       movq %%mm5, 40(%1)   ;\n"
 409        "       pxor 48(%4), %%mm6   ;\n"
 410        "       pxor 56(%4), %%mm7   ;\n"
 411        "       movq %%mm6, 48(%1)   ;\n"
 412        "       movq %%mm7, 56(%1)   ;\n"
 413
 414        "       addl $64, %1         ;\n"
 415        "       addl $64, %2         ;\n"
 416        "       addl $64, %3         ;\n"
 417        "       addl $64, %4         ;\n"
 418        "       decl %0              ;\n"
 419        "       jnz 1b               ;\n"
 420        : "+r" (lines),
 421          "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
 422        :
 423        : "memory");
 424
 425        kernel_fpu_end();
 426}
 427
 428static void
 429xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 430             unsigned long *p3, unsigned long *p4, unsigned long *p5)
 431{
 432        unsigned long lines = bytes >> 6;
 433
 434        kernel_fpu_begin();
 435
 436        /* Make sure GCC forgets anything it knows about p4 or p5,
 437           such that it won't pass to the asm volatile below a
 438           register that is shared with any other variable.  That's
 439           because we modify p4 and p5 there, but we can't mark them
 440           as read/write, otherwise we'd overflow the 10-asm-operands
 441           limit of GCC < 3.1.  */
 442        asm("" : "+r" (p4), "+r" (p5));
 443
 444        asm volatile(
 445        " .align 32,0x90             ;\n"
 446        " 1:                         ;\n"
 447        "       movq   (%1), %%mm0   ;\n"
 448        "       movq  8(%1), %%mm1   ;\n"
 449        "       pxor   (%2), %%mm0   ;\n"
 450        "       pxor  8(%2), %%mm1   ;\n"
 451        "       movq 16(%1), %%mm2   ;\n"
 452        "       pxor   (%3), %%mm0   ;\n"
 453        "       pxor  8(%3), %%mm1   ;\n"
 454        "       pxor 16(%2), %%mm2   ;\n"
 455        "       pxor   (%4), %%mm0   ;\n"
 456        "       pxor  8(%4), %%mm1   ;\n"
 457        "       pxor 16(%3), %%mm2   ;\n"
 458        "       movq 24(%1), %%mm3   ;\n"
 459        "       pxor   (%5), %%mm0   ;\n"
 460        "       pxor  8(%5), %%mm1   ;\n"
 461        "       movq %%mm0,   (%1)   ;\n"
 462        "       pxor 16(%4), %%mm2   ;\n"
 463        "       pxor 24(%2), %%mm3   ;\n"
 464        "       movq %%mm1,  8(%1)   ;\n"
 465        "       pxor 16(%5), %%mm2   ;\n"
 466        "       pxor 24(%3), %%mm3   ;\n"
 467        "       movq 32(%1), %%mm4   ;\n"
 468        "       movq %%mm2, 16(%1)   ;\n"
 469        "       pxor 24(%4), %%mm3   ;\n"
 470        "       pxor 32(%2), %%mm4   ;\n"
 471        "       movq 40(%1), %%mm5   ;\n"
 472        "       pxor 24(%5), %%mm3   ;\n"
 473        "       pxor 32(%3), %%mm4   ;\n"
 474        "       pxor 40(%2), %%mm5   ;\n"
 475        "       movq %%mm3, 24(%1)   ;\n"
 476        "       pxor 32(%4), %%mm4   ;\n"
 477        "       pxor 40(%3), %%mm5   ;\n"
 478        "       movq 48(%1), %%mm6   ;\n"
 479        "       movq 56(%1), %%mm7   ;\n"
 480        "       pxor 32(%5), %%mm4   ;\n"
 481        "       pxor 40(%4), %%mm5   ;\n"
 482        "       pxor 48(%2), %%mm6   ;\n"
 483        "       pxor 56(%2), %%mm7   ;\n"
 484        "       movq %%mm4, 32(%1)   ;\n"
 485        "       pxor 48(%3), %%mm6   ;\n"
 486        "       pxor 56(%3), %%mm7   ;\n"
 487        "       pxor 40(%5), %%mm5   ;\n"
 488        "       pxor 48(%4), %%mm6   ;\n"
 489        "       pxor 56(%4), %%mm7   ;\n"
 490        "       movq %%mm5, 40(%1)   ;\n"
 491        "       pxor 48(%5), %%mm6   ;\n"
 492        "       pxor 56(%5), %%mm7   ;\n"
 493        "       movq %%mm6, 48(%1)   ;\n"
 494        "       movq %%mm7, 56(%1)   ;\n"
 495
 496        "       addl $64, %1         ;\n"
 497        "       addl $64, %2         ;\n"
 498        "       addl $64, %3         ;\n"
 499        "       addl $64, %4         ;\n"
 500        "       addl $64, %5         ;\n"
 501        "       decl %0              ;\n"
 502        "       jnz 1b               ;\n"
 503        : "+r" (lines),
 504          "+r" (p1), "+r" (p2), "+r" (p3)
 505        : "r" (p4), "r" (p5)
 506        : "memory");
 507
 508        /* p4 and p5 were modified, and now the variables are dead.
 509           Clobber them just to be sure nobody does something stupid
 510           like assuming they have some legal value.  */
 511        asm("" : "=r" (p4), "=r" (p5));
 512
 513        kernel_fpu_end();
 514}
 515
 516static struct xor_block_template xor_block_pII_mmx = {
 517        .name = "pII_mmx",
 518        .do_2 = xor_pII_mmx_2,
 519        .do_3 = xor_pII_mmx_3,
 520        .do_4 = xor_pII_mmx_4,
 521        .do_5 = xor_pII_mmx_5,
 522};
 523
 524static struct xor_block_template xor_block_p5_mmx = {
 525        .name = "p5_mmx",
 526        .do_2 = xor_p5_mmx_2,
 527        .do_3 = xor_p5_mmx_3,
 528        .do_4 = xor_p5_mmx_4,
 529        .do_5 = xor_p5_mmx_5,
 530};
 531
 532/*
 533 * Cache avoiding checksumming functions utilizing KNI instructions
 534 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
 535 */
 536
 537#define OFFS(x)         "16*("#x")"
 538#define PF_OFFS(x)      "256+16*("#x")"
 539#define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%1)            ;\n"
 540#define LD(x, y)        "       movaps   "OFFS(x)"(%1), %%xmm"#y"       ;\n"
 541#define ST(x, y)        "       movaps %%xmm"#y",   "OFFS(x)"(%1)       ;\n"
 542#define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%2)            ;\n"
 543#define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%3)            ;\n"
 544#define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%4)            ;\n"
 545#define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%5)            ;\n"
 546#define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%6)            ;\n"
 547#define XO1(x, y)       "       xorps   "OFFS(x)"(%2), %%xmm"#y"        ;\n"
 548#define XO2(x, y)       "       xorps   "OFFS(x)"(%3), %%xmm"#y"        ;\n"
 549#define XO3(x, y)       "       xorps   "OFFS(x)"(%4), %%xmm"#y"        ;\n"
 550#define XO4(x, y)       "       xorps   "OFFS(x)"(%5), %%xmm"#y"        ;\n"
 551#define XO5(x, y)       "       xorps   "OFFS(x)"(%6), %%xmm"#y"        ;\n"
 552
 553
 554static void
 555xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 556{
 557        unsigned long lines = bytes >> 8;
 558
 559        kernel_fpu_begin();
 560
 561        asm volatile(
 562#undef BLOCK
 563#define BLOCK(i)                                        \
 564                LD(i, 0)                                \
 565                        LD(i + 1, 1)                    \
 566                PF1(i)                                  \
 567                                PF1(i + 2)              \
 568                                LD(i + 2, 2)            \
 569                                        LD(i + 3, 3)    \
 570                PF0(i + 4)                              \
 571                                PF0(i + 6)              \
 572                XO1(i, 0)                               \
 573                        XO1(i + 1, 1)                   \
 574                                XO1(i + 2, 2)           \
 575                                        XO1(i + 3, 3)   \
 576                ST(i, 0)                                \
 577                        ST(i + 1, 1)                    \
 578                                ST(i + 2, 2)            \
 579                                        ST(i + 3, 3)    \
 580
 581
 582                PF0(0)
 583                                PF0(2)
 584
 585        " .align 32                     ;\n"
 586        " 1:                            ;\n"
 587
 588                BLOCK(0)
 589                BLOCK(4)
 590                BLOCK(8)
 591                BLOCK(12)
 592
 593        "       addl $256, %1           ;\n"
 594        "       addl $256, %2           ;\n"
 595        "       decl %0                 ;\n"
 596        "       jnz 1b                  ;\n"
 597        : "+r" (lines),
 598          "+r" (p1), "+r" (p2)
 599        :
 600        : "memory");
 601
 602        kernel_fpu_end();
 603}
 604
 605static void
 606xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 607          unsigned long *p3)
 608{
 609        unsigned long lines = bytes >> 8;
 610
 611        kernel_fpu_begin();
 612
 613        asm volatile(
 614#undef BLOCK
 615#define BLOCK(i) \
 616                PF1(i)                                  \
 617                                PF1(i + 2)              \
 618                LD(i,0)                                 \
 619                        LD(i + 1, 1)                    \
 620                                LD(i + 2, 2)            \
 621                                        LD(i + 3, 3)    \
 622                PF2(i)                                  \
 623                                PF2(i + 2)              \
 624                PF0(i + 4)                              \
 625                                PF0(i + 6)              \
 626                XO1(i,0)                                \
 627                        XO1(i + 1, 1)                   \
 628                                XO1(i + 2, 2)           \
 629                                        XO1(i + 3, 3)   \
 630                XO2(i,0)                                \
 631                        XO2(i + 1, 1)                   \
 632                                XO2(i + 2, 2)           \
 633                                        XO2(i + 3, 3)   \
 634                ST(i,0)                                 \
 635                        ST(i + 1, 1)                    \
 636                                ST(i + 2, 2)            \
 637                                        ST(i + 3, 3)    \
 638
 639
 640                PF0(0)
 641                                PF0(2)
 642
 643        " .align 32                     ;\n"
 644        " 1:                            ;\n"
 645
 646                BLOCK(0)
 647                BLOCK(4)
 648                BLOCK(8)
 649                BLOCK(12)
 650
 651        "       addl $256, %1           ;\n"
 652        "       addl $256, %2           ;\n"
 653        "       addl $256, %3           ;\n"
 654        "       decl %0                 ;\n"
 655        "       jnz 1b                  ;\n"
 656        : "+r" (lines),
 657          "+r" (p1), "+r"(p2), "+r"(p3)
 658        :
 659        : "memory" );
 660
 661        kernel_fpu_end();
 662}
 663
 664static void
 665xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 666          unsigned long *p3, unsigned long *p4)
 667{
 668        unsigned long lines = bytes >> 8;
 669
 670        kernel_fpu_begin();
 671
 672        asm volatile(
 673#undef BLOCK
 674#define BLOCK(i) \
 675                PF1(i)                                  \
 676                                PF1(i + 2)              \
 677                LD(i,0)                                 \
 678                        LD(i + 1, 1)                    \
 679                                LD(i + 2, 2)            \
 680                                        LD(i + 3, 3)    \
 681                PF2(i)                                  \
 682                                PF2(i + 2)              \
 683                XO1(i,0)                                \
 684                        XO1(i + 1, 1)                   \
 685                                XO1(i + 2, 2)           \
 686                                        XO1(i + 3, 3)   \
 687                PF3(i)                                  \
 688                                PF3(i + 2)              \
 689                PF0(i + 4)                              \
 690                                PF0(i + 6)              \
 691                XO2(i,0)                                \
 692                        XO2(i + 1, 1)                   \
 693                                XO2(i + 2, 2)           \
 694                                        XO2(i + 3, 3)   \
 695                XO3(i,0)                                \
 696                        XO3(i + 1, 1)                   \
 697                                XO3(i + 2, 2)           \
 698                                        XO3(i + 3, 3)   \
 699                ST(i,0)                                 \
 700                        ST(i + 1, 1)                    \
 701                                ST(i + 2, 2)            \
 702                                        ST(i + 3, 3)    \
 703
 704
 705                PF0(0)
 706                                PF0(2)
 707
 708        " .align 32                     ;\n"
 709        " 1:                            ;\n"
 710
 711                BLOCK(0)
 712                BLOCK(4)
 713                BLOCK(8)
 714                BLOCK(12)
 715
 716        "       addl $256, %1           ;\n"
 717        "       addl $256, %2           ;\n"
 718        "       addl $256, %3           ;\n"
 719        "       addl $256, %4           ;\n"
 720        "       decl %0                 ;\n"
 721        "       jnz 1b                  ;\n"
 722        : "+r" (lines),
 723          "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
 724        :
 725        : "memory" );
 726
 727        kernel_fpu_end();
 728}
 729
 730static void
 731xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 732          unsigned long *p3, unsigned long *p4, unsigned long *p5)
 733{
 734        unsigned long lines = bytes >> 8;
 735
 736        kernel_fpu_begin();
 737
 738        /* Make sure GCC forgets anything it knows about p4 or p5,
 739           such that it won't pass to the asm volatile below a
 740           register that is shared with any other variable.  That's
 741           because we modify p4 and p5 there, but we can't mark them
 742           as read/write, otherwise we'd overflow the 10-asm-operands
 743           limit of GCC < 3.1.  */
 744        asm("" : "+r" (p4), "+r" (p5));
 745
 746        asm volatile(
 747#undef BLOCK
 748#define BLOCK(i) \
 749                PF1(i)                                  \
 750                                PF1(i + 2)              \
 751                LD(i,0)                                 \
 752                        LD(i + 1, 1)                    \
 753                                LD(i + 2, 2)            \
 754                                        LD(i + 3, 3)    \
 755                PF2(i)                                  \
 756                                PF2(i + 2)              \
 757                XO1(i,0)                                \
 758                        XO1(i + 1, 1)                   \
 759                                XO1(i + 2, 2)           \
 760                                        XO1(i + 3, 3)   \
 761                PF3(i)                                  \
 762                                PF3(i + 2)              \
 763                XO2(i,0)                                \
 764                        XO2(i + 1, 1)                   \
 765                                XO2(i + 2, 2)           \
 766                                        XO2(i + 3, 3)   \
 767                PF4(i)                                  \
 768                                PF4(i + 2)              \
 769                PF0(i + 4)                              \
 770                                PF0(i + 6)              \
 771                XO3(i,0)                                \
 772                        XO3(i + 1, 1)                   \
 773                                XO3(i + 2, 2)           \
 774                                        XO3(i + 3, 3)   \
 775                XO4(i,0)                                \
 776                        XO4(i + 1, 1)                   \
 777                                XO4(i + 2, 2)           \
 778                                        XO4(i + 3, 3)   \
 779                ST(i,0)                                 \
 780                        ST(i + 1, 1)                    \
 781                                ST(i + 2, 2)            \
 782                                        ST(i + 3, 3)    \
 783
 784
 785                PF0(0)
 786                                PF0(2)
 787
 788        " .align 32                     ;\n"
 789        " 1:                            ;\n"
 790
 791                BLOCK(0)
 792                BLOCK(4)
 793                BLOCK(8)
 794                BLOCK(12)
 795
 796        "       addl $256, %1           ;\n"
 797        "       addl $256, %2           ;\n"
 798        "       addl $256, %3           ;\n"
 799        "       addl $256, %4           ;\n"
 800        "       addl $256, %5           ;\n"
 801        "       decl %0                 ;\n"
 802        "       jnz 1b                  ;\n"
 803        : "+r" (lines),
 804          "+r" (p1), "+r" (p2), "+r" (p3)
 805        : "r" (p4), "r" (p5)
 806        : "memory");
 807
 808        /* p4 and p5 were modified, and now the variables are dead.
 809           Clobber them just to be sure nobody does something stupid
 810           like assuming they have some legal value.  */
 811        asm("" : "=r" (p4), "=r" (p5));
 812
 813        kernel_fpu_end();
 814}
 815
 816static struct xor_block_template xor_block_pIII_sse = {
 817        .name = "pIII_sse",
 818        .do_2 = xor_sse_2,
 819        .do_3 = xor_sse_3,
 820        .do_4 = xor_sse_4,
 821        .do_5 = xor_sse_5,
 822};
 823
 824/* Also try the AVX routines */
 825#include <asm/xor_avx.h>
 826
 827/* Also try the generic routines.  */
 828#include <asm-generic/xor.h>
 829
 830#undef XOR_TRY_TEMPLATES
 831#define XOR_TRY_TEMPLATES                               \
 832do {                                                    \
 833        xor_speed(&xor_block_8regs);                    \
 834        xor_speed(&xor_block_8regs_p);                  \
 835        xor_speed(&xor_block_32regs);                   \
 836        xor_speed(&xor_block_32regs_p);                 \
 837        AVX_XOR_SPEED;                                  \
 838        if (cpu_has_xmm)                                \
 839                xor_speed(&xor_block_pIII_sse);         \
 840        if (cpu_has_mmx) {                              \
 841                xor_speed(&xor_block_pII_mmx);          \
 842                xor_speed(&xor_block_p5_mmx);           \
 843        }                                               \
 844} while (0)
 845
 846/* We force the use of the SSE xor block because it can write around L2.
 847   We may also be able to load into the L1 only depending on how the cpu
 848   deals with a load to a line that is being prefetched.  */
 849#define XOR_SELECT_TEMPLATE(FASTEST)                    \
 850        AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
 851
 852#endif /* _ASM_X86_XOR_32_H */
 853