linux/arch/x86/include/asm/xor_32.h
<<
>>
Prefs
   1#ifndef _ASM_X86_XOR_32_H
   2#define _ASM_X86_XOR_32_H
   3
   4/*
   5 * Optimized RAID-5 checksumming functions for MMX.
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License as published by
   9 * the Free Software Foundation; either version 2, or (at your option)
  10 * any later version.
  11 *
  12 * You should have received a copy of the GNU General Public License
  13 * (for example /usr/src/linux/COPYING); if not, write to the Free
  14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15 */
  16
  17/*
  18 * High-speed RAID5 checksumming functions utilizing MMX instructions.
  19 * Copyright (C) 1998 Ingo Molnar.
  20 */
  21
  22#define LD(x, y)        "       movq   8*("#x")(%1), %%mm"#y"   ;\n"
  23#define ST(x, y)        "       movq %%mm"#y",   8*("#x")(%1)   ;\n"
  24#define XO1(x, y)       "       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
  25#define XO2(x, y)       "       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
  26#define XO3(x, y)       "       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
  27#define XO4(x, y)       "       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
  28
  29#include <asm/fpu/api.h>
  30
  31static void
  32xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
  33{
  34        unsigned long lines = bytes >> 7;
  35
  36        kernel_fpu_begin();
  37
  38        asm volatile(
  39#undef BLOCK
  40#define BLOCK(i)                                \
  41        LD(i, 0)                                \
  42                LD(i + 1, 1)                    \
  43                        LD(i + 2, 2)            \
  44                                LD(i + 3, 3)    \
  45        XO1(i, 0)                               \
  46        ST(i, 0)                                \
  47                XO1(i+1, 1)                     \
  48                ST(i+1, 1)                      \
  49                        XO1(i + 2, 2)           \
  50                        ST(i + 2, 2)            \
  51                                XO1(i + 3, 3)   \
  52                                ST(i + 3, 3)
  53
  54        " .align 32                     ;\n"
  55        " 1:                            ;\n"
  56
  57        BLOCK(0)
  58        BLOCK(4)
  59        BLOCK(8)
  60        BLOCK(12)
  61
  62        "       addl $128, %1         ;\n"
  63        "       addl $128, %2         ;\n"
  64        "       decl %0               ;\n"
  65        "       jnz 1b                ;\n"
  66        : "+r" (lines),
  67          "+r" (p1), "+r" (p2)
  68        :
  69        : "memory");
  70
  71        kernel_fpu_end();
  72}
  73
  74static void
  75xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
  76              unsigned long *p3)
  77{
  78        unsigned long lines = bytes >> 7;
  79
  80        kernel_fpu_begin();
  81
  82        asm volatile(
  83#undef BLOCK
  84#define BLOCK(i)                                \
  85        LD(i, 0)                                \
  86                LD(i + 1, 1)                    \
  87                        LD(i + 2, 2)            \
  88                                LD(i + 3, 3)    \
  89        XO1(i, 0)                               \
  90                XO1(i + 1, 1)                   \
  91                        XO1(i + 2, 2)           \
  92                                XO1(i + 3, 3)   \
  93        XO2(i, 0)                               \
  94        ST(i, 0)                                \
  95                XO2(i + 1, 1)                   \
  96                ST(i + 1, 1)                    \
  97                        XO2(i + 2, 2)           \
  98                        ST(i + 2, 2)            \
  99                                XO2(i + 3, 3)   \
 100                                ST(i + 3, 3)
 101
 102        " .align 32                     ;\n"
 103        " 1:                            ;\n"
 104
 105        BLOCK(0)
 106        BLOCK(4)
 107        BLOCK(8)
 108        BLOCK(12)
 109
 110        "       addl $128, %1         ;\n"
 111        "       addl $128, %2         ;\n"
 112        "       addl $128, %3         ;\n"
 113        "       decl %0               ;\n"
 114        "       jnz 1b                ;\n"
 115        : "+r" (lines),
 116          "+r" (p1), "+r" (p2), "+r" (p3)
 117        :
 118        : "memory");
 119
 120        kernel_fpu_end();
 121}
 122
 123static void
 124xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 125              unsigned long *p3, unsigned long *p4)
 126{
 127        unsigned long lines = bytes >> 7;
 128
 129        kernel_fpu_begin();
 130
 131        asm volatile(
 132#undef BLOCK
 133#define BLOCK(i)                                \
 134        LD(i, 0)                                \
 135                LD(i + 1, 1)                    \
 136                        LD(i + 2, 2)            \
 137                                LD(i + 3, 3)    \
 138        XO1(i, 0)                               \
 139                XO1(i + 1, 1)                   \
 140                        XO1(i + 2, 2)           \
 141                                XO1(i + 3, 3)   \
 142        XO2(i, 0)                               \
 143                XO2(i + 1, 1)                   \
 144                        XO2(i + 2, 2)           \
 145                                XO2(i + 3, 3)   \
 146        XO3(i, 0)                               \
 147        ST(i, 0)                                \
 148                XO3(i + 1, 1)                   \
 149                ST(i + 1, 1)                    \
 150                        XO3(i + 2, 2)           \
 151                        ST(i + 2, 2)            \
 152                                XO3(i + 3, 3)   \
 153                                ST(i + 3, 3)
 154
 155        " .align 32                     ;\n"
 156        " 1:                            ;\n"
 157
 158        BLOCK(0)
 159        BLOCK(4)
 160        BLOCK(8)
 161        BLOCK(12)
 162
 163        "       addl $128, %1         ;\n"
 164        "       addl $128, %2         ;\n"
 165        "       addl $128, %3         ;\n"
 166        "       addl $128, %4         ;\n"
 167        "       decl %0               ;\n"
 168        "       jnz 1b                ;\n"
 169        : "+r" (lines),
 170          "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
 171        :
 172        : "memory");
 173
 174        kernel_fpu_end();
 175}
 176
 177
 178static void
 179xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 180              unsigned long *p3, unsigned long *p4, unsigned long *p5)
 181{
 182        unsigned long lines = bytes >> 7;
 183
 184        kernel_fpu_begin();
 185
 186        /* Make sure GCC forgets anything it knows about p4 or p5,
 187           such that it won't pass to the asm volatile below a
 188           register that is shared with any other variable.  That's
 189           because we modify p4 and p5 there, but we can't mark them
 190           as read/write, otherwise we'd overflow the 10-asm-operands
 191           limit of GCC < 3.1.  */
 192        asm("" : "+r" (p4), "+r" (p5));
 193
 194        asm volatile(
 195#undef BLOCK
 196#define BLOCK(i)                                \
 197        LD(i, 0)                                \
 198                LD(i + 1, 1)                    \
 199                        LD(i + 2, 2)            \
 200                                LD(i + 3, 3)    \
 201        XO1(i, 0)                               \
 202                XO1(i + 1, 1)                   \
 203                        XO1(i + 2, 2)           \
 204                                XO1(i + 3, 3)   \
 205        XO2(i, 0)                               \
 206                XO2(i + 1, 1)                   \
 207                        XO2(i + 2, 2)           \
 208                                XO2(i + 3, 3)   \
 209        XO3(i, 0)                               \
 210                XO3(i + 1, 1)                   \
 211                        XO3(i + 2, 2)           \
 212                                XO3(i + 3, 3)   \
 213        XO4(i, 0)                               \
 214        ST(i, 0)                                \
 215                XO4(i + 1, 1)                   \
 216                ST(i + 1, 1)                    \
 217                        XO4(i + 2, 2)           \
 218                        ST(i + 2, 2)            \
 219                                XO4(i + 3, 3)   \
 220                                ST(i + 3, 3)
 221
 222        " .align 32                     ;\n"
 223        " 1:                            ;\n"
 224
 225        BLOCK(0)
 226        BLOCK(4)
 227        BLOCK(8)
 228        BLOCK(12)
 229
 230        "       addl $128, %1         ;\n"
 231        "       addl $128, %2         ;\n"
 232        "       addl $128, %3         ;\n"
 233        "       addl $128, %4         ;\n"
 234        "       addl $128, %5         ;\n"
 235        "       decl %0               ;\n"
 236        "       jnz 1b                ;\n"
 237        : "+r" (lines),
 238          "+r" (p1), "+r" (p2), "+r" (p3)
 239        : "r" (p4), "r" (p5)
 240        : "memory");
 241
 242        /* p4 and p5 were modified, and now the variables are dead.
 243           Clobber them just to be sure nobody does something stupid
 244           like assuming they have some legal value.  */
 245        asm("" : "=r" (p4), "=r" (p5));
 246
 247        kernel_fpu_end();
 248}
 249
 250#undef LD
 251#undef XO1
 252#undef XO2
 253#undef XO3
 254#undef XO4
 255#undef ST
 256#undef BLOCK
 257
 258static void
 259xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 260{
 261        unsigned long lines = bytes >> 6;
 262
 263        kernel_fpu_begin();
 264
 265        asm volatile(
 266        " .align 32                  ;\n"
 267        " 1:                         ;\n"
 268        "       movq   (%1), %%mm0   ;\n"
 269        "       movq  8(%1), %%mm1   ;\n"
 270        "       pxor   (%2), %%mm0   ;\n"
 271        "       movq 16(%1), %%mm2   ;\n"
 272        "       movq %%mm0,   (%1)   ;\n"
 273        "       pxor  8(%2), %%mm1   ;\n"
 274        "       movq 24(%1), %%mm3   ;\n"
 275        "       movq %%mm1,  8(%1)   ;\n"
 276        "       pxor 16(%2), %%mm2   ;\n"
 277        "       movq 32(%1), %%mm4   ;\n"
 278        "       movq %%mm2, 16(%1)   ;\n"
 279        "       pxor 24(%2), %%mm3   ;\n"
 280        "       movq 40(%1), %%mm5   ;\n"
 281        "       movq %%mm3, 24(%1)   ;\n"
 282        "       pxor 32(%2), %%mm4   ;\n"
 283        "       movq 48(%1), %%mm6   ;\n"
 284        "       movq %%mm4, 32(%1)   ;\n"
 285        "       pxor 40(%2), %%mm5   ;\n"
 286        "       movq 56(%1), %%mm7   ;\n"
 287        "       movq %%mm5, 40(%1)   ;\n"
 288        "       pxor 48(%2), %%mm6   ;\n"
 289        "       pxor 56(%2), %%mm7   ;\n"
 290        "       movq %%mm6, 48(%1)   ;\n"
 291        "       movq %%mm7, 56(%1)   ;\n"
 292
 293        "       addl $64, %1         ;\n"
 294        "       addl $64, %2         ;\n"
 295        "       decl %0              ;\n"
 296        "       jnz 1b               ;\n"
 297        : "+r" (lines),
 298          "+r" (p1), "+r" (p2)
 299        :
 300        : "memory");
 301
 302        kernel_fpu_end();
 303}
 304
 305static void
 306xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 307             unsigned long *p3)
 308{
 309        unsigned long lines = bytes >> 6;
 310
 311        kernel_fpu_begin();
 312
 313        asm volatile(
 314        " .align 32,0x90             ;\n"
 315        " 1:                         ;\n"
 316        "       movq   (%1), %%mm0   ;\n"
 317        "       movq  8(%1), %%mm1   ;\n"
 318        "       pxor   (%2), %%mm0   ;\n"
 319        "       movq 16(%1), %%mm2   ;\n"
 320        "       pxor  8(%2), %%mm1   ;\n"
 321        "       pxor   (%3), %%mm0   ;\n"
 322        "       pxor 16(%2), %%mm2   ;\n"
 323        "       movq %%mm0,   (%1)   ;\n"
 324        "       pxor  8(%3), %%mm1   ;\n"
 325        "       pxor 16(%3), %%mm2   ;\n"
 326        "       movq 24(%1), %%mm3   ;\n"
 327        "       movq %%mm1,  8(%1)   ;\n"
 328        "       movq 32(%1), %%mm4   ;\n"
 329        "       movq 40(%1), %%mm5   ;\n"
 330        "       pxor 24(%2), %%mm3   ;\n"
 331        "       movq %%mm2, 16(%1)   ;\n"
 332        "       pxor 32(%2), %%mm4   ;\n"
 333        "       pxor 24(%3), %%mm3   ;\n"
 334        "       pxor 40(%2), %%mm5   ;\n"
 335        "       movq %%mm3, 24(%1)   ;\n"
 336        "       pxor 32(%3), %%mm4   ;\n"
 337        "       pxor 40(%3), %%mm5   ;\n"
 338        "       movq 48(%1), %%mm6   ;\n"
 339        "       movq %%mm4, 32(%1)   ;\n"
 340        "       movq 56(%1), %%mm7   ;\n"
 341        "       pxor 48(%2), %%mm6   ;\n"
 342        "       movq %%mm5, 40(%1)   ;\n"
 343        "       pxor 56(%2), %%mm7   ;\n"
 344        "       pxor 48(%3), %%mm6   ;\n"
 345        "       pxor 56(%3), %%mm7   ;\n"
 346        "       movq %%mm6, 48(%1)   ;\n"
 347        "       movq %%mm7, 56(%1)   ;\n"
 348
 349        "       addl $64, %1         ;\n"
 350        "       addl $64, %2         ;\n"
 351        "       addl $64, %3         ;\n"
 352        "       decl %0              ;\n"
 353        "       jnz 1b               ;\n"
 354        : "+r" (lines),
 355          "+r" (p1), "+r" (p2), "+r" (p3)
 356        :
 357        : "memory" );
 358
 359        kernel_fpu_end();
 360}
 361
 362static void
 363xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 364             unsigned long *p3, unsigned long *p4)
 365{
 366        unsigned long lines = bytes >> 6;
 367
 368        kernel_fpu_begin();
 369
 370        asm volatile(
 371        " .align 32,0x90             ;\n"
 372        " 1:                         ;\n"
 373        "       movq   (%1), %%mm0   ;\n"
 374        "       movq  8(%1), %%mm1   ;\n"
 375        "       pxor   (%2), %%mm0   ;\n"
 376        "       movq 16(%1), %%mm2   ;\n"
 377        "       pxor  8(%2), %%mm1   ;\n"
 378        "       pxor   (%3), %%mm0   ;\n"
 379        "       pxor 16(%2), %%mm2   ;\n"
 380        "       pxor  8(%3), %%mm1   ;\n"
 381        "       pxor   (%4), %%mm0   ;\n"
 382        "       movq 24(%1), %%mm3   ;\n"
 383        "       pxor 16(%3), %%mm2   ;\n"
 384        "       pxor  8(%4), %%mm1   ;\n"
 385        "       movq %%mm0,   (%1)   ;\n"
 386        "       movq 32(%1), %%mm4   ;\n"
 387        "       pxor 24(%2), %%mm3   ;\n"
 388        "       pxor 16(%4), %%mm2   ;\n"
 389        "       movq %%mm1,  8(%1)   ;\n"
 390        "       movq 40(%1), %%mm5   ;\n"
 391        "       pxor 32(%2), %%mm4   ;\n"
 392        "       pxor 24(%3), %%mm3   ;\n"
 393        "       movq %%mm2, 16(%1)   ;\n"
 394        "       pxor 40(%2), %%mm5   ;\n"
 395        "       pxor 32(%3), %%mm4   ;\n"
 396        "       pxor 24(%4), %%mm3   ;\n"
 397        "       movq %%mm3, 24(%1)   ;\n"
 398        "       movq 56(%1), %%mm7   ;\n"
 399        "       movq 48(%1), %%mm6   ;\n"
 400        "       pxor 40(%3), %%mm5   ;\n"
 401        "       pxor 32(%4), %%mm4   ;\n"
 402        "       pxor 48(%2), %%mm6   ;\n"
 403        "       movq %%mm4, 32(%1)   ;\n"
 404        "       pxor 56(%2), %%mm7   ;\n"
 405        "       pxor 40(%4), %%mm5   ;\n"
 406        "       pxor 48(%3), %%mm6   ;\n"
 407        "       pxor 56(%3), %%mm7   ;\n"
 408        "       movq %%mm5, 40(%1)   ;\n"
 409        "       pxor 48(%4), %%mm6   ;\n"
 410        "       pxor 56(%4), %%mm7   ;\n"
 411        "       movq %%mm6, 48(%1)   ;\n"
 412        "       movq %%mm7, 56(%1)   ;\n"
 413
 414        "       addl $64, %1         ;\n"
 415        "       addl $64, %2         ;\n"
 416        "       addl $64, %3         ;\n"
 417        "       addl $64, %4         ;\n"
 418        "       decl %0              ;\n"
 419        "       jnz 1b               ;\n"
 420        : "+r" (lines),
 421          "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
 422        :
 423        : "memory");
 424
 425        kernel_fpu_end();
 426}
 427
 428static void
 429xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 430             unsigned long *p3, unsigned long *p4, unsigned long *p5)
 431{
 432        unsigned long lines = bytes >> 6;
 433
 434        kernel_fpu_begin();
 435
 436        /* Make sure GCC forgets anything it knows about p4 or p5,
 437           such that it won't pass to the asm volatile below a
 438           register that is shared with any other variable.  That's
 439           because we modify p4 and p5 there, but we can't mark them
 440           as read/write, otherwise we'd overflow the 10-asm-operands
 441           limit of GCC < 3.1.  */
 442        asm("" : "+r" (p4), "+r" (p5));
 443
 444        asm volatile(
 445        " .align 32,0x90             ;\n"
 446        " 1:                         ;\n"
 447        "       movq   (%1), %%mm0   ;\n"
 448        "       movq  8(%1), %%mm1   ;\n"
 449        "       pxor   (%2), %%mm0   ;\n"
 450        "       pxor  8(%2), %%mm1   ;\n"
 451        "       movq 16(%1), %%mm2   ;\n"
 452        "       pxor   (%3), %%mm0   ;\n"
 453        "       pxor  8(%3), %%mm1   ;\n"
 454        "       pxor 16(%2), %%mm2   ;\n"
 455        "       pxor   (%4), %%mm0   ;\n"
 456        "       pxor  8(%4), %%mm1   ;\n"
 457        "       pxor 16(%3), %%mm2   ;\n"
 458        "       movq 24(%1), %%mm3   ;\n"
 459        "       pxor   (%5), %%mm0   ;\n"
 460        "       pxor  8(%5), %%mm1   ;\n"
 461        "       movq %%mm0,   (%1)   ;\n"
 462        "       pxor 16(%4), %%mm2   ;\n"
 463        "       pxor 24(%2), %%mm3   ;\n"
 464        "       movq %%mm1,  8(%1)   ;\n"
 465        "       pxor 16(%5), %%mm2   ;\n"
 466        "       pxor 24(%3), %%mm3   ;\n"
 467        "       movq 32(%1), %%mm4   ;\n"
 468        "       movq %%mm2, 16(%1)   ;\n"
 469        "       pxor 24(%4), %%mm3   ;\n"
 470        "       pxor 32(%2), %%mm4   ;\n"
 471        "       movq 40(%1), %%mm5   ;\n"
 472        "       pxor 24(%5), %%mm3   ;\n"
 473        "       pxor 32(%3), %%mm4   ;\n"
 474        "       pxor 40(%2), %%mm5   ;\n"
 475        "       movq %%mm3, 24(%1)   ;\n"
 476        "       pxor 32(%4), %%mm4   ;\n"
 477        "       pxor 40(%3), %%mm5   ;\n"
 478        "       movq 48(%1), %%mm6   ;\n"
 479        "       movq 56(%1), %%mm7   ;\n"
 480        "       pxor 32(%5), %%mm4   ;\n"
 481        "       pxor 40(%4), %%mm5   ;\n"
 482        "       pxor 48(%2), %%mm6   ;\n"
 483        "       pxor 56(%2), %%mm7   ;\n"
 484        "       movq %%mm4, 32(%1)   ;\n"
 485        "       pxor 48(%3), %%mm6   ;\n"
 486        "       pxor 56(%3), %%mm7   ;\n"
 487        "       pxor 40(%5), %%mm5   ;\n"
 488        "       pxor 48(%4), %%mm6   ;\n"
 489        "       pxor 56(%4), %%mm7   ;\n"
 490        "       movq %%mm5, 40(%1)   ;\n"
 491        "       pxor 48(%5), %%mm6   ;\n"
 492        "       pxor 56(%5), %%mm7   ;\n"
 493        "       movq %%mm6, 48(%1)   ;\n"
 494        "       movq %%mm7, 56(%1)   ;\n"
 495
 496        "       addl $64, %1         ;\n"
 497        "       addl $64, %2         ;\n"
 498        "       addl $64, %3         ;\n"
 499        "       addl $64, %4         ;\n"
 500        "       addl $64, %5         ;\n"
 501        "       decl %0              ;\n"
 502        "       jnz 1b               ;\n"
 503        : "+r" (lines),
 504          "+r" (p1), "+r" (p2), "+r" (p3)
 505        : "r" (p4), "r" (p5)
 506        : "memory");
 507
 508        /* p4 and p5 were modified, and now the variables are dead.
 509           Clobber them just to be sure nobody does something stupid
 510           like assuming they have some legal value.  */
 511        asm("" : "=r" (p4), "=r" (p5));
 512
 513        kernel_fpu_end();
 514}
 515
 516static struct xor_block_template xor_block_pII_mmx = {
 517        .name = "pII_mmx",
 518        .do_2 = xor_pII_mmx_2,
 519        .do_3 = xor_pII_mmx_3,
 520        .do_4 = xor_pII_mmx_4,
 521        .do_5 = xor_pII_mmx_5,
 522};
 523
 524static struct xor_block_template xor_block_p5_mmx = {
 525        .name = "p5_mmx",
 526        .do_2 = xor_p5_mmx_2,
 527        .do_3 = xor_p5_mmx_3,
 528        .do_4 = xor_p5_mmx_4,
 529        .do_5 = xor_p5_mmx_5,
 530};
 531
 532static struct xor_block_template xor_block_pIII_sse = {
 533        .name = "pIII_sse",
 534        .do_2 = xor_sse_2,
 535        .do_3 = xor_sse_3,
 536        .do_4 = xor_sse_4,
 537        .do_5 = xor_sse_5,
 538};
 539
 540/* Also try the AVX routines */
 541#include <asm/xor_avx.h>
 542
 543/* Also try the generic routines.  */
 544#include <asm-generic/xor.h>
 545
 546/* We force the use of the SSE xor block because it can write around L2.
 547   We may also be able to load into the L1 only depending on how the cpu
 548   deals with a load to a line that is being prefetched.  */
 549#undef XOR_TRY_TEMPLATES
 550#define XOR_TRY_TEMPLATES                               \
 551do {                                                    \
 552        AVX_XOR_SPEED;                                  \
 553        if (boot_cpu_has(X86_FEATURE_XMM)) {                            \
 554                xor_speed(&xor_block_pIII_sse);         \
 555                xor_speed(&xor_block_sse_pf64);         \
 556        } else if (boot_cpu_has(X86_FEATURE_MMX)) {     \
 557                xor_speed(&xor_block_pII_mmx);          \
 558                xor_speed(&xor_block_p5_mmx);           \
 559        } else {                                        \
 560                xor_speed(&xor_block_8regs);            \
 561                xor_speed(&xor_block_8regs_p);          \
 562                xor_speed(&xor_block_32regs);           \
 563                xor_speed(&xor_block_32regs_p);         \
 564        }                                               \
 565} while (0)
 566
 567#endif /* _ASM_X86_XOR_32_H */
 568