linux/arch/x86/include/asm/xor_32.h
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2#ifndef _ASM_X86_XOR_32_H
   3#define _ASM_X86_XOR_32_H
   4
   5/*
   6 * Optimized RAID-5 checksumming functions for MMX.
   7 */
   8
   9/*
  10 * High-speed RAID5 checksumming functions utilizing MMX instructions.
  11 * Copyright (C) 1998 Ingo Molnar.
  12 */
  13
  14#define LD(x, y)        "       movq   8*("#x")(%1), %%mm"#y"   ;\n"
  15#define ST(x, y)        "       movq %%mm"#y",   8*("#x")(%1)   ;\n"
  16#define XO1(x, y)       "       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
  17#define XO2(x, y)       "       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
  18#define XO3(x, y)       "       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
  19#define XO4(x, y)       "       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
  20
  21#include <asm/fpu/api.h>
  22
  23static void
  24xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
  25{
  26        unsigned long lines = bytes >> 7;
  27
  28        kernel_fpu_begin();
  29
  30        asm volatile(
  31#undef BLOCK
  32#define BLOCK(i)                                \
  33        LD(i, 0)                                \
  34                LD(i + 1, 1)                    \
  35                        LD(i + 2, 2)            \
  36                                LD(i + 3, 3)    \
  37        XO1(i, 0)                               \
  38        ST(i, 0)                                \
  39                XO1(i+1, 1)                     \
  40                ST(i+1, 1)                      \
  41                        XO1(i + 2, 2)           \
  42                        ST(i + 2, 2)            \
  43                                XO1(i + 3, 3)   \
  44                                ST(i + 3, 3)
  45
  46        " .align 32                     ;\n"
  47        " 1:                            ;\n"
  48
  49        BLOCK(0)
  50        BLOCK(4)
  51        BLOCK(8)
  52        BLOCK(12)
  53
  54        "       addl $128, %1         ;\n"
  55        "       addl $128, %2         ;\n"
  56        "       decl %0               ;\n"
  57        "       jnz 1b                ;\n"
  58        : "+r" (lines),
  59          "+r" (p1), "+r" (p2)
  60        :
  61        : "memory");
  62
  63        kernel_fpu_end();
  64}
  65
  66static void
  67xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
  68              unsigned long *p3)
  69{
  70        unsigned long lines = bytes >> 7;
  71
  72        kernel_fpu_begin();
  73
  74        asm volatile(
  75#undef BLOCK
  76#define BLOCK(i)                                \
  77        LD(i, 0)                                \
  78                LD(i + 1, 1)                    \
  79                        LD(i + 2, 2)            \
  80                                LD(i + 3, 3)    \
  81        XO1(i, 0)                               \
  82                XO1(i + 1, 1)                   \
  83                        XO1(i + 2, 2)           \
  84                                XO1(i + 3, 3)   \
  85        XO2(i, 0)                               \
  86        ST(i, 0)                                \
  87                XO2(i + 1, 1)                   \
  88                ST(i + 1, 1)                    \
  89                        XO2(i + 2, 2)           \
  90                        ST(i + 2, 2)            \
  91                                XO2(i + 3, 3)   \
  92                                ST(i + 3, 3)
  93
  94        " .align 32                     ;\n"
  95        " 1:                            ;\n"
  96
  97        BLOCK(0)
  98        BLOCK(4)
  99        BLOCK(8)
 100        BLOCK(12)
 101
 102        "       addl $128, %1         ;\n"
 103        "       addl $128, %2         ;\n"
 104        "       addl $128, %3         ;\n"
 105        "       decl %0               ;\n"
 106        "       jnz 1b                ;\n"
 107        : "+r" (lines),
 108          "+r" (p1), "+r" (p2), "+r" (p3)
 109        :
 110        : "memory");
 111
 112        kernel_fpu_end();
 113}
 114
 115static void
 116xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 117              unsigned long *p3, unsigned long *p4)
 118{
 119        unsigned long lines = bytes >> 7;
 120
 121        kernel_fpu_begin();
 122
 123        asm volatile(
 124#undef BLOCK
 125#define BLOCK(i)                                \
 126        LD(i, 0)                                \
 127                LD(i + 1, 1)                    \
 128                        LD(i + 2, 2)            \
 129                                LD(i + 3, 3)    \
 130        XO1(i, 0)                               \
 131                XO1(i + 1, 1)                   \
 132                        XO1(i + 2, 2)           \
 133                                XO1(i + 3, 3)   \
 134        XO2(i, 0)                               \
 135                XO2(i + 1, 1)                   \
 136                        XO2(i + 2, 2)           \
 137                                XO2(i + 3, 3)   \
 138        XO3(i, 0)                               \
 139        ST(i, 0)                                \
 140                XO3(i + 1, 1)                   \
 141                ST(i + 1, 1)                    \
 142                        XO3(i + 2, 2)           \
 143                        ST(i + 2, 2)            \
 144                                XO3(i + 3, 3)   \
 145                                ST(i + 3, 3)
 146
 147        " .align 32                     ;\n"
 148        " 1:                            ;\n"
 149
 150        BLOCK(0)
 151        BLOCK(4)
 152        BLOCK(8)
 153        BLOCK(12)
 154
 155        "       addl $128, %1         ;\n"
 156        "       addl $128, %2         ;\n"
 157        "       addl $128, %3         ;\n"
 158        "       addl $128, %4         ;\n"
 159        "       decl %0               ;\n"
 160        "       jnz 1b                ;\n"
 161        : "+r" (lines),
 162          "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
 163        :
 164        : "memory");
 165
 166        kernel_fpu_end();
 167}
 168
 169
 170static void
 171xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 172              unsigned long *p3, unsigned long *p4, unsigned long *p5)
 173{
 174        unsigned long lines = bytes >> 7;
 175
 176        kernel_fpu_begin();
 177
 178        /* Make sure GCC forgets anything it knows about p4 or p5,
 179           such that it won't pass to the asm volatile below a
 180           register that is shared with any other variable.  That's
 181           because we modify p4 and p5 there, but we can't mark them
 182           as read/write, otherwise we'd overflow the 10-asm-operands
 183           limit of GCC < 3.1.  */
 184        asm("" : "+r" (p4), "+r" (p5));
 185
 186        asm volatile(
 187#undef BLOCK
 188#define BLOCK(i)                                \
 189        LD(i, 0)                                \
 190                LD(i + 1, 1)                    \
 191                        LD(i + 2, 2)            \
 192                                LD(i + 3, 3)    \
 193        XO1(i, 0)                               \
 194                XO1(i + 1, 1)                   \
 195                        XO1(i + 2, 2)           \
 196                                XO1(i + 3, 3)   \
 197        XO2(i, 0)                               \
 198                XO2(i + 1, 1)                   \
 199                        XO2(i + 2, 2)           \
 200                                XO2(i + 3, 3)   \
 201        XO3(i, 0)                               \
 202                XO3(i + 1, 1)                   \
 203                        XO3(i + 2, 2)           \
 204                                XO3(i + 3, 3)   \
 205        XO4(i, 0)                               \
 206        ST(i, 0)                                \
 207                XO4(i + 1, 1)                   \
 208                ST(i + 1, 1)                    \
 209                        XO4(i + 2, 2)           \
 210                        ST(i + 2, 2)            \
 211                                XO4(i + 3, 3)   \
 212                                ST(i + 3, 3)
 213
 214        " .align 32                     ;\n"
 215        " 1:                            ;\n"
 216
 217        BLOCK(0)
 218        BLOCK(4)
 219        BLOCK(8)
 220        BLOCK(12)
 221
 222        "       addl $128, %1         ;\n"
 223        "       addl $128, %2         ;\n"
 224        "       addl $128, %3         ;\n"
 225        "       addl $128, %4         ;\n"
 226        "       addl $128, %5         ;\n"
 227        "       decl %0               ;\n"
 228        "       jnz 1b                ;\n"
 229        : "+r" (lines),
 230          "+r" (p1), "+r" (p2), "+r" (p3)
 231        : "r" (p4), "r" (p5)
 232        : "memory");
 233
 234        /* p4 and p5 were modified, and now the variables are dead.
 235           Clobber them just to be sure nobody does something stupid
 236           like assuming they have some legal value.  */
 237        asm("" : "=r" (p4), "=r" (p5));
 238
 239        kernel_fpu_end();
 240}
 241
 242#undef LD
 243#undef XO1
 244#undef XO2
 245#undef XO3
 246#undef XO4
 247#undef ST
 248#undef BLOCK
 249
 250static void
 251xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 252{
 253        unsigned long lines = bytes >> 6;
 254
 255        kernel_fpu_begin();
 256
 257        asm volatile(
 258        " .align 32                  ;\n"
 259        " 1:                         ;\n"
 260        "       movq   (%1), %%mm0   ;\n"
 261        "       movq  8(%1), %%mm1   ;\n"
 262        "       pxor   (%2), %%mm0   ;\n"
 263        "       movq 16(%1), %%mm2   ;\n"
 264        "       movq %%mm0,   (%1)   ;\n"
 265        "       pxor  8(%2), %%mm1   ;\n"
 266        "       movq 24(%1), %%mm3   ;\n"
 267        "       movq %%mm1,  8(%1)   ;\n"
 268        "       pxor 16(%2), %%mm2   ;\n"
 269        "       movq 32(%1), %%mm4   ;\n"
 270        "       movq %%mm2, 16(%1)   ;\n"
 271        "       pxor 24(%2), %%mm3   ;\n"
 272        "       movq 40(%1), %%mm5   ;\n"
 273        "       movq %%mm3, 24(%1)   ;\n"
 274        "       pxor 32(%2), %%mm4   ;\n"
 275        "       movq 48(%1), %%mm6   ;\n"
 276        "       movq %%mm4, 32(%1)   ;\n"
 277        "       pxor 40(%2), %%mm5   ;\n"
 278        "       movq 56(%1), %%mm7   ;\n"
 279        "       movq %%mm5, 40(%1)   ;\n"
 280        "       pxor 48(%2), %%mm6   ;\n"
 281        "       pxor 56(%2), %%mm7   ;\n"
 282        "       movq %%mm6, 48(%1)   ;\n"
 283        "       movq %%mm7, 56(%1)   ;\n"
 284
 285        "       addl $64, %1         ;\n"
 286        "       addl $64, %2         ;\n"
 287        "       decl %0              ;\n"
 288        "       jnz 1b               ;\n"
 289        : "+r" (lines),
 290          "+r" (p1), "+r" (p2)
 291        :
 292        : "memory");
 293
 294        kernel_fpu_end();
 295}
 296
 297static void
 298xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 299             unsigned long *p3)
 300{
 301        unsigned long lines = bytes >> 6;
 302
 303        kernel_fpu_begin();
 304
 305        asm volatile(
 306        " .align 32,0x90             ;\n"
 307        " 1:                         ;\n"
 308        "       movq   (%1), %%mm0   ;\n"
 309        "       movq  8(%1), %%mm1   ;\n"
 310        "       pxor   (%2), %%mm0   ;\n"
 311        "       movq 16(%1), %%mm2   ;\n"
 312        "       pxor  8(%2), %%mm1   ;\n"
 313        "       pxor   (%3), %%mm0   ;\n"
 314        "       pxor 16(%2), %%mm2   ;\n"
 315        "       movq %%mm0,   (%1)   ;\n"
 316        "       pxor  8(%3), %%mm1   ;\n"
 317        "       pxor 16(%3), %%mm2   ;\n"
 318        "       movq 24(%1), %%mm3   ;\n"
 319        "       movq %%mm1,  8(%1)   ;\n"
 320        "       movq 32(%1), %%mm4   ;\n"
 321        "       movq 40(%1), %%mm5   ;\n"
 322        "       pxor 24(%2), %%mm3   ;\n"
 323        "       movq %%mm2, 16(%1)   ;\n"
 324        "       pxor 32(%2), %%mm4   ;\n"
 325        "       pxor 24(%3), %%mm3   ;\n"
 326        "       pxor 40(%2), %%mm5   ;\n"
 327        "       movq %%mm3, 24(%1)   ;\n"
 328        "       pxor 32(%3), %%mm4   ;\n"
 329        "       pxor 40(%3), %%mm5   ;\n"
 330        "       movq 48(%1), %%mm6   ;\n"
 331        "       movq %%mm4, 32(%1)   ;\n"
 332        "       movq 56(%1), %%mm7   ;\n"
 333        "       pxor 48(%2), %%mm6   ;\n"
 334        "       movq %%mm5, 40(%1)   ;\n"
 335        "       pxor 56(%2), %%mm7   ;\n"
 336        "       pxor 48(%3), %%mm6   ;\n"
 337        "       pxor 56(%3), %%mm7   ;\n"
 338        "       movq %%mm6, 48(%1)   ;\n"
 339        "       movq %%mm7, 56(%1)   ;\n"
 340
 341        "       addl $64, %1         ;\n"
 342        "       addl $64, %2         ;\n"
 343        "       addl $64, %3         ;\n"
 344        "       decl %0              ;\n"
 345        "       jnz 1b               ;\n"
 346        : "+r" (lines),
 347          "+r" (p1), "+r" (p2), "+r" (p3)
 348        :
 349        : "memory" );
 350
 351        kernel_fpu_end();
 352}
 353
 354static void
 355xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 356             unsigned long *p3, unsigned long *p4)
 357{
 358        unsigned long lines = bytes >> 6;
 359
 360        kernel_fpu_begin();
 361
 362        asm volatile(
 363        " .align 32,0x90             ;\n"
 364        " 1:                         ;\n"
 365        "       movq   (%1), %%mm0   ;\n"
 366        "       movq  8(%1), %%mm1   ;\n"
 367        "       pxor   (%2), %%mm0   ;\n"
 368        "       movq 16(%1), %%mm2   ;\n"
 369        "       pxor  8(%2), %%mm1   ;\n"
 370        "       pxor   (%3), %%mm0   ;\n"
 371        "       pxor 16(%2), %%mm2   ;\n"
 372        "       pxor  8(%3), %%mm1   ;\n"
 373        "       pxor   (%4), %%mm0   ;\n"
 374        "       movq 24(%1), %%mm3   ;\n"
 375        "       pxor 16(%3), %%mm2   ;\n"
 376        "       pxor  8(%4), %%mm1   ;\n"
 377        "       movq %%mm0,   (%1)   ;\n"
 378        "       movq 32(%1), %%mm4   ;\n"
 379        "       pxor 24(%2), %%mm3   ;\n"
 380        "       pxor 16(%4), %%mm2   ;\n"
 381        "       movq %%mm1,  8(%1)   ;\n"
 382        "       movq 40(%1), %%mm5   ;\n"
 383        "       pxor 32(%2), %%mm4   ;\n"
 384        "       pxor 24(%3), %%mm3   ;\n"
 385        "       movq %%mm2, 16(%1)   ;\n"
 386        "       pxor 40(%2), %%mm5   ;\n"
 387        "       pxor 32(%3), %%mm4   ;\n"
 388        "       pxor 24(%4), %%mm3   ;\n"
 389        "       movq %%mm3, 24(%1)   ;\n"
 390        "       movq 56(%1), %%mm7   ;\n"
 391        "       movq 48(%1), %%mm6   ;\n"
 392        "       pxor 40(%3), %%mm5   ;\n"
 393        "       pxor 32(%4), %%mm4   ;\n"
 394        "       pxor 48(%2), %%mm6   ;\n"
 395        "       movq %%mm4, 32(%1)   ;\n"
 396        "       pxor 56(%2), %%mm7   ;\n"
 397        "       pxor 40(%4), %%mm5   ;\n"
 398        "       pxor 48(%3), %%mm6   ;\n"
 399        "       pxor 56(%3), %%mm7   ;\n"
 400        "       movq %%mm5, 40(%1)   ;\n"
 401        "       pxor 48(%4), %%mm6   ;\n"
 402        "       pxor 56(%4), %%mm7   ;\n"
 403        "       movq %%mm6, 48(%1)   ;\n"
 404        "       movq %%mm7, 56(%1)   ;\n"
 405
 406        "       addl $64, %1         ;\n"
 407        "       addl $64, %2         ;\n"
 408        "       addl $64, %3         ;\n"
 409        "       addl $64, %4         ;\n"
 410        "       decl %0              ;\n"
 411        "       jnz 1b               ;\n"
 412        : "+r" (lines),
 413          "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
 414        :
 415        : "memory");
 416
 417        kernel_fpu_end();
 418}
 419
 420static void
 421xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 422             unsigned long *p3, unsigned long *p4, unsigned long *p5)
 423{
 424        unsigned long lines = bytes >> 6;
 425
 426        kernel_fpu_begin();
 427
 428        /* Make sure GCC forgets anything it knows about p4 or p5,
 429           such that it won't pass to the asm volatile below a
 430           register that is shared with any other variable.  That's
 431           because we modify p4 and p5 there, but we can't mark them
 432           as read/write, otherwise we'd overflow the 10-asm-operands
 433           limit of GCC < 3.1.  */
 434        asm("" : "+r" (p4), "+r" (p5));
 435
 436        asm volatile(
 437        " .align 32,0x90             ;\n"
 438        " 1:                         ;\n"
 439        "       movq   (%1), %%mm0   ;\n"
 440        "       movq  8(%1), %%mm1   ;\n"
 441        "       pxor   (%2), %%mm0   ;\n"
 442        "       pxor  8(%2), %%mm1   ;\n"
 443        "       movq 16(%1), %%mm2   ;\n"
 444        "       pxor   (%3), %%mm0   ;\n"
 445        "       pxor  8(%3), %%mm1   ;\n"
 446        "       pxor 16(%2), %%mm2   ;\n"
 447        "       pxor   (%4), %%mm0   ;\n"
 448        "       pxor  8(%4), %%mm1   ;\n"
 449        "       pxor 16(%3), %%mm2   ;\n"
 450        "       movq 24(%1), %%mm3   ;\n"
 451        "       pxor   (%5), %%mm0   ;\n"
 452        "       pxor  8(%5), %%mm1   ;\n"
 453        "       movq %%mm0,   (%1)   ;\n"
 454        "       pxor 16(%4), %%mm2   ;\n"
 455        "       pxor 24(%2), %%mm3   ;\n"
 456        "       movq %%mm1,  8(%1)   ;\n"
 457        "       pxor 16(%5), %%mm2   ;\n"
 458        "       pxor 24(%3), %%mm3   ;\n"
 459        "       movq 32(%1), %%mm4   ;\n"
 460        "       movq %%mm2, 16(%1)   ;\n"
 461        "       pxor 24(%4), %%mm3   ;\n"
 462        "       pxor 32(%2), %%mm4   ;\n"
 463        "       movq 40(%1), %%mm5   ;\n"
 464        "       pxor 24(%5), %%mm3   ;\n"
 465        "       pxor 32(%3), %%mm4   ;\n"
 466        "       pxor 40(%2), %%mm5   ;\n"
 467        "       movq %%mm3, 24(%1)   ;\n"
 468        "       pxor 32(%4), %%mm4   ;\n"
 469        "       pxor 40(%3), %%mm5   ;\n"
 470        "       movq 48(%1), %%mm6   ;\n"
 471        "       movq 56(%1), %%mm7   ;\n"
 472        "       pxor 32(%5), %%mm4   ;\n"
 473        "       pxor 40(%4), %%mm5   ;\n"
 474        "       pxor 48(%2), %%mm6   ;\n"
 475        "       pxor 56(%2), %%mm7   ;\n"
 476        "       movq %%mm4, 32(%1)   ;\n"
 477        "       pxor 48(%3), %%mm6   ;\n"
 478        "       pxor 56(%3), %%mm7   ;\n"
 479        "       pxor 40(%5), %%mm5   ;\n"
 480        "       pxor 48(%4), %%mm6   ;\n"
 481        "       pxor 56(%4), %%mm7   ;\n"
 482        "       movq %%mm5, 40(%1)   ;\n"
 483        "       pxor 48(%5), %%mm6   ;\n"
 484        "       pxor 56(%5), %%mm7   ;\n"
 485        "       movq %%mm6, 48(%1)   ;\n"
 486        "       movq %%mm7, 56(%1)   ;\n"
 487
 488        "       addl $64, %1         ;\n"
 489        "       addl $64, %2         ;\n"
 490        "       addl $64, %3         ;\n"
 491        "       addl $64, %4         ;\n"
 492        "       addl $64, %5         ;\n"
 493        "       decl %0              ;\n"
 494        "       jnz 1b               ;\n"
 495        : "+r" (lines),
 496          "+r" (p1), "+r" (p2), "+r" (p3)
 497        : "r" (p4), "r" (p5)
 498        : "memory");
 499
 500        /* p4 and p5 were modified, and now the variables are dead.
 501           Clobber them just to be sure nobody does something stupid
 502           like assuming they have some legal value.  */
 503        asm("" : "=r" (p4), "=r" (p5));
 504
 505        kernel_fpu_end();
 506}
 507
 508static struct xor_block_template xor_block_pII_mmx = {
 509        .name = "pII_mmx",
 510        .do_2 = xor_pII_mmx_2,
 511        .do_3 = xor_pII_mmx_3,
 512        .do_4 = xor_pII_mmx_4,
 513        .do_5 = xor_pII_mmx_5,
 514};
 515
 516static struct xor_block_template xor_block_p5_mmx = {
 517        .name = "p5_mmx",
 518        .do_2 = xor_p5_mmx_2,
 519        .do_3 = xor_p5_mmx_3,
 520        .do_4 = xor_p5_mmx_4,
 521        .do_5 = xor_p5_mmx_5,
 522};
 523
 524static struct xor_block_template xor_block_pIII_sse = {
 525        .name = "pIII_sse",
 526        .do_2 = xor_sse_2,
 527        .do_3 = xor_sse_3,
 528        .do_4 = xor_sse_4,
 529        .do_5 = xor_sse_5,
 530};
 531
 532/* Also try the AVX routines */
 533#include <asm/xor_avx.h>
 534
 535/* Also try the generic routines.  */
 536#include <asm-generic/xor.h>
 537
 538/* We force the use of the SSE xor block because it can write around L2.
 539   We may also be able to load into the L1 only depending on how the cpu
 540   deals with a load to a line that is being prefetched.  */
 541#undef XOR_TRY_TEMPLATES
 542#define XOR_TRY_TEMPLATES                               \
 543do {                                                    \
 544        AVX_XOR_SPEED;                                  \
 545        if (boot_cpu_has(X86_FEATURE_XMM)) {                            \
 546                xor_speed(&xor_block_pIII_sse);         \
 547                xor_speed(&xor_block_sse_pf64);         \
 548        } else if (boot_cpu_has(X86_FEATURE_MMX)) {     \
 549                xor_speed(&xor_block_pII_mmx);          \
 550                xor_speed(&xor_block_p5_mmx);           \
 551        } else {                                        \
 552                xor_speed(&xor_block_8regs);            \
 553                xor_speed(&xor_block_8regs_p);          \
 554                xor_speed(&xor_block_32regs);           \
 555                xor_speed(&xor_block_32regs_p);         \
 556        }                                               \
 557} while (0)
 558
 559#endif /* _ASM_X86_XOR_32_H */
 560