linux/arch/x86/include/asm/xor.h
<<
>>
Prefs
   1#ifdef CONFIG_KMEMCHECK
   2/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
   3# include <asm-generic/xor.h>
   4#elif !defined(_ASM_X86_XOR_H)
   5#define _ASM_X86_XOR_H
   6
   7/*
   8 * Optimized RAID-5 checksumming functions for SSE.
   9 *
  10 * This program is free software; you can redistribute it and/or modify
  11 * it under the terms of the GNU General Public License as published by
  12 * the Free Software Foundation; either version 2, or (at your option)
  13 * any later version.
  14 *
  15 * You should have received a copy of the GNU General Public License
  16 * (for example /usr/src/linux/COPYING); if not, write to the Free
  17 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18 */
  19
  20/*
  21 * Cache avoiding checksumming functions utilizing KNI instructions
  22 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
  23 */
  24
  25/*
  26 * Based on
  27 * High-speed RAID5 checksumming functions utilizing SSE instructions.
  28 * Copyright (C) 1998 Ingo Molnar.
  29 */
  30
  31/*
  32 * x86-64 changes / gcc fixes from Andi Kleen.
  33 * Copyright 2002 Andi Kleen, SuSE Labs.
  34 *
  35 * This hasn't been optimized for the hammer yet, but there are likely
  36 * no advantages to be gotten from x86-64 here anyways.
  37 */
  38
  39#include <asm/i387.h>
  40
  41#ifdef CONFIG_X86_32
  42/* reduce register pressure */
  43# define XOR_CONSTANT_CONSTRAINT "i"
  44#else
  45# define XOR_CONSTANT_CONSTRAINT "re"
  46#endif
  47
  48#define OFFS(x)         "16*("#x")"
  49#define PF_OFFS(x)      "256+16*("#x")"
  50#define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
  51#define LD(x, y)        "       movaps "OFFS(x)"(%[p1]), %%xmm"#y"      ;\n"
  52#define ST(x, y)        "       movaps %%xmm"#y", "OFFS(x)"(%[p1])      ;\n"
  53#define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
  54#define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
  55#define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
  56#define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
  57#define XO1(x, y)       "       xorps "OFFS(x)"(%[p2]), %%xmm"#y"       ;\n"
  58#define XO2(x, y)       "       xorps "OFFS(x)"(%[p3]), %%xmm"#y"       ;\n"
  59#define XO3(x, y)       "       xorps "OFFS(x)"(%[p4]), %%xmm"#y"       ;\n"
  60#define XO4(x, y)       "       xorps "OFFS(x)"(%[p5]), %%xmm"#y"       ;\n"
  61#define NOP(x)
  62
  63#define BLK64(pf, op, i)                                \
  64                pf(i)                                   \
  65                op(i, 0)                                \
  66                        op(i + 1, 1)                    \
  67                                op(i + 2, 2)            \
  68                                        op(i + 3, 3)
  69
  70static void
  71xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
  72{
  73        unsigned long lines = bytes >> 8;
  74
  75        kernel_fpu_begin();
  76
  77        asm volatile(
  78#undef BLOCK
  79#define BLOCK(i)                                        \
  80                LD(i, 0)                                \
  81                        LD(i + 1, 1)                    \
  82                PF1(i)                                  \
  83                                PF1(i + 2)              \
  84                                LD(i + 2, 2)            \
  85                                        LD(i + 3, 3)    \
  86                PF0(i + 4)                              \
  87                                PF0(i + 6)              \
  88                XO1(i, 0)                               \
  89                        XO1(i + 1, 1)                   \
  90                                XO1(i + 2, 2)           \
  91                                        XO1(i + 3, 3)   \
  92                ST(i, 0)                                \
  93                        ST(i + 1, 1)                    \
  94                                ST(i + 2, 2)            \
  95                                        ST(i + 3, 3)    \
  96
  97
  98                PF0(0)
  99                                PF0(2)
 100
 101        " .align 32                     ;\n"
 102        " 1:                            ;\n"
 103
 104                BLOCK(0)
 105                BLOCK(4)
 106                BLOCK(8)
 107                BLOCK(12)
 108
 109        "       add %[inc], %[p1]       ;\n"
 110        "       add %[inc], %[p2]       ;\n"
 111        "       dec %[cnt]              ;\n"
 112        "       jnz 1b                  ;\n"
 113        : [cnt] "+r" (lines),
 114          [p1] "+r" (p1), [p2] "+r" (p2)
 115        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 116        : "memory");
 117
 118        kernel_fpu_end();
 119}
 120
 121static void
 122xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 123{
 124        unsigned long lines = bytes >> 8;
 125
 126        kernel_fpu_begin();
 127
 128        asm volatile(
 129#undef BLOCK
 130#define BLOCK(i)                        \
 131                BLK64(PF0, LD, i)       \
 132                BLK64(PF1, XO1, i)      \
 133                BLK64(NOP, ST, i)       \
 134
 135        " .align 32                     ;\n"
 136        " 1:                            ;\n"
 137
 138                BLOCK(0)
 139                BLOCK(4)
 140                BLOCK(8)
 141                BLOCK(12)
 142
 143        "       add %[inc], %[p1]       ;\n"
 144        "       add %[inc], %[p2]       ;\n"
 145        "       dec %[cnt]              ;\n"
 146        "       jnz 1b                  ;\n"
 147        : [cnt] "+r" (lines),
 148          [p1] "+r" (p1), [p2] "+r" (p2)
 149        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 150        : "memory");
 151
 152        kernel_fpu_end();
 153}
 154
 155static void
 156xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 157          unsigned long *p3)
 158{
 159        unsigned long lines = bytes >> 8;
 160
 161        kernel_fpu_begin();
 162
 163        asm volatile(
 164#undef BLOCK
 165#define BLOCK(i) \
 166                PF1(i)                                  \
 167                                PF1(i + 2)              \
 168                LD(i, 0)                                \
 169                        LD(i + 1, 1)                    \
 170                                LD(i + 2, 2)            \
 171                                        LD(i + 3, 3)    \
 172                PF2(i)                                  \
 173                                PF2(i + 2)              \
 174                PF0(i + 4)                              \
 175                                PF0(i + 6)              \
 176                XO1(i, 0)                               \
 177                        XO1(i + 1, 1)                   \
 178                                XO1(i + 2, 2)           \
 179                                        XO1(i + 3, 3)   \
 180                XO2(i, 0)                               \
 181                        XO2(i + 1, 1)                   \
 182                                XO2(i + 2, 2)           \
 183                                        XO2(i + 3, 3)   \
 184                ST(i, 0)                                \
 185                        ST(i + 1, 1)                    \
 186                                ST(i + 2, 2)            \
 187                                        ST(i + 3, 3)    \
 188
 189
 190                PF0(0)
 191                                PF0(2)
 192
 193        " .align 32                     ;\n"
 194        " 1:                            ;\n"
 195
 196                BLOCK(0)
 197                BLOCK(4)
 198                BLOCK(8)
 199                BLOCK(12)
 200
 201        "       add %[inc], %[p1]       ;\n"
 202        "       add %[inc], %[p2]       ;\n"
 203        "       add %[inc], %[p3]       ;\n"
 204        "       dec %[cnt]              ;\n"
 205        "       jnz 1b                  ;\n"
 206        : [cnt] "+r" (lines),
 207          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
 208        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 209        : "memory");
 210
 211        kernel_fpu_end();
 212}
 213
 214static void
 215xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 216               unsigned long *p3)
 217{
 218        unsigned long lines = bytes >> 8;
 219
 220        kernel_fpu_begin();
 221
 222        asm volatile(
 223#undef BLOCK
 224#define BLOCK(i)                        \
 225                BLK64(PF0, LD, i)       \
 226                BLK64(PF1, XO1, i)      \
 227                BLK64(PF2, XO2, i)      \
 228                BLK64(NOP, ST, i)       \
 229
 230        " .align 32                     ;\n"
 231        " 1:                            ;\n"
 232
 233                BLOCK(0)
 234                BLOCK(4)
 235                BLOCK(8)
 236                BLOCK(12)
 237
 238        "       add %[inc], %[p1]       ;\n"
 239        "       add %[inc], %[p2]       ;\n"
 240        "       add %[inc], %[p3]       ;\n"
 241        "       dec %[cnt]              ;\n"
 242        "       jnz 1b                  ;\n"
 243        : [cnt] "+r" (lines),
 244          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
 245        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 246        : "memory");
 247
 248        kernel_fpu_end();
 249}
 250
 251static void
 252xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 253          unsigned long *p3, unsigned long *p4)
 254{
 255        unsigned long lines = bytes >> 8;
 256
 257        kernel_fpu_begin();
 258
 259        asm volatile(
 260#undef BLOCK
 261#define BLOCK(i) \
 262                PF1(i)                                  \
 263                                PF1(i + 2)              \
 264                LD(i, 0)                                \
 265                        LD(i + 1, 1)                    \
 266                                LD(i + 2, 2)            \
 267                                        LD(i + 3, 3)    \
 268                PF2(i)                                  \
 269                                PF2(i + 2)              \
 270                XO1(i, 0)                               \
 271                        XO1(i + 1, 1)                   \
 272                                XO1(i + 2, 2)           \
 273                                        XO1(i + 3, 3)   \
 274                PF3(i)                                  \
 275                                PF3(i + 2)              \
 276                PF0(i + 4)                              \
 277                                PF0(i + 6)              \
 278                XO2(i, 0)                               \
 279                        XO2(i + 1, 1)                   \
 280                                XO2(i + 2, 2)           \
 281                                        XO2(i + 3, 3)   \
 282                XO3(i, 0)                               \
 283                        XO3(i + 1, 1)                   \
 284                                XO3(i + 2, 2)           \
 285                                        XO3(i + 3, 3)   \
 286                ST(i, 0)                                \
 287                        ST(i + 1, 1)                    \
 288                                ST(i + 2, 2)            \
 289                                        ST(i + 3, 3)    \
 290
 291
 292                PF0(0)
 293                                PF0(2)
 294
 295        " .align 32                     ;\n"
 296        " 1:                            ;\n"
 297
 298                BLOCK(0)
 299                BLOCK(4)
 300                BLOCK(8)
 301                BLOCK(12)
 302
 303        "       add %[inc], %[p1]       ;\n"
 304        "       add %[inc], %[p2]       ;\n"
 305        "       add %[inc], %[p3]       ;\n"
 306        "       add %[inc], %[p4]       ;\n"
 307        "       dec %[cnt]              ;\n"
 308        "       jnz 1b                  ;\n"
 309        : [cnt] "+r" (lines), [p1] "+r" (p1),
 310          [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
 311        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 312        : "memory");
 313
 314        kernel_fpu_end();
 315}
 316
 317static void
 318xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 319               unsigned long *p3, unsigned long *p4)
 320{
 321        unsigned long lines = bytes >> 8;
 322
 323        kernel_fpu_begin();
 324
 325        asm volatile(
 326#undef BLOCK
 327#define BLOCK(i)                        \
 328                BLK64(PF0, LD, i)       \
 329                BLK64(PF1, XO1, i)      \
 330                BLK64(PF2, XO2, i)      \
 331                BLK64(PF3, XO3, i)      \
 332                BLK64(NOP, ST, i)       \
 333
 334        " .align 32                     ;\n"
 335        " 1:                            ;\n"
 336
 337                BLOCK(0)
 338                BLOCK(4)
 339                BLOCK(8)
 340                BLOCK(12)
 341
 342        "       add %[inc], %[p1]       ;\n"
 343        "       add %[inc], %[p2]       ;\n"
 344        "       add %[inc], %[p3]       ;\n"
 345        "       add %[inc], %[p4]       ;\n"
 346        "       dec %[cnt]              ;\n"
 347        "       jnz 1b                  ;\n"
 348        : [cnt] "+r" (lines), [p1] "+r" (p1),
 349          [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
 350        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 351        : "memory");
 352
 353        kernel_fpu_end();
 354}
 355
 356static void
 357xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 358          unsigned long *p3, unsigned long *p4, unsigned long *p5)
 359{
 360        unsigned long lines = bytes >> 8;
 361
 362        kernel_fpu_begin();
 363
 364        asm volatile(
 365#undef BLOCK
 366#define BLOCK(i) \
 367                PF1(i)                                  \
 368                                PF1(i + 2)              \
 369                LD(i, 0)                                \
 370                        LD(i + 1, 1)                    \
 371                                LD(i + 2, 2)            \
 372                                        LD(i + 3, 3)    \
 373                PF2(i)                                  \
 374                                PF2(i + 2)              \
 375                XO1(i, 0)                               \
 376                        XO1(i + 1, 1)                   \
 377                                XO1(i + 2, 2)           \
 378                                        XO1(i + 3, 3)   \
 379                PF3(i)                                  \
 380                                PF3(i + 2)              \
 381                XO2(i, 0)                               \
 382                        XO2(i + 1, 1)                   \
 383                                XO2(i + 2, 2)           \
 384                                        XO2(i + 3, 3)   \
 385                PF4(i)                                  \
 386                                PF4(i + 2)              \
 387                PF0(i + 4)                              \
 388                                PF0(i + 6)              \
 389                XO3(i, 0)                               \
 390                        XO3(i + 1, 1)                   \
 391                                XO3(i + 2, 2)           \
 392                                        XO3(i + 3, 3)   \
 393                XO4(i, 0)                               \
 394                        XO4(i + 1, 1)                   \
 395                                XO4(i + 2, 2)           \
 396                                        XO4(i + 3, 3)   \
 397                ST(i, 0)                                \
 398                        ST(i + 1, 1)                    \
 399                                ST(i + 2, 2)            \
 400                                        ST(i + 3, 3)    \
 401
 402
 403                PF0(0)
 404                                PF0(2)
 405
 406        " .align 32                     ;\n"
 407        " 1:                            ;\n"
 408
 409                BLOCK(0)
 410                BLOCK(4)
 411                BLOCK(8)
 412                BLOCK(12)
 413
 414        "       add %[inc], %[p1]       ;\n"
 415        "       add %[inc], %[p2]       ;\n"
 416        "       add %[inc], %[p3]       ;\n"
 417        "       add %[inc], %[p4]       ;\n"
 418        "       add %[inc], %[p5]       ;\n"
 419        "       dec %[cnt]              ;\n"
 420        "       jnz 1b                  ;\n"
 421        : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
 422          [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
 423        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 424        : "memory");
 425
 426        kernel_fpu_end();
 427}
 428
 429static void
 430xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 431               unsigned long *p3, unsigned long *p4, unsigned long *p5)
 432{
 433        unsigned long lines = bytes >> 8;
 434
 435        kernel_fpu_begin();
 436
 437        asm volatile(
 438#undef BLOCK
 439#define BLOCK(i)                        \
 440                BLK64(PF0, LD, i)       \
 441                BLK64(PF1, XO1, i)      \
 442                BLK64(PF2, XO2, i)      \
 443                BLK64(PF3, XO3, i)      \
 444                BLK64(PF4, XO4, i)      \
 445                BLK64(NOP, ST, i)       \
 446
 447        " .align 32                     ;\n"
 448        " 1:                            ;\n"
 449
 450                BLOCK(0)
 451                BLOCK(4)
 452                BLOCK(8)
 453                BLOCK(12)
 454
 455        "       add %[inc], %[p1]       ;\n"
 456        "       add %[inc], %[p2]       ;\n"
 457        "       add %[inc], %[p3]       ;\n"
 458        "       add %[inc], %[p4]       ;\n"
 459        "       add %[inc], %[p5]       ;\n"
 460        "       dec %[cnt]              ;\n"
 461        "       jnz 1b                  ;\n"
 462        : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
 463          [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
 464        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 465        : "memory");
 466
 467        kernel_fpu_end();
 468}
 469
 470static struct xor_block_template xor_block_sse_pf64 = {
 471        .name = "prefetch64-sse",
 472        .do_2 = xor_sse_2_pf64,
 473        .do_3 = xor_sse_3_pf64,
 474        .do_4 = xor_sse_4_pf64,
 475        .do_5 = xor_sse_5_pf64,
 476};
 477
 478#undef LD
 479#undef XO1
 480#undef XO2
 481#undef XO3
 482#undef XO4
 483#undef ST
 484#undef NOP
 485#undef BLK64
 486#undef BLOCK
 487
 488#undef XOR_CONSTANT_CONSTRAINT
 489
 490#ifdef CONFIG_X86_32
 491# include <asm/xor_32.h>
 492#else
 493# include <asm/xor_64.h>
 494#endif
 495
 496#define XOR_SELECT_TEMPLATE(FASTEST) \
 497        AVX_SELECT(FASTEST)
 498
 499#endif /* _ASM_X86_XOR_H */
 500