linux/arch/x86/include/asm/xor.h
<<
>>
Prefs
   1#ifndef _ASM_X86_XOR_H
   2#define _ASM_X86_XOR_H
   3
   4/*
   5 * Optimized RAID-5 checksumming functions for SSE.
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License as published by
   9 * the Free Software Foundation; either version 2, or (at your option)
  10 * any later version.
  11 *
  12 * You should have received a copy of the GNU General Public License
  13 * (for example /usr/src/linux/COPYING); if not, write to the Free
  14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15 */
  16
  17/*
  18 * Cache avoiding checksumming functions utilizing KNI instructions
  19 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
  20 */
  21
  22/*
  23 * Based on
  24 * High-speed RAID5 checksumming functions utilizing SSE instructions.
  25 * Copyright (C) 1998 Ingo Molnar.
  26 */
  27
  28/*
  29 * x86-64 changes / gcc fixes from Andi Kleen.
  30 * Copyright 2002 Andi Kleen, SuSE Labs.
  31 *
  32 * This hasn't been optimized for the hammer yet, but there are likely
  33 * no advantages to be gotten from x86-64 here anyways.
  34 */
  35
  36#include <asm/fpu/api.h>
  37
  38#ifdef CONFIG_X86_32
  39/* reduce register pressure */
  40# define XOR_CONSTANT_CONSTRAINT "i"
  41#else
  42# define XOR_CONSTANT_CONSTRAINT "re"
  43#endif
  44
  45#define OFFS(x)         "16*("#x")"
  46#define PF_OFFS(x)      "256+16*("#x")"
  47#define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
  48#define LD(x, y)        "       movaps "OFFS(x)"(%[p1]), %%xmm"#y"      ;\n"
  49#define ST(x, y)        "       movaps %%xmm"#y", "OFFS(x)"(%[p1])      ;\n"
  50#define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
  51#define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
  52#define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
  53#define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
  54#define XO1(x, y)       "       xorps "OFFS(x)"(%[p2]), %%xmm"#y"       ;\n"
  55#define XO2(x, y)       "       xorps "OFFS(x)"(%[p3]), %%xmm"#y"       ;\n"
  56#define XO3(x, y)       "       xorps "OFFS(x)"(%[p4]), %%xmm"#y"       ;\n"
  57#define XO4(x, y)       "       xorps "OFFS(x)"(%[p5]), %%xmm"#y"       ;\n"
  58#define NOP(x)
  59
  60#define BLK64(pf, op, i)                                \
  61                pf(i)                                   \
  62                op(i, 0)                                \
  63                        op(i + 1, 1)                    \
  64                                op(i + 2, 2)            \
  65                                        op(i + 3, 3)
  66
  67static void
  68xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
  69{
  70        unsigned long lines = bytes >> 8;
  71
  72        kernel_fpu_begin();
  73
  74        asm volatile(
  75#undef BLOCK
  76#define BLOCK(i)                                        \
  77                LD(i, 0)                                \
  78                        LD(i + 1, 1)                    \
  79                PF1(i)                                  \
  80                                PF1(i + 2)              \
  81                                LD(i + 2, 2)            \
  82                                        LD(i + 3, 3)    \
  83                PF0(i + 4)                              \
  84                                PF0(i + 6)              \
  85                XO1(i, 0)                               \
  86                        XO1(i + 1, 1)                   \
  87                                XO1(i + 2, 2)           \
  88                                        XO1(i + 3, 3)   \
  89                ST(i, 0)                                \
  90                        ST(i + 1, 1)                    \
  91                                ST(i + 2, 2)            \
  92                                        ST(i + 3, 3)    \
  93
  94
  95                PF0(0)
  96                                PF0(2)
  97
  98        " .align 32                     ;\n"
  99        " 1:                            ;\n"
 100
 101                BLOCK(0)
 102                BLOCK(4)
 103                BLOCK(8)
 104                BLOCK(12)
 105
 106        "       add %[inc], %[p1]       ;\n"
 107        "       add %[inc], %[p2]       ;\n"
 108        "       dec %[cnt]              ;\n"
 109        "       jnz 1b                  ;\n"
 110        : [cnt] "+r" (lines),
 111          [p1] "+r" (p1), [p2] "+r" (p2)
 112        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 113        : "memory");
 114
 115        kernel_fpu_end();
 116}
 117
 118static void
 119xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 120{
 121        unsigned long lines = bytes >> 8;
 122
 123        kernel_fpu_begin();
 124
 125        asm volatile(
 126#undef BLOCK
 127#define BLOCK(i)                        \
 128                BLK64(PF0, LD, i)       \
 129                BLK64(PF1, XO1, i)      \
 130                BLK64(NOP, ST, i)       \
 131
 132        " .align 32                     ;\n"
 133        " 1:                            ;\n"
 134
 135                BLOCK(0)
 136                BLOCK(4)
 137                BLOCK(8)
 138                BLOCK(12)
 139
 140        "       add %[inc], %[p1]       ;\n"
 141        "       add %[inc], %[p2]       ;\n"
 142        "       dec %[cnt]              ;\n"
 143        "       jnz 1b                  ;\n"
 144        : [cnt] "+r" (lines),
 145          [p1] "+r" (p1), [p2] "+r" (p2)
 146        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 147        : "memory");
 148
 149        kernel_fpu_end();
 150}
 151
 152static void
 153xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 154          unsigned long *p3)
 155{
 156        unsigned long lines = bytes >> 8;
 157
 158        kernel_fpu_begin();
 159
 160        asm volatile(
 161#undef BLOCK
 162#define BLOCK(i) \
 163                PF1(i)                                  \
 164                                PF1(i + 2)              \
 165                LD(i, 0)                                \
 166                        LD(i + 1, 1)                    \
 167                                LD(i + 2, 2)            \
 168                                        LD(i + 3, 3)    \
 169                PF2(i)                                  \
 170                                PF2(i + 2)              \
 171                PF0(i + 4)                              \
 172                                PF0(i + 6)              \
 173                XO1(i, 0)                               \
 174                        XO1(i + 1, 1)                   \
 175                                XO1(i + 2, 2)           \
 176                                        XO1(i + 3, 3)   \
 177                XO2(i, 0)                               \
 178                        XO2(i + 1, 1)                   \
 179                                XO2(i + 2, 2)           \
 180                                        XO2(i + 3, 3)   \
 181                ST(i, 0)                                \
 182                        ST(i + 1, 1)                    \
 183                                ST(i + 2, 2)            \
 184                                        ST(i + 3, 3)    \
 185
 186
 187                PF0(0)
 188                                PF0(2)
 189
 190        " .align 32                     ;\n"
 191        " 1:                            ;\n"
 192
 193                BLOCK(0)
 194                BLOCK(4)
 195                BLOCK(8)
 196                BLOCK(12)
 197
 198        "       add %[inc], %[p1]       ;\n"
 199        "       add %[inc], %[p2]       ;\n"
 200        "       add %[inc], %[p3]       ;\n"
 201        "       dec %[cnt]              ;\n"
 202        "       jnz 1b                  ;\n"
 203        : [cnt] "+r" (lines),
 204          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
 205        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 206        : "memory");
 207
 208        kernel_fpu_end();
 209}
 210
 211static void
 212xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 213               unsigned long *p3)
 214{
 215        unsigned long lines = bytes >> 8;
 216
 217        kernel_fpu_begin();
 218
 219        asm volatile(
 220#undef BLOCK
 221#define BLOCK(i)                        \
 222                BLK64(PF0, LD, i)       \
 223                BLK64(PF1, XO1, i)      \
 224                BLK64(PF2, XO2, i)      \
 225                BLK64(NOP, ST, i)       \
 226
 227        " .align 32                     ;\n"
 228        " 1:                            ;\n"
 229
 230                BLOCK(0)
 231                BLOCK(4)
 232                BLOCK(8)
 233                BLOCK(12)
 234
 235        "       add %[inc], %[p1]       ;\n"
 236        "       add %[inc], %[p2]       ;\n"
 237        "       add %[inc], %[p3]       ;\n"
 238        "       dec %[cnt]              ;\n"
 239        "       jnz 1b                  ;\n"
 240        : [cnt] "+r" (lines),
 241          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
 242        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 243        : "memory");
 244
 245        kernel_fpu_end();
 246}
 247
 248static void
 249xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 250          unsigned long *p3, unsigned long *p4)
 251{
 252        unsigned long lines = bytes >> 8;
 253
 254        kernel_fpu_begin();
 255
 256        asm volatile(
 257#undef BLOCK
 258#define BLOCK(i) \
 259                PF1(i)                                  \
 260                                PF1(i + 2)              \
 261                LD(i, 0)                                \
 262                        LD(i + 1, 1)                    \
 263                                LD(i + 2, 2)            \
 264                                        LD(i + 3, 3)    \
 265                PF2(i)                                  \
 266                                PF2(i + 2)              \
 267                XO1(i, 0)                               \
 268                        XO1(i + 1, 1)                   \
 269                                XO1(i + 2, 2)           \
 270                                        XO1(i + 3, 3)   \
 271                PF3(i)                                  \
 272                                PF3(i + 2)              \
 273                PF0(i + 4)                              \
 274                                PF0(i + 6)              \
 275                XO2(i, 0)                               \
 276                        XO2(i + 1, 1)                   \
 277                                XO2(i + 2, 2)           \
 278                                        XO2(i + 3, 3)   \
 279                XO3(i, 0)                               \
 280                        XO3(i + 1, 1)                   \
 281                                XO3(i + 2, 2)           \
 282                                        XO3(i + 3, 3)   \
 283                ST(i, 0)                                \
 284                        ST(i + 1, 1)                    \
 285                                ST(i + 2, 2)            \
 286                                        ST(i + 3, 3)    \
 287
 288
 289                PF0(0)
 290                                PF0(2)
 291
 292        " .align 32                     ;\n"
 293        " 1:                            ;\n"
 294
 295                BLOCK(0)
 296                BLOCK(4)
 297                BLOCK(8)
 298                BLOCK(12)
 299
 300        "       add %[inc], %[p1]       ;\n"
 301        "       add %[inc], %[p2]       ;\n"
 302        "       add %[inc], %[p3]       ;\n"
 303        "       add %[inc], %[p4]       ;\n"
 304        "       dec %[cnt]              ;\n"
 305        "       jnz 1b                  ;\n"
 306        : [cnt] "+r" (lines), [p1] "+r" (p1),
 307          [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
 308        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 309        : "memory");
 310
 311        kernel_fpu_end();
 312}
 313
 314static void
 315xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 316               unsigned long *p3, unsigned long *p4)
 317{
 318        unsigned long lines = bytes >> 8;
 319
 320        kernel_fpu_begin();
 321
 322        asm volatile(
 323#undef BLOCK
 324#define BLOCK(i)                        \
 325                BLK64(PF0, LD, i)       \
 326                BLK64(PF1, XO1, i)      \
 327                BLK64(PF2, XO2, i)      \
 328                BLK64(PF3, XO3, i)      \
 329                BLK64(NOP, ST, i)       \
 330
 331        " .align 32                     ;\n"
 332        " 1:                            ;\n"
 333
 334                BLOCK(0)
 335                BLOCK(4)
 336                BLOCK(8)
 337                BLOCK(12)
 338
 339        "       add %[inc], %[p1]       ;\n"
 340        "       add %[inc], %[p2]       ;\n"
 341        "       add %[inc], %[p3]       ;\n"
 342        "       add %[inc], %[p4]       ;\n"
 343        "       dec %[cnt]              ;\n"
 344        "       jnz 1b                  ;\n"
 345        : [cnt] "+r" (lines), [p1] "+r" (p1),
 346          [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
 347        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 348        : "memory");
 349
 350        kernel_fpu_end();
 351}
 352
 353static void
 354xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 355          unsigned long *p3, unsigned long *p4, unsigned long *p5)
 356{
 357        unsigned long lines = bytes >> 8;
 358
 359        kernel_fpu_begin();
 360
 361        asm volatile(
 362#undef BLOCK
 363#define BLOCK(i) \
 364                PF1(i)                                  \
 365                                PF1(i + 2)              \
 366                LD(i, 0)                                \
 367                        LD(i + 1, 1)                    \
 368                                LD(i + 2, 2)            \
 369                                        LD(i + 3, 3)    \
 370                PF2(i)                                  \
 371                                PF2(i + 2)              \
 372                XO1(i, 0)                               \
 373                        XO1(i + 1, 1)                   \
 374                                XO1(i + 2, 2)           \
 375                                        XO1(i + 3, 3)   \
 376                PF3(i)                                  \
 377                                PF3(i + 2)              \
 378                XO2(i, 0)                               \
 379                        XO2(i + 1, 1)                   \
 380                                XO2(i + 2, 2)           \
 381                                        XO2(i + 3, 3)   \
 382                PF4(i)                                  \
 383                                PF4(i + 2)              \
 384                PF0(i + 4)                              \
 385                                PF0(i + 6)              \
 386                XO3(i, 0)                               \
 387                        XO3(i + 1, 1)                   \
 388                                XO3(i + 2, 2)           \
 389                                        XO3(i + 3, 3)   \
 390                XO4(i, 0)                               \
 391                        XO4(i + 1, 1)                   \
 392                                XO4(i + 2, 2)           \
 393                                        XO4(i + 3, 3)   \
 394                ST(i, 0)                                \
 395                        ST(i + 1, 1)                    \
 396                                ST(i + 2, 2)            \
 397                                        ST(i + 3, 3)    \
 398
 399
 400                PF0(0)
 401                                PF0(2)
 402
 403        " .align 32                     ;\n"
 404        " 1:                            ;\n"
 405
 406                BLOCK(0)
 407                BLOCK(4)
 408                BLOCK(8)
 409                BLOCK(12)
 410
 411        "       add %[inc], %[p1]       ;\n"
 412        "       add %[inc], %[p2]       ;\n"
 413        "       add %[inc], %[p3]       ;\n"
 414        "       add %[inc], %[p4]       ;\n"
 415        "       add %[inc], %[p5]       ;\n"
 416        "       dec %[cnt]              ;\n"
 417        "       jnz 1b                  ;\n"
 418        : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
 419          [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
 420        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 421        : "memory");
 422
 423        kernel_fpu_end();
 424}
 425
 426static void
 427xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 428               unsigned long *p3, unsigned long *p4, unsigned long *p5)
 429{
 430        unsigned long lines = bytes >> 8;
 431
 432        kernel_fpu_begin();
 433
 434        asm volatile(
 435#undef BLOCK
 436#define BLOCK(i)                        \
 437                BLK64(PF0, LD, i)       \
 438                BLK64(PF1, XO1, i)      \
 439                BLK64(PF2, XO2, i)      \
 440                BLK64(PF3, XO3, i)      \
 441                BLK64(PF4, XO4, i)      \
 442                BLK64(NOP, ST, i)       \
 443
 444        " .align 32                     ;\n"
 445        " 1:                            ;\n"
 446
 447                BLOCK(0)
 448                BLOCK(4)
 449                BLOCK(8)
 450                BLOCK(12)
 451
 452        "       add %[inc], %[p1]       ;\n"
 453        "       add %[inc], %[p2]       ;\n"
 454        "       add %[inc], %[p3]       ;\n"
 455        "       add %[inc], %[p4]       ;\n"
 456        "       add %[inc], %[p5]       ;\n"
 457        "       dec %[cnt]              ;\n"
 458        "       jnz 1b                  ;\n"
 459        : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
 460          [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
 461        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 462        : "memory");
 463
 464        kernel_fpu_end();
 465}
 466
 467static struct xor_block_template xor_block_sse_pf64 = {
 468        .name = "prefetch64-sse",
 469        .do_2 = xor_sse_2_pf64,
 470        .do_3 = xor_sse_3_pf64,
 471        .do_4 = xor_sse_4_pf64,
 472        .do_5 = xor_sse_5_pf64,
 473};
 474
 475#undef LD
 476#undef XO1
 477#undef XO2
 478#undef XO3
 479#undef XO4
 480#undef ST
 481#undef NOP
 482#undef BLK64
 483#undef BLOCK
 484
 485#undef XOR_CONSTANT_CONSTRAINT
 486
 487#ifdef CONFIG_X86_32
 488# include <asm/xor_32.h>
 489#else
 490# include <asm/xor_64.h>
 491#endif
 492
 493#define XOR_SELECT_TEMPLATE(FASTEST) \
 494        AVX_SELECT(FASTEST)
 495
 496#endif /* _ASM_X86_XOR_H */
 497