linux/arch/x86/include/asm/xor.h
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2#ifndef _ASM_X86_XOR_H
   3#define _ASM_X86_XOR_H
   4
   5/*
   6 * Optimized RAID-5 checksumming functions for SSE.
   7 */
   8
   9/*
  10 * Cache avoiding checksumming functions utilizing KNI instructions
  11 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
  12 */
  13
  14/*
  15 * Based on
  16 * High-speed RAID5 checksumming functions utilizing SSE instructions.
  17 * Copyright (C) 1998 Ingo Molnar.
  18 */
  19
  20/*
  21 * x86-64 changes / gcc fixes from Andi Kleen.
  22 * Copyright 2002 Andi Kleen, SuSE Labs.
  23 *
  24 * This hasn't been optimized for the hammer yet, but there are likely
  25 * no advantages to be gotten from x86-64 here anyways.
  26 */
  27
  28#include <asm/fpu/api.h>
  29
  30#ifdef CONFIG_X86_32
  31/* reduce register pressure */
  32# define XOR_CONSTANT_CONSTRAINT "i"
  33#else
  34# define XOR_CONSTANT_CONSTRAINT "re"
  35#endif
  36
  37#define OFFS(x)         "16*("#x")"
  38#define PF_OFFS(x)      "256+16*("#x")"
  39#define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
  40#define LD(x, y)        "       movaps "OFFS(x)"(%[p1]), %%xmm"#y"      ;\n"
  41#define ST(x, y)        "       movaps %%xmm"#y", "OFFS(x)"(%[p1])      ;\n"
  42#define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
  43#define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
  44#define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
  45#define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
  46#define XO1(x, y)       "       xorps "OFFS(x)"(%[p2]), %%xmm"#y"       ;\n"
  47#define XO2(x, y)       "       xorps "OFFS(x)"(%[p3]), %%xmm"#y"       ;\n"
  48#define XO3(x, y)       "       xorps "OFFS(x)"(%[p4]), %%xmm"#y"       ;\n"
  49#define XO4(x, y)       "       xorps "OFFS(x)"(%[p5]), %%xmm"#y"       ;\n"
  50#define NOP(x)
  51
  52#define BLK64(pf, op, i)                                \
  53                pf(i)                                   \
  54                op(i, 0)                                \
  55                        op(i + 1, 1)                    \
  56                                op(i + 2, 2)            \
  57                                        op(i + 3, 3)
  58
  59static void
  60xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
  61{
  62        unsigned long lines = bytes >> 8;
  63
  64        kernel_fpu_begin();
  65
  66        asm volatile(
  67#undef BLOCK
  68#define BLOCK(i)                                        \
  69                LD(i, 0)                                \
  70                        LD(i + 1, 1)                    \
  71                PF1(i)                                  \
  72                                PF1(i + 2)              \
  73                                LD(i + 2, 2)            \
  74                                        LD(i + 3, 3)    \
  75                PF0(i + 4)                              \
  76                                PF0(i + 6)              \
  77                XO1(i, 0)                               \
  78                        XO1(i + 1, 1)                   \
  79                                XO1(i + 2, 2)           \
  80                                        XO1(i + 3, 3)   \
  81                ST(i, 0)                                \
  82                        ST(i + 1, 1)                    \
  83                                ST(i + 2, 2)            \
  84                                        ST(i + 3, 3)    \
  85
  86
  87                PF0(0)
  88                                PF0(2)
  89
  90        " .align 32                     ;\n"
  91        " 1:                            ;\n"
  92
  93                BLOCK(0)
  94                BLOCK(4)
  95                BLOCK(8)
  96                BLOCK(12)
  97
  98        "       add %[inc], %[p1]       ;\n"
  99        "       add %[inc], %[p2]       ;\n"
 100        "       dec %[cnt]              ;\n"
 101        "       jnz 1b                  ;\n"
 102        : [cnt] "+r" (lines),
 103          [p1] "+r" (p1), [p2] "+r" (p2)
 104        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 105        : "memory");
 106
 107        kernel_fpu_end();
 108}
 109
 110static void
 111xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 112{
 113        unsigned long lines = bytes >> 8;
 114
 115        kernel_fpu_begin();
 116
 117        asm volatile(
 118#undef BLOCK
 119#define BLOCK(i)                        \
 120                BLK64(PF0, LD, i)       \
 121                BLK64(PF1, XO1, i)      \
 122                BLK64(NOP, ST, i)       \
 123
 124        " .align 32                     ;\n"
 125        " 1:                            ;\n"
 126
 127                BLOCK(0)
 128                BLOCK(4)
 129                BLOCK(8)
 130                BLOCK(12)
 131
 132        "       add %[inc], %[p1]       ;\n"
 133        "       add %[inc], %[p2]       ;\n"
 134        "       dec %[cnt]              ;\n"
 135        "       jnz 1b                  ;\n"
 136        : [cnt] "+r" (lines),
 137          [p1] "+r" (p1), [p2] "+r" (p2)
 138        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 139        : "memory");
 140
 141        kernel_fpu_end();
 142}
 143
 144static void
 145xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 146          unsigned long *p3)
 147{
 148        unsigned long lines = bytes >> 8;
 149
 150        kernel_fpu_begin();
 151
 152        asm volatile(
 153#undef BLOCK
 154#define BLOCK(i) \
 155                PF1(i)                                  \
 156                                PF1(i + 2)              \
 157                LD(i, 0)                                \
 158                        LD(i + 1, 1)                    \
 159                                LD(i + 2, 2)            \
 160                                        LD(i + 3, 3)    \
 161                PF2(i)                                  \
 162                                PF2(i + 2)              \
 163                PF0(i + 4)                              \
 164                                PF0(i + 6)              \
 165                XO1(i, 0)                               \
 166                        XO1(i + 1, 1)                   \
 167                                XO1(i + 2, 2)           \
 168                                        XO1(i + 3, 3)   \
 169                XO2(i, 0)                               \
 170                        XO2(i + 1, 1)                   \
 171                                XO2(i + 2, 2)           \
 172                                        XO2(i + 3, 3)   \
 173                ST(i, 0)                                \
 174                        ST(i + 1, 1)                    \
 175                                ST(i + 2, 2)            \
 176                                        ST(i + 3, 3)    \
 177
 178
 179                PF0(0)
 180                                PF0(2)
 181
 182        " .align 32                     ;\n"
 183        " 1:                            ;\n"
 184
 185                BLOCK(0)
 186                BLOCK(4)
 187                BLOCK(8)
 188                BLOCK(12)
 189
 190        "       add %[inc], %[p1]       ;\n"
 191        "       add %[inc], %[p2]       ;\n"
 192        "       add %[inc], %[p3]       ;\n"
 193        "       dec %[cnt]              ;\n"
 194        "       jnz 1b                  ;\n"
 195        : [cnt] "+r" (lines),
 196          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
 197        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 198        : "memory");
 199
 200        kernel_fpu_end();
 201}
 202
 203static void
 204xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 205               unsigned long *p3)
 206{
 207        unsigned long lines = bytes >> 8;
 208
 209        kernel_fpu_begin();
 210
 211        asm volatile(
 212#undef BLOCK
 213#define BLOCK(i)                        \
 214                BLK64(PF0, LD, i)       \
 215                BLK64(PF1, XO1, i)      \
 216                BLK64(PF2, XO2, i)      \
 217                BLK64(NOP, ST, i)       \
 218
 219        " .align 32                     ;\n"
 220        " 1:                            ;\n"
 221
 222                BLOCK(0)
 223                BLOCK(4)
 224                BLOCK(8)
 225                BLOCK(12)
 226
 227        "       add %[inc], %[p1]       ;\n"
 228        "       add %[inc], %[p2]       ;\n"
 229        "       add %[inc], %[p3]       ;\n"
 230        "       dec %[cnt]              ;\n"
 231        "       jnz 1b                  ;\n"
 232        : [cnt] "+r" (lines),
 233          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
 234        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 235        : "memory");
 236
 237        kernel_fpu_end();
 238}
 239
 240static void
 241xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 242          unsigned long *p3, unsigned long *p4)
 243{
 244        unsigned long lines = bytes >> 8;
 245
 246        kernel_fpu_begin();
 247
 248        asm volatile(
 249#undef BLOCK
 250#define BLOCK(i) \
 251                PF1(i)                                  \
 252                                PF1(i + 2)              \
 253                LD(i, 0)                                \
 254                        LD(i + 1, 1)                    \
 255                                LD(i + 2, 2)            \
 256                                        LD(i + 3, 3)    \
 257                PF2(i)                                  \
 258                                PF2(i + 2)              \
 259                XO1(i, 0)                               \
 260                        XO1(i + 1, 1)                   \
 261                                XO1(i + 2, 2)           \
 262                                        XO1(i + 3, 3)   \
 263                PF3(i)                                  \
 264                                PF3(i + 2)              \
 265                PF0(i + 4)                              \
 266                                PF0(i + 6)              \
 267                XO2(i, 0)                               \
 268                        XO2(i + 1, 1)                   \
 269                                XO2(i + 2, 2)           \
 270                                        XO2(i + 3, 3)   \
 271                XO3(i, 0)                               \
 272                        XO3(i + 1, 1)                   \
 273                                XO3(i + 2, 2)           \
 274                                        XO3(i + 3, 3)   \
 275                ST(i, 0)                                \
 276                        ST(i + 1, 1)                    \
 277                                ST(i + 2, 2)            \
 278                                        ST(i + 3, 3)    \
 279
 280
 281                PF0(0)
 282                                PF0(2)
 283
 284        " .align 32                     ;\n"
 285        " 1:                            ;\n"
 286
 287                BLOCK(0)
 288                BLOCK(4)
 289                BLOCK(8)
 290                BLOCK(12)
 291
 292        "       add %[inc], %[p1]       ;\n"
 293        "       add %[inc], %[p2]       ;\n"
 294        "       add %[inc], %[p3]       ;\n"
 295        "       add %[inc], %[p4]       ;\n"
 296        "       dec %[cnt]              ;\n"
 297        "       jnz 1b                  ;\n"
 298        : [cnt] "+r" (lines), [p1] "+r" (p1),
 299          [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
 300        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 301        : "memory");
 302
 303        kernel_fpu_end();
 304}
 305
 306static void
 307xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 308               unsigned long *p3, unsigned long *p4)
 309{
 310        unsigned long lines = bytes >> 8;
 311
 312        kernel_fpu_begin();
 313
 314        asm volatile(
 315#undef BLOCK
 316#define BLOCK(i)                        \
 317                BLK64(PF0, LD, i)       \
 318                BLK64(PF1, XO1, i)      \
 319                BLK64(PF2, XO2, i)      \
 320                BLK64(PF3, XO3, i)      \
 321                BLK64(NOP, ST, i)       \
 322
 323        " .align 32                     ;\n"
 324        " 1:                            ;\n"
 325
 326                BLOCK(0)
 327                BLOCK(4)
 328                BLOCK(8)
 329                BLOCK(12)
 330
 331        "       add %[inc], %[p1]       ;\n"
 332        "       add %[inc], %[p2]       ;\n"
 333        "       add %[inc], %[p3]       ;\n"
 334        "       add %[inc], %[p4]       ;\n"
 335        "       dec %[cnt]              ;\n"
 336        "       jnz 1b                  ;\n"
 337        : [cnt] "+r" (lines), [p1] "+r" (p1),
 338          [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
 339        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 340        : "memory");
 341
 342        kernel_fpu_end();
 343}
 344
 345static void
 346xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 347          unsigned long *p3, unsigned long *p4, unsigned long *p5)
 348{
 349        unsigned long lines = bytes >> 8;
 350
 351        kernel_fpu_begin();
 352
 353        asm volatile(
 354#undef BLOCK
 355#define BLOCK(i) \
 356                PF1(i)                                  \
 357                                PF1(i + 2)              \
 358                LD(i, 0)                                \
 359                        LD(i + 1, 1)                    \
 360                                LD(i + 2, 2)            \
 361                                        LD(i + 3, 3)    \
 362                PF2(i)                                  \
 363                                PF2(i + 2)              \
 364                XO1(i, 0)                               \
 365                        XO1(i + 1, 1)                   \
 366                                XO1(i + 2, 2)           \
 367                                        XO1(i + 3, 3)   \
 368                PF3(i)                                  \
 369                                PF3(i + 2)              \
 370                XO2(i, 0)                               \
 371                        XO2(i + 1, 1)                   \
 372                                XO2(i + 2, 2)           \
 373                                        XO2(i + 3, 3)   \
 374                PF4(i)                                  \
 375                                PF4(i + 2)              \
 376                PF0(i + 4)                              \
 377                                PF0(i + 6)              \
 378                XO3(i, 0)                               \
 379                        XO3(i + 1, 1)                   \
 380                                XO3(i + 2, 2)           \
 381                                        XO3(i + 3, 3)   \
 382                XO4(i, 0)                               \
 383                        XO4(i + 1, 1)                   \
 384                                XO4(i + 2, 2)           \
 385                                        XO4(i + 3, 3)   \
 386                ST(i, 0)                                \
 387                        ST(i + 1, 1)                    \
 388                                ST(i + 2, 2)            \
 389                                        ST(i + 3, 3)    \
 390
 391
 392                PF0(0)
 393                                PF0(2)
 394
 395        " .align 32                     ;\n"
 396        " 1:                            ;\n"
 397
 398                BLOCK(0)
 399                BLOCK(4)
 400                BLOCK(8)
 401                BLOCK(12)
 402
 403        "       add %[inc], %[p1]       ;\n"
 404        "       add %[inc], %[p2]       ;\n"
 405        "       add %[inc], %[p3]       ;\n"
 406        "       add %[inc], %[p4]       ;\n"
 407        "       add %[inc], %[p5]       ;\n"
 408        "       dec %[cnt]              ;\n"
 409        "       jnz 1b                  ;\n"
 410        : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
 411          [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
 412        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 413        : "memory");
 414
 415        kernel_fpu_end();
 416}
 417
 418static void
 419xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 420               unsigned long *p3, unsigned long *p4, unsigned long *p5)
 421{
 422        unsigned long lines = bytes >> 8;
 423
 424        kernel_fpu_begin();
 425
 426        asm volatile(
 427#undef BLOCK
 428#define BLOCK(i)                        \
 429                BLK64(PF0, LD, i)       \
 430                BLK64(PF1, XO1, i)      \
 431                BLK64(PF2, XO2, i)      \
 432                BLK64(PF3, XO3, i)      \
 433                BLK64(PF4, XO4, i)      \
 434                BLK64(NOP, ST, i)       \
 435
 436        " .align 32                     ;\n"
 437        " 1:                            ;\n"
 438
 439                BLOCK(0)
 440                BLOCK(4)
 441                BLOCK(8)
 442                BLOCK(12)
 443
 444        "       add %[inc], %[p1]       ;\n"
 445        "       add %[inc], %[p2]       ;\n"
 446        "       add %[inc], %[p3]       ;\n"
 447        "       add %[inc], %[p4]       ;\n"
 448        "       add %[inc], %[p5]       ;\n"
 449        "       dec %[cnt]              ;\n"
 450        "       jnz 1b                  ;\n"
 451        : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
 452          [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
 453        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 454        : "memory");
 455
 456        kernel_fpu_end();
 457}
 458
 459static struct xor_block_template xor_block_sse_pf64 = {
 460        .name = "prefetch64-sse",
 461        .do_2 = xor_sse_2_pf64,
 462        .do_3 = xor_sse_3_pf64,
 463        .do_4 = xor_sse_4_pf64,
 464        .do_5 = xor_sse_5_pf64,
 465};
 466
 467#undef LD
 468#undef XO1
 469#undef XO2
 470#undef XO3
 471#undef XO4
 472#undef ST
 473#undef NOP
 474#undef BLK64
 475#undef BLOCK
 476
 477#undef XOR_CONSTANT_CONSTRAINT
 478
 479#ifdef CONFIG_X86_32
 480# include <asm/xor_32.h>
 481#else
 482# include <asm/xor_64.h>
 483#endif
 484
 485#define XOR_SELECT_TEMPLATE(FASTEST) \
 486        AVX_SELECT(FASTEST)
 487
 488#endif /* _ASM_X86_XOR_H */
 489