linux/arch/x86/include/asm/xor_64.h
<<
>>
Prefs
   1#ifndef _ASM_X86_XOR_64_H
   2#define _ASM_X86_XOR_64_H
   3
   4/*
   5 * Optimized RAID-5 checksumming functions for MMX and SSE.
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License as published by
   9 * the Free Software Foundation; either version 2, or (at your option)
  10 * any later version.
  11 *
  12 * You should have received a copy of the GNU General Public License
  13 * (for example /usr/src/linux/COPYING); if not, write to the Free
  14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15 */
  16
  17
  18/*
  19 * Cache avoiding checksumming functions utilizing KNI instructions
  20 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
  21 */
  22
  23/*
  24 * Based on
  25 * High-speed RAID5 checksumming functions utilizing SSE instructions.
  26 * Copyright (C) 1998 Ingo Molnar.
  27 */
  28
  29/*
  30 * x86-64 changes / gcc fixes from Andi Kleen.
  31 * Copyright 2002 Andi Kleen, SuSE Labs.
  32 *
  33 * This hasn't been optimized for the hammer yet, but there are likely
  34 * no advantages to be gotten from x86-64 here anyways.
  35 */
  36
  37typedef struct {
  38        unsigned long a, b;
  39} __attribute__((aligned(16))) xmm_store_t;
  40
  41/* Doesn't use gcc to save the XMM registers, because there is no easy way to
  42   tell it to do a clts before the register saving. */
  43#define XMMS_SAVE                               \
  44do {                                            \
  45        preempt_disable();                      \
  46        asm volatile(                           \
  47                "movq %%cr0,%0          ;\n\t"  \
  48                "clts                   ;\n\t"  \
  49                "movups %%xmm0,(%1)     ;\n\t"  \
  50                "movups %%xmm1,0x10(%1) ;\n\t"  \
  51                "movups %%xmm2,0x20(%1) ;\n\t"  \
  52                "movups %%xmm3,0x30(%1) ;\n\t"  \
  53                : "=&r" (cr0)                   \
  54                : "r" (xmm_save)                \
  55                : "memory");                    \
  56} while (0)
  57
  58#define XMMS_RESTORE                            \
  59do {                                            \
  60        asm volatile(                           \
  61                "sfence                 ;\n\t"  \
  62                "movups (%1),%%xmm0     ;\n\t"  \
  63                "movups 0x10(%1),%%xmm1 ;\n\t"  \
  64                "movups 0x20(%1),%%xmm2 ;\n\t"  \
  65                "movups 0x30(%1),%%xmm3 ;\n\t"  \
  66                "movq   %0,%%cr0        ;\n\t"  \
  67                :                               \
  68                : "r" (cr0), "r" (xmm_save)     \
  69                : "memory");                    \
  70        preempt_enable();                       \
  71} while (0)
  72
  73#define OFFS(x)         "16*("#x")"
  74#define PF_OFFS(x)      "256+16*("#x")"
  75#define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
  76#define LD(x, y)        "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
  77#define ST(x, y)        "       movaps %%xmm"#y",   "OFFS(x)"(%[p1])    ;\n"
  78#define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
  79#define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
  80#define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
  81#define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
  82#define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
  83#define XO1(x, y)       "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
  84#define XO2(x, y)       "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
  85#define XO3(x, y)       "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
  86#define XO4(x, y)       "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
  87#define XO5(x, y)       "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
  88
  89
  90static void
  91xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
  92{
  93        unsigned int lines = bytes >> 8;
  94        unsigned long cr0;
  95        xmm_store_t xmm_save[4];
  96
  97        XMMS_SAVE;
  98
  99        asm volatile(
 100#undef BLOCK
 101#define BLOCK(i) \
 102                LD(i, 0)                                \
 103                        LD(i + 1, 1)                    \
 104                PF1(i)                                  \
 105                                PF1(i + 2)              \
 106                                LD(i + 2, 2)            \
 107                                        LD(i + 3, 3)    \
 108                PF0(i + 4)                              \
 109                                PF0(i + 6)              \
 110                XO1(i, 0)                               \
 111                        XO1(i + 1, 1)                   \
 112                                XO1(i + 2, 2)           \
 113                                        XO1(i + 3, 3)   \
 114                ST(i, 0)                                \
 115                        ST(i + 1, 1)                    \
 116                                ST(i + 2, 2)            \
 117                                        ST(i + 3, 3)    \
 118
 119
 120                PF0(0)
 121                                PF0(2)
 122
 123        " .align 32                     ;\n"
 124        " 1:                            ;\n"
 125
 126                BLOCK(0)
 127                BLOCK(4)
 128                BLOCK(8)
 129                BLOCK(12)
 130
 131        "       addq %[inc], %[p1]           ;\n"
 132        "       addq %[inc], %[p2]           ;\n"
 133                "               decl %[cnt] ; jnz 1b"
 134        : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
 135        : [inc] "r" (256UL)
 136        : "memory");
 137
 138        XMMS_RESTORE;
 139}
 140
 141static void
 142xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 143          unsigned long *p3)
 144{
 145        unsigned int lines = bytes >> 8;
 146        xmm_store_t xmm_save[4];
 147        unsigned long cr0;
 148
 149        XMMS_SAVE;
 150
 151        asm volatile(
 152#undef BLOCK
 153#define BLOCK(i) \
 154                PF1(i)                                  \
 155                                PF1(i + 2)              \
 156                LD(i, 0)                                        \
 157                        LD(i + 1, 1)                    \
 158                                LD(i + 2, 2)            \
 159                                        LD(i + 3, 3)    \
 160                PF2(i)                                  \
 161                                PF2(i + 2)              \
 162                PF0(i + 4)                              \
 163                                PF0(i + 6)              \
 164                XO1(i, 0)                               \
 165                        XO1(i + 1, 1)                   \
 166                                XO1(i + 2, 2)           \
 167                                        XO1(i + 3, 3)   \
 168                XO2(i, 0)                               \
 169                        XO2(i + 1, 1)                   \
 170                                XO2(i + 2, 2)           \
 171                                        XO2(i + 3, 3)   \
 172                ST(i, 0)                                \
 173                        ST(i + 1, 1)                    \
 174                                ST(i + 2, 2)            \
 175                                        ST(i + 3, 3)    \
 176
 177
 178                PF0(0)
 179                                PF0(2)
 180
 181        " .align 32                     ;\n"
 182        " 1:                            ;\n"
 183
 184                BLOCK(0)
 185                BLOCK(4)
 186                BLOCK(8)
 187                BLOCK(12)
 188
 189        "       addq %[inc], %[p1]           ;\n"
 190        "       addq %[inc], %[p2]          ;\n"
 191        "       addq %[inc], %[p3]           ;\n"
 192                "               decl %[cnt] ; jnz 1b"
 193        : [cnt] "+r" (lines),
 194          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
 195        : [inc] "r" (256UL)
 196        : "memory");
 197        XMMS_RESTORE;
 198}
 199
 200static void
 201xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 202          unsigned long *p3, unsigned long *p4)
 203{
 204        unsigned int lines = bytes >> 8;
 205        xmm_store_t xmm_save[4];
 206        unsigned long cr0;
 207
 208        XMMS_SAVE;
 209
 210        asm volatile(
 211#undef BLOCK
 212#define BLOCK(i) \
 213                PF1(i)                                  \
 214                                PF1(i + 2)              \
 215                LD(i, 0)                                \
 216                        LD(i + 1, 1)                    \
 217                                LD(i + 2, 2)            \
 218                                        LD(i + 3, 3)    \
 219                PF2(i)                                  \
 220                                PF2(i + 2)              \
 221                XO1(i, 0)                               \
 222                        XO1(i + 1, 1)                   \
 223                                XO1(i + 2, 2)           \
 224                                        XO1(i + 3, 3)   \
 225                PF3(i)                                  \
 226                                PF3(i + 2)              \
 227                PF0(i + 4)                              \
 228                                PF0(i + 6)              \
 229                XO2(i, 0)                               \
 230                        XO2(i + 1, 1)                   \
 231                                XO2(i + 2, 2)           \
 232                                        XO2(i + 3, 3)   \
 233                XO3(i, 0)                               \
 234                        XO3(i + 1, 1)                   \
 235                                XO3(i + 2, 2)           \
 236                                        XO3(i + 3, 3)   \
 237                ST(i, 0)                                \
 238                        ST(i + 1, 1)                    \
 239                                ST(i + 2, 2)            \
 240                                        ST(i + 3, 3)    \
 241
 242
 243                PF0(0)
 244                                PF0(2)
 245
 246        " .align 32                     ;\n"
 247        " 1:                            ;\n"
 248
 249                BLOCK(0)
 250                BLOCK(4)
 251                BLOCK(8)
 252                BLOCK(12)
 253
 254        "       addq %[inc], %[p1]           ;\n"
 255        "       addq %[inc], %[p2]           ;\n"
 256        "       addq %[inc], %[p3]           ;\n"
 257        "       addq %[inc], %[p4]           ;\n"
 258        "       decl %[cnt] ; jnz 1b"
 259        : [cnt] "+c" (lines),
 260          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
 261        : [inc] "r" (256UL)
 262        : "memory" );
 263
 264        XMMS_RESTORE;
 265}
 266
 267static void
 268xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 269          unsigned long *p3, unsigned long *p4, unsigned long *p5)
 270{
 271        unsigned int lines = bytes >> 8;
 272        xmm_store_t xmm_save[4];
 273        unsigned long cr0;
 274
 275        XMMS_SAVE;
 276
 277        asm volatile(
 278#undef BLOCK
 279#define BLOCK(i) \
 280                PF1(i)                                  \
 281                                PF1(i + 2)              \
 282                LD(i, 0)                                \
 283                        LD(i + 1, 1)                    \
 284                                LD(i + 2, 2)            \
 285                                        LD(i + 3, 3)    \
 286                PF2(i)                                  \
 287                                PF2(i + 2)              \
 288                XO1(i, 0)                               \
 289                        XO1(i + 1, 1)                   \
 290                                XO1(i + 2, 2)           \
 291                                        XO1(i + 3, 3)   \
 292                PF3(i)                                  \
 293                                PF3(i + 2)              \
 294                XO2(i, 0)                               \
 295                        XO2(i + 1, 1)                   \
 296                                XO2(i + 2, 2)           \
 297                                        XO2(i + 3, 3)   \
 298                PF4(i)                                  \
 299                                PF4(i + 2)              \
 300                PF0(i + 4)                              \
 301                                PF0(i + 6)              \
 302                XO3(i, 0)                               \
 303                        XO3(i + 1, 1)                   \
 304                                XO3(i + 2, 2)           \
 305                                        XO3(i + 3, 3)   \
 306                XO4(i, 0)                               \
 307                        XO4(i + 1, 1)                   \
 308                                XO4(i + 2, 2)           \
 309                                        XO4(i + 3, 3)   \
 310                ST(i, 0)                                \
 311                        ST(i + 1, 1)                    \
 312                                ST(i + 2, 2)            \
 313                                        ST(i + 3, 3)    \
 314
 315
 316                PF0(0)
 317                                PF0(2)
 318
 319        " .align 32                     ;\n"
 320        " 1:                            ;\n"
 321
 322                BLOCK(0)
 323                BLOCK(4)
 324                BLOCK(8)
 325                BLOCK(12)
 326
 327        "       addq %[inc], %[p1]           ;\n"
 328        "       addq %[inc], %[p2]           ;\n"
 329        "       addq %[inc], %[p3]           ;\n"
 330        "       addq %[inc], %[p4]           ;\n"
 331        "       addq %[inc], %[p5]           ;\n"
 332        "       decl %[cnt] ; jnz 1b"
 333        : [cnt] "+c" (lines),
 334          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
 335          [p5] "+r" (p5)
 336        : [inc] "r" (256UL)
 337        : "memory");
 338
 339        XMMS_RESTORE;
 340}
 341
 342static struct xor_block_template xor_block_sse = {
 343        .name = "generic_sse",
 344        .do_2 = xor_sse_2,
 345        .do_3 = xor_sse_3,
 346        .do_4 = xor_sse_4,
 347        .do_5 = xor_sse_5,
 348};
 349
 350#undef XOR_TRY_TEMPLATES
 351#define XOR_TRY_TEMPLATES                       \
 352do {                                            \
 353        xor_speed(&xor_block_sse);              \
 354} while (0)
 355
 356/* We force the use of the SSE xor block because it can write around L2.
 357   We may also be able to load into the L1 only depending on how the cpu
 358   deals with a load to a line that is being prefetched.  */
 359#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
 360
 361#endif /* _ASM_X86_XOR_64_H */
 362