linux/lib/raid6/sse2.c
<<
>>
Prefs
   1/* -*- linux-c -*- ------------------------------------------------------- *
   2 *
   3 *   Copyright 2002 H. Peter Anvin - All Rights Reserved
   4 *
   5 *   This program is free software; you can redistribute it and/or modify
   6 *   it under the terms of the GNU General Public License as published by
   7 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
   8 *   Boston MA 02111-1307, USA; either version 2 of the License, or
   9 *   (at your option) any later version; incorporated herein by reference.
  10 *
  11 * ----------------------------------------------------------------------- */
  12
  13/*
  14 * raid6/sse2.c
  15 *
  16 * SSE-2 implementation of RAID-6 syndrome functions
  17 *
  18 */
  19
  20#include <linux/raid/pq.h>
  21#include "x86.h"
  22
  23static const struct raid6_sse_constants {
  24        u64 x1d[2];
  25} raid6_sse_constants  __attribute__((aligned(16))) = {
  26        { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL },
  27};
  28
  29static int raid6_have_sse2(void)
  30{
  31        /* Not really boot_cpu but "all_cpus" */
  32        return boot_cpu_has(X86_FEATURE_MMX) &&
  33                boot_cpu_has(X86_FEATURE_FXSR) &&
  34                boot_cpu_has(X86_FEATURE_XMM) &&
  35                boot_cpu_has(X86_FEATURE_XMM2);
  36}
  37
  38/*
  39 * Plain SSE2 implementation
  40 */
  41static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs)
  42{
  43        u8 **dptr = (u8 **)ptrs;
  44        u8 *p, *q;
  45        int d, z, z0;
  46
  47        z0 = disks - 3;         /* Highest data disk */
  48        p = dptr[z0+1];         /* XOR parity */
  49        q = dptr[z0+2];         /* RS syndrome */
  50
  51        kernel_fpu_begin();
  52
  53        asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
  54        asm volatile("pxor %xmm5,%xmm5");       /* Zero temp */
  55
  56        for ( d = 0 ; d < bytes ; d += 16 ) {
  57                asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
  58                asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
  59                asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
  60                asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
  61                asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d]));
  62                for ( z = z0-2 ; z >= 0 ; z-- ) {
  63                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
  64                        asm volatile("pcmpgtb %xmm4,%xmm5");
  65                        asm volatile("paddb %xmm4,%xmm4");
  66                        asm volatile("pand %xmm0,%xmm5");
  67                        asm volatile("pxor %xmm5,%xmm4");
  68                        asm volatile("pxor %xmm5,%xmm5");
  69                        asm volatile("pxor %xmm6,%xmm2");
  70                        asm volatile("pxor %xmm6,%xmm4");
  71                        asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d]));
  72                }
  73                asm volatile("pcmpgtb %xmm4,%xmm5");
  74                asm volatile("paddb %xmm4,%xmm4");
  75                asm volatile("pand %xmm0,%xmm5");
  76                asm volatile("pxor %xmm5,%xmm4");
  77                asm volatile("pxor %xmm5,%xmm5");
  78                asm volatile("pxor %xmm6,%xmm2");
  79                asm volatile("pxor %xmm6,%xmm4");
  80
  81                asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
  82                asm volatile("pxor %xmm2,%xmm2");
  83                asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
  84                asm volatile("pxor %xmm4,%xmm4");
  85        }
  86
  87        asm volatile("sfence" : : : "memory");
  88        kernel_fpu_end();
  89}
  90
  91
  92static void raid6_sse21_xor_syndrome(int disks, int start, int stop,
  93                                     size_t bytes, void **ptrs)
  94 {
  95        u8 **dptr = (u8 **)ptrs;
  96        u8 *p, *q;
  97        int d, z, z0;
  98
  99        z0 = stop;              /* P/Q right side optimization */
 100        p = dptr[disks-2];      /* XOR parity */
 101        q = dptr[disks-1];      /* RS syndrome */
 102
 103        kernel_fpu_begin();
 104
 105        asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
 106
 107        for ( d = 0 ; d < bytes ; d += 16 ) {
 108                asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
 109                asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
 110                asm volatile("pxor %xmm4,%xmm2");
 111                /* P/Q data pages */
 112                for ( z = z0-1 ; z >= start ; z-- ) {
 113                        asm volatile("pxor %xmm5,%xmm5");
 114                        asm volatile("pcmpgtb %xmm4,%xmm5");
 115                        asm volatile("paddb %xmm4,%xmm4");
 116                        asm volatile("pand %xmm0,%xmm5");
 117                        asm volatile("pxor %xmm5,%xmm4");
 118                        asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
 119                        asm volatile("pxor %xmm5,%xmm2");
 120                        asm volatile("pxor %xmm5,%xmm4");
 121                }
 122                /* P/Q left side optimization */
 123                for ( z = start-1 ; z >= 0 ; z-- ) {
 124                        asm volatile("pxor %xmm5,%xmm5");
 125                        asm volatile("pcmpgtb %xmm4,%xmm5");
 126                        asm volatile("paddb %xmm4,%xmm4");
 127                        asm volatile("pand %xmm0,%xmm5");
 128                        asm volatile("pxor %xmm5,%xmm4");
 129                }
 130                asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
 131                /* Don't use movntdq for r/w memory area < cache line */
 132                asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
 133                asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
 134        }
 135
 136        asm volatile("sfence" : : : "memory");
 137        kernel_fpu_end();
 138}
 139
 140const struct raid6_calls raid6_sse2x1 = {
 141        raid6_sse21_gen_syndrome,
 142        raid6_sse21_xor_syndrome,
 143        raid6_have_sse2,
 144        "sse2x1",
 145        1                       /* Has cache hints */
 146};
 147
 148/*
 149 * Unrolled-by-2 SSE2 implementation
 150 */
 151static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs)
 152{
 153        u8 **dptr = (u8 **)ptrs;
 154        u8 *p, *q;
 155        int d, z, z0;
 156
 157        z0 = disks - 3;         /* Highest data disk */
 158        p = dptr[z0+1];         /* XOR parity */
 159        q = dptr[z0+2];         /* RS syndrome */
 160
 161        kernel_fpu_begin();
 162
 163        asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
 164        asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
 165        asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
 166
 167        /* We uniformly assume a single prefetch covers at least 32 bytes */
 168        for ( d = 0 ; d < bytes ; d += 32 ) {
 169                asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
 170                asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d]));    /* P[0] */
 171                asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */
 172                asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
 173                asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */
 174                for ( z = z0-1 ; z >= 0 ; z-- ) {
 175                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
 176                        asm volatile("pcmpgtb %xmm4,%xmm5");
 177                        asm volatile("pcmpgtb %xmm6,%xmm7");
 178                        asm volatile("paddb %xmm4,%xmm4");
 179                        asm volatile("paddb %xmm6,%xmm6");
 180                        asm volatile("pand %xmm0,%xmm5");
 181                        asm volatile("pand %xmm0,%xmm7");
 182                        asm volatile("pxor %xmm5,%xmm4");
 183                        asm volatile("pxor %xmm7,%xmm6");
 184                        asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d]));
 185                        asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16]));
 186                        asm volatile("pxor %xmm5,%xmm2");
 187                        asm volatile("pxor %xmm7,%xmm3");
 188                        asm volatile("pxor %xmm5,%xmm4");
 189                        asm volatile("pxor %xmm7,%xmm6");
 190                        asm volatile("pxor %xmm5,%xmm5");
 191                        asm volatile("pxor %xmm7,%xmm7");
 192                }
 193                asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
 194                asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
 195                asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
 196                asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
 197        }
 198
 199        asm volatile("sfence" : : : "memory");
 200        kernel_fpu_end();
 201}
 202
 203 static void raid6_sse22_xor_syndrome(int disks, int start, int stop,
 204                                     size_t bytes, void **ptrs)
 205 {
 206        u8 **dptr = (u8 **)ptrs;
 207        u8 *p, *q;
 208        int d, z, z0;
 209
 210        z0 = stop;              /* P/Q right side optimization */
 211        p = dptr[disks-2];      /* XOR parity */
 212        q = dptr[disks-1];      /* RS syndrome */
 213
 214        kernel_fpu_begin();
 215
 216        asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
 217
 218        for ( d = 0 ; d < bytes ; d += 32 ) {
 219                asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
 220                asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
 221                asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
 222                asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
 223                asm volatile("pxor %xmm4,%xmm2");
 224                asm volatile("pxor %xmm6,%xmm3");
 225                /* P/Q data pages */
 226                for ( z = z0-1 ; z >= start ; z-- ) {
 227                        asm volatile("pxor %xmm5,%xmm5");
 228                        asm volatile("pxor %xmm7,%xmm7");
 229                        asm volatile("pcmpgtb %xmm4,%xmm5");
 230                        asm volatile("pcmpgtb %xmm6,%xmm7");
 231                        asm volatile("paddb %xmm4,%xmm4");
 232                        asm volatile("paddb %xmm6,%xmm6");
 233                        asm volatile("pand %xmm0,%xmm5");
 234                        asm volatile("pand %xmm0,%xmm7");
 235                        asm volatile("pxor %xmm5,%xmm4");
 236                        asm volatile("pxor %xmm7,%xmm6");
 237                        asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
 238                        asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
 239                        asm volatile("pxor %xmm5,%xmm2");
 240                        asm volatile("pxor %xmm7,%xmm3");
 241                        asm volatile("pxor %xmm5,%xmm4");
 242                        asm volatile("pxor %xmm7,%xmm6");
 243                }
 244                /* P/Q left side optimization */
 245                for ( z = start-1 ; z >= 0 ; z-- ) {
 246                        asm volatile("pxor %xmm5,%xmm5");
 247                        asm volatile("pxor %xmm7,%xmm7");
 248                        asm volatile("pcmpgtb %xmm4,%xmm5");
 249                        asm volatile("pcmpgtb %xmm6,%xmm7");
 250                        asm volatile("paddb %xmm4,%xmm4");
 251                        asm volatile("paddb %xmm6,%xmm6");
 252                        asm volatile("pand %xmm0,%xmm5");
 253                        asm volatile("pand %xmm0,%xmm7");
 254                        asm volatile("pxor %xmm5,%xmm4");
 255                        asm volatile("pxor %xmm7,%xmm6");
 256                }
 257                asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
 258                asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
 259                /* Don't use movntdq for r/w memory area < cache line */
 260                asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
 261                asm volatile("movdqa %%xmm6,%0" : "=m" (q[d+16]));
 262                asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
 263                asm volatile("movdqa %%xmm3,%0" : "=m" (p[d+16]));
 264        }
 265
 266        asm volatile("sfence" : : : "memory");
 267        kernel_fpu_end();
 268 }
 269
 270const struct raid6_calls raid6_sse2x2 = {
 271        raid6_sse22_gen_syndrome,
 272        raid6_sse22_xor_syndrome,
 273        raid6_have_sse2,
 274        "sse2x2",
 275        1                       /* Has cache hints */
 276};
 277
 278#ifdef CONFIG_X86_64
 279
 280/*
 281 * Unrolled-by-4 SSE2 implementation
 282 */
 283static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs)
 284{
 285        u8 **dptr = (u8 **)ptrs;
 286        u8 *p, *q;
 287        int d, z, z0;
 288
 289        z0 = disks - 3;         /* Highest data disk */
 290        p = dptr[z0+1];         /* XOR parity */
 291        q = dptr[z0+2];         /* RS syndrome */
 292
 293        kernel_fpu_begin();
 294
 295        asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
 296        asm volatile("pxor %xmm2,%xmm2");       /* P[0] */
 297        asm volatile("pxor %xmm3,%xmm3");       /* P[1] */
 298        asm volatile("pxor %xmm4,%xmm4");       /* Q[0] */
 299        asm volatile("pxor %xmm5,%xmm5");       /* Zero temp */
 300        asm volatile("pxor %xmm6,%xmm6");       /* Q[1] */
 301        asm volatile("pxor %xmm7,%xmm7");       /* Zero temp */
 302        asm volatile("pxor %xmm10,%xmm10");     /* P[2] */
 303        asm volatile("pxor %xmm11,%xmm11");     /* P[3] */
 304        asm volatile("pxor %xmm12,%xmm12");     /* Q[2] */
 305        asm volatile("pxor %xmm13,%xmm13");     /* Zero temp */
 306        asm volatile("pxor %xmm14,%xmm14");     /* Q[3] */
 307        asm volatile("pxor %xmm15,%xmm15");     /* Zero temp */
 308
 309        for ( d = 0 ; d < bytes ; d += 64 ) {
 310                for ( z = z0 ; z >= 0 ; z-- ) {
 311                        /* The second prefetch seems to improve performance... */
 312                        asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
 313                        asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
 314                        asm volatile("pcmpgtb %xmm4,%xmm5");
 315                        asm volatile("pcmpgtb %xmm6,%xmm7");
 316                        asm volatile("pcmpgtb %xmm12,%xmm13");
 317                        asm volatile("pcmpgtb %xmm14,%xmm15");
 318                        asm volatile("paddb %xmm4,%xmm4");
 319                        asm volatile("paddb %xmm6,%xmm6");
 320                        asm volatile("paddb %xmm12,%xmm12");
 321                        asm volatile("paddb %xmm14,%xmm14");
 322                        asm volatile("pand %xmm0,%xmm5");
 323                        asm volatile("pand %xmm0,%xmm7");
 324                        asm volatile("pand %xmm0,%xmm13");
 325                        asm volatile("pand %xmm0,%xmm15");
 326                        asm volatile("pxor %xmm5,%xmm4");
 327                        asm volatile("pxor %xmm7,%xmm6");
 328                        asm volatile("pxor %xmm13,%xmm12");
 329                        asm volatile("pxor %xmm15,%xmm14");
 330                        asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
 331                        asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
 332                        asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
 333                        asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
 334                        asm volatile("pxor %xmm5,%xmm2");
 335                        asm volatile("pxor %xmm7,%xmm3");
 336                        asm volatile("pxor %xmm13,%xmm10");
 337                        asm volatile("pxor %xmm15,%xmm11");
 338                        asm volatile("pxor %xmm5,%xmm4");
 339                        asm volatile("pxor %xmm7,%xmm6");
 340                        asm volatile("pxor %xmm13,%xmm12");
 341                        asm volatile("pxor %xmm15,%xmm14");
 342                        asm volatile("pxor %xmm5,%xmm5");
 343                        asm volatile("pxor %xmm7,%xmm7");
 344                        asm volatile("pxor %xmm13,%xmm13");
 345                        asm volatile("pxor %xmm15,%xmm15");
 346                }
 347                asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
 348                asm volatile("pxor %xmm2,%xmm2");
 349                asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
 350                asm volatile("pxor %xmm3,%xmm3");
 351                asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
 352                asm volatile("pxor %xmm10,%xmm10");
 353                asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
 354                asm volatile("pxor %xmm11,%xmm11");
 355                asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
 356                asm volatile("pxor %xmm4,%xmm4");
 357                asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
 358                asm volatile("pxor %xmm6,%xmm6");
 359                asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
 360                asm volatile("pxor %xmm12,%xmm12");
 361                asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
 362                asm volatile("pxor %xmm14,%xmm14");
 363        }
 364
 365        asm volatile("sfence" : : : "memory");
 366        kernel_fpu_end();
 367}
 368
 369 static void raid6_sse24_xor_syndrome(int disks, int start, int stop,
 370                                     size_t bytes, void **ptrs)
 371 {
 372        u8 **dptr = (u8 **)ptrs;
 373        u8 *p, *q;
 374        int d, z, z0;
 375
 376        z0 = stop;              /* P/Q right side optimization */
 377        p = dptr[disks-2];      /* XOR parity */
 378        q = dptr[disks-1];      /* RS syndrome */
 379
 380        kernel_fpu_begin();
 381
 382        asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
 383
 384        for ( d = 0 ; d < bytes ; d += 64 ) {
 385                asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
 386                asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
 387                asm volatile("movdqa %0,%%xmm12" :: "m" (dptr[z0][d+32]));
 388                asm volatile("movdqa %0,%%xmm14" :: "m" (dptr[z0][d+48]));
 389                asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
 390                asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
 391                asm volatile("movdqa %0,%%xmm10" : : "m" (p[d+32]));
 392                asm volatile("movdqa %0,%%xmm11" : : "m" (p[d+48]));
 393                asm volatile("pxor %xmm4,%xmm2");
 394                asm volatile("pxor %xmm6,%xmm3");
 395                asm volatile("pxor %xmm12,%xmm10");
 396                asm volatile("pxor %xmm14,%xmm11");
 397                /* P/Q data pages */
 398                for ( z = z0-1 ; z >= start ; z-- ) {
 399                        asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
 400                        asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
 401                        asm volatile("pxor %xmm5,%xmm5");
 402                        asm volatile("pxor %xmm7,%xmm7");
 403                        asm volatile("pxor %xmm13,%xmm13");
 404                        asm volatile("pxor %xmm15,%xmm15");
 405                        asm volatile("pcmpgtb %xmm4,%xmm5");
 406                        asm volatile("pcmpgtb %xmm6,%xmm7");
 407                        asm volatile("pcmpgtb %xmm12,%xmm13");
 408                        asm volatile("pcmpgtb %xmm14,%xmm15");
 409                        asm volatile("paddb %xmm4,%xmm4");
 410                        asm volatile("paddb %xmm6,%xmm6");
 411                        asm volatile("paddb %xmm12,%xmm12");
 412                        asm volatile("paddb %xmm14,%xmm14");
 413                        asm volatile("pand %xmm0,%xmm5");
 414                        asm volatile("pand %xmm0,%xmm7");
 415                        asm volatile("pand %xmm0,%xmm13");
 416                        asm volatile("pand %xmm0,%xmm15");
 417                        asm volatile("pxor %xmm5,%xmm4");
 418                        asm volatile("pxor %xmm7,%xmm6");
 419                        asm volatile("pxor %xmm13,%xmm12");
 420                        asm volatile("pxor %xmm15,%xmm14");
 421                        asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
 422                        asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
 423                        asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
 424                        asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
 425                        asm volatile("pxor %xmm5,%xmm2");
 426                        asm volatile("pxor %xmm7,%xmm3");
 427                        asm volatile("pxor %xmm13,%xmm10");
 428                        asm volatile("pxor %xmm15,%xmm11");
 429                        asm volatile("pxor %xmm5,%xmm4");
 430                        asm volatile("pxor %xmm7,%xmm6");
 431                        asm volatile("pxor %xmm13,%xmm12");
 432                        asm volatile("pxor %xmm15,%xmm14");
 433                }
 434                asm volatile("prefetchnta %0" :: "m" (q[d]));
 435                asm volatile("prefetchnta %0" :: "m" (q[d+32]));
 436                /* P/Q left side optimization */
 437                for ( z = start-1 ; z >= 0 ; z-- ) {
 438                        asm volatile("pxor %xmm5,%xmm5");
 439                        asm volatile("pxor %xmm7,%xmm7");
 440                        asm volatile("pxor %xmm13,%xmm13");
 441                        asm volatile("pxor %xmm15,%xmm15");
 442                        asm volatile("pcmpgtb %xmm4,%xmm5");
 443                        asm volatile("pcmpgtb %xmm6,%xmm7");
 444                        asm volatile("pcmpgtb %xmm12,%xmm13");
 445                        asm volatile("pcmpgtb %xmm14,%xmm15");
 446                        asm volatile("paddb %xmm4,%xmm4");
 447                        asm volatile("paddb %xmm6,%xmm6");
 448                        asm volatile("paddb %xmm12,%xmm12");
 449                        asm volatile("paddb %xmm14,%xmm14");
 450                        asm volatile("pand %xmm0,%xmm5");
 451                        asm volatile("pand %xmm0,%xmm7");
 452                        asm volatile("pand %xmm0,%xmm13");
 453                        asm volatile("pand %xmm0,%xmm15");
 454                        asm volatile("pxor %xmm5,%xmm4");
 455                        asm volatile("pxor %xmm7,%xmm6");
 456                        asm volatile("pxor %xmm13,%xmm12");
 457                        asm volatile("pxor %xmm15,%xmm14");
 458                }
 459                asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
 460                asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
 461                asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
 462                asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
 463                asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
 464                asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
 465                asm volatile("pxor %0,%%xmm12" : : "m" (q[d+32]));
 466                asm volatile("pxor %0,%%xmm14" : : "m" (q[d+48]));
 467                asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
 468                asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
 469                asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
 470                asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
 471        }
 472        asm volatile("sfence" : : : "memory");
 473        kernel_fpu_end();
 474 }
 475
 476
 477const struct raid6_calls raid6_sse2x4 = {
 478        raid6_sse24_gen_syndrome,
 479        raid6_sse24_xor_syndrome,
 480        raid6_have_sse2,
 481        "sse2x4",
 482        1                       /* Has cache hints */
 483};
 484
 485#endif /* CONFIG_X86_64 */
 486