linux/lib/raid6/avx2.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/* -*- linux-c -*- ------------------------------------------------------- *
   3 *
   4 *   Copyright (C) 2012 Intel Corporation
   5 *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
   6 *
   7 *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
   8 *
   9 * ----------------------------------------------------------------------- */
  10
  11/*
  12 * AVX2 implementation of RAID-6 syndrome functions
  13 *
  14 */
  15
  16#include <linux/raid/pq.h>
  17#include "x86.h"
  18
  19static const struct raid6_avx2_constants {
  20        u64 x1d[4];
  21} raid6_avx2_constants __aligned(32) = {
  22        { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
  23          0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
  24};
  25
  26static int raid6_have_avx2(void)
  27{
  28        return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
  29}
  30
  31/*
  32 * Plain AVX2 implementation
  33 */
  34static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
  35{
  36        u8 **dptr = (u8 **)ptrs;
  37        u8 *p, *q;
  38        int d, z, z0;
  39
  40        z0 = disks - 3;         /* Highest data disk */
  41        p = dptr[z0+1];         /* XOR parity */
  42        q = dptr[z0+2];         /* RS syndrome */
  43
  44        kernel_fpu_begin();
  45
  46        asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
  47        asm volatile("vpxor %ymm3,%ymm3,%ymm3");        /* Zero temp */
  48
  49        for (d = 0; d < bytes; d += 32) {
  50                asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
  51                asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
  52                asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
  53                asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
  54                asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
  55                for (z = z0-2; z >= 0; z--) {
  56                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
  57                        asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
  58                        asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
  59                        asm volatile("vpand %ymm0,%ymm5,%ymm5");
  60                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
  61                        asm volatile("vpxor %ymm6,%ymm2,%ymm2");
  62                        asm volatile("vpxor %ymm6,%ymm4,%ymm4");
  63                        asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
  64                }
  65                asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
  66                asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
  67                asm volatile("vpand %ymm0,%ymm5,%ymm5");
  68                asm volatile("vpxor %ymm5,%ymm4,%ymm4");
  69                asm volatile("vpxor %ymm6,%ymm2,%ymm2");
  70                asm volatile("vpxor %ymm6,%ymm4,%ymm4");
  71
  72                asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
  73                asm volatile("vpxor %ymm2,%ymm2,%ymm2");
  74                asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
  75                asm volatile("vpxor %ymm4,%ymm4,%ymm4");
  76        }
  77
  78        asm volatile("sfence" : : : "memory");
  79        kernel_fpu_end();
  80}
  81
  82static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
  83                                     size_t bytes, void **ptrs)
  84{
  85        u8 **dptr = (u8 **)ptrs;
  86        u8 *p, *q;
  87        int d, z, z0;
  88
  89        z0 = stop;              /* P/Q right side optimization */
  90        p = dptr[disks-2];      /* XOR parity */
  91        q = dptr[disks-1];      /* RS syndrome */
  92
  93        kernel_fpu_begin();
  94
  95        asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
  96
  97        for (d = 0 ; d < bytes ; d += 32) {
  98                asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
  99                asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
 100                asm volatile("vpxor %ymm4,%ymm2,%ymm2");
 101                /* P/Q data pages */
 102                for (z = z0-1 ; z >= start ; z--) {
 103                        asm volatile("vpxor %ymm5,%ymm5,%ymm5");
 104                        asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
 105                        asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 106                        asm volatile("vpand %ymm0,%ymm5,%ymm5");
 107                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 108                        asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
 109                        asm volatile("vpxor %ymm5,%ymm2,%ymm2");
 110                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 111                }
 112                /* P/Q left side optimization */
 113                for (z = start-1 ; z >= 0 ; z--) {
 114                        asm volatile("vpxor %ymm5,%ymm5,%ymm5");
 115                        asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
 116                        asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 117                        asm volatile("vpand %ymm0,%ymm5,%ymm5");
 118                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 119                }
 120                asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
 121                /* Don't use movntdq for r/w memory area < cache line */
 122                asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
 123                asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
 124        }
 125
 126        asm volatile("sfence" : : : "memory");
 127        kernel_fpu_end();
 128}
 129
 130const struct raid6_calls raid6_avx2x1 = {
 131        raid6_avx21_gen_syndrome,
 132        raid6_avx21_xor_syndrome,
 133        raid6_have_avx2,
 134        "avx2x1",
 135        1                       /* Has cache hints */
 136};
 137
 138/*
 139 * Unrolled-by-2 AVX2 implementation
 140 */
 141static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
 142{
 143        u8 **dptr = (u8 **)ptrs;
 144        u8 *p, *q;
 145        int d, z, z0;
 146
 147        z0 = disks - 3;         /* Highest data disk */
 148        p = dptr[z0+1];         /* XOR parity */
 149        q = dptr[z0+2];         /* RS syndrome */
 150
 151        kernel_fpu_begin();
 152
 153        asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
 154        asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
 155
 156        /* We uniformly assume a single prefetch covers at least 32 bytes */
 157        for (d = 0; d < bytes; d += 64) {
 158                asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
 159                asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
 160                asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
 161                asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
 162                asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
 163                asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
 164                for (z = z0-1; z >= 0; z--) {
 165                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
 166                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
 167                        asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
 168                        asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
 169                        asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 170                        asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
 171                        asm volatile("vpand %ymm0,%ymm5,%ymm5");
 172                        asm volatile("vpand %ymm0,%ymm7,%ymm7");
 173                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 174                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 175                        asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
 176                        asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
 177                        asm volatile("vpxor %ymm5,%ymm2,%ymm2");
 178                        asm volatile("vpxor %ymm7,%ymm3,%ymm3");
 179                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 180                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 181                }
 182                asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
 183                asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
 184                asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
 185                asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
 186        }
 187
 188        asm volatile("sfence" : : : "memory");
 189        kernel_fpu_end();
 190}
 191
 192static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
 193                                     size_t bytes, void **ptrs)
 194{
 195        u8 **dptr = (u8 **)ptrs;
 196        u8 *p, *q;
 197        int d, z, z0;
 198
 199        z0 = stop;              /* P/Q right side optimization */
 200        p = dptr[disks-2];      /* XOR parity */
 201        q = dptr[disks-1];      /* RS syndrome */
 202
 203        kernel_fpu_begin();
 204
 205        asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
 206
 207        for (d = 0 ; d < bytes ; d += 64) {
 208                asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
 209                asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
 210                asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
 211                asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
 212                asm volatile("vpxor %ymm4,%ymm2,%ymm2");
 213                asm volatile("vpxor %ymm6,%ymm3,%ymm3");
 214                /* P/Q data pages */
 215                for (z = z0-1 ; z >= start ; z--) {
 216                        asm volatile("vpxor %ymm5,%ymm5,%ymm5");
 217                        asm volatile("vpxor %ymm7,%ymm7,%ymm7");
 218                        asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
 219                        asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
 220                        asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 221                        asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
 222                        asm volatile("vpand %ymm0,%ymm5,%ymm5");
 223                        asm volatile("vpand %ymm0,%ymm7,%ymm7");
 224                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 225                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 226                        asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
 227                        asm volatile("vmovdqa %0,%%ymm7"
 228                                     :: "m" (dptr[z][d+32]));
 229                        asm volatile("vpxor %ymm5,%ymm2,%ymm2");
 230                        asm volatile("vpxor %ymm7,%ymm3,%ymm3");
 231                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 232                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 233                }
 234                /* P/Q left side optimization */
 235                for (z = start-1 ; z >= 0 ; z--) {
 236                        asm volatile("vpxor %ymm5,%ymm5,%ymm5");
 237                        asm volatile("vpxor %ymm7,%ymm7,%ymm7");
 238                        asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
 239                        asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
 240                        asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 241                        asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
 242                        asm volatile("vpand %ymm0,%ymm5,%ymm5");
 243                        asm volatile("vpand %ymm0,%ymm7,%ymm7");
 244                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 245                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 246                }
 247                asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
 248                asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
 249                /* Don't use movntdq for r/w memory area < cache line */
 250                asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
 251                asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
 252                asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
 253                asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
 254        }
 255
 256        asm volatile("sfence" : : : "memory");
 257        kernel_fpu_end();
 258}
 259
 260const struct raid6_calls raid6_avx2x2 = {
 261        raid6_avx22_gen_syndrome,
 262        raid6_avx22_xor_syndrome,
 263        raid6_have_avx2,
 264        "avx2x2",
 265        1                       /* Has cache hints */
 266};
 267
 268#ifdef CONFIG_X86_64
 269
 270/*
 271 * Unrolled-by-4 AVX2 implementation
 272 */
 273static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
 274{
 275        u8 **dptr = (u8 **)ptrs;
 276        u8 *p, *q;
 277        int d, z, z0;
 278
 279        z0 = disks - 3;         /* Highest data disk */
 280        p = dptr[z0+1];         /* XOR parity */
 281        q = dptr[z0+2];         /* RS syndrome */
 282
 283        kernel_fpu_begin();
 284
 285        asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
 286        asm volatile("vpxor %ymm1,%ymm1,%ymm1");        /* Zero temp */
 287        asm volatile("vpxor %ymm2,%ymm2,%ymm2");        /* P[0] */
 288        asm volatile("vpxor %ymm3,%ymm3,%ymm3");        /* P[1] */
 289        asm volatile("vpxor %ymm4,%ymm4,%ymm4");        /* Q[0] */
 290        asm volatile("vpxor %ymm6,%ymm6,%ymm6");        /* Q[1] */
 291        asm volatile("vpxor %ymm10,%ymm10,%ymm10");     /* P[2] */
 292        asm volatile("vpxor %ymm11,%ymm11,%ymm11");     /* P[3] */
 293        asm volatile("vpxor %ymm12,%ymm12,%ymm12");     /* Q[2] */
 294        asm volatile("vpxor %ymm14,%ymm14,%ymm14");     /* Q[3] */
 295
 296        for (d = 0; d < bytes; d += 128) {
 297                for (z = z0; z >= 0; z--) {
 298                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
 299                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
 300                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
 301                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
 302                        asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
 303                        asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
 304                        asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
 305                        asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
 306                        asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 307                        asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
 308                        asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
 309                        asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
 310                        asm volatile("vpand %ymm0,%ymm5,%ymm5");
 311                        asm volatile("vpand %ymm0,%ymm7,%ymm7");
 312                        asm volatile("vpand %ymm0,%ymm13,%ymm13");
 313                        asm volatile("vpand %ymm0,%ymm15,%ymm15");
 314                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 315                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 316                        asm volatile("vpxor %ymm13,%ymm12,%ymm12");
 317                        asm volatile("vpxor %ymm15,%ymm14,%ymm14");
 318                        asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
 319                        asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
 320                        asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
 321                        asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
 322                        asm volatile("vpxor %ymm5,%ymm2,%ymm2");
 323                        asm volatile("vpxor %ymm7,%ymm3,%ymm3");
 324                        asm volatile("vpxor %ymm13,%ymm10,%ymm10");
 325                        asm volatile("vpxor %ymm15,%ymm11,%ymm11");
 326                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 327                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 328                        asm volatile("vpxor %ymm13,%ymm12,%ymm12");
 329                        asm volatile("vpxor %ymm15,%ymm14,%ymm14");
 330                }
 331                asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
 332                asm volatile("vpxor %ymm2,%ymm2,%ymm2");
 333                asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
 334                asm volatile("vpxor %ymm3,%ymm3,%ymm3");
 335                asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
 336                asm volatile("vpxor %ymm10,%ymm10,%ymm10");
 337                asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
 338                asm volatile("vpxor %ymm11,%ymm11,%ymm11");
 339                asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
 340                asm volatile("vpxor %ymm4,%ymm4,%ymm4");
 341                asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
 342                asm volatile("vpxor %ymm6,%ymm6,%ymm6");
 343                asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
 344                asm volatile("vpxor %ymm12,%ymm12,%ymm12");
 345                asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
 346                asm volatile("vpxor %ymm14,%ymm14,%ymm14");
 347        }
 348
 349        asm volatile("sfence" : : : "memory");
 350        kernel_fpu_end();
 351}
 352
 353static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
 354                                     size_t bytes, void **ptrs)
 355{
 356        u8 **dptr = (u8 **)ptrs;
 357        u8 *p, *q;
 358        int d, z, z0;
 359
 360        z0 = stop;              /* P/Q right side optimization */
 361        p = dptr[disks-2];      /* XOR parity */
 362        q = dptr[disks-1];      /* RS syndrome */
 363
 364        kernel_fpu_begin();
 365
 366        asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
 367
 368        for (d = 0 ; d < bytes ; d += 128) {
 369                asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
 370                asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
 371                asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
 372                asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
 373                asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
 374                asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
 375                asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
 376                asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
 377                asm volatile("vpxor %ymm4,%ymm2,%ymm2");
 378                asm volatile("vpxor %ymm6,%ymm3,%ymm3");
 379                asm volatile("vpxor %ymm12,%ymm10,%ymm10");
 380                asm volatile("vpxor %ymm14,%ymm11,%ymm11");
 381                /* P/Q data pages */
 382                for (z = z0-1 ; z >= start ; z--) {
 383                        asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
 384                        asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
 385                        asm volatile("vpxor %ymm5,%ymm5,%ymm5");
 386                        asm volatile("vpxor %ymm7,%ymm7,%ymm7");
 387                        asm volatile("vpxor %ymm13,%ymm13,%ymm13");
 388                        asm volatile("vpxor %ymm15,%ymm15,%ymm15");
 389                        asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
 390                        asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
 391                        asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
 392                        asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
 393                        asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 394                        asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
 395                        asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
 396                        asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
 397                        asm volatile("vpand %ymm0,%ymm5,%ymm5");
 398                        asm volatile("vpand %ymm0,%ymm7,%ymm7");
 399                        asm volatile("vpand %ymm0,%ymm13,%ymm13");
 400                        asm volatile("vpand %ymm0,%ymm15,%ymm15");
 401                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 402                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 403                        asm volatile("vpxor %ymm13,%ymm12,%ymm12");
 404                        asm volatile("vpxor %ymm15,%ymm14,%ymm14");
 405                        asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
 406                        asm volatile("vmovdqa %0,%%ymm7"
 407                                     :: "m" (dptr[z][d+32]));
 408                        asm volatile("vmovdqa %0,%%ymm13"
 409                                     :: "m" (dptr[z][d+64]));
 410                        asm volatile("vmovdqa %0,%%ymm15"
 411                                     :: "m" (dptr[z][d+96]));
 412                        asm volatile("vpxor %ymm5,%ymm2,%ymm2");
 413                        asm volatile("vpxor %ymm7,%ymm3,%ymm3");
 414                        asm volatile("vpxor %ymm13,%ymm10,%ymm10");
 415                        asm volatile("vpxor %ymm15,%ymm11,%ymm11");
 416                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 417                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 418                        asm volatile("vpxor %ymm13,%ymm12,%ymm12");
 419                        asm volatile("vpxor %ymm15,%ymm14,%ymm14");
 420                }
 421                asm volatile("prefetchnta %0" :: "m" (q[d]));
 422                asm volatile("prefetchnta %0" :: "m" (q[d+64]));
 423                /* P/Q left side optimization */
 424                for (z = start-1 ; z >= 0 ; z--) {
 425                        asm volatile("vpxor %ymm5,%ymm5,%ymm5");
 426                        asm volatile("vpxor %ymm7,%ymm7,%ymm7");
 427                        asm volatile("vpxor %ymm13,%ymm13,%ymm13");
 428                        asm volatile("vpxor %ymm15,%ymm15,%ymm15");
 429                        asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
 430                        asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
 431                        asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
 432                        asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
 433                        asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 434                        asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
 435                        asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
 436                        asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
 437                        asm volatile("vpand %ymm0,%ymm5,%ymm5");
 438                        asm volatile("vpand %ymm0,%ymm7,%ymm7");
 439                        asm volatile("vpand %ymm0,%ymm13,%ymm13");
 440                        asm volatile("vpand %ymm0,%ymm15,%ymm15");
 441                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 442                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 443                        asm volatile("vpxor %ymm13,%ymm12,%ymm12");
 444                        asm volatile("vpxor %ymm15,%ymm14,%ymm14");
 445                }
 446                asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
 447                asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
 448                asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
 449                asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
 450                asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
 451                asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
 452                asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
 453                asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
 454                asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
 455                asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
 456                asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
 457                asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
 458        }
 459        asm volatile("sfence" : : : "memory");
 460        kernel_fpu_end();
 461}
 462
 463const struct raid6_calls raid6_avx2x4 = {
 464        raid6_avx24_gen_syndrome,
 465        raid6_avx24_xor_syndrome,
 466        raid6_have_avx2,
 467        "avx2x4",
 468        1                       /* Has cache hints */
 469};
 470#endif
 471