linux/lib/raid6/avx2.c
<<
>>
Prefs
   1/* -*- linux-c -*- ------------------------------------------------------- *
   2 *
   3 *   Copyright (C) 2012 Intel Corporation
   4 *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
   5 *
   6 *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
   7 *
   8 *
   9 *   This program is free software; you can redistribute it and/or modify
  10 *   it under the terms of the GNU General Public License as published by
  11 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
  12 *   Boston MA 02111-1307, USA; either version 2 of the License, or
  13 *   (at your option) any later version; incorporated herein by reference.
  14 *
  15 * ----------------------------------------------------------------------- */
  16
  17/*
  18 * AVX2 implementation of RAID-6 syndrome functions
  19 *
  20 */
  21
  22#ifdef CONFIG_AS_AVX2
  23
  24#include <linux/raid/pq.h>
  25#include "x86.h"
  26
  27static const struct raid6_avx2_constants {
  28        u64 x1d[4];
  29} raid6_avx2_constants __aligned(32) = {
  30        { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
  31          0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
  32};
  33
  34static int raid6_have_avx2(void)
  35{
  36        return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
  37}
  38
  39/*
  40 * Plain AVX2 implementation
  41 */
  42static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
  43{
  44        u8 **dptr = (u8 **)ptrs;
  45        u8 *p, *q;
  46        int d, z, z0;
  47
  48        z0 = disks - 3;         /* Highest data disk */
  49        p = dptr[z0+1];         /* XOR parity */
  50        q = dptr[z0+2];         /* RS syndrome */
  51
  52        kernel_fpu_begin();
  53
  54        asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
  55        asm volatile("vpxor %ymm3,%ymm3,%ymm3");        /* Zero temp */
  56
  57        for (d = 0; d < bytes; d += 32) {
  58                asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
  59                asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
  60                asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
  61                asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
  62                asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
  63                for (z = z0-2; z >= 0; z--) {
  64                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
  65                        asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
  66                        asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
  67                        asm volatile("vpand %ymm0,%ymm5,%ymm5");
  68                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
  69                        asm volatile("vpxor %ymm6,%ymm2,%ymm2");
  70                        asm volatile("vpxor %ymm6,%ymm4,%ymm4");
  71                        asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
  72                }
  73                asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
  74                asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
  75                asm volatile("vpand %ymm0,%ymm5,%ymm5");
  76                asm volatile("vpxor %ymm5,%ymm4,%ymm4");
  77                asm volatile("vpxor %ymm6,%ymm2,%ymm2");
  78                asm volatile("vpxor %ymm6,%ymm4,%ymm4");
  79
  80                asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
  81                asm volatile("vpxor %ymm2,%ymm2,%ymm2");
  82                asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
  83                asm volatile("vpxor %ymm4,%ymm4,%ymm4");
  84        }
  85
  86        asm volatile("sfence" : : : "memory");
  87        kernel_fpu_end();
  88}
  89
  90static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
  91                                     size_t bytes, void **ptrs)
  92{
  93        u8 **dptr = (u8 **)ptrs;
  94        u8 *p, *q;
  95        int d, z, z0;
  96
  97        z0 = stop;              /* P/Q right side optimization */
  98        p = dptr[disks-2];      /* XOR parity */
  99        q = dptr[disks-1];      /* RS syndrome */
 100
 101        kernel_fpu_begin();
 102
 103        asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
 104
 105        for (d = 0 ; d < bytes ; d += 32) {
 106                asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
 107                asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
 108                asm volatile("vpxor %ymm4,%ymm2,%ymm2");
 109                /* P/Q data pages */
 110                for (z = z0-1 ; z >= start ; z--) {
 111                        asm volatile("vpxor %ymm5,%ymm5,%ymm5");
 112                        asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
 113                        asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 114                        asm volatile("vpand %ymm0,%ymm5,%ymm5");
 115                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 116                        asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
 117                        asm volatile("vpxor %ymm5,%ymm2,%ymm2");
 118                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 119                }
 120                /* P/Q left side optimization */
 121                for (z = start-1 ; z >= 0 ; z--) {
 122                        asm volatile("vpxor %ymm5,%ymm5,%ymm5");
 123                        asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
 124                        asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 125                        asm volatile("vpand %ymm0,%ymm5,%ymm5");
 126                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 127                }
 128                asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
 129                /* Don't use movntdq for r/w memory area < cache line */
 130                asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
 131                asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
 132        }
 133
 134        asm volatile("sfence" : : : "memory");
 135        kernel_fpu_end();
 136}
 137
 138const struct raid6_calls raid6_avx2x1 = {
 139        raid6_avx21_gen_syndrome,
 140        raid6_avx21_xor_syndrome,
 141        raid6_have_avx2,
 142        "avx2x1",
 143        1                       /* Has cache hints */
 144};
 145
 146/*
 147 * Unrolled-by-2 AVX2 implementation
 148 */
 149static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
 150{
 151        u8 **dptr = (u8 **)ptrs;
 152        u8 *p, *q;
 153        int d, z, z0;
 154
 155        z0 = disks - 3;         /* Highest data disk */
 156        p = dptr[z0+1];         /* XOR parity */
 157        q = dptr[z0+2];         /* RS syndrome */
 158
 159        kernel_fpu_begin();
 160
 161        asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
 162        asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
 163
 164        /* We uniformly assume a single prefetch covers at least 32 bytes */
 165        for (d = 0; d < bytes; d += 64) {
 166                asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
 167                asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
 168                asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
 169                asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
 170                asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
 171                asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
 172                for (z = z0-1; z >= 0; z--) {
 173                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
 174                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
 175                        asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
 176                        asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
 177                        asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 178                        asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
 179                        asm volatile("vpand %ymm0,%ymm5,%ymm5");
 180                        asm volatile("vpand %ymm0,%ymm7,%ymm7");
 181                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 182                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 183                        asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
 184                        asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
 185                        asm volatile("vpxor %ymm5,%ymm2,%ymm2");
 186                        asm volatile("vpxor %ymm7,%ymm3,%ymm3");
 187                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 188                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 189                }
 190                asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
 191                asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
 192                asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
 193                asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
 194        }
 195
 196        asm volatile("sfence" : : : "memory");
 197        kernel_fpu_end();
 198}
 199
 200static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
 201                                     size_t bytes, void **ptrs)
 202{
 203        u8 **dptr = (u8 **)ptrs;
 204        u8 *p, *q;
 205        int d, z, z0;
 206
 207        z0 = stop;              /* P/Q right side optimization */
 208        p = dptr[disks-2];      /* XOR parity */
 209        q = dptr[disks-1];      /* RS syndrome */
 210
 211        kernel_fpu_begin();
 212
 213        asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
 214
 215        for (d = 0 ; d < bytes ; d += 64) {
 216                asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
 217                asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
 218                asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
 219                asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
 220                asm volatile("vpxor %ymm4,%ymm2,%ymm2");
 221                asm volatile("vpxor %ymm6,%ymm3,%ymm3");
 222                /* P/Q data pages */
 223                for (z = z0-1 ; z >= start ; z--) {
 224                        asm volatile("vpxor %ymm5,%ymm5,%ymm5");
 225                        asm volatile("vpxor %ymm7,%ymm7,%ymm7");
 226                        asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
 227                        asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
 228                        asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 229                        asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
 230                        asm volatile("vpand %ymm0,%ymm5,%ymm5");
 231                        asm volatile("vpand %ymm0,%ymm7,%ymm7");
 232                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 233                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 234                        asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
 235                        asm volatile("vmovdqa %0,%%ymm7"
 236                                     :: "m" (dptr[z][d+32]));
 237                        asm volatile("vpxor %ymm5,%ymm2,%ymm2");
 238                        asm volatile("vpxor %ymm7,%ymm3,%ymm3");
 239                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 240                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 241                }
 242                /* P/Q left side optimization */
 243                for (z = start-1 ; z >= 0 ; z--) {
 244                        asm volatile("vpxor %ymm5,%ymm5,%ymm5");
 245                        asm volatile("vpxor %ymm7,%ymm7,%ymm7");
 246                        asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
 247                        asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
 248                        asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 249                        asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
 250                        asm volatile("vpand %ymm0,%ymm5,%ymm5");
 251                        asm volatile("vpand %ymm0,%ymm7,%ymm7");
 252                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 253                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 254                }
 255                asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
 256                asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
 257                /* Don't use movntdq for r/w memory area < cache line */
 258                asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
 259                asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
 260                asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
 261                asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
 262        }
 263
 264        asm volatile("sfence" : : : "memory");
 265        kernel_fpu_end();
 266}
 267
 268const struct raid6_calls raid6_avx2x2 = {
 269        raid6_avx22_gen_syndrome,
 270        raid6_avx22_xor_syndrome,
 271        raid6_have_avx2,
 272        "avx2x2",
 273        1                       /* Has cache hints */
 274};
 275
 276#ifdef CONFIG_X86_64
 277
 278/*
 279 * Unrolled-by-4 AVX2 implementation
 280 */
 281static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
 282{
 283        u8 **dptr = (u8 **)ptrs;
 284        u8 *p, *q;
 285        int d, z, z0;
 286
 287        z0 = disks - 3;         /* Highest data disk */
 288        p = dptr[z0+1];         /* XOR parity */
 289        q = dptr[z0+2];         /* RS syndrome */
 290
 291        kernel_fpu_begin();
 292
 293        asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
 294        asm volatile("vpxor %ymm1,%ymm1,%ymm1");        /* Zero temp */
 295        asm volatile("vpxor %ymm2,%ymm2,%ymm2");        /* P[0] */
 296        asm volatile("vpxor %ymm3,%ymm3,%ymm3");        /* P[1] */
 297        asm volatile("vpxor %ymm4,%ymm4,%ymm4");        /* Q[0] */
 298        asm volatile("vpxor %ymm6,%ymm6,%ymm6");        /* Q[1] */
 299        asm volatile("vpxor %ymm10,%ymm10,%ymm10");     /* P[2] */
 300        asm volatile("vpxor %ymm11,%ymm11,%ymm11");     /* P[3] */
 301        asm volatile("vpxor %ymm12,%ymm12,%ymm12");     /* Q[2] */
 302        asm volatile("vpxor %ymm14,%ymm14,%ymm14");     /* Q[3] */
 303
 304        for (d = 0; d < bytes; d += 128) {
 305                for (z = z0; z >= 0; z--) {
 306                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
 307                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
 308                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
 309                        asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
 310                        asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
 311                        asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
 312                        asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
 313                        asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
 314                        asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 315                        asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
 316                        asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
 317                        asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
 318                        asm volatile("vpand %ymm0,%ymm5,%ymm5");
 319                        asm volatile("vpand %ymm0,%ymm7,%ymm7");
 320                        asm volatile("vpand %ymm0,%ymm13,%ymm13");
 321                        asm volatile("vpand %ymm0,%ymm15,%ymm15");
 322                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 323                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 324                        asm volatile("vpxor %ymm13,%ymm12,%ymm12");
 325                        asm volatile("vpxor %ymm15,%ymm14,%ymm14");
 326                        asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
 327                        asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
 328                        asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
 329                        asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
 330                        asm volatile("vpxor %ymm5,%ymm2,%ymm2");
 331                        asm volatile("vpxor %ymm7,%ymm3,%ymm3");
 332                        asm volatile("vpxor %ymm13,%ymm10,%ymm10");
 333                        asm volatile("vpxor %ymm15,%ymm11,%ymm11");
 334                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 335                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 336                        asm volatile("vpxor %ymm13,%ymm12,%ymm12");
 337                        asm volatile("vpxor %ymm15,%ymm14,%ymm14");
 338                }
 339                asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
 340                asm volatile("vpxor %ymm2,%ymm2,%ymm2");
 341                asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
 342                asm volatile("vpxor %ymm3,%ymm3,%ymm3");
 343                asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
 344                asm volatile("vpxor %ymm10,%ymm10,%ymm10");
 345                asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
 346                asm volatile("vpxor %ymm11,%ymm11,%ymm11");
 347                asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
 348                asm volatile("vpxor %ymm4,%ymm4,%ymm4");
 349                asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
 350                asm volatile("vpxor %ymm6,%ymm6,%ymm6");
 351                asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
 352                asm volatile("vpxor %ymm12,%ymm12,%ymm12");
 353                asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
 354                asm volatile("vpxor %ymm14,%ymm14,%ymm14");
 355        }
 356
 357        asm volatile("sfence" : : : "memory");
 358        kernel_fpu_end();
 359}
 360
 361static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
 362                                     size_t bytes, void **ptrs)
 363{
 364        u8 **dptr = (u8 **)ptrs;
 365        u8 *p, *q;
 366        int d, z, z0;
 367
 368        z0 = stop;              /* P/Q right side optimization */
 369        p = dptr[disks-2];      /* XOR parity */
 370        q = dptr[disks-1];      /* RS syndrome */
 371
 372        kernel_fpu_begin();
 373
 374        asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
 375
 376        for (d = 0 ; d < bytes ; d += 128) {
 377                asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
 378                asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
 379                asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
 380                asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
 381                asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
 382                asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
 383                asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
 384                asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
 385                asm volatile("vpxor %ymm4,%ymm2,%ymm2");
 386                asm volatile("vpxor %ymm6,%ymm3,%ymm3");
 387                asm volatile("vpxor %ymm12,%ymm10,%ymm10");
 388                asm volatile("vpxor %ymm14,%ymm11,%ymm11");
 389                /* P/Q data pages */
 390                for (z = z0-1 ; z >= start ; z--) {
 391                        asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
 392                        asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
 393                        asm volatile("vpxor %ymm5,%ymm5,%ymm5");
 394                        asm volatile("vpxor %ymm7,%ymm7,%ymm7");
 395                        asm volatile("vpxor %ymm13,%ymm13,%ymm13");
 396                        asm volatile("vpxor %ymm15,%ymm15,%ymm15");
 397                        asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
 398                        asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
 399                        asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
 400                        asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
 401                        asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 402                        asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
 403                        asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
 404                        asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
 405                        asm volatile("vpand %ymm0,%ymm5,%ymm5");
 406                        asm volatile("vpand %ymm0,%ymm7,%ymm7");
 407                        asm volatile("vpand %ymm0,%ymm13,%ymm13");
 408                        asm volatile("vpand %ymm0,%ymm15,%ymm15");
 409                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 410                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 411                        asm volatile("vpxor %ymm13,%ymm12,%ymm12");
 412                        asm volatile("vpxor %ymm15,%ymm14,%ymm14");
 413                        asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
 414                        asm volatile("vmovdqa %0,%%ymm7"
 415                                     :: "m" (dptr[z][d+32]));
 416                        asm volatile("vmovdqa %0,%%ymm13"
 417                                     :: "m" (dptr[z][d+64]));
 418                        asm volatile("vmovdqa %0,%%ymm15"
 419                                     :: "m" (dptr[z][d+96]));
 420                        asm volatile("vpxor %ymm5,%ymm2,%ymm2");
 421                        asm volatile("vpxor %ymm7,%ymm3,%ymm3");
 422                        asm volatile("vpxor %ymm13,%ymm10,%ymm10");
 423                        asm volatile("vpxor %ymm15,%ymm11,%ymm11");
 424                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 425                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 426                        asm volatile("vpxor %ymm13,%ymm12,%ymm12");
 427                        asm volatile("vpxor %ymm15,%ymm14,%ymm14");
 428                }
 429                asm volatile("prefetchnta %0" :: "m" (q[d]));
 430                asm volatile("prefetchnta %0" :: "m" (q[d+64]));
 431                /* P/Q left side optimization */
 432                for (z = start-1 ; z >= 0 ; z--) {
 433                        asm volatile("vpxor %ymm5,%ymm5,%ymm5");
 434                        asm volatile("vpxor %ymm7,%ymm7,%ymm7");
 435                        asm volatile("vpxor %ymm13,%ymm13,%ymm13");
 436                        asm volatile("vpxor %ymm15,%ymm15,%ymm15");
 437                        asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
 438                        asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
 439                        asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
 440                        asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
 441                        asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
 442                        asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
 443                        asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
 444                        asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
 445                        asm volatile("vpand %ymm0,%ymm5,%ymm5");
 446                        asm volatile("vpand %ymm0,%ymm7,%ymm7");
 447                        asm volatile("vpand %ymm0,%ymm13,%ymm13");
 448                        asm volatile("vpand %ymm0,%ymm15,%ymm15");
 449                        asm volatile("vpxor %ymm5,%ymm4,%ymm4");
 450                        asm volatile("vpxor %ymm7,%ymm6,%ymm6");
 451                        asm volatile("vpxor %ymm13,%ymm12,%ymm12");
 452                        asm volatile("vpxor %ymm15,%ymm14,%ymm14");
 453                }
 454                asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
 455                asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
 456                asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
 457                asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
 458                asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
 459                asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
 460                asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
 461                asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
 462                asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
 463                asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
 464                asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
 465                asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
 466        }
 467        asm volatile("sfence" : : : "memory");
 468        kernel_fpu_end();
 469}
 470
 471const struct raid6_calls raid6_avx2x4 = {
 472        raid6_avx24_gen_syndrome,
 473        raid6_avx24_xor_syndrome,
 474        raid6_have_avx2,
 475        "avx2x4",
 476        1                       /* Has cache hints */
 477};
 478#endif
 479
 480#endif /* CONFIG_AS_AVX2 */
 481