linux/lib/raid6/avx512.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/* -*- linux-c -*- --------------------------------------------------------
   3 *
   4 *   Copyright (C) 2016 Intel Corporation
   5 *
   6 *   Author: Gayatri Kammela <gayatri.kammela@intel.com>
   7 *   Author: Megha Dey <megha.dey@linux.intel.com>
   8 *
   9 *   Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
  10 *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
  11 *
  12 * -----------------------------------------------------------------------
  13 */
  14
  15/*
  16 * AVX512 implementation of RAID-6 syndrome functions
  17 *
  18 */
  19
  20#ifdef CONFIG_AS_AVX512
  21
  22#include <linux/raid/pq.h>
  23#include "x86.h"
  24
  25static const struct raid6_avx512_constants {
  26        u64 x1d[8];
  27} raid6_avx512_constants __aligned(512/8) = {
  28        { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
  29          0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
  30          0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
  31          0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
  32};
  33
  34static int raid6_have_avx512(void)
  35{
  36        return boot_cpu_has(X86_FEATURE_AVX2) &&
  37                boot_cpu_has(X86_FEATURE_AVX) &&
  38                boot_cpu_has(X86_FEATURE_AVX512F) &&
  39                boot_cpu_has(X86_FEATURE_AVX512BW) &&
  40                boot_cpu_has(X86_FEATURE_AVX512VL) &&
  41                boot_cpu_has(X86_FEATURE_AVX512DQ);
  42}
  43
  44static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
  45{
  46        u8 **dptr = (u8 **)ptrs;
  47        u8 *p, *q;
  48        int d, z, z0;
  49
  50        z0 = disks - 3;         /* Highest data disk */
  51        p = dptr[z0+1];         /* XOR parity */
  52        q = dptr[z0+2];         /* RS syndrome */
  53
  54        kernel_fpu_begin();
  55
  56        asm volatile("vmovdqa64 %0,%%zmm0\n\t"
  57                     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
  58                     :
  59                     : "m" (raid6_avx512_constants.x1d[0]));
  60
  61        for (d = 0; d < bytes; d += 64) {
  62                asm volatile("prefetchnta %0\n\t"
  63                             "vmovdqa64 %0,%%zmm2\n\t"     /* P[0] */
  64                             "prefetchnta %1\n\t"
  65                             "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
  66                             "vmovdqa64 %1,%%zmm6"
  67                             :
  68                             : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
  69                for (z = z0-2; z >= 0; z--) {
  70                        asm volatile("prefetchnta %0\n\t"
  71                                     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
  72                                     "vpmovm2b %%k1,%%zmm5\n\t"
  73                                     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  74                                     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  75                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  76                                     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
  77                                     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
  78                                     "vmovdqa64 %0,%%zmm6"
  79                                     :
  80                                     : "m" (dptr[z][d]));
  81                }
  82                asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
  83                             "vpmovm2b %%k1,%%zmm5\n\t"
  84                             "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  85                             "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  86                             "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  87                             "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
  88                             "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
  89                             "vmovntdq %%zmm2,%0\n\t"
  90                             "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
  91                             "vmovntdq %%zmm4,%1\n\t"
  92                             "vpxorq %%zmm4,%%zmm4,%%zmm4"
  93                             :
  94                             : "m" (p[d]), "m" (q[d]));
  95        }
  96
  97        asm volatile("sfence" : : : "memory");
  98        kernel_fpu_end();
  99}
 100
 101static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
 102                                       size_t bytes, void **ptrs)
 103{
 104        u8 **dptr = (u8 **)ptrs;
 105        u8 *p, *q;
 106        int d, z, z0;
 107
 108        z0 = stop;              /* P/Q right side optimization */
 109        p = dptr[disks-2];      /* XOR parity */
 110        q = dptr[disks-1];      /* RS syndrome */
 111
 112        kernel_fpu_begin();
 113
 114        asm volatile("vmovdqa64 %0,%%zmm0"
 115                     : : "m" (raid6_avx512_constants.x1d[0]));
 116
 117        for (d = 0 ; d < bytes ; d += 64) {
 118                asm volatile("vmovdqa64 %0,%%zmm4\n\t"
 119                             "vmovdqa64 %1,%%zmm2\n\t"
 120                             "vpxorq %%zmm4,%%zmm2,%%zmm2"
 121                             :
 122                             : "m" (dptr[z0][d]),  "m" (p[d]));
 123                /* P/Q data pages */
 124                for (z = z0-1 ; z >= start ; z--) {
 125                        asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
 126                                     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
 127                                     "vpmovm2b %%k1,%%zmm5\n\t"
 128                                     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 129                                     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 130                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 131                                     "vmovdqa64 %0,%%zmm5\n\t"
 132                                     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
 133                                     "vpxorq %%zmm5,%%zmm4,%%zmm4"
 134                                     :
 135                                     : "m" (dptr[z][d]));
 136                }
 137                /* P/Q left side optimization */
 138                for (z = start-1 ; z >= 0 ; z--) {
 139                        asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
 140                                     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
 141                                     "vpmovm2b %%k1,%%zmm5\n\t"
 142                                     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 143                                     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 144                                     "vpxorq %%zmm5,%%zmm4,%%zmm4"
 145                                     :
 146                                     : );
 147                }
 148                asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
 149                /* Don't use movntdq for r/w memory area < cache line */
 150                             "vmovdqa64 %%zmm4,%0\n\t"
 151                             "vmovdqa64 %%zmm2,%1"
 152                             :
 153                             : "m" (q[d]), "m" (p[d]));
 154        }
 155
 156        asm volatile("sfence" : : : "memory");
 157        kernel_fpu_end();
 158}
 159
 160const struct raid6_calls raid6_avx512x1 = {
 161        raid6_avx5121_gen_syndrome,
 162        raid6_avx5121_xor_syndrome,
 163        raid6_have_avx512,
 164        "avx512x1",
 165        1                       /* Has cache hints */
 166};
 167
 168/*
 169 * Unrolled-by-2 AVX512 implementation
 170 */
 171static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
 172{
 173        u8 **dptr = (u8 **)ptrs;
 174        u8 *p, *q;
 175        int d, z, z0;
 176
 177        z0 = disks - 3;         /* Highest data disk */
 178        p = dptr[z0+1];         /* XOR parity */
 179        q = dptr[z0+2];         /* RS syndrome */
 180
 181        kernel_fpu_begin();
 182
 183        asm volatile("vmovdqa64 %0,%%zmm0\n\t"
 184                     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
 185                     :
 186                     : "m" (raid6_avx512_constants.x1d[0]));
 187
 188        /* We uniformly assume a single prefetch covers at least 64 bytes */
 189        for (d = 0; d < bytes; d += 128) {
 190                asm volatile("prefetchnta %0\n\t"
 191                             "prefetchnta %1\n\t"
 192                             "vmovdqa64 %0,%%zmm2\n\t"      /* P[0] */
 193                             "vmovdqa64 %1,%%zmm3\n\t"      /* P[1] */
 194                             "vmovdqa64 %%zmm2,%%zmm4\n\t"  /* Q[0] */
 195                             "vmovdqa64 %%zmm3,%%zmm6"      /* Q[1] */
 196                             :
 197                             : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
 198                for (z = z0-1; z >= 0; z--) {
 199                        asm volatile("prefetchnta %0\n\t"
 200                                     "prefetchnta %1\n\t"
 201                                     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
 202                                     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
 203                                     "vpmovm2b %%k1,%%zmm5\n\t"
 204                                     "vpmovm2b %%k2,%%zmm7\n\t"
 205                                     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 206                                     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
 207                                     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 208                                     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
 209                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 210                                     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
 211                                     "vmovdqa64 %0,%%zmm5\n\t"
 212                                     "vmovdqa64 %1,%%zmm7\n\t"
 213                                     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
 214                                     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
 215                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 216                                     "vpxorq %%zmm7,%%zmm6,%%zmm6"
 217                                     :
 218                                     : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
 219                }
 220                asm volatile("vmovntdq %%zmm2,%0\n\t"
 221                             "vmovntdq %%zmm3,%1\n\t"
 222                             "vmovntdq %%zmm4,%2\n\t"
 223                             "vmovntdq %%zmm6,%3"
 224                             :
 225                             : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
 226                               "m" (q[d+64]));
 227        }
 228
 229        asm volatile("sfence" : : : "memory");
 230        kernel_fpu_end();
 231}
 232
 233static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
 234                                       size_t bytes, void **ptrs)
 235{
 236        u8 **dptr = (u8 **)ptrs;
 237        u8 *p, *q;
 238        int d, z, z0;
 239
 240        z0 = stop;              /* P/Q right side optimization */
 241        p = dptr[disks-2];      /* XOR parity */
 242        q = dptr[disks-1];      /* RS syndrome */
 243
 244        kernel_fpu_begin();
 245
 246        asm volatile("vmovdqa64 %0,%%zmm0"
 247                     : : "m" (raid6_avx512_constants.x1d[0]));
 248
 249        for (d = 0 ; d < bytes ; d += 128) {
 250                asm volatile("vmovdqa64 %0,%%zmm4\n\t"
 251                             "vmovdqa64 %1,%%zmm6\n\t"
 252                             "vmovdqa64 %2,%%zmm2\n\t"
 253                             "vmovdqa64 %3,%%zmm3\n\t"
 254                             "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
 255                             "vpxorq %%zmm6,%%zmm3,%%zmm3"
 256                             :
 257                             : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
 258                               "m" (p[d]), "m" (p[d+64]));
 259                /* P/Q data pages */
 260                for (z = z0-1 ; z >= start ; z--) {
 261                        asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
 262                                     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
 263                                     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
 264                                     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
 265                                     "vpmovm2b %%k1,%%zmm5\n\t"
 266                                     "vpmovm2b %%k2,%%zmm7\n\t"
 267                                     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 268                                     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
 269                                     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 270                                     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
 271                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 272                                     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
 273                                     "vmovdqa64 %0,%%zmm5\n\t"
 274                                     "vmovdqa64 %1,%%zmm7\n\t"
 275                                     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
 276                                     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
 277                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 278                                     "vpxorq %%zmm7,%%zmm6,%%zmm6"
 279                                     :
 280                                     : "m" (dptr[z][d]),  "m" (dptr[z][d+64]));
 281                }
 282                /* P/Q left side optimization */
 283                for (z = start-1 ; z >= 0 ; z--) {
 284                        asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
 285                                     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
 286                                     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
 287                                     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
 288                                     "vpmovm2b %%k1,%%zmm5\n\t"
 289                                     "vpmovm2b %%k2,%%zmm7\n\t"
 290                                     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 291                                     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
 292                                     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 293                                     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
 294                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 295                                     "vpxorq %%zmm7,%%zmm6,%%zmm6"
 296                                     :
 297                                     : );
 298                }
 299                asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
 300                             "vpxorq %1,%%zmm6,%%zmm6\n\t"
 301                             /* Don't use movntdq for r/w
 302                              * memory area < cache line
 303                              */
 304                             "vmovdqa64 %%zmm4,%0\n\t"
 305                             "vmovdqa64 %%zmm6,%1\n\t"
 306                             "vmovdqa64 %%zmm2,%2\n\t"
 307                             "vmovdqa64 %%zmm3,%3"
 308                             :
 309                             : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
 310                               "m" (p[d+64]));
 311        }
 312
 313        asm volatile("sfence" : : : "memory");
 314        kernel_fpu_end();
 315}
 316
 317const struct raid6_calls raid6_avx512x2 = {
 318        raid6_avx5122_gen_syndrome,
 319        raid6_avx5122_xor_syndrome,
 320        raid6_have_avx512,
 321        "avx512x2",
 322        1                       /* Has cache hints */
 323};
 324
 325#ifdef CONFIG_X86_64
 326
 327/*
 328 * Unrolled-by-4 AVX2 implementation
 329 */
 330static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
 331{
 332        u8 **dptr = (u8 **)ptrs;
 333        u8 *p, *q;
 334        int d, z, z0;
 335
 336        z0 = disks - 3;         /* Highest data disk */
 337        p = dptr[z0+1];         /* XOR parity */
 338        q = dptr[z0+2];         /* RS syndrome */
 339
 340        kernel_fpu_begin();
 341
 342        asm volatile("vmovdqa64 %0,%%zmm0\n\t"
 343                     "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t"       /* Zero temp */
 344                     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"       /* P[0] */
 345                     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"       /* P[1] */
 346                     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"       /* Q[0] */
 347                     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"       /* Q[1] */
 348                     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"    /* P[2] */
 349                     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"    /* P[3] */
 350                     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"    /* Q[2] */
 351                     "vpxorq %%zmm14,%%zmm14,%%zmm14"        /* Q[3] */
 352                     :
 353                     : "m" (raid6_avx512_constants.x1d[0]));
 354
 355        for (d = 0; d < bytes; d += 256) {
 356                for (z = z0; z >= 0; z--) {
 357                asm volatile("prefetchnta %0\n\t"
 358                             "prefetchnta %1\n\t"
 359                             "prefetchnta %2\n\t"
 360                             "prefetchnta %3\n\t"
 361                             "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
 362                             "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
 363                             "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
 364                             "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
 365                             "vpmovm2b %%k1,%%zmm5\n\t"
 366                             "vpmovm2b %%k2,%%zmm7\n\t"
 367                             "vpmovm2b %%k3,%%zmm13\n\t"
 368                             "vpmovm2b %%k4,%%zmm15\n\t"
 369                             "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 370                             "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
 371                             "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
 372                             "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
 373                             "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 374                             "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
 375                             "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
 376                             "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
 377                             "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 378                             "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
 379                             "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
 380                             "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
 381                             "vmovdqa64 %0,%%zmm5\n\t"
 382                             "vmovdqa64 %1,%%zmm7\n\t"
 383                             "vmovdqa64 %2,%%zmm13\n\t"
 384                             "vmovdqa64 %3,%%zmm15\n\t"
 385                             "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
 386                             "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
 387                             "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
 388                             "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
 389                             "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 390                             "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
 391                             "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
 392                             "vpxorq %%zmm15,%%zmm14,%%zmm14"
 393                             :
 394                             : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
 395                               "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
 396                }
 397                asm volatile("vmovntdq %%zmm2,%0\n\t"
 398                             "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
 399                             "vmovntdq %%zmm3,%1\n\t"
 400                             "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
 401                             "vmovntdq %%zmm10,%2\n\t"
 402                             "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
 403                             "vmovntdq %%zmm11,%3\n\t"
 404                             "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
 405                             "vmovntdq %%zmm4,%4\n\t"
 406                             "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
 407                             "vmovntdq %%zmm6,%5\n\t"
 408                             "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
 409                             "vmovntdq %%zmm12,%6\n\t"
 410                             "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
 411                             "vmovntdq %%zmm14,%7\n\t"
 412                             "vpxorq %%zmm14,%%zmm14,%%zmm14"
 413                             :
 414                             : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
 415                               "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
 416                               "m" (q[d+128]), "m" (q[d+192]));
 417        }
 418
 419        asm volatile("sfence" : : : "memory");
 420        kernel_fpu_end();
 421}
 422
 423static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
 424                                       size_t bytes, void **ptrs)
 425{
 426        u8 **dptr = (u8 **)ptrs;
 427        u8 *p, *q;
 428        int d, z, z0;
 429
 430        z0 = stop;              /* P/Q right side optimization */
 431        p = dptr[disks-2];      /* XOR parity */
 432        q = dptr[disks-1];      /* RS syndrome */
 433
 434        kernel_fpu_begin();
 435
 436        asm volatile("vmovdqa64 %0,%%zmm0"
 437                     :: "m" (raid6_avx512_constants.x1d[0]));
 438
 439        for (d = 0 ; d < bytes ; d += 256) {
 440                asm volatile("vmovdqa64 %0,%%zmm4\n\t"
 441                             "vmovdqa64 %1,%%zmm6\n\t"
 442                             "vmovdqa64 %2,%%zmm12\n\t"
 443                             "vmovdqa64 %3,%%zmm14\n\t"
 444                             "vmovdqa64 %4,%%zmm2\n\t"
 445                             "vmovdqa64 %5,%%zmm3\n\t"
 446                             "vmovdqa64 %6,%%zmm10\n\t"
 447                             "vmovdqa64 %7,%%zmm11\n\t"
 448                             "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
 449                             "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
 450                             "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
 451                             "vpxorq %%zmm14,%%zmm11,%%zmm11"
 452                             :
 453                             : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
 454                               "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
 455                               "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
 456                               "m" (p[d+192]));
 457                /* P/Q data pages */
 458                for (z = z0-1 ; z >= start ; z--) {
 459                        asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
 460                                     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
 461                                     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
 462                                     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
 463                                     "prefetchnta %0\n\t"
 464                                     "prefetchnta %2\n\t"
 465                                     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
 466                                     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
 467                                     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
 468                                     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
 469                                     "vpmovm2b %%k1,%%zmm5\n\t"
 470                                     "vpmovm2b %%k2,%%zmm7\n\t"
 471                                     "vpmovm2b %%k3,%%zmm13\n\t"
 472                                     "vpmovm2b %%k4,%%zmm15\n\t"
 473                                     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 474                                     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
 475                                     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
 476                                     "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
 477                                     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 478                                     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
 479                                     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
 480                                     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
 481                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 482                                     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
 483                                     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
 484                                     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
 485                                     "vmovdqa64 %0,%%zmm5\n\t"
 486                                     "vmovdqa64 %1,%%zmm7\n\t"
 487                                     "vmovdqa64 %2,%%zmm13\n\t"
 488                                     "vmovdqa64 %3,%%zmm15\n\t"
 489                                     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
 490                                     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
 491                                     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
 492                                     "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
 493                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 494                                     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
 495                                     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
 496                                     "vpxorq %%zmm15,%%zmm14,%%zmm14"
 497                                     :
 498                                     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
 499                                       "m" (dptr[z][d+128]),
 500                                       "m" (dptr[z][d+192]));
 501                }
 502                asm volatile("prefetchnta %0\n\t"
 503                             "prefetchnta %1\n\t"
 504                             :
 505                             : "m" (q[d]), "m" (q[d+128]));
 506                /* P/Q left side optimization */
 507                for (z = start-1 ; z >= 0 ; z--) {
 508                        asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
 509                                     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
 510                                     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
 511                                     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
 512                                     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
 513                                     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
 514                                     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
 515                                     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
 516                                     "vpmovm2b %%k1,%%zmm5\n\t"
 517                                     "vpmovm2b %%k2,%%zmm7\n\t"
 518                                     "vpmovm2b %%k3,%%zmm13\n\t"
 519                                     "vpmovm2b %%k4,%%zmm15\n\t"
 520                                     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 521                                     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
 522                                     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
 523                                     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
 524                                     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 525                                     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
 526                                     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
 527                                     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
 528                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 529                                     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
 530                                     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
 531                                     "vpxorq %%zmm15,%%zmm14,%%zmm14"
 532                                     :
 533                                     : );
 534                }
 535                asm volatile("vmovntdq %%zmm2,%0\n\t"
 536                             "vmovntdq %%zmm3,%1\n\t"
 537                             "vmovntdq %%zmm10,%2\n\t"
 538                             "vmovntdq %%zmm11,%3\n\t"
 539                             "vpxorq %4,%%zmm4,%%zmm4\n\t"
 540                             "vpxorq %5,%%zmm6,%%zmm6\n\t"
 541                             "vpxorq %6,%%zmm12,%%zmm12\n\t"
 542                             "vpxorq %7,%%zmm14,%%zmm14\n\t"
 543                             "vmovntdq %%zmm4,%4\n\t"
 544                             "vmovntdq %%zmm6,%5\n\t"
 545                             "vmovntdq %%zmm12,%6\n\t"
 546                             "vmovntdq %%zmm14,%7"
 547                             :
 548                             : "m" (p[d]),  "m" (p[d+64]), "m" (p[d+128]),
 549                               "m" (p[d+192]), "m" (q[d]),  "m" (q[d+64]),
 550                               "m" (q[d+128]), "m" (q[d+192]));
 551        }
 552        asm volatile("sfence" : : : "memory");
 553        kernel_fpu_end();
 554}
 555const struct raid6_calls raid6_avx512x4 = {
 556        raid6_avx5124_gen_syndrome,
 557        raid6_avx5124_xor_syndrome,
 558        raid6_have_avx512,
 559        "avx512x4",
 560        1                       /* Has cache hints */
 561};
 562#endif
 563
 564#endif /* CONFIG_AS_AVX512 */
 565