linux/lib/raid6/avx512.c
<<
>>
Prefs
   1/* -*- linux-c -*- --------------------------------------------------------
   2 *
   3 *   Copyright (C) 2016 Intel Corporation
   4 *
   5 *   Author: Gayatri Kammela <gayatri.kammela@intel.com>
   6 *   Author: Megha Dey <megha.dey@linux.intel.com>
   7 *
   8 *   Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
   9 *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
  10 *
  11 *   This program is free software; you can redistribute it and/or modify
  12 *   it under the terms of the GNU General Public License as published by
  13 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
  14 *   Boston MA 02111-1307, USA; either version 2 of the License, or
  15 *   (at your option) any later version; incorporated herein by reference.
  16 *
  17 * -----------------------------------------------------------------------
  18 */
  19
  20/*
  21 * AVX512 implementation of RAID-6 syndrome functions
  22 *
  23 */
  24
  25#ifdef CONFIG_AS_AVX512
  26
  27#include <linux/raid/pq.h>
  28#include "x86.h"
  29
  30static const struct raid6_avx512_constants {
  31        u64 x1d[8];
  32} raid6_avx512_constants __aligned(512/8) = {
  33        { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
  34          0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
  35          0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
  36          0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
  37};
  38
  39static int raid6_have_avx512(void)
  40{
  41        return boot_cpu_has(X86_FEATURE_AVX2) &&
  42                boot_cpu_has(X86_FEATURE_AVX) &&
  43                boot_cpu_has(X86_FEATURE_AVX512F) &&
  44                boot_cpu_has(X86_FEATURE_AVX512BW) &&
  45                boot_cpu_has(X86_FEATURE_AVX512VL) &&
  46                boot_cpu_has(X86_FEATURE_AVX512DQ);
  47}
  48
  49static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
  50{
  51        u8 **dptr = (u8 **)ptrs;
  52        u8 *p, *q;
  53        int d, z, z0;
  54
  55        z0 = disks - 3;         /* Highest data disk */
  56        p = dptr[z0+1];         /* XOR parity */
  57        q = dptr[z0+2];         /* RS syndrome */
  58
  59        kernel_fpu_begin();
  60
  61        asm volatile("vmovdqa64 %0,%%zmm0\n\t"
  62                     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
  63                     :
  64                     : "m" (raid6_avx512_constants.x1d[0]));
  65
  66        for (d = 0; d < bytes; d += 64) {
  67                asm volatile("prefetchnta %0\n\t"
  68                             "vmovdqa64 %0,%%zmm2\n\t"     /* P[0] */
  69                             "prefetchnta %1\n\t"
  70                             "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
  71                             "vmovdqa64 %1,%%zmm6"
  72                             :
  73                             : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
  74                for (z = z0-2; z >= 0; z--) {
  75                        asm volatile("prefetchnta %0\n\t"
  76                                     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
  77                                     "vpmovm2b %%k1,%%zmm5\n\t"
  78                                     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  79                                     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  80                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  81                                     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
  82                                     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
  83                                     "vmovdqa64 %0,%%zmm6"
  84                                     :
  85                                     : "m" (dptr[z][d]));
  86                }
  87                asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
  88                             "vpmovm2b %%k1,%%zmm5\n\t"
  89                             "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
  90                             "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
  91                             "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
  92                             "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
  93                             "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
  94                             "vmovntdq %%zmm2,%0\n\t"
  95                             "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
  96                             "vmovntdq %%zmm4,%1\n\t"
  97                             "vpxorq %%zmm4,%%zmm4,%%zmm4"
  98                             :
  99                             : "m" (p[d]), "m" (q[d]));
 100        }
 101
 102        asm volatile("sfence" : : : "memory");
 103        kernel_fpu_end();
 104}
 105
 106static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
 107                                       size_t bytes, void **ptrs)
 108{
 109        u8 **dptr = (u8 **)ptrs;
 110        u8 *p, *q;
 111        int d, z, z0;
 112
 113        z0 = stop;              /* P/Q right side optimization */
 114        p = dptr[disks-2];      /* XOR parity */
 115        q = dptr[disks-1];      /* RS syndrome */
 116
 117        kernel_fpu_begin();
 118
 119        asm volatile("vmovdqa64 %0,%%zmm0"
 120                     : : "m" (raid6_avx512_constants.x1d[0]));
 121
 122        for (d = 0 ; d < bytes ; d += 64) {
 123                asm volatile("vmovdqa64 %0,%%zmm4\n\t"
 124                             "vmovdqa64 %1,%%zmm2\n\t"
 125                             "vpxorq %%zmm4,%%zmm2,%%zmm2"
 126                             :
 127                             : "m" (dptr[z0][d]),  "m" (p[d]));
 128                /* P/Q data pages */
 129                for (z = z0-1 ; z >= start ; z--) {
 130                        asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
 131                                     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
 132                                     "vpmovm2b %%k1,%%zmm5\n\t"
 133                                     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 134                                     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 135                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 136                                     "vmovdqa64 %0,%%zmm5\n\t"
 137                                     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
 138                                     "vpxorq %%zmm5,%%zmm4,%%zmm4"
 139                                     :
 140                                     : "m" (dptr[z][d]));
 141                }
 142                /* P/Q left side optimization */
 143                for (z = start-1 ; z >= 0 ; z--) {
 144                        asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
 145                                     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
 146                                     "vpmovm2b %%k1,%%zmm5\n\t"
 147                                     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 148                                     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 149                                     "vpxorq %%zmm5,%%zmm4,%%zmm4"
 150                                     :
 151                                     : );
 152                }
 153                asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
 154                /* Don't use movntdq for r/w memory area < cache line */
 155                             "vmovdqa64 %%zmm4,%0\n\t"
 156                             "vmovdqa64 %%zmm2,%1"
 157                             :
 158                             : "m" (q[d]), "m" (p[d]));
 159        }
 160
 161        asm volatile("sfence" : : : "memory");
 162        kernel_fpu_end();
 163}
 164
 165const struct raid6_calls raid6_avx512x1 = {
 166        raid6_avx5121_gen_syndrome,
 167        raid6_avx5121_xor_syndrome,
 168        raid6_have_avx512,
 169        "avx512x1",
 170        1                       /* Has cache hints */
 171};
 172
 173/*
 174 * Unrolled-by-2 AVX512 implementation
 175 */
 176static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
 177{
 178        u8 **dptr = (u8 **)ptrs;
 179        u8 *p, *q;
 180        int d, z, z0;
 181
 182        z0 = disks - 3;         /* Highest data disk */
 183        p = dptr[z0+1];         /* XOR parity */
 184        q = dptr[z0+2];         /* RS syndrome */
 185
 186        kernel_fpu_begin();
 187
 188        asm volatile("vmovdqa64 %0,%%zmm0\n\t"
 189                     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
 190                     :
 191                     : "m" (raid6_avx512_constants.x1d[0]));
 192
 193        /* We uniformly assume a single prefetch covers at least 64 bytes */
 194        for (d = 0; d < bytes; d += 128) {
 195                asm volatile("prefetchnta %0\n\t"
 196                             "prefetchnta %1\n\t"
 197                             "vmovdqa64 %0,%%zmm2\n\t"      /* P[0] */
 198                             "vmovdqa64 %1,%%zmm3\n\t"      /* P[1] */
 199                             "vmovdqa64 %%zmm2,%%zmm4\n\t"  /* Q[0] */
 200                             "vmovdqa64 %%zmm3,%%zmm6"      /* Q[1] */
 201                             :
 202                             : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
 203                for (z = z0-1; z >= 0; z--) {
 204                        asm volatile("prefetchnta %0\n\t"
 205                                     "prefetchnta %1\n\t"
 206                                     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
 207                                     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
 208                                     "vpmovm2b %%k1,%%zmm5\n\t"
 209                                     "vpmovm2b %%k2,%%zmm7\n\t"
 210                                     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 211                                     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
 212                                     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 213                                     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
 214                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 215                                     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
 216                                     "vmovdqa64 %0,%%zmm5\n\t"
 217                                     "vmovdqa64 %1,%%zmm7\n\t"
 218                                     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
 219                                     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
 220                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 221                                     "vpxorq %%zmm7,%%zmm6,%%zmm6"
 222                                     :
 223                                     : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
 224                }
 225                asm volatile("vmovntdq %%zmm2,%0\n\t"
 226                             "vmovntdq %%zmm3,%1\n\t"
 227                             "vmovntdq %%zmm4,%2\n\t"
 228                             "vmovntdq %%zmm6,%3"
 229                             :
 230                             : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
 231                               "m" (q[d+64]));
 232        }
 233
 234        asm volatile("sfence" : : : "memory");
 235        kernel_fpu_end();
 236}
 237
 238static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
 239                                       size_t bytes, void **ptrs)
 240{
 241        u8 **dptr = (u8 **)ptrs;
 242        u8 *p, *q;
 243        int d, z, z0;
 244
 245        z0 = stop;              /* P/Q right side optimization */
 246        p = dptr[disks-2];      /* XOR parity */
 247        q = dptr[disks-1];      /* RS syndrome */
 248
 249        kernel_fpu_begin();
 250
 251        asm volatile("vmovdqa64 %0,%%zmm0"
 252                     : : "m" (raid6_avx512_constants.x1d[0]));
 253
 254        for (d = 0 ; d < bytes ; d += 128) {
 255                asm volatile("vmovdqa64 %0,%%zmm4\n\t"
 256                             "vmovdqa64 %1,%%zmm6\n\t"
 257                             "vmovdqa64 %2,%%zmm2\n\t"
 258                             "vmovdqa64 %3,%%zmm3\n\t"
 259                             "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
 260                             "vpxorq %%zmm6,%%zmm3,%%zmm3"
 261                             :
 262                             : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
 263                               "m" (p[d]), "m" (p[d+64]));
 264                /* P/Q data pages */
 265                for (z = z0-1 ; z >= start ; z--) {
 266                        asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
 267                                     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
 268                                     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
 269                                     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
 270                                     "vpmovm2b %%k1,%%zmm5\n\t"
 271                                     "vpmovm2b %%k2,%%zmm7\n\t"
 272                                     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 273                                     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
 274                                     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 275                                     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
 276                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 277                                     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
 278                                     "vmovdqa64 %0,%%zmm5\n\t"
 279                                     "vmovdqa64 %1,%%zmm7\n\t"
 280                                     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
 281                                     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
 282                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 283                                     "vpxorq %%zmm7,%%zmm6,%%zmm6"
 284                                     :
 285                                     : "m" (dptr[z][d]),  "m" (dptr[z][d+64]));
 286                }
 287                /* P/Q left side optimization */
 288                for (z = start-1 ; z >= 0 ; z--) {
 289                        asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
 290                                     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
 291                                     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
 292                                     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
 293                                     "vpmovm2b %%k1,%%zmm5\n\t"
 294                                     "vpmovm2b %%k2,%%zmm7\n\t"
 295                                     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 296                                     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
 297                                     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 298                                     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
 299                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 300                                     "vpxorq %%zmm7,%%zmm6,%%zmm6"
 301                                     :
 302                                     : );
 303                }
 304                asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
 305                             "vpxorq %1,%%zmm6,%%zmm6\n\t"
 306                             /* Don't use movntdq for r/w
 307                              * memory area < cache line
 308                              */
 309                             "vmovdqa64 %%zmm4,%0\n\t"
 310                             "vmovdqa64 %%zmm6,%1\n\t"
 311                             "vmovdqa64 %%zmm2,%2\n\t"
 312                             "vmovdqa64 %%zmm3,%3"
 313                             :
 314                             : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
 315                               "m" (p[d+64]));
 316        }
 317
 318        asm volatile("sfence" : : : "memory");
 319        kernel_fpu_end();
 320}
 321
 322const struct raid6_calls raid6_avx512x2 = {
 323        raid6_avx5122_gen_syndrome,
 324        raid6_avx5122_xor_syndrome,
 325        raid6_have_avx512,
 326        "avx512x2",
 327        1                       /* Has cache hints */
 328};
 329
 330#ifdef CONFIG_X86_64
 331
 332/*
 333 * Unrolled-by-4 AVX2 implementation
 334 */
 335static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
 336{
 337        u8 **dptr = (u8 **)ptrs;
 338        u8 *p, *q;
 339        int d, z, z0;
 340
 341        z0 = disks - 3;         /* Highest data disk */
 342        p = dptr[z0+1];         /* XOR parity */
 343        q = dptr[z0+2];         /* RS syndrome */
 344
 345        kernel_fpu_begin();
 346
 347        asm volatile("vmovdqa64 %0,%%zmm0\n\t"
 348                     "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t"       /* Zero temp */
 349                     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"       /* P[0] */
 350                     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"       /* P[1] */
 351                     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"       /* Q[0] */
 352                     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"       /* Q[1] */
 353                     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"    /* P[2] */
 354                     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"    /* P[3] */
 355                     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"    /* Q[2] */
 356                     "vpxorq %%zmm14,%%zmm14,%%zmm14"        /* Q[3] */
 357                     :
 358                     : "m" (raid6_avx512_constants.x1d[0]));
 359
 360        for (d = 0; d < bytes; d += 256) {
 361                for (z = z0; z >= 0; z--) {
 362                asm volatile("prefetchnta %0\n\t"
 363                             "prefetchnta %1\n\t"
 364                             "prefetchnta %2\n\t"
 365                             "prefetchnta %3\n\t"
 366                             "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
 367                             "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
 368                             "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
 369                             "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
 370                             "vpmovm2b %%k1,%%zmm5\n\t"
 371                             "vpmovm2b %%k2,%%zmm7\n\t"
 372                             "vpmovm2b %%k3,%%zmm13\n\t"
 373                             "vpmovm2b %%k4,%%zmm15\n\t"
 374                             "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 375                             "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
 376                             "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
 377                             "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
 378                             "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 379                             "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
 380                             "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
 381                             "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
 382                             "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 383                             "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
 384                             "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
 385                             "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
 386                             "vmovdqa64 %0,%%zmm5\n\t"
 387                             "vmovdqa64 %1,%%zmm7\n\t"
 388                             "vmovdqa64 %2,%%zmm13\n\t"
 389                             "vmovdqa64 %3,%%zmm15\n\t"
 390                             "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
 391                             "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
 392                             "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
 393                             "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
 394                             "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 395                             "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
 396                             "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
 397                             "vpxorq %%zmm15,%%zmm14,%%zmm14"
 398                             :
 399                             : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
 400                               "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
 401                }
 402                asm volatile("vmovntdq %%zmm2,%0\n\t"
 403                             "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
 404                             "vmovntdq %%zmm3,%1\n\t"
 405                             "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
 406                             "vmovntdq %%zmm10,%2\n\t"
 407                             "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
 408                             "vmovntdq %%zmm11,%3\n\t"
 409                             "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
 410                             "vmovntdq %%zmm4,%4\n\t"
 411                             "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
 412                             "vmovntdq %%zmm6,%5\n\t"
 413                             "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
 414                             "vmovntdq %%zmm12,%6\n\t"
 415                             "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
 416                             "vmovntdq %%zmm14,%7\n\t"
 417                             "vpxorq %%zmm14,%%zmm14,%%zmm14"
 418                             :
 419                             : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
 420                               "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
 421                               "m" (q[d+128]), "m" (q[d+192]));
 422        }
 423
 424        asm volatile("sfence" : : : "memory");
 425        kernel_fpu_end();
 426}
 427
 428static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
 429                                       size_t bytes, void **ptrs)
 430{
 431        u8 **dptr = (u8 **)ptrs;
 432        u8 *p, *q;
 433        int d, z, z0;
 434
 435        z0 = stop;              /* P/Q right side optimization */
 436        p = dptr[disks-2];      /* XOR parity */
 437        q = dptr[disks-1];      /* RS syndrome */
 438
 439        kernel_fpu_begin();
 440
 441        asm volatile("vmovdqa64 %0,%%zmm0"
 442                     :: "m" (raid6_avx512_constants.x1d[0]));
 443
 444        for (d = 0 ; d < bytes ; d += 256) {
 445                asm volatile("vmovdqa64 %0,%%zmm4\n\t"
 446                             "vmovdqa64 %1,%%zmm6\n\t"
 447                             "vmovdqa64 %2,%%zmm12\n\t"
 448                             "vmovdqa64 %3,%%zmm14\n\t"
 449                             "vmovdqa64 %4,%%zmm2\n\t"
 450                             "vmovdqa64 %5,%%zmm3\n\t"
 451                             "vmovdqa64 %6,%%zmm10\n\t"
 452                             "vmovdqa64 %7,%%zmm11\n\t"
 453                             "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
 454                             "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
 455                             "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
 456                             "vpxorq %%zmm14,%%zmm11,%%zmm11"
 457                             :
 458                             : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
 459                               "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
 460                               "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
 461                               "m" (p[d+192]));
 462                /* P/Q data pages */
 463                for (z = z0-1 ; z >= start ; z--) {
 464                        asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
 465                                     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
 466                                     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
 467                                     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
 468                                     "prefetchnta %0\n\t"
 469                                     "prefetchnta %2\n\t"
 470                                     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
 471                                     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
 472                                     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
 473                                     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
 474                                     "vpmovm2b %%k1,%%zmm5\n\t"
 475                                     "vpmovm2b %%k2,%%zmm7\n\t"
 476                                     "vpmovm2b %%k3,%%zmm13\n\t"
 477                                     "vpmovm2b %%k4,%%zmm15\n\t"
 478                                     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 479                                     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
 480                                     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
 481                                     "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
 482                                     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 483                                     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
 484                                     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
 485                                     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
 486                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 487                                     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
 488                                     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
 489                                     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
 490                                     "vmovdqa64 %0,%%zmm5\n\t"
 491                                     "vmovdqa64 %1,%%zmm7\n\t"
 492                                     "vmovdqa64 %2,%%zmm13\n\t"
 493                                     "vmovdqa64 %3,%%zmm15\n\t"
 494                                     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
 495                                     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
 496                                     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
 497                                     "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
 498                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 499                                     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
 500                                     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
 501                                     "vpxorq %%zmm15,%%zmm14,%%zmm14"
 502                                     :
 503                                     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
 504                                       "m" (dptr[z][d+128]),
 505                                       "m" (dptr[z][d+192]));
 506                }
 507                asm volatile("prefetchnta %0\n\t"
 508                             "prefetchnta %1\n\t"
 509                             :
 510                             : "m" (q[d]), "m" (q[d+128]));
 511                /* P/Q left side optimization */
 512                for (z = start-1 ; z >= 0 ; z--) {
 513                        asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
 514                                     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
 515                                     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
 516                                     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
 517                                     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
 518                                     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
 519                                     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
 520                                     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
 521                                     "vpmovm2b %%k1,%%zmm5\n\t"
 522                                     "vpmovm2b %%k2,%%zmm7\n\t"
 523                                     "vpmovm2b %%k3,%%zmm13\n\t"
 524                                     "vpmovm2b %%k4,%%zmm15\n\t"
 525                                     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
 526                                     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
 527                                     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
 528                                     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
 529                                     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
 530                                     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
 531                                     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
 532                                     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
 533                                     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
 534                                     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
 535                                     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
 536                                     "vpxorq %%zmm15,%%zmm14,%%zmm14"
 537                                     :
 538                                     : );
 539                }
 540                asm volatile("vmovntdq %%zmm2,%0\n\t"
 541                             "vmovntdq %%zmm3,%1\n\t"
 542                             "vmovntdq %%zmm10,%2\n\t"
 543                             "vmovntdq %%zmm11,%3\n\t"
 544                             "vpxorq %4,%%zmm4,%%zmm4\n\t"
 545                             "vpxorq %5,%%zmm6,%%zmm6\n\t"
 546                             "vpxorq %6,%%zmm12,%%zmm12\n\t"
 547                             "vpxorq %7,%%zmm14,%%zmm14\n\t"
 548                             "vmovntdq %%zmm4,%4\n\t"
 549                             "vmovntdq %%zmm6,%5\n\t"
 550                             "vmovntdq %%zmm12,%6\n\t"
 551                             "vmovntdq %%zmm14,%7"
 552                             :
 553                             : "m" (p[d]),  "m" (p[d+64]), "m" (p[d+128]),
 554                               "m" (p[d+192]), "m" (q[d]),  "m" (q[d+64]),
 555                               "m" (q[d+128]), "m" (q[d+192]));
 556        }
 557        asm volatile("sfence" : : : "memory");
 558        kernel_fpu_end();
 559}
 560const struct raid6_calls raid6_avx512x4 = {
 561        raid6_avx5124_gen_syndrome,
 562        raid6_avx5124_xor_syndrome,
 563        raid6_have_avx512,
 564        "avx512x4",
 565        1                       /* Has cache hints */
 566};
 567#endif
 568
 569#endif /* CONFIG_AS_AVX512 */
 570