linux/arch/x86/include/asm/xor_avx.h
<<
>>
Prefs
   1#ifndef _ASM_X86_XOR_AVX_H
   2#define _ASM_X86_XOR_AVX_H
   3
   4/*
   5 * Optimized RAID-5 checksumming functions for AVX
   6 *
   7 * Copyright (C) 2012 Intel Corporation
   8 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
   9 *
  10 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
  11 *
  12 * This program is free software; you can redistribute it and/or
  13 * modify it under the terms of the GNU General Public License
  14 * as published by the Free Software Foundation; version 2
  15 * of the License.
  16 */
  17
  18#ifdef CONFIG_AS_AVX
  19
  20#include <linux/compiler.h>
  21#include <asm/i387.h>
  22
  23#define BLOCK4(i) \
  24                BLOCK(32 * i, 0) \
  25                BLOCK(32 * (i + 1), 1) \
  26                BLOCK(32 * (i + 2), 2) \
  27                BLOCK(32 * (i + 3), 3)
  28
  29#define BLOCK16() \
  30                BLOCK4(0) \
  31                BLOCK4(4) \
  32                BLOCK4(8) \
  33                BLOCK4(12)
  34
  35static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
  36{
  37        unsigned long lines = bytes >> 9;
  38
  39        kernel_fpu_begin();
  40
  41        while (lines--) {
  42#undef BLOCK
  43#define BLOCK(i, reg) \
  44do { \
  45        asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
  46        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
  47                "m" (p0[i / sizeof(*p0)])); \
  48        asm volatile("vmovdqa %%ymm" #reg ", %0" : \
  49                "=m" (p0[i / sizeof(*p0)])); \
  50} while (0);
  51
  52                BLOCK16()
  53
  54                p0 = (unsigned long *)((uintptr_t)p0 + 512);
  55                p1 = (unsigned long *)((uintptr_t)p1 + 512);
  56        }
  57
  58        kernel_fpu_end();
  59}
  60
  61static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
  62        unsigned long *p2)
  63{
  64        unsigned long lines = bytes >> 9;
  65
  66        kernel_fpu_begin();
  67
  68        while (lines--) {
  69#undef BLOCK
  70#define BLOCK(i, reg) \
  71do { \
  72        asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
  73        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
  74                "m" (p1[i / sizeof(*p1)])); \
  75        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
  76                "m" (p0[i / sizeof(*p0)])); \
  77        asm volatile("vmovdqa %%ymm" #reg ", %0" : \
  78                "=m" (p0[i / sizeof(*p0)])); \
  79} while (0);
  80
  81                BLOCK16()
  82
  83                p0 = (unsigned long *)((uintptr_t)p0 + 512);
  84                p1 = (unsigned long *)((uintptr_t)p1 + 512);
  85                p2 = (unsigned long *)((uintptr_t)p2 + 512);
  86        }
  87
  88        kernel_fpu_end();
  89}
  90
  91static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
  92        unsigned long *p2, unsigned long *p3)
  93{
  94        unsigned long lines = bytes >> 9;
  95
  96        kernel_fpu_begin();
  97
  98        while (lines--) {
  99#undef BLOCK
 100#define BLOCK(i, reg) \
 101do { \
 102        asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
 103        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 104                "m" (p2[i / sizeof(*p2)])); \
 105        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 106                "m" (p1[i / sizeof(*p1)])); \
 107        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 108                "m" (p0[i / sizeof(*p0)])); \
 109        asm volatile("vmovdqa %%ymm" #reg ", %0" : \
 110                "=m" (p0[i / sizeof(*p0)])); \
 111} while (0);
 112
 113                BLOCK16();
 114
 115                p0 = (unsigned long *)((uintptr_t)p0 + 512);
 116                p1 = (unsigned long *)((uintptr_t)p1 + 512);
 117                p2 = (unsigned long *)((uintptr_t)p2 + 512);
 118                p3 = (unsigned long *)((uintptr_t)p3 + 512);
 119        }
 120
 121        kernel_fpu_end();
 122}
 123
 124static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
 125        unsigned long *p2, unsigned long *p3, unsigned long *p4)
 126{
 127        unsigned long lines = bytes >> 9;
 128
 129        kernel_fpu_begin();
 130
 131        while (lines--) {
 132#undef BLOCK
 133#define BLOCK(i, reg) \
 134do { \
 135        asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
 136        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 137                "m" (p3[i / sizeof(*p3)])); \
 138        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 139                "m" (p2[i / sizeof(*p2)])); \
 140        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 141                "m" (p1[i / sizeof(*p1)])); \
 142        asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 143                "m" (p0[i / sizeof(*p0)])); \
 144        asm volatile("vmovdqa %%ymm" #reg ", %0" : \
 145                "=m" (p0[i / sizeof(*p0)])); \
 146} while (0);
 147
 148                BLOCK16()
 149
 150                p0 = (unsigned long *)((uintptr_t)p0 + 512);
 151                p1 = (unsigned long *)((uintptr_t)p1 + 512);
 152                p2 = (unsigned long *)((uintptr_t)p2 + 512);
 153                p3 = (unsigned long *)((uintptr_t)p3 + 512);
 154                p4 = (unsigned long *)((uintptr_t)p4 + 512);
 155        }
 156
 157        kernel_fpu_end();
 158}
 159
 160static struct xor_block_template xor_block_avx = {
 161        .name = "avx",
 162        .do_2 = xor_avx_2,
 163        .do_3 = xor_avx_3,
 164        .do_4 = xor_avx_4,
 165        .do_5 = xor_avx_5,
 166};
 167
 168#define AVX_XOR_SPEED \
 169do { \
 170        if (cpu_has_avx) \
 171                xor_speed(&xor_block_avx); \
 172} while (0)
 173
 174#define AVX_SELECT(FASTEST) \
 175        (cpu_has_avx ? &xor_block_avx : FASTEST)
 176
 177#else
 178
 179#define AVX_XOR_SPEED {}
 180
 181#define AVX_SELECT(FASTEST) (FASTEST)
 182
 183#endif
 184#endif
 185