linux/arch/arm64/lib/xor-neon.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * arch/arm64/lib/xor-neon.c
   4 *
   5 * Authors: Jackie Liu <liuyun01@kylinos.cn>
   6 * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
   7 */
   8
   9#include <linux/raid/xor.h>
  10#include <linux/module.h>
  11#include <asm/neon-intrinsics.h>
  12
  13void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
  14        unsigned long *p2)
  15{
  16        uint64_t *dp1 = (uint64_t *)p1;
  17        uint64_t *dp2 = (uint64_t *)p2;
  18
  19        register uint64x2_t v0, v1, v2, v3;
  20        long lines = bytes / (sizeof(uint64x2_t) * 4);
  21
  22        do {
  23                /* p1 ^= p2 */
  24                v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
  25                v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
  26                v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
  27                v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
  28
  29                /* store */
  30                vst1q_u64(dp1 +  0, v0);
  31                vst1q_u64(dp1 +  2, v1);
  32                vst1q_u64(dp1 +  4, v2);
  33                vst1q_u64(dp1 +  6, v3);
  34
  35                dp1 += 8;
  36                dp2 += 8;
  37        } while (--lines > 0);
  38}
  39
  40void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
  41        unsigned long *p2, unsigned long *p3)
  42{
  43        uint64_t *dp1 = (uint64_t *)p1;
  44        uint64_t *dp2 = (uint64_t *)p2;
  45        uint64_t *dp3 = (uint64_t *)p3;
  46
  47        register uint64x2_t v0, v1, v2, v3;
  48        long lines = bytes / (sizeof(uint64x2_t) * 4);
  49
  50        do {
  51                /* p1 ^= p2 */
  52                v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
  53                v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
  54                v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
  55                v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
  56
  57                /* p1 ^= p3 */
  58                v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
  59                v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
  60                v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
  61                v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
  62
  63                /* store */
  64                vst1q_u64(dp1 +  0, v0);
  65                vst1q_u64(dp1 +  2, v1);
  66                vst1q_u64(dp1 +  4, v2);
  67                vst1q_u64(dp1 +  6, v3);
  68
  69                dp1 += 8;
  70                dp2 += 8;
  71                dp3 += 8;
  72        } while (--lines > 0);
  73}
  74
  75void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
  76        unsigned long *p2, unsigned long *p3, unsigned long *p4)
  77{
  78        uint64_t *dp1 = (uint64_t *)p1;
  79        uint64_t *dp2 = (uint64_t *)p2;
  80        uint64_t *dp3 = (uint64_t *)p3;
  81        uint64_t *dp4 = (uint64_t *)p4;
  82
  83        register uint64x2_t v0, v1, v2, v3;
  84        long lines = bytes / (sizeof(uint64x2_t) * 4);
  85
  86        do {
  87                /* p1 ^= p2 */
  88                v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
  89                v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
  90                v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
  91                v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
  92
  93                /* p1 ^= p3 */
  94                v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
  95                v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
  96                v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
  97                v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
  98
  99                /* p1 ^= p4 */
 100                v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
 101                v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
 102                v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
 103                v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
 104
 105                /* store */
 106                vst1q_u64(dp1 +  0, v0);
 107                vst1q_u64(dp1 +  2, v1);
 108                vst1q_u64(dp1 +  4, v2);
 109                vst1q_u64(dp1 +  6, v3);
 110
 111                dp1 += 8;
 112                dp2 += 8;
 113                dp3 += 8;
 114                dp4 += 8;
 115        } while (--lines > 0);
 116}
 117
 118void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
 119        unsigned long *p2, unsigned long *p3,
 120        unsigned long *p4, unsigned long *p5)
 121{
 122        uint64_t *dp1 = (uint64_t *)p1;
 123        uint64_t *dp2 = (uint64_t *)p2;
 124        uint64_t *dp3 = (uint64_t *)p3;
 125        uint64_t *dp4 = (uint64_t *)p4;
 126        uint64_t *dp5 = (uint64_t *)p5;
 127
 128        register uint64x2_t v0, v1, v2, v3;
 129        long lines = bytes / (sizeof(uint64x2_t) * 4);
 130
 131        do {
 132                /* p1 ^= p2 */
 133                v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
 134                v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
 135                v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
 136                v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
 137
 138                /* p1 ^= p3 */
 139                v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
 140                v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
 141                v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
 142                v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
 143
 144                /* p1 ^= p4 */
 145                v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
 146                v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
 147                v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
 148                v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
 149
 150                /* p1 ^= p5 */
 151                v0 = veorq_u64(v0, vld1q_u64(dp5 +  0));
 152                v1 = veorq_u64(v1, vld1q_u64(dp5 +  2));
 153                v2 = veorq_u64(v2, vld1q_u64(dp5 +  4));
 154                v3 = veorq_u64(v3, vld1q_u64(dp5 +  6));
 155
 156                /* store */
 157                vst1q_u64(dp1 +  0, v0);
 158                vst1q_u64(dp1 +  2, v1);
 159                vst1q_u64(dp1 +  4, v2);
 160                vst1q_u64(dp1 +  6, v3);
 161
 162                dp1 += 8;
 163                dp2 += 8;
 164                dp3 += 8;
 165                dp4 += 8;
 166                dp5 += 8;
 167        } while (--lines > 0);
 168}
 169
 170struct xor_block_template const xor_block_inner_neon = {
 171        .name   = "__inner_neon__",
 172        .do_2   = xor_arm64_neon_2,
 173        .do_3   = xor_arm64_neon_3,
 174        .do_4   = xor_arm64_neon_4,
 175        .do_5   = xor_arm64_neon_5,
 176};
 177EXPORT_SYMBOL(xor_block_inner_neon);
 178
 179MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
 180MODULE_DESCRIPTION("ARMv8 XOR Extensions");
 181MODULE_LICENSE("GPL");
 182