1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20#include "qemu/osdep.h"
21#include "cpu.h"
22#include "internals.h"
23#include "exec/exec-all.h"
24#include "exec/cpu_ldst.h"
25#include "exec/helper-proto.h"
26#include "tcg/tcg-gvec-desc.h"
27#include "fpu/softfloat.h"
28#include "tcg/tcg.h"
29#include "vec_internal.h"
30
31
32
33
34
35
36
37
38
39
40#define PREDTEST_INIT 1
41
42
43
44
45static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
46{
47 if (likely(g)) {
48
49
50 if (!(flags & 4)) {
51 flags |= ((d & (g & -g)) != 0) << 31;
52 flags |= 4;
53 }
54
55
56 flags |= ((d & g) != 0) << 1;
57
58
59 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
60 }
61 return flags;
62}
63
64
65
66
67static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
68{
69 if (likely(g)) {
70
71
72 if (!(flags & 4)) {
73 flags += 4 - 1;
74 flags |= (d & pow2floor(g)) == 0;
75 }
76
77
78 flags |= ((d & g) != 0) << 1;
79
80
81 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
82 }
83 return flags;
84}
85
86
87uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
88{
89 return iter_predtest_fwd(d, g, PREDTEST_INIT);
90}
91
92
93uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
94{
95 uint32_t flags = PREDTEST_INIT;
96 uint64_t *d = vd, *g = vg;
97 uintptr_t i = 0;
98
99 do {
100 flags = iter_predtest_fwd(d[i], g[i], flags);
101 } while (++i < words);
102
103 return flags;
104}
105
106
107
108
109
110static inline uint64_t expand_pred_b(uint8_t byte)
111{
112 return expand_pred_b_data[byte];
113}
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129static inline uint64_t expand_pred_h(uint8_t byte)
130{
131 static const uint64_t word[] = {
132 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
133 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
134 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
135 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
136 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
137 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
138 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
139 [0x55] = 0xffffffffffffffff,
140 };
141 return word[byte & 0x55];
142}
143
144
145static inline uint64_t expand_pred_s(uint8_t byte)
146{
147 static const uint64_t word[] = {
148 [0x01] = 0x00000000ffffffffull,
149 [0x10] = 0xffffffff00000000ull,
150 [0x11] = 0xffffffffffffffffull,
151 };
152 return word[byte & 0x11];
153}
154
155#define LOGICAL_PPPP(NAME, FUNC) \
156void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
157{ \
158 uintptr_t opr_sz = simd_oprsz(desc); \
159 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
160 uintptr_t i; \
161 for (i = 0; i < opr_sz / 8; ++i) { \
162 d[i] = FUNC(n[i], m[i], g[i]); \
163 } \
164}
165
166#define DO_AND(N, M, G) (((N) & (M)) & (G))
167#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
168#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
169#define DO_ORR(N, M, G) (((N) | (M)) & (G))
170#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
171#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
172#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
173#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
174
175LOGICAL_PPPP(sve_and_pppp, DO_AND)
176LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
177LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
178LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
179LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
180LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
181LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
182LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
183
184#undef DO_AND
185#undef DO_BIC
186#undef DO_EOR
187#undef DO_ORR
188#undef DO_ORN
189#undef DO_NOR
190#undef DO_NAND
191#undef DO_SEL
192#undef LOGICAL_PPPP
193
194
195
196
197
198
199
200
201
202#define DO_ZPZZ(NAME, TYPE, H, OP) \
203void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
204{ \
205 intptr_t i, opr_sz = simd_oprsz(desc); \
206 for (i = 0; i < opr_sz; ) { \
207 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
208 do { \
209 if (pg & 1) { \
210 TYPE nn = *(TYPE *)(vn + H(i)); \
211 TYPE mm = *(TYPE *)(vm + H(i)); \
212 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
213 } \
214 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
215 } while (i & 15); \
216 } \
217}
218
219
220#define DO_ZPZZ_D(NAME, TYPE, OP) \
221void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
222{ \
223 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
224 TYPE *d = vd, *n = vn, *m = vm; \
225 uint8_t *pg = vg; \
226 for (i = 0; i < opr_sz; i += 1) { \
227 if (pg[H1(i)] & 1) { \
228 TYPE nn = n[i], mm = m[i]; \
229 d[i] = OP(nn, mm); \
230 } \
231 } \
232}
233
234#define DO_AND(N, M) (N & M)
235#define DO_EOR(N, M) (N ^ M)
236#define DO_ORR(N, M) (N | M)
237#define DO_BIC(N, M) (N & ~M)
238#define DO_ADD(N, M) (N + M)
239#define DO_SUB(N, M) (N - M)
240#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
241#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
242#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
243#define DO_MUL(N, M) (N * M)
244
245
246
247
248
249
250
251
252
253#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
254#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
255
256DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
257DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
258DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
259DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
260
261DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
262DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
263DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
264DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
265
266DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
267DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
268DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
269DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
270
271DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
272DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
273DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
274DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
275
276DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
277DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
278DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
279DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
280
281DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
282DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
283DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
284DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
285
286DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
287DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
288DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
289DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
290
291DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
292DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
293DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
294DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
295
296DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
297DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
298DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
299DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
300
301DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
302DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
303DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
304DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
305
306DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
307DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
308DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
309DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
310
311DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
312DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
313DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
314DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
315
316
317
318static inline uint8_t do_mulh_b(int32_t n, int32_t m)
319{
320 return (n * m) >> 8;
321}
322
323static inline uint16_t do_mulh_h(int32_t n, int32_t m)
324{
325 return (n * m) >> 16;
326}
327
328static inline uint32_t do_mulh_s(int64_t n, int64_t m)
329{
330 return (n * m) >> 32;
331}
332
333static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
334{
335 uint64_t lo, hi;
336 muls64(&lo, &hi, n, m);
337 return hi;
338}
339
340static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
341{
342 uint64_t lo, hi;
343 mulu64(&lo, &hi, n, m);
344 return hi;
345}
346
347DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
348DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
349DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
350DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
351
352DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
353DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
354DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
355DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
356
357DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
358DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
359DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
360DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
361
362DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
363DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
364
365DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
366DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
367
368
369
370#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
371#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
372#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
373
374DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
375DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
376DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
377
378DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
379DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
380DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
381
382DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
383DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
384DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
385
386DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
387DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
388DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
389
390static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
391{
392 int8_t n1 = n, n2 = n >> 8;
393 return m + n1 + n2;
394}
395
396static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
397{
398 int16_t n1 = n, n2 = n >> 16;
399 return m + n1 + n2;
400}
401
402static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
403{
404 int32_t n1 = n, n2 = n >> 32;
405 return m + n1 + n2;
406}
407
408DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
409DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
410DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
411
412static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
413{
414 uint8_t n1 = n, n2 = n >> 8;
415 return m + n1 + n2;
416}
417
418static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
419{
420 uint16_t n1 = n, n2 = n >> 16;
421 return m + n1 + n2;
422}
423
424static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
425{
426 uint32_t n1 = n, n2 = n >> 32;
427 return m + n1 + n2;
428}
429
430DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
431DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
432DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
433
434#define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
435#define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
436#define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
437#define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
438
439DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
440DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
441DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
442DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
443
444#define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
445#define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
446#define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
447#define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
448
449DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
450DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
451DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
452DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
453
454
455
456
457
458
459
460#define do_sqshl_b(n, m) \
461 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
462#define do_sqshl_h(n, m) \
463 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
464#define do_sqshl_s(n, m) \
465 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
466#define do_sqshl_d(n, m) \
467 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
468
469DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
470DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
471DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
472DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
473
474#define do_uqshl_b(n, m) \
475 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
476#define do_uqshl_h(n, m) \
477 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
478#define do_uqshl_s(n, m) \
479 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
480#define do_uqshl_d(n, m) \
481 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
482
483DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
484DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
485DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
486DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
487
488#define do_sqrshl_b(n, m) \
489 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
490#define do_sqrshl_h(n, m) \
491 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
492#define do_sqrshl_s(n, m) \
493 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
494#define do_sqrshl_d(n, m) \
495 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
496
497DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
498DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
499DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
500DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
501
502#undef do_sqrshl_d
503
504#define do_uqrshl_b(n, m) \
505 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
506#define do_uqrshl_h(n, m) \
507 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
508#define do_uqrshl_s(n, m) \
509 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
510#define do_uqrshl_d(n, m) \
511 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
512
513DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
514DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
515DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
516DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
517
518#undef do_uqrshl_d
519
520#define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
521#define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
522
523DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
524DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
525DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
526DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
527
528DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
529DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
530DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
531DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
532
533#define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
534#define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
535
536DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
537DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
538DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
539DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
540
541DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
542DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
543DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
544DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
545
546#define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
547#define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
548
549DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
550DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
551DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
552DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
553
554DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
555DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
556DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
557DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
558
559static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
560{
561 return val >= max ? max : val <= min ? min : val;
562}
563
564#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
565#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
566#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
567
568static inline int64_t do_sqadd_d(int64_t n, int64_t m)
569{
570 int64_t r = n + m;
571 if (((r ^ n) & ~(n ^ m)) < 0) {
572
573 return r < 0 ? INT64_MAX : INT64_MIN;
574 }
575 return r;
576}
577
578DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
579DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
580DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
581DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
582
583#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
584#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
585#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
586
587static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
588{
589 uint64_t r = n + m;
590 return r < n ? UINT64_MAX : r;
591}
592
593DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
594DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
595DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
596DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
597
598#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
599#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
600#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
601
602static inline int64_t do_sqsub_d(int64_t n, int64_t m)
603{
604 int64_t r = n - m;
605 if (((r ^ n) & (n ^ m)) < 0) {
606
607 return r < 0 ? INT64_MAX : INT64_MIN;
608 }
609 return r;
610}
611
612DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
613DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
614DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
615DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
616
617#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
618#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
619#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
620
621static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
622{
623 return n > m ? n - m : 0;
624}
625
626DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
627DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
628DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
629DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
630
631#define DO_SUQADD_B(n, m) \
632 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
633#define DO_SUQADD_H(n, m) \
634 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
635#define DO_SUQADD_S(n, m) \
636 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
637
638static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
639{
640 uint64_t r = n + m;
641
642 if (n < 0) {
643
644 if (r > INT64_MAX) {
645
646 if (m > -n) {
647
648 return INT64_MAX;
649 }
650
651 }
652 } else {
653
654 if (r < m || r > INT64_MAX) {
655 return INT64_MAX;
656 }
657 }
658 return r;
659}
660
661DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
662DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
663DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
664DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
665
666#define DO_USQADD_B(n, m) \
667 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
668#define DO_USQADD_H(n, m) \
669 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
670#define DO_USQADD_S(n, m) \
671 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
672
673static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
674{
675 uint64_t r = n + m;
676
677 if (m < 0) {
678 return n < -m ? 0 : r;
679 }
680 return r < n ? UINT64_MAX : r;
681}
682
683DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
684DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
685DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
686DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
687
688#undef DO_ZPZZ
689#undef DO_ZPZZ_D
690
691
692
693
694
695
696
697#define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
698void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
699{ \
700 intptr_t i, opr_sz = simd_oprsz(desc); \
701 for (i = 0; i < opr_sz; ) { \
702 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
703 do { \
704 TYPE n0 = *(TYPE *)(vn + H(i)); \
705 TYPE m0 = *(TYPE *)(vm + H(i)); \
706 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
707 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
708 if (pg & 1) { \
709 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
710 } \
711 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
712 if (pg & 1) { \
713 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
714 } \
715 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
716 } while (i & 15); \
717 } \
718}
719
720
721#define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
722void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
723{ \
724 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
725 TYPE *d = vd, *n = vn, *m = vm; \
726 uint8_t *pg = vg; \
727 for (i = 0; i < opr_sz; i += 2) { \
728 TYPE n0 = n[i], n1 = n[i + 1]; \
729 TYPE m0 = m[i], m1 = m[i + 1]; \
730 if (pg[H1(i)] & 1) { \
731 d[i] = OP(n0, n1); \
732 } \
733 if (pg[H1(i + 1)] & 1) { \
734 d[i + 1] = OP(m0, m1); \
735 } \
736 } \
737}
738
739DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
740DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
741DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
742DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
743
744DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
745DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
746DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
747DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
748
749DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
750DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
751DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
752DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
753
754DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
755DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
756DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
757DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
758
759DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
760DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
761DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
762DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
763
764#undef DO_ZPZZ_PAIR
765#undef DO_ZPZZ_PAIR_D
766
767#define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
768void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
769 void *status, uint32_t desc) \
770{ \
771 intptr_t i, opr_sz = simd_oprsz(desc); \
772 for (i = 0; i < opr_sz; ) { \
773 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
774 do { \
775 TYPE n0 = *(TYPE *)(vn + H(i)); \
776 TYPE m0 = *(TYPE *)(vm + H(i)); \
777 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
778 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
779 if (pg & 1) { \
780 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
781 } \
782 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
783 if (pg & 1) { \
784 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
785 } \
786 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
787 } while (i & 15); \
788 } \
789}
790
791DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
792DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
793DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
794
795DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
796DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
797DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
798
799DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
800DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
801DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
802
803DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
804DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
805DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
806
807DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
808DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
809DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
810
811#undef DO_ZPZZ_PAIR_FP
812
813
814
815
816
817#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
818void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
819{ \
820 intptr_t i, opr_sz = simd_oprsz(desc); \
821 for (i = 0; i < opr_sz; ) { \
822 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
823 TYPEW mm = *(TYPEW *)(vm + i); \
824 do { \
825 if (pg & 1) { \
826 TYPE nn = *(TYPE *)(vn + H(i)); \
827 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
828 } \
829 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
830 } while (i & 7); \
831 } \
832}
833
834DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
835DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
836DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
837
838DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
839DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
840DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
841
842DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
843DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
844DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
845
846#undef DO_ZPZW
847
848
849
850#define DO_ZPZ(NAME, TYPE, H, OP) \
851void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
852{ \
853 intptr_t i, opr_sz = simd_oprsz(desc); \
854 for (i = 0; i < opr_sz; ) { \
855 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
856 do { \
857 if (pg & 1) { \
858 TYPE nn = *(TYPE *)(vn + H(i)); \
859 *(TYPE *)(vd + H(i)) = OP(nn); \
860 } \
861 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
862 } while (i & 15); \
863 } \
864}
865
866
867#define DO_ZPZ_D(NAME, TYPE, OP) \
868void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
869{ \
870 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
871 TYPE *d = vd, *n = vn; \
872 uint8_t *pg = vg; \
873 for (i = 0; i < opr_sz; i += 1) { \
874 if (pg[H1(i)] & 1) { \
875 TYPE nn = n[i]; \
876 d[i] = OP(nn); \
877 } \
878 } \
879}
880
881#define DO_CLS_B(N) (clrsb32(N) - 24)
882#define DO_CLS_H(N) (clrsb32(N) - 16)
883
884DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
885DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
886DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
887DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
888
889#define DO_CLZ_B(N) (clz32(N) - 24)
890#define DO_CLZ_H(N) (clz32(N) - 16)
891
892DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
893DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
894DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
895DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
896
897DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
898DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
899DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
900DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
901
902#define DO_CNOT(N) (N == 0)
903
904DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
905DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
906DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
907DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
908
909#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
910
911DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
912DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
913DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
914
915#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
916
917DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
918DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
919DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
920
921#define DO_NOT(N) (~N)
922
923DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
924DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
925DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
926DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
927
928#define DO_SXTB(N) ((int8_t)N)
929#define DO_SXTH(N) ((int16_t)N)
930#define DO_SXTS(N) ((int32_t)N)
931#define DO_UXTB(N) ((uint8_t)N)
932#define DO_UXTH(N) ((uint16_t)N)
933#define DO_UXTS(N) ((uint32_t)N)
934
935DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
936DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
937DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
938DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
939DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
940DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
941
942DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
943DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
944DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
945DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
946DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
947DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
948
949#define DO_ABS(N) (N < 0 ? -N : N)
950
951DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
952DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
953DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
954DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
955
956#define DO_NEG(N) (-N)
957
958DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
959DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
960DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
961DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
962
963DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
964DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
965DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
966
967DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
968DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
969
970DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
971
972DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
973DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
974DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
975DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
976
977#define DO_SQABS(X) \
978 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
979 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
980
981DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
982DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
983DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
984DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
985
986#define DO_SQNEG(X) \
987 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
988 x_ == min_ ? -min_ - 1 : -x_; })
989
990DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
991DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
992DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
993DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
994
995DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
996DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
997
998
999
1000#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
1001void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1002{ \
1003 intptr_t i, opr_sz = simd_oprsz(desc); \
1004 for (i = 0; i < opr_sz; ) { \
1005 TYPEW mm = *(TYPEW *)(vm + i); \
1006 do { \
1007 TYPE nn = *(TYPE *)(vn + H(i)); \
1008 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1009 i += sizeof(TYPE); \
1010 } while (i & 7); \
1011 } \
1012}
1013
1014DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1015DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1016DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1017
1018DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1019DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1020DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1021
1022DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1023DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1024DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1025
1026#undef DO_ZZW
1027
1028#undef DO_CLS_B
1029#undef DO_CLS_H
1030#undef DO_CLZ_B
1031#undef DO_CLZ_H
1032#undef DO_CNOT
1033#undef DO_FABS
1034#undef DO_FNEG
1035#undef DO_ABS
1036#undef DO_NEG
1037#undef DO_ZPZ
1038#undef DO_ZPZ_D
1039
1040
1041
1042
1043
1044#define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1045void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1046{ \
1047 intptr_t i, opr_sz = simd_oprsz(desc); \
1048 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1049 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1050 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1051 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1052 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1053 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1054 } \
1055}
1056
1057DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1058DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1059DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1060
1061DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1062DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1063DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1064
1065DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1066DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1067DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1068
1069DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1070DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1071DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1072
1073DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1074DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1075DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1076
1077DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1078DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1079DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1080
1081DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1082DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1083DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1084
1085DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1086DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1087DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1088
1089
1090static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1091{
1092 int16_t val = n * m;
1093 return DO_SQADD_H(val, val);
1094}
1095
1096static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1097{
1098 int32_t val = n * m;
1099 return DO_SQADD_S(val, val);
1100}
1101
1102static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1103{
1104 int64_t val = n * m;
1105 return do_sqadd_d(val, val);
1106}
1107
1108DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1109DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1110DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1111
1112#undef DO_ZZZ_TB
1113
1114#define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1115void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1116{ \
1117 intptr_t i, opr_sz = simd_oprsz(desc); \
1118 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1119 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1120 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1121 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1122 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1123 } \
1124}
1125
1126DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1127DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1128DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1129
1130DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1131DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1132DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1133
1134DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1135DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1136DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1137
1138DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1139DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1140DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1141
1142#undef DO_ZZZ_WTB
1143
1144#define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1145void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1146{ \
1147 intptr_t i, opr_sz = simd_oprsz(desc); \
1148 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1149 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1150 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1151 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1152 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1153 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1154 } \
1155}
1156
1157DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1158DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1159DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1160DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1161
1162#undef DO_ZZZ_NTB
1163
1164#define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1165void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1166{ \
1167 intptr_t i, opr_sz = simd_oprsz(desc); \
1168 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1169 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1170 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1171 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1172 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1173 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1174 } \
1175}
1176
1177DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1178DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1179DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1180
1181DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1182DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1183DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1184
1185DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1186DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1187DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1188
1189DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1190DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1191DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1192
1193#define DO_NMUL(N, M) -(N * M)
1194
1195DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1196DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1197DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1198
1199DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1200DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1201DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1202
1203#undef DO_ZZZW_ACC
1204
1205#define DO_XTNB(NAME, TYPE, OP) \
1206void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1207{ \
1208 intptr_t i, opr_sz = simd_oprsz(desc); \
1209 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1210 TYPE nn = *(TYPE *)(vn + i); \
1211 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1212 *(TYPE *)(vd + i) = nn; \
1213 } \
1214}
1215
1216#define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1217void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1218{ \
1219 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1220 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1221 TYPE nn = *(TYPE *)(vn + i); \
1222 *(TYPEN *)(vd + i + odd) = OP(nn); \
1223 } \
1224}
1225
1226#define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1227#define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1228#define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1229
1230DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1231DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1232DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1233
1234DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1235DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1236DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1237
1238#define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1239#define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1240#define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1241
1242DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1243DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1244DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1245
1246DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1247DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1248DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1249
1250DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1251DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1252DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1253
1254DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1255DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1256DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1257
1258#undef DO_XTNB
1259#undef DO_XTNT
1260
1261void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1262{
1263 intptr_t i, opr_sz = simd_oprsz(desc);
1264 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1265 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1266 uint32_t *a = va, *n = vn;
1267 uint64_t *d = vd, *m = vm;
1268
1269 for (i = 0; i < opr_sz / 8; ++i) {
1270 uint32_t e1 = a[2 * i + H4(0)];
1271 uint32_t e2 = n[2 * i + sel] ^ inv;
1272 uint64_t c = extract64(m[i], 32, 1);
1273
1274 d[i] = c + e1 + e2;
1275 }
1276}
1277
1278void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1279{
1280 intptr_t i, opr_sz = simd_oprsz(desc);
1281 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1282 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1283 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1284
1285 for (i = 0; i < opr_sz / 8; i += 2) {
1286 Int128 e1 = int128_make64(a[i]);
1287 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1288 Int128 c = int128_make64(m[i + 1] & 1);
1289 Int128 r = int128_add(int128_add(e1, e2), c);
1290 d[i + 0] = int128_getlo(r);
1291 d[i + 1] = int128_gethi(r);
1292 }
1293}
1294
1295#define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1296void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1297{ \
1298 intptr_t i, opr_sz = simd_oprsz(desc); \
1299 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1300 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1301 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1302 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1303 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1304 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1305 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1306 } \
1307}
1308
1309DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1310 do_sqdmull_h, DO_SQADD_H)
1311DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1312 do_sqdmull_s, DO_SQADD_S)
1313DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1314 do_sqdmull_d, do_sqadd_d)
1315
1316DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1317 do_sqdmull_h, DO_SQSUB_H)
1318DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1319 do_sqdmull_s, DO_SQSUB_S)
1320DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1321 do_sqdmull_d, do_sqsub_d)
1322
1323#undef DO_SQDMLAL
1324
1325#define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1326void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1327{ \
1328 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1329 int rot = simd_data(desc); \
1330 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1331 bool sub_r = rot == 1 || rot == 2; \
1332 bool sub_i = rot >= 2; \
1333 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1334 for (i = 0; i < opr_sz; i += 2) { \
1335 TYPE elt1_a = n[H(i + sel_a)]; \
1336 TYPE elt2_a = m[H(i + sel_a)]; \
1337 TYPE elt2_b = m[H(i + sel_b)]; \
1338 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1339 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1340 } \
1341}
1342
1343#define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1344
1345DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1346DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1347DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1348DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1349
1350#define DO_SQRDMLAH_B(N, M, A, S) \
1351 do_sqrdmlah_b(N, M, A, S, true)
1352#define DO_SQRDMLAH_H(N, M, A, S) \
1353 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1354#define DO_SQRDMLAH_S(N, M, A, S) \
1355 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1356#define DO_SQRDMLAH_D(N, M, A, S) \
1357 do_sqrdmlah_d(N, M, A, S, true)
1358
1359DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1360DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1361DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1362DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1363
1364#define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1365void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1366{ \
1367 intptr_t i, j, oprsz = simd_oprsz(desc); \
1368 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
1369 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
1370 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1371 bool sub_r = rot == 1 || rot == 2; \
1372 bool sub_i = rot >= 2; \
1373 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1374 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
1375 TYPE elt2_a = m[H(i + idx + sel_a)]; \
1376 TYPE elt2_b = m[H(i + idx + sel_b)]; \
1377 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
1378 TYPE elt1_a = n[H(i + j + sel_a)]; \
1379 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
1380 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
1381 } \
1382 } \
1383}
1384
1385DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1386DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1387
1388DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1389DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1390
1391#undef DO_CMLA
1392#undef DO_CMLA_FUNC
1393#undef DO_CMLA_IDX_FUNC
1394#undef DO_SQRDMLAH_B
1395#undef DO_SQRDMLAH_H
1396#undef DO_SQRDMLAH_S
1397#undef DO_SQRDMLAH_D
1398
1399
1400static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1401 int sel_a, int sel_b, int sub_i)
1402{
1403 for (int i = 0; i <= 1; i++) {
1404 int32_t elt1_r = (int8_t)(n >> (16 * i));
1405 int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1406 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1407 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1408
1409 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1410 }
1411 return a;
1412}
1413
1414static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1415 int sel_a, int sel_b, int sub_i)
1416{
1417 for (int i = 0; i <= 1; i++) {
1418 int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1419 int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1420 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1421 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1422
1423 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1424 }
1425 return a;
1426}
1427
1428void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1429 void *va, uint32_t desc)
1430{
1431 int opr_sz = simd_oprsz(desc);
1432 int rot = simd_data(desc);
1433 int sel_a = rot & 1;
1434 int sel_b = sel_a ^ 1;
1435 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1436 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1437
1438 for (int e = 0; e < opr_sz / 4; e++) {
1439 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1440 }
1441}
1442
1443void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1444 void *va, uint32_t desc)
1445{
1446 int opr_sz = simd_oprsz(desc);
1447 int rot = simd_data(desc);
1448 int sel_a = rot & 1;
1449 int sel_b = sel_a ^ 1;
1450 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1451 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1452
1453 for (int e = 0; e < opr_sz / 8; e++) {
1454 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1455 }
1456}
1457
1458void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1459 void *va, uint32_t desc)
1460{
1461 int opr_sz = simd_oprsz(desc);
1462 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1463 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1464 int sel_a = rot & 1;
1465 int sel_b = sel_a ^ 1;
1466 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1467 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1468
1469 for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1470 uint32_t seg_m = m[seg + idx];
1471 for (int e = 0; e < 4; e++) {
1472 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1473 sel_a, sel_b, sub_i);
1474 }
1475 }
1476}
1477
1478void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1479 void *va, uint32_t desc)
1480{
1481 int seg, opr_sz = simd_oprsz(desc);
1482 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1483 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1484 int sel_a = rot & 1;
1485 int sel_b = sel_a ^ 1;
1486 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1487 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1488
1489 for (seg = 0; seg < opr_sz / 8; seg += 2) {
1490 uint64_t seg_m = m[seg + idx];
1491 for (int e = 0; e < 2; e++) {
1492 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1493 sel_a, sel_b, sub_i);
1494 }
1495 }
1496}
1497
1498#define DO_ZZXZ(NAME, TYPE, H, OP) \
1499void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1500{ \
1501 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
1502 intptr_t i, j, idx = simd_data(desc); \
1503 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
1504 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1505 TYPE mm = m[i]; \
1506 for (j = 0; j < segment; j++) { \
1507 d[i + j] = OP(n[i + j], mm, a[i + j]); \
1508 } \
1509 } \
1510}
1511
1512#define DO_SQRDMLAH_H(N, M, A) \
1513 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1514#define DO_SQRDMLAH_S(N, M, A) \
1515 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1516#define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1517
1518DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1519DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1520DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1521
1522#define DO_SQRDMLSH_H(N, M, A) \
1523 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1524#define DO_SQRDMLSH_S(N, M, A) \
1525 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1526#define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1527
1528DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1529DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1530DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1531
1532#undef DO_ZZXZ
1533
1534#define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1535void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1536{ \
1537 intptr_t i, j, oprsz = simd_oprsz(desc); \
1538 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1539 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1540 for (i = 0; i < oprsz; i += 16) { \
1541 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1542 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1543 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1544 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
1545 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
1546 } \
1547 } \
1548}
1549
1550#define DO_MLA(N, M, A) (A + N * M)
1551
1552DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1553DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1554DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1555DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1556
1557#define DO_MLS(N, M, A) (A - N * M)
1558
1559DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1560DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1561DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1562DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1563
1564#define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
1565#define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
1566
1567DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1568DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1569
1570#define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
1571#define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
1572
1573DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1574DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1575
1576#undef DO_MLA
1577#undef DO_MLS
1578#undef DO_ZZXW
1579
1580#define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1581void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1582{ \
1583 intptr_t i, j, oprsz = simd_oprsz(desc); \
1584 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1585 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1586 for (i = 0; i < oprsz; i += 16) { \
1587 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1588 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1589 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1590 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
1591 } \
1592 } \
1593}
1594
1595DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1596DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1597
1598DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1599DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1600
1601DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1602DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1603
1604#undef DO_ZZX
1605
1606#define DO_BITPERM(NAME, TYPE, OP) \
1607void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1608{ \
1609 intptr_t i, opr_sz = simd_oprsz(desc); \
1610 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1611 TYPE nn = *(TYPE *)(vn + i); \
1612 TYPE mm = *(TYPE *)(vm + i); \
1613 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1614 } \
1615}
1616
1617static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1618{
1619 uint64_t res = 0;
1620 int db, rb = 0;
1621
1622 for (db = 0; db < n; ++db) {
1623 if ((mask >> db) & 1) {
1624 res |= ((data >> db) & 1) << rb;
1625 ++rb;
1626 }
1627 }
1628 return res;
1629}
1630
1631DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1632DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1633DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1634DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1635
1636static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1637{
1638 uint64_t res = 0;
1639 int rb, db = 0;
1640
1641 for (rb = 0; rb < n; ++rb) {
1642 if ((mask >> rb) & 1) {
1643 res |= ((data >> db) & 1) << rb;
1644 ++db;
1645 }
1646 }
1647 return res;
1648}
1649
1650DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1651DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1652DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1653DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1654
1655static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1656{
1657 uint64_t resm = 0, resu = 0;
1658 int db, rbm = 0, rbu = 0;
1659
1660 for (db = 0; db < n; ++db) {
1661 uint64_t val = (data >> db) & 1;
1662 if ((mask >> db) & 1) {
1663 resm |= val << rbm++;
1664 } else {
1665 resu |= val << rbu++;
1666 }
1667 }
1668
1669 return resm | (resu << rbm);
1670}
1671
1672DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1673DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1674DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1675DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1676
1677#undef DO_BITPERM
1678
1679#define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1680void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1681{ \
1682 intptr_t i, opr_sz = simd_oprsz(desc); \
1683 int sub_r = simd_data(desc); \
1684 if (sub_r) { \
1685 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1686 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1687 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1688 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1689 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1690 acc_r = ADD_OP(acc_r, el2_i); \
1691 acc_i = SUB_OP(acc_i, el2_r); \
1692 *(TYPE *)(vd + H(i)) = acc_r; \
1693 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1694 } \
1695 } else { \
1696 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1697 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1698 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1699 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1700 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1701 acc_r = SUB_OP(acc_r, el2_i); \
1702 acc_i = ADD_OP(acc_i, el2_r); \
1703 *(TYPE *)(vd + H(i)) = acc_r; \
1704 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1705 } \
1706 } \
1707}
1708
1709DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1710DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1711DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1712DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1713
1714DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1715DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1716DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1717DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1718
1719#undef DO_CADD
1720
1721#define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1722void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1723{ \
1724 intptr_t i, opr_sz = simd_oprsz(desc); \
1725 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1726 int shift = simd_data(desc) >> 1; \
1727 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1728 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1729 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1730 } \
1731}
1732
1733DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1734DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1735DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1736
1737DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1738DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1739DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1740
1741#undef DO_ZZI_SHLL
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1753uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1754{ \
1755 intptr_t i, opr_sz = simd_oprsz(desc); \
1756 TYPERED ret = INIT; \
1757 for (i = 0; i < opr_sz; ) { \
1758 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1759 do { \
1760 if (pg & 1) { \
1761 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1762 ret = OP(ret, nn); \
1763 } \
1764 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1765 } while (i & 15); \
1766 } \
1767 return (TYPERET)ret; \
1768}
1769
1770#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1771uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1772{ \
1773 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1774 TYPEE *n = vn; \
1775 uint8_t *pg = vg; \
1776 TYPER ret = INIT; \
1777 for (i = 0; i < opr_sz; i += 1) { \
1778 if (pg[H1(i)] & 1) { \
1779 TYPEE nn = n[i]; \
1780 ret = OP(ret, nn); \
1781 } \
1782 } \
1783 return ret; \
1784}
1785
1786DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1787DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1788DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1789DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1790
1791DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1792DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1793DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1794DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1795
1796DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1797DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1798DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1799DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1800
1801DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1802DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1803DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1804
1805DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1806DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1807DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1808DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1809
1810DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1811DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1812DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1813DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1814
1815DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1816DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1817DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1818DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1819
1820DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1821DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1822DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1823DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1824
1825DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1826DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1827DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1828DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1829
1830#undef DO_VPZ
1831#undef DO_VPZ_D
1832
1833
1834#define DO_ZZI(NAME, TYPE, OP) \
1835void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1836{ \
1837 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1838 TYPE s = s64, *d = vd, *n = vn; \
1839 for (i = 0; i < opr_sz; ++i) { \
1840 d[i] = OP(n[i], s); \
1841 } \
1842}
1843
1844#define DO_SUBR(X, Y) (Y - X)
1845
1846DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1847DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1848DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1849DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1850
1851DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1852DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1853DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1854DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1855
1856DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1857DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1858DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1859DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1860
1861DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1862DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1863DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1864DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1865
1866DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1867DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1868DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1869DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1870
1871#undef DO_ZZI
1872
1873#undef DO_AND
1874#undef DO_ORR
1875#undef DO_EOR
1876#undef DO_BIC
1877#undef DO_ADD
1878#undef DO_SUB
1879#undef DO_MAX
1880#undef DO_MIN
1881#undef DO_ABD
1882#undef DO_MUL
1883#undef DO_DIV
1884#undef DO_ASR
1885#undef DO_LSR
1886#undef DO_LSL
1887#undef DO_SUBR
1888
1889
1890
1891
1892static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1893{
1894 uint64_t mask = pred_esz_masks[esz];
1895 intptr_t i = words;
1896
1897 do {
1898 uint64_t this_g = g[--i] & mask;
1899 if (this_g) {
1900 return i * 64 + (63 - clz64(this_g));
1901 }
1902 } while (i > 0);
1903 return (intptr_t)-1 << esz;
1904}
1905
1906uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1907{
1908 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1909 uint32_t flags = PREDTEST_INIT;
1910 uint64_t *d = vd, *g = vg;
1911 intptr_t i = 0;
1912
1913 do {
1914 uint64_t this_d = d[i];
1915 uint64_t this_g = g[i];
1916
1917 if (this_g) {
1918 if (!(flags & 4)) {
1919
1920 this_d |= this_g & -this_g;
1921 d[i] = this_d;
1922 }
1923 flags = iter_predtest_fwd(this_d, this_g, flags);
1924 }
1925 } while (++i < words);
1926
1927 return flags;
1928}
1929
1930uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1931{
1932 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1933 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1934 uint32_t flags = PREDTEST_INIT;
1935 uint64_t *d = vd, *g = vg, esz_mask;
1936 intptr_t i, next;
1937
1938 next = last_active_element(vd, words, esz) + (1 << esz);
1939 esz_mask = pred_esz_masks[esz];
1940
1941
1942
1943 if (next < words * 64) {
1944 uint64_t mask = -1;
1945
1946 if (next & 63) {
1947 mask = ~((1ull << (next & 63)) - 1);
1948 next &= -64;
1949 }
1950 do {
1951 uint64_t this_g = g[next / 64] & esz_mask & mask;
1952 if (this_g != 0) {
1953 next = (next & -64) + ctz64(this_g);
1954 break;
1955 }
1956 next += 64;
1957 mask = -1;
1958 } while (next < words * 64);
1959 }
1960
1961 i = 0;
1962 do {
1963 uint64_t this_d = 0;
1964 if (i == next / 64) {
1965 this_d = 1ull << (next & 63);
1966 }
1967 d[i] = this_d;
1968 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1969 } while (++i < words);
1970
1971 return flags;
1972}
1973
1974
1975
1976
1977
1978void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1979{
1980 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1981 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1982 uint64_t *d = vd, *n = vn;
1983 uint8_t *pg = vg;
1984
1985 for (i = 0; i < opr_sz; i += 1) {
1986 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1987 }
1988}
1989
1990void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1991{
1992 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1993 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1994 uint64_t *d = vd, *n = vn;
1995 uint8_t *pg = vg;
1996
1997 for (i = 0; i < opr_sz; i += 1) {
1998 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1999 }
2000}
2001
2002void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
2003{
2004 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2005 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2006 uint64_t *d = vd, *n = vn;
2007 uint8_t *pg = vg;
2008
2009 for (i = 0; i < opr_sz; i += 1) {
2010 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
2011 }
2012}
2013
2014void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
2015{
2016 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2017 uint64_t *d = vd, *n = vn;
2018 uint8_t *pg = vg;
2019 uint8_t inv = simd_data(desc);
2020
2021 for (i = 0; i < opr_sz; i += 1) {
2022 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2023 }
2024}
2025
2026
2027
2028#define DO_ZPZI(NAME, TYPE, H, OP) \
2029void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2030{ \
2031 intptr_t i, opr_sz = simd_oprsz(desc); \
2032 TYPE imm = simd_data(desc); \
2033 for (i = 0; i < opr_sz; ) { \
2034 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2035 do { \
2036 if (pg & 1) { \
2037 TYPE nn = *(TYPE *)(vn + H(i)); \
2038 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
2039 } \
2040 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2041 } while (i & 15); \
2042 } \
2043}
2044
2045
2046#define DO_ZPZI_D(NAME, TYPE, OP) \
2047void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2048{ \
2049 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2050 TYPE *d = vd, *n = vn; \
2051 TYPE imm = simd_data(desc); \
2052 uint8_t *pg = vg; \
2053 for (i = 0; i < opr_sz; i += 1) { \
2054 if (pg[H1(i)] & 1) { \
2055 TYPE nn = n[i]; \
2056 d[i] = OP(nn, imm); \
2057 } \
2058 } \
2059}
2060
2061#define DO_SHR(N, M) (N >> M)
2062#define DO_SHL(N, M) (N << M)
2063
2064
2065
2066
2067#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2068
2069static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2070{
2071 if (likely(sh < 64)) {
2072 return (x >> sh) + ((x >> (sh - 1)) & 1);
2073 } else if (sh == 64) {
2074 return x >> 63;
2075 } else {
2076 return 0;
2077 }
2078}
2079
2080static inline int64_t do_srshr(int64_t x, unsigned sh)
2081{
2082 if (likely(sh < 64)) {
2083 return (x >> sh) + ((x >> (sh - 1)) & 1);
2084 } else {
2085
2086 return 0;
2087 }
2088}
2089
2090DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2091DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2092DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2093DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2094
2095DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2096DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2097DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2098DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2099
2100DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2101DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2102DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2103DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2104
2105DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2106DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2107DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2108DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2109
2110
2111DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2112DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2113DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2114DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2115
2116DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2117DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2118DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2119DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2120
2121DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2122DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2123DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2124DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2125
2126DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2127DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2128DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2129DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2130
2131#define do_suqrshl_b(n, m) \
2132 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2133#define do_suqrshl_h(n, m) \
2134 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2135#define do_suqrshl_s(n, m) \
2136 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2137#define do_suqrshl_d(n, m) \
2138 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2139
2140DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2141DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2142DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2143DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2144
2145#undef DO_ASRD
2146#undef DO_ZPZI
2147#undef DO_ZPZI_D
2148
2149#define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2150void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2151{ \
2152 intptr_t i, opr_sz = simd_oprsz(desc); \
2153 int shift = simd_data(desc); \
2154 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2155 TYPEW nn = *(TYPEW *)(vn + i); \
2156 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2157 } \
2158}
2159
2160#define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2161void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2162{ \
2163 intptr_t i, opr_sz = simd_oprsz(desc); \
2164 int shift = simd_data(desc); \
2165 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2166 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2167 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2168 } \
2169}
2170
2171DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2172DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2173DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2174
2175DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2176DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2177DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2178
2179DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2180DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2181DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2182
2183DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2184DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2185DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2186
2187#define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2188#define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2189#define DO_SQSHRUN_D(x, sh) \
2190 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2191
2192DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2193DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2194DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2195
2196DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2197DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2198DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2199
2200#define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2201#define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2202#define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2203
2204DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2205DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2206DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2207
2208DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2209DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2210DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2211
2212#define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2213#define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2214#define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2215
2216DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2217DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2218DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2219
2220DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2221DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2222DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2223
2224#define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2225#define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2226#define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2227
2228DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2229DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2230DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2231
2232DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2233DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2234DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2235
2236#define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2237#define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2238#define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2239
2240DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2241DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2242DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2243
2244DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2245DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2246DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2247
2248#define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2249#define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2250#define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2251
2252DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2253DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2254DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2255
2256DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2257DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2258DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2259
2260#undef DO_SHRNB
2261#undef DO_SHRNT
2262
2263#define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2264void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2265{ \
2266 intptr_t i, opr_sz = simd_oprsz(desc); \
2267 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2268 TYPEW nn = *(TYPEW *)(vn + i); \
2269 TYPEW mm = *(TYPEW *)(vm + i); \
2270 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2271 } \
2272}
2273
2274#define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2275void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2276{ \
2277 intptr_t i, opr_sz = simd_oprsz(desc); \
2278 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2279 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2280 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2281 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2282 } \
2283}
2284
2285#define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2286#define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2287#define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2288#define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2289
2290DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2291DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2292DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2293
2294DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2295DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2296DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2297
2298DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2299DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2300DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2301
2302DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2303DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2304DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2305
2306DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2307DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2308DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2309
2310DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2311DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2312DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2313
2314DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2315DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2316DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2317
2318DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2319DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2320DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2321
2322#undef DO_RSUBHN
2323#undef DO_SUBHN
2324#undef DO_RADDHN
2325#undef DO_ADDHN
2326
2327#undef DO_BINOPNB
2328
2329
2330
2331#define DO_ZPZZZ(NAME, TYPE, H, OP) \
2332void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2333 void *vg, uint32_t desc) \
2334{ \
2335 intptr_t i, opr_sz = simd_oprsz(desc); \
2336 for (i = 0; i < opr_sz; ) { \
2337 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2338 do { \
2339 if (pg & 1) { \
2340 TYPE nn = *(TYPE *)(vn + H(i)); \
2341 TYPE mm = *(TYPE *)(vm + H(i)); \
2342 TYPE aa = *(TYPE *)(va + H(i)); \
2343 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2344 } \
2345 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2346 } while (i & 15); \
2347 } \
2348}
2349
2350
2351#define DO_ZPZZZ_D(NAME, TYPE, OP) \
2352void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2353 void *vg, uint32_t desc) \
2354{ \
2355 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2356 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2357 uint8_t *pg = vg; \
2358 for (i = 0; i < opr_sz; i += 1) { \
2359 if (pg[H1(i)] & 1) { \
2360 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2361 d[i] = OP(aa, nn, mm); \
2362 } \
2363 } \
2364}
2365
2366#define DO_MLA(A, N, M) (A + N * M)
2367#define DO_MLS(A, N, M) (A - N * M)
2368
2369DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2370DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2371
2372DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2373DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2374
2375DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2376DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2377
2378DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2379DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2380
2381#undef DO_MLA
2382#undef DO_MLS
2383#undef DO_ZPZZZ
2384#undef DO_ZPZZZ_D
2385
2386void HELPER(sve_index_b)(void *vd, uint32_t start,
2387 uint32_t incr, uint32_t desc)
2388{
2389 intptr_t i, opr_sz = simd_oprsz(desc);
2390 uint8_t *d = vd;
2391 for (i = 0; i < opr_sz; i += 1) {
2392 d[H1(i)] = start + i * incr;
2393 }
2394}
2395
2396void HELPER(sve_index_h)(void *vd, uint32_t start,
2397 uint32_t incr, uint32_t desc)
2398{
2399 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2400 uint16_t *d = vd;
2401 for (i = 0; i < opr_sz; i += 1) {
2402 d[H2(i)] = start + i * incr;
2403 }
2404}
2405
2406void HELPER(sve_index_s)(void *vd, uint32_t start,
2407 uint32_t incr, uint32_t desc)
2408{
2409 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2410 uint32_t *d = vd;
2411 for (i = 0; i < opr_sz; i += 1) {
2412 d[H4(i)] = start + i * incr;
2413 }
2414}
2415
2416void HELPER(sve_index_d)(void *vd, uint64_t start,
2417 uint64_t incr, uint32_t desc)
2418{
2419 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2420 uint64_t *d = vd;
2421 for (i = 0; i < opr_sz; i += 1) {
2422 d[i] = start + i * incr;
2423 }
2424}
2425
2426void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2427{
2428 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2429 uint32_t sh = simd_data(desc);
2430 uint32_t *d = vd, *n = vn, *m = vm;
2431 for (i = 0; i < opr_sz; i += 1) {
2432 d[i] = n[i] + (m[i] << sh);
2433 }
2434}
2435
2436void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2437{
2438 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2439 uint64_t sh = simd_data(desc);
2440 uint64_t *d = vd, *n = vn, *m = vm;
2441 for (i = 0; i < opr_sz; i += 1) {
2442 d[i] = n[i] + (m[i] << sh);
2443 }
2444}
2445
2446void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2447{
2448 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2449 uint64_t sh = simd_data(desc);
2450 uint64_t *d = vd, *n = vn, *m = vm;
2451 for (i = 0; i < opr_sz; i += 1) {
2452 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2453 }
2454}
2455
2456void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2457{
2458 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2459 uint64_t sh = simd_data(desc);
2460 uint64_t *d = vd, *n = vn, *m = vm;
2461 for (i = 0; i < opr_sz; i += 1) {
2462 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2463 }
2464}
2465
2466void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2467{
2468
2469 static const uint16_t coeff[] = {
2470 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2471 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2472 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2473 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2474 };
2475 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2476 uint16_t *d = vd, *n = vn;
2477
2478 for (i = 0; i < opr_sz; i++) {
2479 uint16_t nn = n[i];
2480 intptr_t idx = extract32(nn, 0, 5);
2481 uint16_t exp = extract32(nn, 5, 5);
2482 d[i] = coeff[idx] | (exp << 10);
2483 }
2484}
2485
2486void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2487{
2488
2489 static const uint32_t coeff[] = {
2490 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2491 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2492 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2493 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2494 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2495 0x1ef532, 0x20b051, 0x227043, 0x243516,
2496 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2497 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2498 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2499 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2500 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2501 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2502 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2503 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2504 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2505 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2506 };
2507 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2508 uint32_t *d = vd, *n = vn;
2509
2510 for (i = 0; i < opr_sz; i++) {
2511 uint32_t nn = n[i];
2512 intptr_t idx = extract32(nn, 0, 6);
2513 uint32_t exp = extract32(nn, 6, 8);
2514 d[i] = coeff[idx] | (exp << 23);
2515 }
2516}
2517
2518void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2519{
2520
2521 static const uint64_t coeff[] = {
2522 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2523 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2524 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2525 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2526 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2527 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2528 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2529 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2530 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2531 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2532 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2533 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2534 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2535 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2536 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2537 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2538 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2539 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2540 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2541 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2542 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2543 0xFA7C1819E90D8ull,
2544 };
2545 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2546 uint64_t *d = vd, *n = vn;
2547
2548 for (i = 0; i < opr_sz; i++) {
2549 uint64_t nn = n[i];
2550 intptr_t idx = extract32(nn, 0, 6);
2551 uint64_t exp = extract32(nn, 6, 11);
2552 d[i] = coeff[idx] | (exp << 52);
2553 }
2554}
2555
2556void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2557{
2558 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2559 uint16_t *d = vd, *n = vn, *m = vm;
2560 for (i = 0; i < opr_sz; i += 1) {
2561 uint16_t nn = n[i];
2562 uint16_t mm = m[i];
2563 if (mm & 1) {
2564 nn = float16_one;
2565 }
2566 d[i] = nn ^ (mm & 2) << 14;
2567 }
2568}
2569
2570void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2571{
2572 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2573 uint32_t *d = vd, *n = vn, *m = vm;
2574 for (i = 0; i < opr_sz; i += 1) {
2575 uint32_t nn = n[i];
2576 uint32_t mm = m[i];
2577 if (mm & 1) {
2578 nn = float32_one;
2579 }
2580 d[i] = nn ^ (mm & 2) << 30;
2581 }
2582}
2583
2584void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2585{
2586 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2587 uint64_t *d = vd, *n = vn, *m = vm;
2588 for (i = 0; i < opr_sz; i += 1) {
2589 uint64_t nn = n[i];
2590 uint64_t mm = m[i];
2591 if (mm & 1) {
2592 nn = float64_one;
2593 }
2594 d[i] = nn ^ (mm & 2) << 62;
2595 }
2596}
2597
2598
2599
2600
2601
2602void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2603{
2604 intptr_t i, oprsz = simd_oprsz(desc);
2605
2606 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2607 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2608 }
2609}
2610
2611void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2612{
2613 intptr_t i, oprsz = simd_oprsz(desc);
2614
2615 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2616 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2617 }
2618}
2619
2620void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2621{
2622 intptr_t i, oprsz = simd_oprsz(desc);
2623
2624 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2625 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2626 }
2627}
2628
2629void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2630{
2631 intptr_t i, oprsz = simd_oprsz(desc);
2632
2633 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2634 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2635 }
2636}
2637
2638
2639
2640
2641
2642void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2643{
2644 intptr_t i, oprsz = simd_oprsz(desc);
2645
2646 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2647 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2648 }
2649}
2650
2651void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2652{
2653 intptr_t i, oprsz = simd_oprsz(desc);
2654
2655 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2656 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2657 }
2658}
2659
2660void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2661{
2662 intptr_t i, oprsz = simd_oprsz(desc);
2663
2664 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2665 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2666 }
2667}
2668
2669void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2670{
2671 intptr_t i, oprsz = simd_oprsz(desc);
2672
2673 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2674 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2675 }
2676}
2677
2678void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2679{
2680 intptr_t i, oprsz = simd_oprsz(desc);
2681
2682 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2683 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2684 }
2685}
2686
2687
2688
2689
2690void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2691 uint64_t mm, uint32_t desc)
2692{
2693 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2694 uint64_t *d = vd, *n = vn;
2695 uint8_t *pg = vg;
2696
2697 mm = dup_const(MO_8, mm);
2698 for (i = 0; i < opr_sz; i += 1) {
2699 uint64_t nn = n[i];
2700 uint64_t pp = expand_pred_b(pg[H1(i)]);
2701 d[i] = (mm & pp) | (nn & ~pp);
2702 }
2703}
2704
2705void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2706 uint64_t mm, uint32_t desc)
2707{
2708 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2709 uint64_t *d = vd, *n = vn;
2710 uint8_t *pg = vg;
2711
2712 mm = dup_const(MO_16, mm);
2713 for (i = 0; i < opr_sz; i += 1) {
2714 uint64_t nn = n[i];
2715 uint64_t pp = expand_pred_h(pg[H1(i)]);
2716 d[i] = (mm & pp) | (nn & ~pp);
2717 }
2718}
2719
2720void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2721 uint64_t mm, uint32_t desc)
2722{
2723 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2724 uint64_t *d = vd, *n = vn;
2725 uint8_t *pg = vg;
2726
2727 mm = dup_const(MO_32, mm);
2728 for (i = 0; i < opr_sz; i += 1) {
2729 uint64_t nn = n[i];
2730 uint64_t pp = expand_pred_s(pg[H1(i)]);
2731 d[i] = (mm & pp) | (nn & ~pp);
2732 }
2733}
2734
2735void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2736 uint64_t mm, uint32_t desc)
2737{
2738 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2739 uint64_t *d = vd, *n = vn;
2740 uint8_t *pg = vg;
2741
2742 for (i = 0; i < opr_sz; i += 1) {
2743 uint64_t nn = n[i];
2744 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2745 }
2746}
2747
2748void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2749{
2750 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2751 uint64_t *d = vd;
2752 uint8_t *pg = vg;
2753
2754 val = dup_const(MO_8, val);
2755 for (i = 0; i < opr_sz; i += 1) {
2756 d[i] = val & expand_pred_b(pg[H1(i)]);
2757 }
2758}
2759
2760void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2761{
2762 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2763 uint64_t *d = vd;
2764 uint8_t *pg = vg;
2765
2766 val = dup_const(MO_16, val);
2767 for (i = 0; i < opr_sz; i += 1) {
2768 d[i] = val & expand_pred_h(pg[H1(i)]);
2769 }
2770}
2771
2772void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2773{
2774 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2775 uint64_t *d = vd;
2776 uint8_t *pg = vg;
2777
2778 val = dup_const(MO_32, val);
2779 for (i = 0; i < opr_sz; i += 1) {
2780 d[i] = val & expand_pred_s(pg[H1(i)]);
2781 }
2782}
2783
2784void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2785{
2786 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2787 uint64_t *d = vd;
2788 uint8_t *pg = vg;
2789
2790 for (i = 0; i < opr_sz; i += 1) {
2791 d[i] = (pg[H1(i)] & 1 ? val : 0);
2792 }
2793}
2794
2795
2796
2797
2798static void swap_memmove(void *vd, void *vs, size_t n)
2799{
2800 uintptr_t d = (uintptr_t)vd;
2801 uintptr_t s = (uintptr_t)vs;
2802 uintptr_t o = (d | s | n) & 7;
2803 size_t i;
2804
2805#ifndef HOST_WORDS_BIGENDIAN
2806 o = 0;
2807#endif
2808 switch (o) {
2809 case 0:
2810 memmove(vd, vs, n);
2811 break;
2812
2813 case 4:
2814 if (d < s || d >= s + n) {
2815 for (i = 0; i < n; i += 4) {
2816 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2817 }
2818 } else {
2819 for (i = n; i > 0; ) {
2820 i -= 4;
2821 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2822 }
2823 }
2824 break;
2825
2826 case 2:
2827 case 6:
2828 if (d < s || d >= s + n) {
2829 for (i = 0; i < n; i += 2) {
2830 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2831 }
2832 } else {
2833 for (i = n; i > 0; ) {
2834 i -= 2;
2835 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2836 }
2837 }
2838 break;
2839
2840 default:
2841 if (d < s || d >= s + n) {
2842 for (i = 0; i < n; i++) {
2843 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2844 }
2845 } else {
2846 for (i = n; i > 0; ) {
2847 i -= 1;
2848 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2849 }
2850 }
2851 break;
2852 }
2853}
2854
2855
2856static void swap_memzero(void *vd, size_t n)
2857{
2858 uintptr_t d = (uintptr_t)vd;
2859 uintptr_t o = (d | n) & 7;
2860 size_t i;
2861
2862
2863 if (likely(n == 0)) {
2864 return;
2865 }
2866
2867#ifndef HOST_WORDS_BIGENDIAN
2868 o = 0;
2869#endif
2870 switch (o) {
2871 case 0:
2872 memset(vd, 0, n);
2873 break;
2874
2875 case 4:
2876 for (i = 0; i < n; i += 4) {
2877 *(uint32_t *)H1_4(d + i) = 0;
2878 }
2879 break;
2880
2881 case 2:
2882 case 6:
2883 for (i = 0; i < n; i += 2) {
2884 *(uint16_t *)H1_2(d + i) = 0;
2885 }
2886 break;
2887
2888 default:
2889 for (i = 0; i < n; i++) {
2890 *(uint8_t *)H1(d + i) = 0;
2891 }
2892 break;
2893 }
2894}
2895
2896void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2897{
2898 intptr_t opr_sz = simd_oprsz(desc);
2899 size_t n_ofs = simd_data(desc);
2900 size_t n_siz = opr_sz - n_ofs;
2901
2902 if (vd != vm) {
2903 swap_memmove(vd, vn + n_ofs, n_siz);
2904 swap_memmove(vd + n_siz, vm, n_ofs);
2905 } else if (vd != vn) {
2906 swap_memmove(vd + n_siz, vd, n_ofs);
2907 swap_memmove(vd, vn + n_ofs, n_siz);
2908 } else {
2909
2910 ARMVectorReg tmp;
2911 swap_memmove(&tmp, vm, n_ofs);
2912 swap_memmove(vd, vd + n_ofs, n_siz);
2913 memcpy(vd + n_siz, &tmp, n_ofs);
2914 }
2915}
2916
2917#define DO_INSR(NAME, TYPE, H) \
2918void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2919{ \
2920 intptr_t opr_sz = simd_oprsz(desc); \
2921 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2922 *(TYPE *)(vd + H(0)) = val; \
2923}
2924
2925DO_INSR(sve_insr_b, uint8_t, H1)
2926DO_INSR(sve_insr_h, uint16_t, H1_2)
2927DO_INSR(sve_insr_s, uint32_t, H1_4)
2928DO_INSR(sve_insr_d, uint64_t, H1_8)
2929
2930#undef DO_INSR
2931
2932void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2933{
2934 intptr_t i, j, opr_sz = simd_oprsz(desc);
2935 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2936 uint64_t f = *(uint64_t *)(vn + i);
2937 uint64_t b = *(uint64_t *)(vn + j);
2938 *(uint64_t *)(vd + i) = bswap64(b);
2939 *(uint64_t *)(vd + j) = bswap64(f);
2940 }
2941}
2942
2943void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2944{
2945 intptr_t i, j, opr_sz = simd_oprsz(desc);
2946 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2947 uint64_t f = *(uint64_t *)(vn + i);
2948 uint64_t b = *(uint64_t *)(vn + j);
2949 *(uint64_t *)(vd + i) = hswap64(b);
2950 *(uint64_t *)(vd + j) = hswap64(f);
2951 }
2952}
2953
2954void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2955{
2956 intptr_t i, j, opr_sz = simd_oprsz(desc);
2957 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2958 uint64_t f = *(uint64_t *)(vn + i);
2959 uint64_t b = *(uint64_t *)(vn + j);
2960 *(uint64_t *)(vd + i) = rol64(b, 32);
2961 *(uint64_t *)(vd + j) = rol64(f, 32);
2962 }
2963}
2964
2965void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2966{
2967 intptr_t i, j, opr_sz = simd_oprsz(desc);
2968 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2969 uint64_t f = *(uint64_t *)(vn + i);
2970 uint64_t b = *(uint64_t *)(vn + j);
2971 *(uint64_t *)(vd + i) = b;
2972 *(uint64_t *)(vd + j) = f;
2973 }
2974}
2975
2976typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2977
2978static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2979 bool is_tbx, tb_impl_fn *fn)
2980{
2981 ARMVectorReg scratch;
2982 uintptr_t oprsz = simd_oprsz(desc);
2983
2984 if (unlikely(vd == vn)) {
2985 vn = memcpy(&scratch, vn, oprsz);
2986 }
2987
2988 fn(vd, vn, NULL, vm, oprsz, is_tbx);
2989}
2990
2991static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
2992 uint32_t desc, bool is_tbx, tb_impl_fn *fn)
2993{
2994 ARMVectorReg scratch;
2995 uintptr_t oprsz = simd_oprsz(desc);
2996
2997 if (unlikely(vd == vn0)) {
2998 vn0 = memcpy(&scratch, vn0, oprsz);
2999 if (vd == vn1) {
3000 vn1 = vn0;
3001 }
3002 } else if (unlikely(vd == vn1)) {
3003 vn1 = memcpy(&scratch, vn1, oprsz);
3004 }
3005
3006 fn(vd, vn0, vn1, vm, oprsz, is_tbx);
3007}
3008
3009#define DO_TB(SUFF, TYPE, H) \
3010static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \
3011 void *vm, uintptr_t oprsz, bool is_tbx) \
3012{ \
3013 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \
3014 uintptr_t i, nelem = oprsz / sizeof(TYPE); \
3015 for (i = 0; i < nelem; ++i) { \
3016 TYPE index = indexes[H1(i)], val = 0; \
3017 if (index < nelem) { \
3018 val = tbl0[H(index)]; \
3019 } else { \
3020 index -= nelem; \
3021 if (tbl1 && index < nelem) { \
3022 val = tbl1[H(index)]; \
3023 } else if (is_tbx) { \
3024 continue; \
3025 } \
3026 } \
3027 d[H(i)] = val; \
3028 } \
3029} \
3030void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3031{ \
3032 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \
3033} \
3034void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \
3035 void *vm, uint32_t desc) \
3036{ \
3037 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \
3038} \
3039void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3040{ \
3041 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \
3042}
3043
3044DO_TB(b, uint8_t, H1)
3045DO_TB(h, uint16_t, H2)
3046DO_TB(s, uint32_t, H4)
3047DO_TB(d, uint64_t, H8)
3048
3049#undef DO_TB
3050
3051#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3052void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
3053{ \
3054 intptr_t i, opr_sz = simd_oprsz(desc); \
3055 TYPED *d = vd; \
3056 TYPES *n = vn; \
3057 ARMVectorReg tmp; \
3058 if (unlikely(vn - vd < opr_sz)) { \
3059 n = memcpy(&tmp, n, opr_sz / 2); \
3060 } \
3061 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
3062 d[HD(i)] = n[HS(i)]; \
3063 } \
3064}
3065
3066DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3067DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3068DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3069
3070DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3071DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3072DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3073
3074#undef DO_UNPK
3075
3076
3077
3078
3079
3080static const uint64_t even_bit_esz_masks[5] = {
3081 0x5555555555555555ull,
3082 0x3333333333333333ull,
3083 0x0f0f0f0f0f0f0f0full,
3084 0x00ff00ff00ff00ffull,
3085 0x0000ffff0000ffffull,
3086};
3087
3088
3089
3090
3091
3092
3093static uint64_t expand_bits(uint64_t x, int n)
3094{
3095 int i;
3096
3097 x &= 0xffffffffu;
3098 for (i = 4; i >= n; i--) {
3099 int sh = 1 << i;
3100 x = ((x << sh) | x) & even_bit_esz_masks[i];
3101 }
3102 return x;
3103}
3104
3105
3106
3107
3108
3109
3110static uint64_t compress_bits(uint64_t x, int n)
3111{
3112 int i;
3113
3114 for (i = n; i <= 4; i++) {
3115 int sh = 1 << i;
3116 x &= even_bit_esz_masks[i];
3117 x = (x >> sh) | x;
3118 }
3119 return x & 0xffffffffu;
3120}
3121
3122void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3123{
3124 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3125 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3126 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3127 int esize = 1 << esz;
3128 uint64_t *d = vd;
3129 intptr_t i;
3130
3131 if (oprsz <= 8) {
3132 uint64_t nn = *(uint64_t *)vn;
3133 uint64_t mm = *(uint64_t *)vm;
3134 int half = 4 * oprsz;
3135
3136 nn = extract64(nn, high * half, half);
3137 mm = extract64(mm, high * half, half);
3138 nn = expand_bits(nn, esz);
3139 mm = expand_bits(mm, esz);
3140 d[0] = nn | (mm << esize);
3141 } else {
3142 ARMPredicateReg tmp;
3143
3144
3145
3146 if (vd == vn) {
3147 vn = memcpy(&tmp, vn, oprsz);
3148 if (vd == vm) {
3149 vm = vn;
3150 }
3151 } else if (vd == vm) {
3152 vm = memcpy(&tmp, vm, oprsz);
3153 }
3154 if (high) {
3155 high = oprsz >> 1;
3156 }
3157
3158 if ((oprsz & 7) == 0) {
3159 uint32_t *n = vn, *m = vm;
3160 high >>= 2;
3161
3162 for (i = 0; i < oprsz / 8; i++) {
3163 uint64_t nn = n[H4(high + i)];
3164 uint64_t mm = m[H4(high + i)];
3165
3166 nn = expand_bits(nn, esz);
3167 mm = expand_bits(mm, esz);
3168 d[i] = nn | (mm << esize);
3169 }
3170 } else {
3171 uint8_t *n = vn, *m = vm;
3172 uint16_t *d16 = vd;
3173
3174 for (i = 0; i < oprsz / 2; i++) {
3175 uint16_t nn = n[H1(high + i)];
3176 uint16_t mm = m[H1(high + i)];
3177
3178 nn = expand_bits(nn, esz);
3179 mm = expand_bits(mm, esz);
3180 d16[H2(i)] = nn | (mm << esize);
3181 }
3182 }
3183 }
3184}
3185
3186void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3187{
3188 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3189 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3190 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3191 uint64_t *d = vd, *n = vn, *m = vm;
3192 uint64_t l, h;
3193 intptr_t i;
3194
3195 if (oprsz <= 8) {
3196 l = compress_bits(n[0] >> odd, esz);
3197 h = compress_bits(m[0] >> odd, esz);
3198 d[0] = l | (h << (4 * oprsz));
3199 } else {
3200 ARMPredicateReg tmp_m;
3201 intptr_t oprsz_16 = oprsz / 16;
3202
3203 if ((vm - vd) < (uintptr_t)oprsz) {
3204 m = memcpy(&tmp_m, vm, oprsz);
3205 }
3206
3207 for (i = 0; i < oprsz_16; i++) {
3208 l = n[2 * i + 0];
3209 h = n[2 * i + 1];
3210 l = compress_bits(l >> odd, esz);
3211 h = compress_bits(h >> odd, esz);
3212 d[i] = l | (h << 32);
3213 }
3214
3215
3216
3217
3218
3219
3220 if (oprsz & 15) {
3221 int final_shift = (oprsz & 15) * 2;
3222
3223 l = n[2 * i + 0];
3224 h = n[2 * i + 1];
3225 l = compress_bits(l >> odd, esz);
3226 h = compress_bits(h >> odd, esz);
3227 d[i] = l | (h << final_shift);
3228
3229 for (i = 0; i < oprsz_16; i++) {
3230 l = m[2 * i + 0];
3231 h = m[2 * i + 1];
3232 l = compress_bits(l >> odd, esz);
3233 h = compress_bits(h >> odd, esz);
3234 tmp_m.p[i] = l | (h << 32);
3235 }
3236 l = m[2 * i + 0];
3237 h = m[2 * i + 1];
3238 l = compress_bits(l >> odd, esz);
3239 h = compress_bits(h >> odd, esz);
3240 tmp_m.p[i] = l | (h << final_shift);
3241
3242 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3243 } else {
3244 for (i = 0; i < oprsz_16; i++) {
3245 l = m[2 * i + 0];
3246 h = m[2 * i + 1];
3247 l = compress_bits(l >> odd, esz);
3248 h = compress_bits(h >> odd, esz);
3249 d[oprsz_16 + i] = l | (h << 32);
3250 }
3251 }
3252 }
3253}
3254
3255void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3256{
3257 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3258 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3259 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3260 uint64_t *d = vd, *n = vn, *m = vm;
3261 uint64_t mask;
3262 int shr, shl;
3263 intptr_t i;
3264
3265 shl = 1 << esz;
3266 shr = 0;
3267 mask = even_bit_esz_masks[esz];
3268 if (odd) {
3269 mask <<= shl;
3270 shr = shl;
3271 shl = 0;
3272 }
3273
3274 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3275 uint64_t nn = (n[i] & mask) >> shr;
3276 uint64_t mm = (m[i] & mask) << shl;
3277 d[i] = nn + mm;
3278 }
3279}
3280
3281
3282static uint64_t reverse_bits_64(uint64_t x, int n)
3283{
3284 int i, sh;
3285
3286 x = bswap64(x);
3287 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3288 uint64_t mask = even_bit_esz_masks[i];
3289 x = ((x & mask) << sh) | ((x >> sh) & mask);
3290 }
3291 return x;
3292}
3293
3294static uint8_t reverse_bits_8(uint8_t x, int n)
3295{
3296 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3297 int i, sh;
3298
3299 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3300 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3301 }
3302 return x;
3303}
3304
3305void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3306{
3307 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3308 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3309 intptr_t i, oprsz_2 = oprsz / 2;
3310
3311 if (oprsz <= 8) {
3312 uint64_t l = *(uint64_t *)vn;
3313 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3314 *(uint64_t *)vd = l;
3315 } else if ((oprsz & 15) == 0) {
3316 for (i = 0; i < oprsz_2; i += 8) {
3317 intptr_t ih = oprsz - 8 - i;
3318 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3319 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3320 *(uint64_t *)(vd + i) = h;
3321 *(uint64_t *)(vd + ih) = l;
3322 }
3323 } else {
3324 for (i = 0; i < oprsz_2; i += 1) {
3325 intptr_t il = H1(i);
3326 intptr_t ih = H1(oprsz - 1 - i);
3327 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3328 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3329 *(uint8_t *)(vd + il) = h;
3330 *(uint8_t *)(vd + ih) = l;
3331 }
3332 }
3333}
3334
3335void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3336{
3337 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3338 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3339 uint64_t *d = vd;
3340 intptr_t i;
3341
3342 if (oprsz <= 8) {
3343 uint64_t nn = *(uint64_t *)vn;
3344 int half = 4 * oprsz;
3345
3346 nn = extract64(nn, high * half, half);
3347 nn = expand_bits(nn, 0);
3348 d[0] = nn;
3349 } else {
3350 ARMPredicateReg tmp_n;
3351
3352
3353
3354 if ((vn - vd) < (uintptr_t)oprsz) {
3355 vn = memcpy(&tmp_n, vn, oprsz);
3356 }
3357 if (high) {
3358 high = oprsz >> 1;
3359 }
3360
3361 if ((oprsz & 7) == 0) {
3362 uint32_t *n = vn;
3363 high >>= 2;
3364
3365 for (i = 0; i < oprsz / 8; i++) {
3366 uint64_t nn = n[H4(high + i)];
3367 d[i] = expand_bits(nn, 0);
3368 }
3369 } else {
3370 uint16_t *d16 = vd;
3371 uint8_t *n = vn;
3372
3373 for (i = 0; i < oprsz / 2; i++) {
3374 uint16_t nn = n[H1(high + i)];
3375 d16[H2(i)] = expand_bits(nn, 0);
3376 }
3377 }
3378 }
3379}
3380
3381#define DO_ZIP(NAME, TYPE, H) \
3382void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3383{ \
3384 intptr_t oprsz = simd_oprsz(desc); \
3385 intptr_t i, oprsz_2 = oprsz / 2; \
3386 ARMVectorReg tmp_n, tmp_m; \
3387
3388 \
3389 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3390 vn = memcpy(&tmp_n, vn, oprsz_2); \
3391 } \
3392 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3393 vm = memcpy(&tmp_m, vm, oprsz_2); \
3394 } \
3395 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3396 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
3397 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
3398 } \
3399 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3400 memset(vd + oprsz - 16, 0, 16); \
3401 } \
3402}
3403
3404DO_ZIP(sve_zip_b, uint8_t, H1)
3405DO_ZIP(sve_zip_h, uint16_t, H1_2)
3406DO_ZIP(sve_zip_s, uint32_t, H1_4)
3407DO_ZIP(sve_zip_d, uint64_t, H1_8)
3408DO_ZIP(sve2_zip_q, Int128, )
3409
3410#define DO_UZP(NAME, TYPE, H) \
3411void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3412{ \
3413 intptr_t oprsz = simd_oprsz(desc); \
3414 intptr_t odd_ofs = simd_data(desc); \
3415 intptr_t i, p; \
3416 ARMVectorReg tmp_m; \
3417 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3418 vm = memcpy(&tmp_m, vm, oprsz); \
3419 } \
3420 i = 0, p = odd_ofs; \
3421 do { \
3422 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \
3423 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3424 } while (p < oprsz); \
3425 p -= oprsz; \
3426 do { \
3427 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \
3428 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3429 } while (p < oprsz); \
3430 tcg_debug_assert(i == oprsz); \
3431}
3432
3433DO_UZP(sve_uzp_b, uint8_t, H1)
3434DO_UZP(sve_uzp_h, uint16_t, H1_2)
3435DO_UZP(sve_uzp_s, uint32_t, H1_4)
3436DO_UZP(sve_uzp_d, uint64_t, H1_8)
3437DO_UZP(sve2_uzp_q, Int128, )
3438
3439#define DO_TRN(NAME, TYPE, H) \
3440void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3441{ \
3442 intptr_t oprsz = simd_oprsz(desc); \
3443 intptr_t odd_ofs = simd_data(desc); \
3444 intptr_t i; \
3445 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3446 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3447 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3448 *(TYPE *)(vd + H(i + 0)) = ae; \
3449 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3450 } \
3451 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3452 memset(vd + oprsz - 16, 0, 16); \
3453 } \
3454}
3455
3456DO_TRN(sve_trn_b, uint8_t, H1)
3457DO_TRN(sve_trn_h, uint16_t, H1_2)
3458DO_TRN(sve_trn_s, uint32_t, H1_4)
3459DO_TRN(sve_trn_d, uint64_t, H1_8)
3460DO_TRN(sve2_trn_q, Int128, )
3461
3462#undef DO_ZIP
3463#undef DO_UZP
3464#undef DO_TRN
3465
3466void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3467{
3468 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3469 uint32_t *d = vd, *n = vn;
3470 uint8_t *pg = vg;
3471
3472 for (i = j = 0; i < opr_sz; i++) {
3473 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3474 d[H4(j)] = n[H4(i)];
3475 j++;
3476 }
3477 }
3478 for (; j < opr_sz; j++) {
3479 d[H4(j)] = 0;
3480 }
3481}
3482
3483void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3484{
3485 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3486 uint64_t *d = vd, *n = vn;
3487 uint8_t *pg = vg;
3488
3489 for (i = j = 0; i < opr_sz; i++) {
3490 if (pg[H1(i)] & 1) {
3491 d[j] = n[i];
3492 j++;
3493 }
3494 }
3495 for (; j < opr_sz; j++) {
3496 d[j] = 0;
3497 }
3498}
3499
3500
3501
3502
3503
3504int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3505{
3506 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3507 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3508
3509 return last_active_element(vg, words, esz);
3510}
3511
3512void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3513{
3514 intptr_t opr_sz = simd_oprsz(desc) / 8;
3515 int esz = simd_data(desc);
3516 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3517 intptr_t i, first_i, last_i;
3518 ARMVectorReg tmp;
3519
3520 first_i = last_i = 0;
3521 first_g = last_g = 0;
3522
3523
3524 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3525 pg = *(uint64_t *)(vg + i) & mask;
3526 if (pg) {
3527 if (last_g == 0) {
3528 last_g = pg;
3529 last_i = i;
3530 }
3531 first_g = pg;
3532 first_i = i;
3533 }
3534 }
3535
3536 len = 0;
3537 if (first_g != 0) {
3538 first_i = first_i * 8 + ctz64(first_g);
3539 last_i = last_i * 8 + 63 - clz64(last_g);
3540 len = last_i - first_i + (1 << esz);
3541 if (vd == vm) {
3542 vm = memcpy(&tmp, vm, opr_sz * 8);
3543 }
3544 swap_memmove(vd, vn + first_i, len);
3545 }
3546 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3547}
3548
3549void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3550 void *vg, uint32_t desc)
3551{
3552 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3553 uint64_t *d = vd, *n = vn, *m = vm;
3554 uint8_t *pg = vg;
3555
3556 for (i = 0; i < opr_sz; i += 1) {
3557 uint64_t nn = n[i], mm = m[i];
3558 uint64_t pp = expand_pred_b(pg[H1(i)]);
3559 d[i] = (nn & pp) | (mm & ~pp);
3560 }
3561}
3562
3563void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3564 void *vg, uint32_t desc)
3565{
3566 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3567 uint64_t *d = vd, *n = vn, *m = vm;
3568 uint8_t *pg = vg;
3569
3570 for (i = 0; i < opr_sz; i += 1) {
3571 uint64_t nn = n[i], mm = m[i];
3572 uint64_t pp = expand_pred_h(pg[H1(i)]);
3573 d[i] = (nn & pp) | (mm & ~pp);
3574 }
3575}
3576
3577void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3578 void *vg, uint32_t desc)
3579{
3580 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3581 uint64_t *d = vd, *n = vn, *m = vm;
3582 uint8_t *pg = vg;
3583
3584 for (i = 0; i < opr_sz; i += 1) {
3585 uint64_t nn = n[i], mm = m[i];
3586 uint64_t pp = expand_pred_s(pg[H1(i)]);
3587 d[i] = (nn & pp) | (mm & ~pp);
3588 }
3589}
3590
3591void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3592 void *vg, uint32_t desc)
3593{
3594 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3595 uint64_t *d = vd, *n = vn, *m = vm;
3596 uint8_t *pg = vg;
3597
3598 for (i = 0; i < opr_sz; i += 1) {
3599 uint64_t nn = n[i], mm = m[i];
3600 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3601 }
3602}
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3626uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3627{ \
3628 intptr_t opr_sz = simd_oprsz(desc); \
3629 uint32_t flags = PREDTEST_INIT; \
3630 intptr_t i = opr_sz; \
3631 do { \
3632 uint64_t out = 0, pg; \
3633 do { \
3634 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3635 TYPE nn = *(TYPE *)(vn + H(i)); \
3636 TYPE mm = *(TYPE *)(vm + H(i)); \
3637 out |= nn OP mm; \
3638 } while (i & 63); \
3639 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3640 out &= pg; \
3641 *(uint64_t *)(vd + (i >> 3)) = out; \
3642 flags = iter_predtest_bwd(out, pg, flags); \
3643 } while (i > 0); \
3644 return flags; \
3645}
3646
3647#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3648 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3649#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3650 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3651#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3652 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3653#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3654 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3655
3656DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3657DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3658DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3659DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3660
3661DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3662DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3663DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3664DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3665
3666DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3667DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3668DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3669DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3670
3671DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3672DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3673DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3674DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3675
3676DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3677DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3678DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3679DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3680
3681DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3682DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3683DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3684DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3685
3686#undef DO_CMP_PPZZ_B
3687#undef DO_CMP_PPZZ_H
3688#undef DO_CMP_PPZZ_S
3689#undef DO_CMP_PPZZ_D
3690#undef DO_CMP_PPZZ
3691
3692
3693#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3694uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3695{ \
3696 intptr_t opr_sz = simd_oprsz(desc); \
3697 uint32_t flags = PREDTEST_INIT; \
3698 intptr_t i = opr_sz; \
3699 do { \
3700 uint64_t out = 0, pg; \
3701 do { \
3702 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3703 do { \
3704 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3705 TYPE nn = *(TYPE *)(vn + H(i)); \
3706 out |= nn OP mm; \
3707 } while (i & 7); \
3708 } while (i & 63); \
3709 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3710 out &= pg; \
3711 *(uint64_t *)(vd + (i >> 3)) = out; \
3712 flags = iter_predtest_bwd(out, pg, flags); \
3713 } while (i > 0); \
3714 return flags; \
3715}
3716
3717#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3718 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3719#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3720 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3721#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3722 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3723
3724DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3725DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3726DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3727
3728DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3729DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3730DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3731
3732DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3733DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3734DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3735
3736DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3737DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3738DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3739
3740DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3741DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3742DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3743
3744DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3745DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3746DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3747
3748DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3749DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3750DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3751
3752DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3753DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3754DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3755
3756DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3757DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3758DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3759
3760DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3761DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3762DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3763
3764#undef DO_CMP_PPZW_B
3765#undef DO_CMP_PPZW_H
3766#undef DO_CMP_PPZW_S
3767#undef DO_CMP_PPZW
3768
3769
3770#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3771uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3772{ \
3773 intptr_t opr_sz = simd_oprsz(desc); \
3774 uint32_t flags = PREDTEST_INIT; \
3775 TYPE mm = simd_data(desc); \
3776 intptr_t i = opr_sz; \
3777 do { \
3778 uint64_t out = 0, pg; \
3779 do { \
3780 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3781 TYPE nn = *(TYPE *)(vn + H(i)); \
3782 out |= nn OP mm; \
3783 } while (i & 63); \
3784 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3785 out &= pg; \
3786 *(uint64_t *)(vd + (i >> 3)) = out; \
3787 flags = iter_predtest_bwd(out, pg, flags); \
3788 } while (i > 0); \
3789 return flags; \
3790}
3791
3792#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3793 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3794#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3795 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3796#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3797 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3798#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3799 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3800
3801DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3802DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3803DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3804DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3805
3806DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3807DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3808DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3809DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3810
3811DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3812DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3813DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3814DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3815
3816DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3817DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3818DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3819DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3820
3821DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3822DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3823DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3824DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3825
3826DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3827DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3828DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3829DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3830
3831DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3832DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3833DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3834DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3835
3836DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3837DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3838DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3839DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3840
3841DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3842DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3843DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3844DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3845
3846DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3847DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3848DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3849DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3850
3851#undef DO_CMP_PPZI_B
3852#undef DO_CMP_PPZI_H
3853#undef DO_CMP_PPZI_S
3854#undef DO_CMP_PPZI_D
3855#undef DO_CMP_PPZI
3856
3857
3858static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3859{
3860 intptr_t i;
3861
3862 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3863 uint64_t pg = *(uint64_t *)(vg + i);
3864 if (pg) {
3865 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3866 }
3867 }
3868 return 0;
3869}
3870
3871
3872
3873
3874
3875static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3876 bool brk, bool after)
3877{
3878 uint64_t b;
3879
3880 if (brk) {
3881 b = 0;
3882 } else if ((g & n) == 0) {
3883
3884 b = g;
3885 } else {
3886
3887 b = g & n;
3888 b = b & -b;
3889 if (after) {
3890 b = b | (b - 1);
3891 } else {
3892 b = b - 1;
3893 }
3894 brk = true;
3895 }
3896
3897 *retb = b;
3898 return brk;
3899}
3900
3901
3902static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3903 intptr_t oprsz, bool after)
3904{
3905 bool brk = false;
3906 intptr_t i;
3907
3908 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3909 uint64_t this_b, this_g = g[i];
3910
3911 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3912 d[i] = this_b & this_g;
3913 }
3914}
3915
3916
3917static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3918 intptr_t oprsz, bool after)
3919{
3920 uint32_t flags = PREDTEST_INIT;
3921 bool brk = false;
3922 intptr_t i;
3923
3924 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3925 uint64_t this_b, this_d, this_g = g[i];
3926
3927 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3928 d[i] = this_d = this_b & this_g;
3929 flags = iter_predtest_fwd(this_d, this_g, flags);
3930 }
3931 return flags;
3932}
3933
3934
3935static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3936 intptr_t oprsz, bool after)
3937{
3938 bool brk = false;
3939 intptr_t i;
3940
3941 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3942 uint64_t this_b, this_g = g[i];
3943
3944 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3945 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3946 }
3947}
3948
3949
3950static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3951 intptr_t oprsz, bool after)
3952{
3953 uint32_t flags = PREDTEST_INIT;
3954 bool brk = false;
3955 intptr_t i;
3956
3957 for (i = 0; i < oprsz / 8; ++i) {
3958 uint64_t this_b, this_d = d[i], this_g = g[i];
3959
3960 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3961 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3962 flags = iter_predtest_fwd(this_d, this_g, flags);
3963 }
3964 return flags;
3965}
3966
3967static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3968{
3969
3970
3971
3972 memset(d, 0, sizeof(ARMPredicateReg));
3973 return PREDTEST_INIT;
3974}
3975
3976void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3977 uint32_t pred_desc)
3978{
3979 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3980 if (last_active_pred(vn, vg, oprsz)) {
3981 compute_brk_z(vd, vm, vg, oprsz, true);
3982 } else {
3983 do_zero(vd, oprsz);
3984 }
3985}
3986
3987uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3988 uint32_t pred_desc)
3989{
3990 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3991 if (last_active_pred(vn, vg, oprsz)) {
3992 return compute_brks_z(vd, vm, vg, oprsz, true);
3993 } else {
3994 return do_zero(vd, oprsz);
3995 }
3996}
3997
3998void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3999 uint32_t pred_desc)
4000{
4001 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4002 if (last_active_pred(vn, vg, oprsz)) {
4003 compute_brk_z(vd, vm, vg, oprsz, false);
4004 } else {
4005 do_zero(vd, oprsz);
4006 }
4007}
4008
4009uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4010 uint32_t pred_desc)
4011{
4012 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4013 if (last_active_pred(vn, vg, oprsz)) {
4014 return compute_brks_z(vd, vm, vg, oprsz, false);
4015 } else {
4016 return do_zero(vd, oprsz);
4017 }
4018}
4019
4020void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4021{
4022 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4023 compute_brk_z(vd, vn, vg, oprsz, true);
4024}
4025
4026uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4027{
4028 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4029 return compute_brks_z(vd, vn, vg, oprsz, true);
4030}
4031
4032void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4033{
4034 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4035 compute_brk_z(vd, vn, vg, oprsz, false);
4036}
4037
4038uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4039{
4040 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4041 return compute_brks_z(vd, vn, vg, oprsz, false);
4042}
4043
4044void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4045{
4046 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4047 compute_brk_m(vd, vn, vg, oprsz, true);
4048}
4049
4050uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4051{
4052 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4053 return compute_brks_m(vd, vn, vg, oprsz, true);
4054}
4055
4056void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4057{
4058 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4059 compute_brk_m(vd, vn, vg, oprsz, false);
4060}
4061
4062uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4063{
4064 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4065 return compute_brks_m(vd, vn, vg, oprsz, false);
4066}
4067
4068void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4069{
4070 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4071 if (!last_active_pred(vn, vg, oprsz)) {
4072 do_zero(vd, oprsz);
4073 }
4074}
4075
4076
4077static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4078 uint64_t esz_mask)
4079{
4080 uint32_t flags = PREDTEST_INIT;
4081 intptr_t i;
4082
4083 for (i = 0; i < oprsz / 8; i++) {
4084 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4085 }
4086 if (oprsz & 7) {
4087 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4088 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4089 }
4090 return flags;
4091}
4092
4093uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4094{
4095 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4096 if (last_active_pred(vn, vg, oprsz)) {
4097 return predtest_ones(vd, oprsz, -1);
4098 } else {
4099 return do_zero(vd, oprsz);
4100 }
4101}
4102
4103uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4104{
4105 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4106 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4107 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4108 intptr_t i;
4109
4110 for (i = 0; i < words; ++i) {
4111 uint64_t t = n[i] & g[i] & mask;
4112 sum += ctpop64(t);
4113 }
4114 return sum;
4115}
4116
4117uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4118{
4119 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4120 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4121 uint64_t esz_mask = pred_esz_masks[esz];
4122 ARMPredicateReg *d = vd;
4123 uint32_t flags;
4124 intptr_t i;
4125
4126
4127 flags = do_zero(d, oprsz);
4128 if (count == 0) {
4129 return flags;
4130 }
4131
4132
4133 for (i = 0; i < count / 64; ++i) {
4134 d->p[i] = esz_mask;
4135 }
4136 if (count & 63) {
4137 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4138 }
4139
4140 return predtest_ones(d, oprsz, esz_mask);
4141}
4142
4143uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4144{
4145 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4146 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4147 uint64_t esz_mask = pred_esz_masks[esz];
4148 ARMPredicateReg *d = vd;
4149 intptr_t i, invcount, oprbits;
4150 uint64_t bits;
4151
4152 if (count == 0) {
4153 return do_zero(d, oprsz);
4154 }
4155
4156 oprbits = oprsz * 8;
4157 tcg_debug_assert(count <= oprbits);
4158
4159 bits = esz_mask;
4160 if (oprbits & 63) {
4161 bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4162 }
4163
4164 invcount = oprbits - count;
4165 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4166 d->p[i] = bits;
4167 bits = esz_mask;
4168 }
4169
4170 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4171
4172 while (--i >= 0) {
4173 d->p[i] = 0;
4174 }
4175
4176 return predtest_ones(d, oprsz, esz_mask);
4177}
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
4188static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4189{ \
4190 if (n == 1) { \
4191 return *data; \
4192 } else { \
4193 uintptr_t half = n / 2; \
4194 TYPE lo = NAME##_reduce(data, status, half); \
4195 TYPE hi = NAME##_reduce(data + half, status, half); \
4196 return TYPE##_##FUNC(lo, hi, status); \
4197 } \
4198} \
4199uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
4200{ \
4201 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
4202 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
4203 for (i = 0; i < oprsz; ) { \
4204 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4205 do { \
4206 TYPE nn = *(TYPE *)(vn + H(i)); \
4207 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
4208 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4209 } while (i & 15); \
4210 } \
4211 for (; i < maxsz; i += sizeof(TYPE)) { \
4212 *(TYPE *)((void *)data + i) = IDENT; \
4213 } \
4214 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
4215}
4216
4217DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4218DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4219DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4220
4221
4222DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4223DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4224DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4225
4226DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4227DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4228DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4229
4230DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4231DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4232DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4233
4234DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4235DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4236DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4237
4238#undef DO_REDUCE
4239
4240uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4241 void *status, uint32_t desc)
4242{
4243 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4244 float16 result = nn;
4245
4246 do {
4247 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4248 do {
4249 if (pg & 1) {
4250 float16 mm = *(float16 *)(vm + H1_2(i));
4251 result = float16_add(result, mm, status);
4252 }
4253 i += sizeof(float16), pg >>= sizeof(float16);
4254 } while (i & 15);
4255 } while (i < opr_sz);
4256
4257 return result;
4258}
4259
4260uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4261 void *status, uint32_t desc)
4262{
4263 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4264 float32 result = nn;
4265
4266 do {
4267 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4268 do {
4269 if (pg & 1) {
4270 float32 mm = *(float32 *)(vm + H1_2(i));
4271 result = float32_add(result, mm, status);
4272 }
4273 i += sizeof(float32), pg >>= sizeof(float32);
4274 } while (i & 15);
4275 } while (i < opr_sz);
4276
4277 return result;
4278}
4279
4280uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4281 void *status, uint32_t desc)
4282{
4283 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4284 uint64_t *m = vm;
4285 uint8_t *pg = vg;
4286
4287 for (i = 0; i < opr_sz; i++) {
4288 if (pg[H1(i)] & 1) {
4289 nn = float64_add(nn, m[i], status);
4290 }
4291 }
4292
4293 return nn;
4294}
4295
4296
4297
4298
4299#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4300void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4301 void *status, uint32_t desc) \
4302{ \
4303 intptr_t i = simd_oprsz(desc); \
4304 uint64_t *g = vg; \
4305 do { \
4306 uint64_t pg = g[(i - 1) >> 6]; \
4307 do { \
4308 i -= sizeof(TYPE); \
4309 if (likely((pg >> (i & 63)) & 1)) { \
4310 TYPE nn = *(TYPE *)(vn + H(i)); \
4311 TYPE mm = *(TYPE *)(vm + H(i)); \
4312 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4313 } \
4314 } while (i & 63); \
4315 } while (i != 0); \
4316}
4317
4318DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4319DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4320DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4321
4322DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4323DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4324DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4325
4326DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4327DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4328DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4329
4330DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4331DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4332DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4333
4334DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4335DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4336DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4337
4338DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4339DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4340DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4341
4342DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4343DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4344DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4345
4346DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4347DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4348DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4349
4350static inline float16 abd_h(float16 a, float16 b, float_status *s)
4351{
4352 return float16_abs(float16_sub(a, b, s));
4353}
4354
4355static inline float32 abd_s(float32 a, float32 b, float_status *s)
4356{
4357 return float32_abs(float32_sub(a, b, s));
4358}
4359
4360static inline float64 abd_d(float64 a, float64 b, float_status *s)
4361{
4362 return float64_abs(float64_sub(a, b, s));
4363}
4364
4365DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4366DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4367DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4368
4369static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4370{
4371 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4372 return float64_scalbn(a, b_int, s);
4373}
4374
4375DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4376DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4377DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4378
4379DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4380DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4381DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4382
4383#undef DO_ZPZZ_FP
4384
4385
4386
4387
4388#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4389void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4390 void *status, uint32_t desc) \
4391{ \
4392 intptr_t i = simd_oprsz(desc); \
4393 uint64_t *g = vg; \
4394 TYPE mm = scalar; \
4395 do { \
4396 uint64_t pg = g[(i - 1) >> 6]; \
4397 do { \
4398 i -= sizeof(TYPE); \
4399 if (likely((pg >> (i & 63)) & 1)) { \
4400 TYPE nn = *(TYPE *)(vn + H(i)); \
4401 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4402 } \
4403 } while (i & 63); \
4404 } while (i != 0); \
4405}
4406
4407DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4408DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4409DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4410
4411DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4412DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4413DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4414
4415DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4416DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4417DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4418
4419static inline float16 subr_h(float16 a, float16 b, float_status *s)
4420{
4421 return float16_sub(b, a, s);
4422}
4423
4424static inline float32 subr_s(float32 a, float32 b, float_status *s)
4425{
4426 return float32_sub(b, a, s);
4427}
4428
4429static inline float64 subr_d(float64 a, float64 b, float_status *s)
4430{
4431 return float64_sub(b, a, s);
4432}
4433
4434DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4435DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4436DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4437
4438DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4439DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4440DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4441
4442DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4443DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4444DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4445
4446DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4447DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4448DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4449
4450DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4451DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4452DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4453
4454
4455
4456
4457#define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4458void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4459{ \
4460 intptr_t i = simd_oprsz(desc); \
4461 uint64_t *g = vg; \
4462 do { \
4463 uint64_t pg = g[(i - 1) >> 6]; \
4464 do { \
4465 i -= sizeof(TYPE); \
4466 if (likely((pg >> (i & 63)) & 1)) { \
4467 TYPE nn = *(TYPE *)(vn + H(i)); \
4468 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4469 } \
4470 } while (i & 63); \
4471 } while (i != 0); \
4472}
4473
4474
4475
4476
4477
4478static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4479{
4480 bool save = get_flush_inputs_to_zero(fpst);
4481 float32 ret;
4482
4483 set_flush_inputs_to_zero(false, fpst);
4484 ret = float16_to_float32(f, true, fpst);
4485 set_flush_inputs_to_zero(save, fpst);
4486 return ret;
4487}
4488
4489static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4490{
4491 bool save = get_flush_inputs_to_zero(fpst);
4492 float64 ret;
4493
4494 set_flush_inputs_to_zero(false, fpst);
4495 ret = float16_to_float64(f, true, fpst);
4496 set_flush_inputs_to_zero(save, fpst);
4497 return ret;
4498}
4499
4500static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4501{
4502 bool save = get_flush_to_zero(fpst);
4503 float16 ret;
4504
4505 set_flush_to_zero(false, fpst);
4506 ret = float32_to_float16(f, true, fpst);
4507 set_flush_to_zero(save, fpst);
4508 return ret;
4509}
4510
4511static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4512{
4513 bool save = get_flush_to_zero(fpst);
4514 float16 ret;
4515
4516 set_flush_to_zero(false, fpst);
4517 ret = float64_to_float16(f, true, fpst);
4518 set_flush_to_zero(save, fpst);
4519 return ret;
4520}
4521
4522static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4523{
4524 if (float16_is_any_nan(f)) {
4525 float_raise(float_flag_invalid, s);
4526 return 0;
4527 }
4528 return float16_to_int16_round_to_zero(f, s);
4529}
4530
4531static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4532{
4533 if (float16_is_any_nan(f)) {
4534 float_raise(float_flag_invalid, s);
4535 return 0;
4536 }
4537 return float16_to_int64_round_to_zero(f, s);
4538}
4539
4540static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4541{
4542 if (float32_is_any_nan(f)) {
4543 float_raise(float_flag_invalid, s);
4544 return 0;
4545 }
4546 return float32_to_int64_round_to_zero(f, s);
4547}
4548
4549static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4550{
4551 if (float64_is_any_nan(f)) {
4552 float_raise(float_flag_invalid, s);
4553 return 0;
4554 }
4555 return float64_to_int64_round_to_zero(f, s);
4556}
4557
4558static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4559{
4560 if (float16_is_any_nan(f)) {
4561 float_raise(float_flag_invalid, s);
4562 return 0;
4563 }
4564 return float16_to_uint16_round_to_zero(f, s);
4565}
4566
4567static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4568{
4569 if (float16_is_any_nan(f)) {
4570 float_raise(float_flag_invalid, s);
4571 return 0;
4572 }
4573 return float16_to_uint64_round_to_zero(f, s);
4574}
4575
4576static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4577{
4578 if (float32_is_any_nan(f)) {
4579 float_raise(float_flag_invalid, s);
4580 return 0;
4581 }
4582 return float32_to_uint64_round_to_zero(f, s);
4583}
4584
4585static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4586{
4587 if (float64_is_any_nan(f)) {
4588 float_raise(float_flag_invalid, s);
4589 return 0;
4590 }
4591 return float64_to_uint64_round_to_zero(f, s);
4592}
4593
4594DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4595DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4596DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16)
4597DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4598DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4599DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4600DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4601
4602DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4603DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4604DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4605DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4606DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4607DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4608DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4609
4610DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4611DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4612DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4613DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4614DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4615DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4616DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4617
4618DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4619DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4620DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4621
4622DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4623DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4624DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4625
4626DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4627DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4628DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4629
4630DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4631DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4632DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4633
4634DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4635DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4636DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4637DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4638DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4639DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4640DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4641
4642DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4643DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4644DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4645DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4646DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4647DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4648DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4649
4650static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4651{
4652
4653 uint32_t frac = (uint32_t)a << (16 + 6);
4654 int16_t exp = extract32(a, 10, 5);
4655
4656 if (unlikely(exp == 0)) {
4657 if (frac != 0) {
4658 if (!get_flush_inputs_to_zero(s)) {
4659
4660 return -15 - clz32(frac);
4661 }
4662
4663 float_raise(float_flag_input_denormal, s);
4664 }
4665 } else if (unlikely(exp == 0x1f)) {
4666 if (frac == 0) {
4667 return INT16_MAX;
4668 }
4669 } else {
4670
4671 return exp - 15;
4672 }
4673
4674 float_raise(float_flag_invalid, s);
4675 return INT16_MIN;
4676}
4677
4678static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4679{
4680
4681 uint32_t frac = a << 9;
4682 int32_t exp = extract32(a, 23, 8);
4683
4684 if (unlikely(exp == 0)) {
4685 if (frac != 0) {
4686 if (!get_flush_inputs_to_zero(s)) {
4687
4688 return -127 - clz32(frac);
4689 }
4690
4691 float_raise(float_flag_input_denormal, s);
4692 }
4693 } else if (unlikely(exp == 0xff)) {
4694 if (frac == 0) {
4695 return INT32_MAX;
4696 }
4697 } else {
4698
4699 return exp - 127;
4700 }
4701
4702 float_raise(float_flag_invalid, s);
4703 return INT32_MIN;
4704}
4705
4706static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4707{
4708
4709 uint64_t frac = a << 12;
4710 int64_t exp = extract64(a, 52, 11);
4711
4712 if (unlikely(exp == 0)) {
4713 if (frac != 0) {
4714 if (!get_flush_inputs_to_zero(s)) {
4715
4716 return -1023 - clz64(frac);
4717 }
4718
4719 float_raise(float_flag_input_denormal, s);
4720 }
4721 } else if (unlikely(exp == 0x7ff)) {
4722 if (frac == 0) {
4723 return INT64_MAX;
4724 }
4725 } else {
4726
4727 return exp - 1023;
4728 }
4729
4730 float_raise(float_flag_invalid, s);
4731 return INT64_MIN;
4732}
4733
4734DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4735DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4736DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4737
4738#undef DO_ZPZ_FP
4739
4740static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4741 float_status *status, uint32_t desc,
4742 uint16_t neg1, uint16_t neg3)
4743{
4744 intptr_t i = simd_oprsz(desc);
4745 uint64_t *g = vg;
4746
4747 do {
4748 uint64_t pg = g[(i - 1) >> 6];
4749 do {
4750 i -= 2;
4751 if (likely((pg >> (i & 63)) & 1)) {
4752 float16 e1, e2, e3, r;
4753
4754 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4755 e2 = *(uint16_t *)(vm + H1_2(i));
4756 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4757 r = float16_muladd(e1, e2, e3, 0, status);
4758 *(uint16_t *)(vd + H1_2(i)) = r;
4759 }
4760 } while (i & 63);
4761 } while (i != 0);
4762}
4763
4764void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4765 void *vg, void *status, uint32_t desc)
4766{
4767 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4768}
4769
4770void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4771 void *vg, void *status, uint32_t desc)
4772{
4773 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4774}
4775
4776void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4777 void *vg, void *status, uint32_t desc)
4778{
4779 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4780}
4781
4782void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4783 void *vg, void *status, uint32_t desc)
4784{
4785 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4786}
4787
4788static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4789 float_status *status, uint32_t desc,
4790 uint32_t neg1, uint32_t neg3)
4791{
4792 intptr_t i = simd_oprsz(desc);
4793 uint64_t *g = vg;
4794
4795 do {
4796 uint64_t pg = g[(i - 1) >> 6];
4797 do {
4798 i -= 4;
4799 if (likely((pg >> (i & 63)) & 1)) {
4800 float32 e1, e2, e3, r;
4801
4802 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4803 e2 = *(uint32_t *)(vm + H1_4(i));
4804 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4805 r = float32_muladd(e1, e2, e3, 0, status);
4806 *(uint32_t *)(vd + H1_4(i)) = r;
4807 }
4808 } while (i & 63);
4809 } while (i != 0);
4810}
4811
4812void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4813 void *vg, void *status, uint32_t desc)
4814{
4815 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4816}
4817
4818void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4819 void *vg, void *status, uint32_t desc)
4820{
4821 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4822}
4823
4824void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4825 void *vg, void *status, uint32_t desc)
4826{
4827 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4828}
4829
4830void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4831 void *vg, void *status, uint32_t desc)
4832{
4833 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4834}
4835
4836static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4837 float_status *status, uint32_t desc,
4838 uint64_t neg1, uint64_t neg3)
4839{
4840 intptr_t i = simd_oprsz(desc);
4841 uint64_t *g = vg;
4842
4843 do {
4844 uint64_t pg = g[(i - 1) >> 6];
4845 do {
4846 i -= 8;
4847 if (likely((pg >> (i & 63)) & 1)) {
4848 float64 e1, e2, e3, r;
4849
4850 e1 = *(uint64_t *)(vn + i) ^ neg1;
4851 e2 = *(uint64_t *)(vm + i);
4852 e3 = *(uint64_t *)(va + i) ^ neg3;
4853 r = float64_muladd(e1, e2, e3, 0, status);
4854 *(uint64_t *)(vd + i) = r;
4855 }
4856 } while (i & 63);
4857 } while (i != 0);
4858}
4859
4860void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4861 void *vg, void *status, uint32_t desc)
4862{
4863 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4864}
4865
4866void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4867 void *vg, void *status, uint32_t desc)
4868{
4869 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4870}
4871
4872void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4873 void *vg, void *status, uint32_t desc)
4874{
4875 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4876}
4877
4878void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4879 void *vg, void *status, uint32_t desc)
4880{
4881 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4882}
4883
4884
4885
4886
4887
4888
4889#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4890void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4891 void *status, uint32_t desc) \
4892{ \
4893 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4894 uint64_t *d = vd, *g = vg; \
4895 do { \
4896 uint64_t out = 0, pg = g[j]; \
4897 do { \
4898 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4899 if (likely((pg >> (i & 63)) & 1)) { \
4900 TYPE nn = *(TYPE *)(vn + H(i)); \
4901 TYPE mm = *(TYPE *)(vm + H(i)); \
4902 out |= OP(TYPE, nn, mm, status); \
4903 } \
4904 } while (i & 63); \
4905 d[j--] = out; \
4906 } while (i > 0); \
4907}
4908
4909#define DO_FPCMP_PPZZ_H(NAME, OP) \
4910 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4911#define DO_FPCMP_PPZZ_S(NAME, OP) \
4912 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4913#define DO_FPCMP_PPZZ_D(NAME, OP) \
4914 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4915
4916#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4917 DO_FPCMP_PPZZ_H(NAME, OP) \
4918 DO_FPCMP_PPZZ_S(NAME, OP) \
4919 DO_FPCMP_PPZZ_D(NAME, OP)
4920
4921#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4922#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4923#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4924#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
4925#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4926#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4927#define DO_FCMUO(TYPE, X, Y, ST) \
4928 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4929#define DO_FACGE(TYPE, X, Y, ST) \
4930 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4931#define DO_FACGT(TYPE, X, Y, ST) \
4932 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4933
4934DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4935DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4936DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4937DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4938DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4939DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4940DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4941
4942#undef DO_FPCMP_PPZZ_ALL
4943#undef DO_FPCMP_PPZZ_D
4944#undef DO_FPCMP_PPZZ_S
4945#undef DO_FPCMP_PPZZ_H
4946#undef DO_FPCMP_PPZZ
4947
4948
4949
4950
4951#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4952void HELPER(NAME)(void *vd, void *vn, void *vg, \
4953 void *status, uint32_t desc) \
4954{ \
4955 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4956 uint64_t *d = vd, *g = vg; \
4957 do { \
4958 uint64_t out = 0, pg = g[j]; \
4959 do { \
4960 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4961 if ((pg >> (i & 63)) & 1) { \
4962 TYPE nn = *(TYPE *)(vn + H(i)); \
4963 out |= OP(TYPE, nn, 0, status); \
4964 } \
4965 } while (i & 63); \
4966 d[j--] = out; \
4967 } while (i > 0); \
4968}
4969
4970#define DO_FPCMP_PPZ0_H(NAME, OP) \
4971 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4972#define DO_FPCMP_PPZ0_S(NAME, OP) \
4973 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4974#define DO_FPCMP_PPZ0_D(NAME, OP) \
4975 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4976
4977#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4978 DO_FPCMP_PPZ0_H(NAME, OP) \
4979 DO_FPCMP_PPZ0_S(NAME, OP) \
4980 DO_FPCMP_PPZ0_D(NAME, OP)
4981
4982DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4983DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4984DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4985DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4986DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4987DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4988
4989
4990
4991void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4992{
4993 static const float16 coeff[16] = {
4994 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4995 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4996 };
4997 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4998 intptr_t x = simd_data(desc);
4999 float16 *d = vd, *n = vn, *m = vm;
5000 for (i = 0; i < opr_sz; i++) {
5001 float16 mm = m[i];
5002 intptr_t xx = x;
5003 if (float16_is_neg(mm)) {
5004 mm = float16_abs(mm);
5005 xx += 8;
5006 }
5007 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
5008 }
5009}
5010
5011void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5012{
5013 static const float32 coeff[16] = {
5014 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5015 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5016 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5017 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5018 };
5019 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5020 intptr_t x = simd_data(desc);
5021 float32 *d = vd, *n = vn, *m = vm;
5022 for (i = 0; i < opr_sz; i++) {
5023 float32 mm = m[i];
5024 intptr_t xx = x;
5025 if (float32_is_neg(mm)) {
5026 mm = float32_abs(mm);
5027 xx += 8;
5028 }
5029 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
5030 }
5031}
5032
5033void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5034{
5035 static const float64 coeff[16] = {
5036 0x3ff0000000000000ull, 0xbfc5555555555543ull,
5037 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5038 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5039 0x3de5d8408868552full, 0x0000000000000000ull,
5040 0x3ff0000000000000ull, 0xbfe0000000000000ull,
5041 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5042 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5043 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5044 };
5045 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5046 intptr_t x = simd_data(desc);
5047 float64 *d = vd, *n = vn, *m = vm;
5048 for (i = 0; i < opr_sz; i++) {
5049 float64 mm = m[i];
5050 intptr_t xx = x;
5051 if (float64_is_neg(mm)) {
5052 mm = float64_abs(mm);
5053 xx += 8;
5054 }
5055 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5056 }
5057}
5058
5059
5060
5061
5062
5063void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5064 void *vs, uint32_t desc)
5065{
5066 intptr_t j, i = simd_oprsz(desc);
5067 uint64_t *g = vg;
5068 float16 neg_imag = float16_set_sign(0, simd_data(desc));
5069 float16 neg_real = float16_chs(neg_imag);
5070
5071 do {
5072 uint64_t pg = g[(i - 1) >> 6];
5073 do {
5074 float16 e0, e1, e2, e3;
5075
5076
5077 j = i - sizeof(float16);
5078 i -= 2 * sizeof(float16);
5079
5080 e0 = *(float16 *)(vn + H1_2(i));
5081 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5082 e2 = *(float16 *)(vn + H1_2(j));
5083 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5084
5085 if (likely((pg >> (i & 63)) & 1)) {
5086 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5087 }
5088 if (likely((pg >> (j & 63)) & 1)) {
5089 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5090 }
5091 } while (i & 63);
5092 } while (i != 0);
5093}
5094
5095void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5096 void *vs, uint32_t desc)
5097{
5098 intptr_t j, i = simd_oprsz(desc);
5099 uint64_t *g = vg;
5100 float32 neg_imag = float32_set_sign(0, simd_data(desc));
5101 float32 neg_real = float32_chs(neg_imag);
5102
5103 do {
5104 uint64_t pg = g[(i - 1) >> 6];
5105 do {
5106 float32 e0, e1, e2, e3;
5107
5108
5109 j = i - sizeof(float32);
5110 i -= 2 * sizeof(float32);
5111
5112 e0 = *(float32 *)(vn + H1_2(i));
5113 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5114 e2 = *(float32 *)(vn + H1_2(j));
5115 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5116
5117 if (likely((pg >> (i & 63)) & 1)) {
5118 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5119 }
5120 if (likely((pg >> (j & 63)) & 1)) {
5121 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5122 }
5123 } while (i & 63);
5124 } while (i != 0);
5125}
5126
5127void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5128 void *vs, uint32_t desc)
5129{
5130 intptr_t j, i = simd_oprsz(desc);
5131 uint64_t *g = vg;
5132 float64 neg_imag = float64_set_sign(0, simd_data(desc));
5133 float64 neg_real = float64_chs(neg_imag);
5134
5135 do {
5136 uint64_t pg = g[(i - 1) >> 6];
5137 do {
5138 float64 e0, e1, e2, e3;
5139
5140
5141 j = i - sizeof(float64);
5142 i -= 2 * sizeof(float64);
5143
5144 e0 = *(float64 *)(vn + H1_2(i));
5145 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5146 e2 = *(float64 *)(vn + H1_2(j));
5147 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5148
5149 if (likely((pg >> (i & 63)) & 1)) {
5150 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5151 }
5152 if (likely((pg >> (j & 63)) & 1)) {
5153 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5154 }
5155 } while (i & 63);
5156 } while (i != 0);
5157}
5158
5159
5160
5161
5162
5163void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5164 void *vg, void *status, uint32_t desc)
5165{
5166 intptr_t j, i = simd_oprsz(desc);
5167 unsigned rot = simd_data(desc);
5168 bool flip = rot & 1;
5169 float16 neg_imag, neg_real;
5170 uint64_t *g = vg;
5171
5172 neg_imag = float16_set_sign(0, (rot & 2) != 0);
5173 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5174
5175 do {
5176 uint64_t pg = g[(i - 1) >> 6];
5177 do {
5178 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5179
5180
5181 j = i - sizeof(float16);
5182 i -= 2 * sizeof(float16);
5183
5184 nr = *(float16 *)(vn + H1_2(i));
5185 ni = *(float16 *)(vn + H1_2(j));
5186 mr = *(float16 *)(vm + H1_2(i));
5187 mi = *(float16 *)(vm + H1_2(j));
5188
5189 e2 = (flip ? ni : nr);
5190 e1 = (flip ? mi : mr) ^ neg_real;
5191 e4 = e2;
5192 e3 = (flip ? mr : mi) ^ neg_imag;
5193
5194 if (likely((pg >> (i & 63)) & 1)) {
5195 d = *(float16 *)(va + H1_2(i));
5196 d = float16_muladd(e2, e1, d, 0, status);
5197 *(float16 *)(vd + H1_2(i)) = d;
5198 }
5199 if (likely((pg >> (j & 63)) & 1)) {
5200 d = *(float16 *)(va + H1_2(j));
5201 d = float16_muladd(e4, e3, d, 0, status);
5202 *(float16 *)(vd + H1_2(j)) = d;
5203 }
5204 } while (i & 63);
5205 } while (i != 0);
5206}
5207
5208void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5209 void *vg, void *status, uint32_t desc)
5210{
5211 intptr_t j, i = simd_oprsz(desc);
5212 unsigned rot = simd_data(desc);
5213 bool flip = rot & 1;
5214 float32 neg_imag, neg_real;
5215 uint64_t *g = vg;
5216
5217 neg_imag = float32_set_sign(0, (rot & 2) != 0);
5218 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5219
5220 do {
5221 uint64_t pg = g[(i - 1) >> 6];
5222 do {
5223 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5224
5225
5226 j = i - sizeof(float32);
5227 i -= 2 * sizeof(float32);
5228
5229 nr = *(float32 *)(vn + H1_2(i));
5230 ni = *(float32 *)(vn + H1_2(j));
5231 mr = *(float32 *)(vm + H1_2(i));
5232 mi = *(float32 *)(vm + H1_2(j));
5233
5234 e2 = (flip ? ni : nr);
5235 e1 = (flip ? mi : mr) ^ neg_real;
5236 e4 = e2;
5237 e3 = (flip ? mr : mi) ^ neg_imag;
5238
5239 if (likely((pg >> (i & 63)) & 1)) {
5240 d = *(float32 *)(va + H1_2(i));
5241 d = float32_muladd(e2, e1, d, 0, status);
5242 *(float32 *)(vd + H1_2(i)) = d;
5243 }
5244 if (likely((pg >> (j & 63)) & 1)) {
5245 d = *(float32 *)(va + H1_2(j));
5246 d = float32_muladd(e4, e3, d, 0, status);
5247 *(float32 *)(vd + H1_2(j)) = d;
5248 }
5249 } while (i & 63);
5250 } while (i != 0);
5251}
5252
5253void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5254 void *vg, void *status, uint32_t desc)
5255{
5256 intptr_t j, i = simd_oprsz(desc);
5257 unsigned rot = simd_data(desc);
5258 bool flip = rot & 1;
5259 float64 neg_imag, neg_real;
5260 uint64_t *g = vg;
5261
5262 neg_imag = float64_set_sign(0, (rot & 2) != 0);
5263 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5264
5265 do {
5266 uint64_t pg = g[(i - 1) >> 6];
5267 do {
5268 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5269
5270
5271 j = i - sizeof(float64);
5272 i -= 2 * sizeof(float64);
5273
5274 nr = *(float64 *)(vn + H1_2(i));
5275 ni = *(float64 *)(vn + H1_2(j));
5276 mr = *(float64 *)(vm + H1_2(i));
5277 mi = *(float64 *)(vm + H1_2(j));
5278
5279 e2 = (flip ? ni : nr);
5280 e1 = (flip ? mi : mr) ^ neg_real;
5281 e4 = e2;
5282 e3 = (flip ? mr : mi) ^ neg_imag;
5283
5284 if (likely((pg >> (i & 63)) & 1)) {
5285 d = *(float64 *)(va + H1_2(i));
5286 d = float64_muladd(e2, e1, d, 0, status);
5287 *(float64 *)(vd + H1_2(i)) = d;
5288 }
5289 if (likely((pg >> (j & 63)) & 1)) {
5290 d = *(float64 *)(va + H1_2(j));
5291 d = float64_muladd(e4, e3, d, 0, status);
5292 *(float64 *)(vd + H1_2(j)) = d;
5293 }
5294 } while (i & 63);
5295 } while (i != 0);
5296}
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
5307
5308
5309
5310
5311
5312typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
5313 target_ulong vaddr, uintptr_t retaddr);
5314
5315
5316
5317
5318
5319#define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
5320static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5321{ \
5322 TYPEM val = HOST(host); \
5323 *(TYPEE *)(vd + H(reg_off)) = val; \
5324}
5325
5326#define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
5327static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5328{ HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
5329
5330#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
5331static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
5332 target_ulong addr, uintptr_t ra) \
5333{ \
5334 *(TYPEE *)(vd + H(reg_off)) = \
5335 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
5336}
5337
5338#define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
5339static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
5340 target_ulong addr, uintptr_t ra) \
5341{ \
5342 TLB(env, useronly_clean_ptr(addr), \
5343 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
5344}
5345
5346#define DO_LD_PRIM_1(NAME, H, TE, TM) \
5347 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
5348 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
5349
5350DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
5351DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
5352DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
5353DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
5354DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
5355DO_LD_PRIM_1(ld1bdu, H1_8, uint64_t, uint8_t)
5356DO_LD_PRIM_1(ld1bds, H1_8, uint64_t, int8_t)
5357
5358#define DO_ST_PRIM_1(NAME, H, TE, TM) \
5359 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
5360 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
5361
5362DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
5363DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
5364DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
5365DO_ST_PRIM_1(bd, H1_8, uint64_t, uint8_t)
5366
5367#define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
5368 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
5369 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
5370 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
5371 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
5372
5373#define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
5374 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
5375 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
5376 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
5377 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
5378
5379DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
5380DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
5381DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
5382DO_LD_PRIM_2(hdu, H1_8, uint64_t, uint16_t, lduw)
5383DO_LD_PRIM_2(hds, H1_8, uint64_t, int16_t, lduw)
5384
5385DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
5386DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
5387DO_ST_PRIM_2(hd, H1_8, uint64_t, uint16_t, stw)
5388
5389DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
5390DO_LD_PRIM_2(sdu, H1_8, uint64_t, uint32_t, ldl)
5391DO_LD_PRIM_2(sds, H1_8, uint64_t, int32_t, ldl)
5392
5393DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
5394DO_ST_PRIM_2(sd, H1_8, uint64_t, uint32_t, stl)
5395
5396DO_LD_PRIM_2(dd, H1_8, uint64_t, uint64_t, ldq)
5397DO_ST_PRIM_2(dd, H1_8, uint64_t, uint64_t, stq)
5398
5399#undef DO_LD_TLB
5400#undef DO_ST_TLB
5401#undef DO_LD_HOST
5402#undef DO_LD_PRIM_1
5403#undef DO_ST_PRIM_1
5404#undef DO_LD_PRIM_2
5405#undef DO_ST_PRIM_2
5406
5407
5408
5409
5410
5411
5412static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5413 intptr_t reg_max, int esz)
5414{
5415 uint64_t pg_mask = pred_esz_masks[esz];
5416 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5417
5418
5419 if (likely(pg & 1)) {
5420 return reg_off;
5421 }
5422
5423 if (pg == 0) {
5424 reg_off &= -64;
5425 do {
5426 reg_off += 64;
5427 if (unlikely(reg_off >= reg_max)) {
5428
5429 return reg_max;
5430 }
5431 pg = vg[reg_off >> 6] & pg_mask;
5432 } while (pg == 0);
5433 }
5434 reg_off += ctz64(pg);
5435
5436
5437 tcg_debug_assert(reg_off < reg_max);
5438 return reg_off;
5439}
5440
5441
5442
5443
5444
5445
5446
5447typedef struct {
5448 void *host;
5449 int flags;
5450 MemTxAttrs attrs;
5451} SVEHostPage;
5452
5453static bool sve_probe_page(SVEHostPage *info, bool nofault,
5454 CPUARMState *env, target_ulong addr,
5455 int mem_off, MMUAccessType access_type,
5456 int mmu_idx, uintptr_t retaddr)
5457{
5458 int flags;
5459
5460 addr += mem_off;
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472 addr = useronly_clean_ptr(addr);
5473
5474 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
5475 &info->host, retaddr);
5476 info->flags = flags;
5477
5478 if (flags & TLB_INVALID_MASK) {
5479 g_assert(nofault);
5480 return false;
5481 }
5482
5483
5484 info->host -= mem_off;
5485
5486#ifdef CONFIG_USER_ONLY
5487 memset(&info->attrs, 0, sizeof(info->attrs));
5488#else
5489
5490
5491
5492
5493 {
5494 uintptr_t index = tlb_index(env, mmu_idx, addr);
5495
5496# ifdef CONFIG_DEBUG_TCG
5497 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
5498 target_ulong comparator = (access_type == MMU_DATA_LOAD
5499 ? entry->addr_read
5500 : tlb_addr_write(entry));
5501 g_assert(tlb_hit(comparator, addr));
5502# endif
5503
5504 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
5505 info->attrs = iotlbentry->attrs;
5506 }
5507#endif
5508
5509 return true;
5510}
5511
5512
5513
5514
5515
5516
5517typedef enum {
5518 FAULT_NO,
5519 FAULT_FIRST,
5520 FAULT_ALL,
5521} SVEContFault;
5522
5523typedef struct {
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536 int16_t mem_off_first[2];
5537 int16_t reg_off_first[2];
5538 int16_t reg_off_last[2];
5539
5540
5541
5542
5543
5544 int16_t mem_off_split;
5545 int16_t reg_off_split;
5546
5547
5548
5549
5550
5551 int16_t page_split;
5552
5553
5554 SVEHostPage page[2];
5555} SVEContLdSt;
5556
5557
5558
5559
5560
5561
5562static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
5563 uint64_t *vg, intptr_t reg_max,
5564 int esz, int msize)
5565{
5566 const int esize = 1 << esz;
5567 const uint64_t pg_mask = pred_esz_masks[esz];
5568 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5569 intptr_t mem_off_last, mem_off_split;
5570 intptr_t page_split, elt_split;
5571 intptr_t i;
5572
5573
5574 memset(info, -1, offsetof(SVEContLdSt, page));
5575 memset(info->page, 0, sizeof(info->page));
5576
5577
5578 i = 0;
5579 do {
5580 uint64_t pg = vg[i] & pg_mask;
5581 if (pg) {
5582 reg_off_last = i * 64 + 63 - clz64(pg);
5583 if (reg_off_first < 0) {
5584 reg_off_first = i * 64 + ctz64(pg);
5585 }
5586 }
5587 } while (++i * 64 < reg_max);
5588
5589 if (unlikely(reg_off_first < 0)) {
5590
5591 return false;
5592 }
5593 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5594
5595 info->reg_off_first[0] = reg_off_first;
5596 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5597 mem_off_last = (reg_off_last >> esz) * msize;
5598
5599 page_split = -(addr | TARGET_PAGE_MASK);
5600 if (likely(mem_off_last + msize <= page_split)) {
5601
5602 info->reg_off_last[0] = reg_off_last;
5603 return true;
5604 }
5605
5606 info->page_split = page_split;
5607 elt_split = page_split / msize;
5608 reg_off_split = elt_split << esz;
5609 mem_off_split = elt_split * msize;
5610
5611
5612
5613
5614
5615
5616
5617 if (elt_split != 0) {
5618 info->reg_off_last[0] = reg_off_split - esize;
5619 }
5620
5621
5622 if (page_split % msize != 0) {
5623
5624 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5625 info->reg_off_split = reg_off_split;
5626 info->mem_off_split = mem_off_split;
5627
5628 if (reg_off_split == reg_off_last) {
5629
5630 return true;
5631 }
5632 }
5633 reg_off_split += esize;
5634 mem_off_split += msize;
5635 }
5636
5637
5638
5639
5640
5641 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5642 tcg_debug_assert(reg_off_split <= reg_off_last);
5643 info->reg_off_first[1] = reg_off_split;
5644 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5645 info->reg_off_last[1] = reg_off_last;
5646 return true;
5647}
5648
5649
5650
5651
5652
5653
5654static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5655 CPUARMState *env, target_ulong addr,
5656 MMUAccessType access_type, uintptr_t retaddr)
5657{
5658 int mmu_idx = cpu_mmu_index(env, false);
5659 int mem_off = info->mem_off_first[0];
5660 bool nofault = fault == FAULT_NO;
5661 bool have_work = true;
5662
5663 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5664 access_type, mmu_idx, retaddr)) {
5665
5666 return false;
5667 }
5668
5669 if (likely(info->page_split < 0)) {
5670
5671 return true;
5672 }
5673
5674
5675
5676
5677
5678 if (info->mem_off_split >= 0) {
5679
5680
5681
5682
5683 mem_off = info->page_split;
5684
5685
5686
5687
5688
5689
5690 if (info->mem_off_first[0] < info->mem_off_split) {
5691 nofault = FAULT_FIRST;
5692 have_work = false;
5693 }
5694 } else {
5695
5696
5697
5698
5699 mem_off = info->mem_off_first[1];
5700
5701
5702
5703
5704 nofault = fault != FAULT_ALL;
5705 }
5706
5707 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5708 access_type, mmu_idx, retaddr);
5709 return have_work;
5710}
5711
5712static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5713 uint64_t *vg, target_ulong addr,
5714 int esize, int msize, int wp_access,
5715 uintptr_t retaddr)
5716{
5717#ifndef CONFIG_USER_ONLY
5718 intptr_t mem_off, reg_off, reg_last;
5719 int flags0 = info->page[0].flags;
5720 int flags1 = info->page[1].flags;
5721
5722 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5723 return;
5724 }
5725
5726
5727 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5728 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5729
5730 if (flags0 & TLB_WATCHPOINT) {
5731 mem_off = info->mem_off_first[0];
5732 reg_off = info->reg_off_first[0];
5733 reg_last = info->reg_off_last[0];
5734
5735 while (reg_off <= reg_last) {
5736 uint64_t pg = vg[reg_off >> 6];
5737 do {
5738 if ((pg >> (reg_off & 63)) & 1) {
5739 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5740 msize, info->page[0].attrs,
5741 wp_access, retaddr);
5742 }
5743 reg_off += esize;
5744 mem_off += msize;
5745 } while (reg_off <= reg_last && (reg_off & 63));
5746 }
5747 }
5748
5749 mem_off = info->mem_off_split;
5750 if (mem_off >= 0) {
5751 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5752 info->page[0].attrs, wp_access, retaddr);
5753 }
5754
5755 mem_off = info->mem_off_first[1];
5756 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5757 reg_off = info->reg_off_first[1];
5758 reg_last = info->reg_off_last[1];
5759
5760 do {
5761 uint64_t pg = vg[reg_off >> 6];
5762 do {
5763 if ((pg >> (reg_off & 63)) & 1) {
5764 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5765 msize, info->page[1].attrs,
5766 wp_access, retaddr);
5767 }
5768 reg_off += esize;
5769 mem_off += msize;
5770 } while (reg_off & 63);
5771 } while (reg_off <= reg_last);
5772 }
5773#endif
5774}
5775
5776static void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5777 uint64_t *vg, target_ulong addr, int esize,
5778 int msize, uint32_t mtedesc, uintptr_t ra)
5779{
5780 intptr_t mem_off, reg_off, reg_last;
5781
5782
5783 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
5784 mem_off = info->mem_off_first[0];
5785 reg_off = info->reg_off_first[0];
5786 reg_last = info->reg_off_split;
5787 if (reg_last < 0) {
5788 reg_last = info->reg_off_last[0];
5789 }
5790
5791 do {
5792 uint64_t pg = vg[reg_off >> 6];
5793 do {
5794 if ((pg >> (reg_off & 63)) & 1) {
5795 mte_check(env, mtedesc, addr, ra);
5796 }
5797 reg_off += esize;
5798 mem_off += msize;
5799 } while (reg_off <= reg_last && (reg_off & 63));
5800 } while (reg_off <= reg_last);
5801 }
5802
5803 mem_off = info->mem_off_first[1];
5804 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
5805 reg_off = info->reg_off_first[1];
5806 reg_last = info->reg_off_last[1];
5807
5808 do {
5809 uint64_t pg = vg[reg_off >> 6];
5810 do {
5811 if ((pg >> (reg_off & 63)) & 1) {
5812 mte_check(env, mtedesc, addr, ra);
5813 }
5814 reg_off += esize;
5815 mem_off += msize;
5816 } while (reg_off & 63);
5817 } while (reg_off <= reg_last);
5818 }
5819}
5820
5821
5822
5823
5824static inline QEMU_ALWAYS_INLINE
5825void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5826 uint32_t desc, const uintptr_t retaddr,
5827 const int esz, const int msz, const int N, uint32_t mtedesc,
5828 sve_ldst1_host_fn *host_fn,
5829 sve_ldst1_tlb_fn *tlb_fn)
5830{
5831 const unsigned rd = simd_data(desc);
5832 const intptr_t reg_max = simd_oprsz(desc);
5833 intptr_t reg_off, reg_last, mem_off;
5834 SVEContLdSt info;
5835 void *host;
5836 int flags, i;
5837
5838
5839 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5840
5841 for (i = 0; i < N; ++i) {
5842 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5843 }
5844 return;
5845 }
5846
5847
5848 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5849
5850
5851 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5852 BP_MEM_READ, retaddr);
5853
5854
5855
5856
5857
5858 if (mtedesc) {
5859 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5860 mtedesc, retaddr);
5861 }
5862
5863 flags = info.page[0].flags | info.page[1].flags;
5864 if (unlikely(flags != 0)) {
5865#ifdef CONFIG_USER_ONLY
5866 g_assert_not_reached();
5867#else
5868
5869
5870
5871
5872
5873
5874 ARMVectorReg scratch[4] = { };
5875
5876 mem_off = info.mem_off_first[0];
5877 reg_off = info.reg_off_first[0];
5878 reg_last = info.reg_off_last[1];
5879 if (reg_last < 0) {
5880 reg_last = info.reg_off_split;
5881 if (reg_last < 0) {
5882 reg_last = info.reg_off_last[0];
5883 }
5884 }
5885
5886 do {
5887 uint64_t pg = vg[reg_off >> 6];
5888 do {
5889 if ((pg >> (reg_off & 63)) & 1) {
5890 for (i = 0; i < N; ++i) {
5891 tlb_fn(env, &scratch[i], reg_off,
5892 addr + mem_off + (i << msz), retaddr);
5893 }
5894 }
5895 reg_off += 1 << esz;
5896 mem_off += N << msz;
5897 } while (reg_off & 63);
5898 } while (reg_off <= reg_last);
5899
5900 for (i = 0; i < N; ++i) {
5901 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5902 }
5903 return;
5904#endif
5905 }
5906
5907
5908
5909 for (i = 0; i < N; ++i) {
5910 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5911 }
5912
5913 mem_off = info.mem_off_first[0];
5914 reg_off = info.reg_off_first[0];
5915 reg_last = info.reg_off_last[0];
5916 host = info.page[0].host;
5917
5918 while (reg_off <= reg_last) {
5919 uint64_t pg = vg[reg_off >> 6];
5920 do {
5921 if ((pg >> (reg_off & 63)) & 1) {
5922 for (i = 0; i < N; ++i) {
5923 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5924 host + mem_off + (i << msz));
5925 }
5926 }
5927 reg_off += 1 << esz;
5928 mem_off += N << msz;
5929 } while (reg_off <= reg_last && (reg_off & 63));
5930 }
5931
5932
5933
5934
5935
5936 mem_off = info.mem_off_split;
5937 if (unlikely(mem_off >= 0)) {
5938 reg_off = info.reg_off_split;
5939 for (i = 0; i < N; ++i) {
5940 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5941 addr + mem_off + (i << msz), retaddr);
5942 }
5943 }
5944
5945 mem_off = info.mem_off_first[1];
5946 if (unlikely(mem_off >= 0)) {
5947 reg_off = info.reg_off_first[1];
5948 reg_last = info.reg_off_last[1];
5949 host = info.page[1].host;
5950
5951 do {
5952 uint64_t pg = vg[reg_off >> 6];
5953 do {
5954 if ((pg >> (reg_off & 63)) & 1) {
5955 for (i = 0; i < N; ++i) {
5956 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5957 host + mem_off + (i << msz));
5958 }
5959 }
5960 reg_off += 1 << esz;
5961 mem_off += N << msz;
5962 } while (reg_off & 63);
5963 } while (reg_off <= reg_last);
5964 }
5965}
5966
5967static inline QEMU_ALWAYS_INLINE
5968void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5969 uint32_t desc, const uintptr_t ra,
5970 const int esz, const int msz, const int N,
5971 sve_ldst1_host_fn *host_fn,
5972 sve_ldst1_tlb_fn *tlb_fn)
5973{
5974 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5975 int bit55 = extract64(addr, 55, 1);
5976
5977
5978 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5979
5980
5981 if (!tbi_check(desc, bit55) ||
5982 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5983 mtedesc = 0;
5984 }
5985
5986 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5987}
5988
5989#define DO_LD1_1(NAME, ESZ) \
5990void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5991 target_ulong addr, uint32_t desc) \
5992{ \
5993 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
5994 sve_##NAME##_host, sve_##NAME##_tlb); \
5995} \
5996void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5997 target_ulong addr, uint32_t desc) \
5998{ \
5999 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
6000 sve_##NAME##_host, sve_##NAME##_tlb); \
6001}
6002
6003#define DO_LD1_2(NAME, ESZ, MSZ) \
6004void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
6005 target_ulong addr, uint32_t desc) \
6006{ \
6007 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
6008 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
6009} \
6010void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
6011 target_ulong addr, uint32_t desc) \
6012{ \
6013 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
6014 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
6015} \
6016void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6017 target_ulong addr, uint32_t desc) \
6018{ \
6019 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
6020 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
6021} \
6022void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6023 target_ulong addr, uint32_t desc) \
6024{ \
6025 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
6026 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
6027}
6028
6029DO_LD1_1(ld1bb, MO_8)
6030DO_LD1_1(ld1bhu, MO_16)
6031DO_LD1_1(ld1bhs, MO_16)
6032DO_LD1_1(ld1bsu, MO_32)
6033DO_LD1_1(ld1bss, MO_32)
6034DO_LD1_1(ld1bdu, MO_64)
6035DO_LD1_1(ld1bds, MO_64)
6036
6037DO_LD1_2(ld1hh, MO_16, MO_16)
6038DO_LD1_2(ld1hsu, MO_32, MO_16)
6039DO_LD1_2(ld1hss, MO_32, MO_16)
6040DO_LD1_2(ld1hdu, MO_64, MO_16)
6041DO_LD1_2(ld1hds, MO_64, MO_16)
6042
6043DO_LD1_2(ld1ss, MO_32, MO_32)
6044DO_LD1_2(ld1sdu, MO_64, MO_32)
6045DO_LD1_2(ld1sds, MO_64, MO_32)
6046
6047DO_LD1_2(ld1dd, MO_64, MO_64)
6048
6049#undef DO_LD1_1
6050#undef DO_LD1_2
6051
6052#define DO_LDN_1(N) \
6053void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
6054 target_ulong addr, uint32_t desc) \
6055{ \
6056 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
6057 sve_ld1bb_host, sve_ld1bb_tlb); \
6058} \
6059void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
6060 target_ulong addr, uint32_t desc) \
6061{ \
6062 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
6063 sve_ld1bb_host, sve_ld1bb_tlb); \
6064}
6065
6066#define DO_LDN_2(N, SUFF, ESZ) \
6067void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
6068 target_ulong addr, uint32_t desc) \
6069{ \
6070 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6071 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6072} \
6073void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
6074 target_ulong addr, uint32_t desc) \
6075{ \
6076 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6077 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6078} \
6079void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
6080 target_ulong addr, uint32_t desc) \
6081{ \
6082 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6083 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6084} \
6085void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
6086 target_ulong addr, uint32_t desc) \
6087{ \
6088 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6089 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6090}
6091
6092DO_LDN_1(2)
6093DO_LDN_1(3)
6094DO_LDN_1(4)
6095
6096DO_LDN_2(2, hh, MO_16)
6097DO_LDN_2(3, hh, MO_16)
6098DO_LDN_2(4, hh, MO_16)
6099
6100DO_LDN_2(2, ss, MO_32)
6101DO_LDN_2(3, ss, MO_32)
6102DO_LDN_2(4, ss, MO_32)
6103
6104DO_LDN_2(2, dd, MO_64)
6105DO_LDN_2(3, dd, MO_64)
6106DO_LDN_2(4, dd, MO_64)
6107
6108#undef DO_LDN_1
6109#undef DO_LDN_2
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
6130{
6131 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
6132
6133 if (i & 63) {
6134 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
6135 i = ROUND_UP(i, 64);
6136 }
6137 for (; i < oprsz; i += 64) {
6138 ffr[i / 64] = 0;
6139 }
6140}
6141
6142
6143
6144
6145static inline QEMU_ALWAYS_INLINE
6146void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
6147 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
6148 const int esz, const int msz, const SVEContFault fault,
6149 sve_ldst1_host_fn *host_fn,
6150 sve_ldst1_tlb_fn *tlb_fn)
6151{
6152 const unsigned rd = simd_data(desc);
6153 void *vd = &env->vfp.zregs[rd];
6154 const intptr_t reg_max = simd_oprsz(desc);
6155 intptr_t reg_off, mem_off, reg_last;
6156 SVEContLdSt info;
6157 int flags;
6158 void *host;
6159
6160
6161 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
6162
6163 memset(vd, 0, reg_max);
6164 return;
6165 }
6166 reg_off = info.reg_off_first[0];
6167
6168
6169 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
6170
6171 tcg_debug_assert(fault == FAULT_NO);
6172 memset(vd, 0, reg_max);
6173 goto do_fault;
6174 }
6175
6176 mem_off = info.mem_off_first[0];
6177 flags = info.page[0].flags;
6178
6179
6180
6181
6182
6183 if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
6184 mtedesc = 0;
6185 }
6186
6187 if (fault == FAULT_FIRST) {
6188
6189 if (mtedesc) {
6190 mte_check(env, mtedesc, addr + mem_off, retaddr);
6191 }
6192
6193
6194
6195
6196
6197 bool is_split = mem_off == info.mem_off_split;
6198 if (unlikely(flags != 0) || unlikely(is_split)) {
6199
6200
6201
6202
6203 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6204
6205
6206 swap_memzero(vd, reg_off);
6207 reg_off += 1 << esz;
6208 mem_off += 1 << msz;
6209 swap_memzero(vd + reg_off, reg_max - reg_off);
6210
6211 if (is_split) {
6212 goto second_page;
6213 }
6214 } else {
6215 memset(vd, 0, reg_max);
6216 }
6217 } else {
6218 memset(vd, 0, reg_max);
6219 if (unlikely(mem_off == info.mem_off_split)) {
6220
6221 flags |= info.page[1].flags;
6222 if (unlikely(flags & TLB_MMIO)) {
6223
6224 goto do_fault;
6225 }
6226 if (unlikely(flags & TLB_WATCHPOINT) &&
6227 (cpu_watchpoint_address_matches
6228 (env_cpu(env), addr + mem_off, 1 << msz)
6229 & BP_MEM_READ)) {
6230
6231 goto do_fault;
6232 }
6233 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6234 goto do_fault;
6235 }
6236
6237
6238
6239
6240 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6241 goto second_page;
6242 }
6243 }
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266 if (unlikely(flags & TLB_MMIO)) {
6267 goto do_fault;
6268 }
6269
6270 reg_last = info.reg_off_last[0];
6271 host = info.page[0].host;
6272
6273 do {
6274 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6275 do {
6276 if ((pg >> (reg_off & 63)) & 1) {
6277 if (unlikely(flags & TLB_WATCHPOINT) &&
6278 (cpu_watchpoint_address_matches
6279 (env_cpu(env), addr + mem_off, 1 << msz)
6280 & BP_MEM_READ)) {
6281 goto do_fault;
6282 }
6283 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6284 goto do_fault;
6285 }
6286 host_fn(vd, reg_off, host + mem_off);
6287 }
6288 reg_off += 1 << esz;
6289 mem_off += 1 << msz;
6290 } while (reg_off <= reg_last && (reg_off & 63));
6291 } while (reg_off <= reg_last);
6292
6293
6294
6295
6296
6297
6298
6299 reg_off = info.reg_off_split;
6300 if (reg_off >= 0) {
6301 goto do_fault;
6302 }
6303
6304 second_page:
6305 reg_off = info.reg_off_first[1];
6306 if (likely(reg_off < 0)) {
6307
6308 return;
6309 }
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319 do_fault:
6320 record_fault(env, reg_off, reg_max);
6321}
6322
6323static inline QEMU_ALWAYS_INLINE
6324void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6325 uint32_t desc, const uintptr_t retaddr,
6326 const int esz, const int msz, const SVEContFault fault,
6327 sve_ldst1_host_fn *host_fn,
6328 sve_ldst1_tlb_fn *tlb_fn)
6329{
6330 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6331 int bit55 = extract64(addr, 55, 1);
6332
6333
6334 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6335
6336
6337 if (!tbi_check(desc, bit55) ||
6338 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6339 mtedesc = 0;
6340 }
6341
6342 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6343 esz, msz, fault, host_fn, tlb_fn);
6344}
6345
6346#define DO_LDFF1_LDNF1_1(PART, ESZ) \
6347void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6348 target_ulong addr, uint32_t desc) \
6349{ \
6350 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6351 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6352} \
6353void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6354 target_ulong addr, uint32_t desc) \
6355{ \
6356 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6357 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6358} \
6359void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6360 target_ulong addr, uint32_t desc) \
6361{ \
6362 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6363 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6364} \
6365void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6366 target_ulong addr, uint32_t desc) \
6367{ \
6368 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6369 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6370}
6371
6372#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6373void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6374 target_ulong addr, uint32_t desc) \
6375{ \
6376 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6377 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6378} \
6379void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6380 target_ulong addr, uint32_t desc) \
6381{ \
6382 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6383 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6384} \
6385void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6386 target_ulong addr, uint32_t desc) \
6387{ \
6388 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6389 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6390} \
6391void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6392 target_ulong addr, uint32_t desc) \
6393{ \
6394 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6395 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6396} \
6397void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6398 target_ulong addr, uint32_t desc) \
6399{ \
6400 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6401 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6402} \
6403void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6404 target_ulong addr, uint32_t desc) \
6405{ \
6406 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6407 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6408} \
6409void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6410 target_ulong addr, uint32_t desc) \
6411{ \
6412 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6413 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6414} \
6415void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6416 target_ulong addr, uint32_t desc) \
6417{ \
6418 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6419 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6420}
6421
6422DO_LDFF1_LDNF1_1(bb, MO_8)
6423DO_LDFF1_LDNF1_1(bhu, MO_16)
6424DO_LDFF1_LDNF1_1(bhs, MO_16)
6425DO_LDFF1_LDNF1_1(bsu, MO_32)
6426DO_LDFF1_LDNF1_1(bss, MO_32)
6427DO_LDFF1_LDNF1_1(bdu, MO_64)
6428DO_LDFF1_LDNF1_1(bds, MO_64)
6429
6430DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
6431DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6432DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6433DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6434DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6435
6436DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
6437DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6438DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6439
6440DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
6441
6442#undef DO_LDFF1_LDNF1_1
6443#undef DO_LDFF1_LDNF1_2
6444
6445
6446
6447
6448
6449static inline QEMU_ALWAYS_INLINE
6450void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6451 uint32_t desc, const uintptr_t retaddr,
6452 const int esz, const int msz, const int N, uint32_t mtedesc,
6453 sve_ldst1_host_fn *host_fn,
6454 sve_ldst1_tlb_fn *tlb_fn)
6455{
6456 const unsigned rd = simd_data(desc);
6457 const intptr_t reg_max = simd_oprsz(desc);
6458 intptr_t reg_off, reg_last, mem_off;
6459 SVEContLdSt info;
6460 void *host;
6461 int i, flags;
6462
6463
6464 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6465
6466 return;
6467 }
6468
6469
6470 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6471
6472
6473 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6474 BP_MEM_WRITE, retaddr);
6475
6476
6477
6478
6479
6480 if (mtedesc) {
6481 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6482 mtedesc, retaddr);
6483 }
6484
6485 flags = info.page[0].flags | info.page[1].flags;
6486 if (unlikely(flags != 0)) {
6487#ifdef CONFIG_USER_ONLY
6488 g_assert_not_reached();
6489#else
6490
6491
6492
6493
6494
6495
6496 mem_off = info.mem_off_first[0];
6497 reg_off = info.reg_off_first[0];
6498 reg_last = info.reg_off_last[1];
6499 if (reg_last < 0) {
6500 reg_last = info.reg_off_split;
6501 if (reg_last < 0) {
6502 reg_last = info.reg_off_last[0];
6503 }
6504 }
6505
6506 do {
6507 uint64_t pg = vg[reg_off >> 6];
6508 do {
6509 if ((pg >> (reg_off & 63)) & 1) {
6510 for (i = 0; i < N; ++i) {
6511 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6512 addr + mem_off + (i << msz), retaddr);
6513 }
6514 }
6515 reg_off += 1 << esz;
6516 mem_off += N << msz;
6517 } while (reg_off & 63);
6518 } while (reg_off <= reg_last);
6519 return;
6520#endif
6521 }
6522
6523 mem_off = info.mem_off_first[0];
6524 reg_off = info.reg_off_first[0];
6525 reg_last = info.reg_off_last[0];
6526 host = info.page[0].host;
6527
6528 while (reg_off <= reg_last) {
6529 uint64_t pg = vg[reg_off >> 6];
6530 do {
6531 if ((pg >> (reg_off & 63)) & 1) {
6532 for (i = 0; i < N; ++i) {
6533 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6534 host + mem_off + (i << msz));
6535 }
6536 }
6537 reg_off += 1 << esz;
6538 mem_off += N << msz;
6539 } while (reg_off <= reg_last && (reg_off & 63));
6540 }
6541
6542
6543
6544
6545
6546 mem_off = info.mem_off_split;
6547 if (unlikely(mem_off >= 0)) {
6548 reg_off = info.reg_off_split;
6549 for (i = 0; i < N; ++i) {
6550 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6551 addr + mem_off + (i << msz), retaddr);
6552 }
6553 }
6554
6555 mem_off = info.mem_off_first[1];
6556 if (unlikely(mem_off >= 0)) {
6557 reg_off = info.reg_off_first[1];
6558 reg_last = info.reg_off_last[1];
6559 host = info.page[1].host;
6560
6561 do {
6562 uint64_t pg = vg[reg_off >> 6];
6563 do {
6564 if ((pg >> (reg_off & 63)) & 1) {
6565 for (i = 0; i < N; ++i) {
6566 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6567 host + mem_off + (i << msz));
6568 }
6569 }
6570 reg_off += 1 << esz;
6571 mem_off += N << msz;
6572 } while (reg_off & 63);
6573 } while (reg_off <= reg_last);
6574 }
6575}
6576
6577static inline QEMU_ALWAYS_INLINE
6578void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6579 uint32_t desc, const uintptr_t ra,
6580 const int esz, const int msz, const int N,
6581 sve_ldst1_host_fn *host_fn,
6582 sve_ldst1_tlb_fn *tlb_fn)
6583{
6584 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6585 int bit55 = extract64(addr, 55, 1);
6586
6587
6588 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6589
6590
6591 if (!tbi_check(desc, bit55) ||
6592 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6593 mtedesc = 0;
6594 }
6595
6596 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6597}
6598
6599#define DO_STN_1(N, NAME, ESZ) \
6600void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6601 target_ulong addr, uint32_t desc) \
6602{ \
6603 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6604 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6605} \
6606void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6607 target_ulong addr, uint32_t desc) \
6608{ \
6609 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6610 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6611}
6612
6613#define DO_STN_2(N, NAME, ESZ, MSZ) \
6614void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6615 target_ulong addr, uint32_t desc) \
6616{ \
6617 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6618 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6619} \
6620void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6621 target_ulong addr, uint32_t desc) \
6622{ \
6623 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6624 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6625} \
6626void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6627 target_ulong addr, uint32_t desc) \
6628{ \
6629 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6630 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6631} \
6632void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6633 target_ulong addr, uint32_t desc) \
6634{ \
6635 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6636 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6637}
6638
6639DO_STN_1(1, bb, MO_8)
6640DO_STN_1(1, bh, MO_16)
6641DO_STN_1(1, bs, MO_32)
6642DO_STN_1(1, bd, MO_64)
6643DO_STN_1(2, bb, MO_8)
6644DO_STN_1(3, bb, MO_8)
6645DO_STN_1(4, bb, MO_8)
6646
6647DO_STN_2(1, hh, MO_16, MO_16)
6648DO_STN_2(1, hs, MO_32, MO_16)
6649DO_STN_2(1, hd, MO_64, MO_16)
6650DO_STN_2(2, hh, MO_16, MO_16)
6651DO_STN_2(3, hh, MO_16, MO_16)
6652DO_STN_2(4, hh, MO_16, MO_16)
6653
6654DO_STN_2(1, ss, MO_32, MO_32)
6655DO_STN_2(1, sd, MO_64, MO_32)
6656DO_STN_2(2, ss, MO_32, MO_32)
6657DO_STN_2(3, ss, MO_32, MO_32)
6658DO_STN_2(4, ss, MO_32, MO_32)
6659
6660DO_STN_2(1, dd, MO_64, MO_64)
6661DO_STN_2(2, dd, MO_64, MO_64)
6662DO_STN_2(3, dd, MO_64, MO_64)
6663DO_STN_2(4, dd, MO_64, MO_64)
6664
6665#undef DO_STN_1
6666#undef DO_STN_2
6667
6668
6669
6670
6671
6672
6673
6674
6675typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6676
6677static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6678{
6679 return *(uint32_t *)(reg + H1_4(reg_ofs));
6680}
6681
6682static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6683{
6684 return *(int32_t *)(reg + H1_4(reg_ofs));
6685}
6686
6687static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6688{
6689 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6690}
6691
6692static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6693{
6694 return (int32_t)*(uint64_t *)(reg + reg_ofs);
6695}
6696
6697static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6698{
6699 return *(uint64_t *)(reg + reg_ofs);
6700}
6701
6702static inline QEMU_ALWAYS_INLINE
6703void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6704 target_ulong base, uint32_t desc, uintptr_t retaddr,
6705 uint32_t mtedesc, int esize, int msize,
6706 zreg_off_fn *off_fn,
6707 sve_ldst1_host_fn *host_fn,
6708 sve_ldst1_tlb_fn *tlb_fn)
6709{
6710 const int mmu_idx = cpu_mmu_index(env, false);
6711 const intptr_t reg_max = simd_oprsz(desc);
6712 const int scale = simd_data(desc);
6713 ARMVectorReg scratch;
6714 intptr_t reg_off;
6715 SVEHostPage info, info2;
6716
6717 memset(&scratch, 0, reg_max);
6718 reg_off = 0;
6719 do {
6720 uint64_t pg = vg[reg_off >> 6];
6721 do {
6722 if (likely(pg & 1)) {
6723 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6724 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6725
6726 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6727 mmu_idx, retaddr);
6728
6729 if (likely(in_page >= msize)) {
6730 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6731 cpu_check_watchpoint(env_cpu(env), addr, msize,
6732 info.attrs, BP_MEM_READ, retaddr);
6733 }
6734 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6735 mte_check(env, mtedesc, addr, retaddr);
6736 }
6737 host_fn(&scratch, reg_off, info.host);
6738 } else {
6739
6740 sve_probe_page(&info2, false, env, addr + in_page, 0,
6741 MMU_DATA_LOAD, mmu_idx, retaddr);
6742 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6743 cpu_check_watchpoint(env_cpu(env), addr,
6744 msize, info.attrs,
6745 BP_MEM_READ, retaddr);
6746 }
6747 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6748 mte_check(env, mtedesc, addr, retaddr);
6749 }
6750 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6751 }
6752 }
6753 reg_off += esize;
6754 pg >>= esize;
6755 } while (reg_off & 63);
6756 } while (reg_off < reg_max);
6757
6758
6759 memcpy(vd, &scratch, reg_max);
6760}
6761
6762static inline QEMU_ALWAYS_INLINE
6763void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6764 target_ulong base, uint32_t desc, uintptr_t retaddr,
6765 int esize, int msize, zreg_off_fn *off_fn,
6766 sve_ldst1_host_fn *host_fn,
6767 sve_ldst1_tlb_fn *tlb_fn)
6768{
6769 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6770
6771 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6772
6773
6774
6775
6776
6777
6778
6779 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6780 esize, msize, off_fn, host_fn, tlb_fn);
6781}
6782
6783#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6784void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6785 void *vm, target_ulong base, uint32_t desc) \
6786{ \
6787 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6788 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6789} \
6790void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6791 void *vm, target_ulong base, uint32_t desc) \
6792{ \
6793 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6794 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6795}
6796
6797#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6798void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6799 void *vm, target_ulong base, uint32_t desc) \
6800{ \
6801 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6802 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6803} \
6804void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6805 void *vm, target_ulong base, uint32_t desc) \
6806{ \
6807 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6808 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6809}
6810
6811DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6812DO_LD1_ZPZ_S(bsu, zss, MO_8)
6813DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6814DO_LD1_ZPZ_D(bdu, zss, MO_8)
6815DO_LD1_ZPZ_D(bdu, zd, MO_8)
6816
6817DO_LD1_ZPZ_S(bss, zsu, MO_8)
6818DO_LD1_ZPZ_S(bss, zss, MO_8)
6819DO_LD1_ZPZ_D(bds, zsu, MO_8)
6820DO_LD1_ZPZ_D(bds, zss, MO_8)
6821DO_LD1_ZPZ_D(bds, zd, MO_8)
6822
6823DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6824DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6825DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6826DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6827DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6828
6829DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6830DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6831DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6832DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6833DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6834
6835DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6836DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6837DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6838DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6839DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6840
6841DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6842DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6843DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6844DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6845DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6846
6847DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6848DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6849DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6850DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6851DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6852
6853DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6854DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6855DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6856DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6857DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6858
6859DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6860DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6861DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6862
6863DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6864DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6865DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6866
6867DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6868DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6869DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6870
6871DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6872DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6873DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6874
6875#undef DO_LD1_ZPZ_S
6876#undef DO_LD1_ZPZ_D
6877
6878
6879
6880
6881
6882
6883
6884static inline QEMU_ALWAYS_INLINE
6885void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6886 target_ulong base, uint32_t desc, uintptr_t retaddr,
6887 uint32_t mtedesc, const int esz, const int msz,
6888 zreg_off_fn *off_fn,
6889 sve_ldst1_host_fn *host_fn,
6890 sve_ldst1_tlb_fn *tlb_fn)
6891{
6892 const int mmu_idx = cpu_mmu_index(env, false);
6893 const intptr_t reg_max = simd_oprsz(desc);
6894 const int scale = simd_data(desc);
6895 const int esize = 1 << esz;
6896 const int msize = 1 << msz;
6897 intptr_t reg_off;
6898 SVEHostPage info;
6899 target_ulong addr, in_page;
6900
6901
6902 reg_off = find_next_active(vg, 0, reg_max, esz);
6903 if (unlikely(reg_off >= reg_max)) {
6904
6905 memset(vd, 0, reg_max);
6906 return;
6907 }
6908
6909
6910
6911
6912 addr = base + (off_fn(vm, reg_off) << scale);
6913 if (mtedesc) {
6914 mte_check(env, mtedesc, addr, retaddr);
6915 }
6916 tlb_fn(env, vd, reg_off, addr, retaddr);
6917
6918
6919 swap_memzero(vd, reg_off);
6920 reg_off += esize;
6921 swap_memzero(vd + reg_off, reg_max - reg_off);
6922
6923
6924
6925
6926 while (reg_off < reg_max) {
6927 uint64_t pg = vg[reg_off >> 6];
6928 do {
6929 if (likely((pg >> (reg_off & 63)) & 1)) {
6930 addr = base + (off_fn(vm, reg_off) << scale);
6931 in_page = -(addr | TARGET_PAGE_MASK);
6932
6933 if (unlikely(in_page < msize)) {
6934
6935 goto fault;
6936 }
6937
6938 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6939 mmu_idx, retaddr);
6940 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6941 goto fault;
6942 }
6943 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6944 (cpu_watchpoint_address_matches
6945 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6946 goto fault;
6947 }
6948 if (mtedesc &&
6949 arm_tlb_mte_tagged(&info.attrs) &&
6950 !mte_probe(env, mtedesc, addr)) {
6951 goto fault;
6952 }
6953
6954 host_fn(vd, reg_off, info.host);
6955 }
6956 reg_off += esize;
6957 } while (reg_off & 63);
6958 }
6959 return;
6960
6961 fault:
6962 record_fault(env, reg_off, reg_max);
6963}
6964
6965static inline QEMU_ALWAYS_INLINE
6966void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6967 target_ulong base, uint32_t desc, uintptr_t retaddr,
6968 const int esz, const int msz,
6969 zreg_off_fn *off_fn,
6970 sve_ldst1_host_fn *host_fn,
6971 sve_ldst1_tlb_fn *tlb_fn)
6972{
6973 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6974
6975 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6976
6977
6978
6979
6980
6981
6982
6983 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6984 esz, msz, off_fn, host_fn, tlb_fn);
6985}
6986
6987#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6988void HELPER(sve_ldff##MEM##_##OFS) \
6989 (CPUARMState *env, void *vd, void *vg, \
6990 void *vm, target_ulong base, uint32_t desc) \
6991{ \
6992 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6993 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6994} \
6995void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6996 (CPUARMState *env, void *vd, void *vg, \
6997 void *vm, target_ulong base, uint32_t desc) \
6998{ \
6999 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
7000 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7001}
7002
7003#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
7004void HELPER(sve_ldff##MEM##_##OFS) \
7005 (CPUARMState *env, void *vd, void *vg, \
7006 void *vm, target_ulong base, uint32_t desc) \
7007{ \
7008 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
7009 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7010} \
7011void HELPER(sve_ldff##MEM##_##OFS##_mte) \
7012 (CPUARMState *env, void *vd, void *vg, \
7013 void *vm, target_ulong base, uint32_t desc) \
7014{ \
7015 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
7016 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7017}
7018
7019DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
7020DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
7021DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
7022DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
7023DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
7024
7025DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
7026DO_LDFF1_ZPZ_S(bss, zss, MO_8)
7027DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
7028DO_LDFF1_ZPZ_D(bds, zss, MO_8)
7029DO_LDFF1_ZPZ_D(bds, zd, MO_8)
7030
7031DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
7032DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
7033DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
7034DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
7035DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
7036
7037DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
7038DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
7039DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
7040DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
7041DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
7042
7043DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
7044DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
7045DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
7046DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
7047DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
7048
7049DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
7050DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
7051DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
7052DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
7053DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
7054
7055DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
7056DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
7057DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
7058DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
7059DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
7060
7061DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
7062DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
7063DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
7064DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
7065DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
7066
7067DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
7068DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
7069DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
7070
7071DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
7072DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
7073DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
7074
7075DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
7076DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
7077DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
7078
7079DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
7080DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
7081DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
7082
7083
7084
7085static inline QEMU_ALWAYS_INLINE
7086void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7087 target_ulong base, uint32_t desc, uintptr_t retaddr,
7088 uint32_t mtedesc, int esize, int msize,
7089 zreg_off_fn *off_fn,
7090 sve_ldst1_host_fn *host_fn,
7091 sve_ldst1_tlb_fn *tlb_fn)
7092{
7093 const int mmu_idx = cpu_mmu_index(env, false);
7094 const intptr_t reg_max = simd_oprsz(desc);
7095 const int scale = simd_data(desc);
7096 void *host[ARM_MAX_VQ * 4];
7097 intptr_t reg_off, i;
7098 SVEHostPage info, info2;
7099
7100
7101
7102
7103 i = reg_off = 0;
7104 do {
7105 uint64_t pg = vg[reg_off >> 6];
7106 do {
7107 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7108 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
7109
7110 host[i] = NULL;
7111 if (likely((pg >> (reg_off & 63)) & 1)) {
7112 if (likely(in_page >= msize)) {
7113 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
7114 mmu_idx, retaddr);
7115 host[i] = info.host;
7116 } else {
7117
7118
7119
7120
7121
7122 sve_probe_page(&info, false, env, addr, 0,
7123 MMU_DATA_STORE, mmu_idx, retaddr);
7124 sve_probe_page(&info2, false, env, addr + in_page, 0,
7125 MMU_DATA_STORE, mmu_idx, retaddr);
7126 info.flags |= info2.flags;
7127 }
7128
7129 if (unlikely(info.flags & TLB_WATCHPOINT)) {
7130 cpu_check_watchpoint(env_cpu(env), addr, msize,
7131 info.attrs, BP_MEM_WRITE, retaddr);
7132 }
7133
7134 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
7135 mte_check(env, mtedesc, addr, retaddr);
7136 }
7137 }
7138 i += 1;
7139 reg_off += esize;
7140 } while (reg_off & 63);
7141 } while (reg_off < reg_max);
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152 i = reg_off = 0;
7153 do {
7154 void *h = host[i];
7155 if (likely(h != NULL)) {
7156 host_fn(vd, reg_off, h);
7157 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7158 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7159 tlb_fn(env, vd, reg_off, addr, retaddr);
7160 }
7161 i += 1;
7162 reg_off += esize;
7163 } while (reg_off < reg_max);
7164}
7165
7166static inline QEMU_ALWAYS_INLINE
7167void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7168 target_ulong base, uint32_t desc, uintptr_t retaddr,
7169 int esize, int msize, zreg_off_fn *off_fn,
7170 sve_ldst1_host_fn *host_fn,
7171 sve_ldst1_tlb_fn *tlb_fn)
7172{
7173 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7174
7175 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7176
7177
7178
7179
7180
7181
7182
7183 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7184 esize, msize, off_fn, host_fn, tlb_fn);
7185}
7186
7187#define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
7188void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7189 void *vm, target_ulong base, uint32_t desc) \
7190{ \
7191 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
7192 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7193} \
7194void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7195 void *vm, target_ulong base, uint32_t desc) \
7196{ \
7197 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
7198 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7199}
7200
7201#define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
7202void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7203 void *vm, target_ulong base, uint32_t desc) \
7204{ \
7205 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
7206 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7207} \
7208void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7209 void *vm, target_ulong base, uint32_t desc) \
7210{ \
7211 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
7212 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7213}
7214
7215DO_ST1_ZPZ_S(bs, zsu, MO_8)
7216DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7217DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7218DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7219DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7220
7221DO_ST1_ZPZ_S(bs, zss, MO_8)
7222DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7223DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7224DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7225DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7226
7227DO_ST1_ZPZ_D(bd, zsu, MO_8)
7228DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7229DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7230DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7231DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7232DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7233DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7234
7235DO_ST1_ZPZ_D(bd, zss, MO_8)
7236DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7237DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7238DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7239DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7240DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7241DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7242
7243DO_ST1_ZPZ_D(bd, zd, MO_8)
7244DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7245DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7246DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7247DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7248DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7249DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7250
7251#undef DO_ST1_ZPZ_S
7252#undef DO_ST1_ZPZ_D
7253
7254void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7255{
7256 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7257 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7258
7259 for (i = 0; i < opr_sz; ++i) {
7260 d[i] = n[i] ^ m[i] ^ k[i];
7261 }
7262}
7263
7264void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7265{
7266 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7267 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7268
7269 for (i = 0; i < opr_sz; ++i) {
7270 d[i] = n[i] ^ (m[i] & ~k[i]);
7271 }
7272}
7273
7274void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7275{
7276 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7277 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7278
7279 for (i = 0; i < opr_sz; ++i) {
7280 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7281 }
7282}
7283
7284void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7285{
7286 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7287 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7288
7289 for (i = 0; i < opr_sz; ++i) {
7290 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7291 }
7292}
7293
7294void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7295{
7296 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7297 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7298
7299 for (i = 0; i < opr_sz; ++i) {
7300 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7301 }
7302}
7303
7304
7305
7306
7307
7308
7309static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7310{
7311 int bits = 8 << esz;
7312 uint64_t ones = dup_const(esz, 1);
7313 uint64_t signs = ones << (bits - 1);
7314 uint64_t cmp0, cmp1;
7315
7316 cmp1 = dup_const(esz, n);
7317 cmp0 = cmp1 ^ m0;
7318 cmp1 = cmp1 ^ m1;
7319 cmp0 = (cmp0 - ones) & ~cmp0;
7320 cmp1 = (cmp1 - ones) & ~cmp1;
7321 return (cmp0 | cmp1) & signs;
7322}
7323
7324static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7325 uint32_t desc, int esz, bool nmatch)
7326{
7327 uint16_t esz_mask = pred_esz_masks[esz];
7328 intptr_t opr_sz = simd_oprsz(desc);
7329 uint32_t flags = PREDTEST_INIT;
7330 intptr_t i, j, k;
7331
7332 for (i = 0; i < opr_sz; i += 16) {
7333 uint64_t m0 = *(uint64_t *)(vm + i);
7334 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7335 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7336 uint16_t out = 0;
7337
7338 for (j = 0; j < 16; j += 8) {
7339 uint64_t n = *(uint64_t *)(vn + i + j);
7340
7341 for (k = 0; k < 8; k += 1 << esz) {
7342 if (pg & (1 << (j + k))) {
7343 bool o = do_match2(n >> (k * 8), m0, m1, esz);
7344 out |= (o ^ nmatch) << (j + k);
7345 }
7346 }
7347 }
7348 *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7349 flags = iter_predtest_fwd(out, pg, flags);
7350 }
7351 return flags;
7352}
7353
7354#define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7355uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7356{ \
7357 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7358}
7359
7360DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7361DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7362
7363DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7364DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7365
7366#undef DO_PPZZ_MATCH
7367
7368void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7369 uint32_t desc)
7370{
7371 ARMVectorReg scratch;
7372 intptr_t i, j;
7373 intptr_t opr_sz = simd_oprsz(desc);
7374 uint32_t *d = vd, *n = vn, *m = vm;
7375 uint8_t *pg = vg;
7376
7377 if (d == n) {
7378 n = memcpy(&scratch, n, opr_sz);
7379 if (d == m) {
7380 m = n;
7381 }
7382 } else if (d == m) {
7383 m = memcpy(&scratch, m, opr_sz);
7384 }
7385
7386 for (i = 0; i < opr_sz; i += 4) {
7387 uint64_t count = 0;
7388 uint8_t pred;
7389
7390 pred = pg[H1(i >> 3)] >> (i & 7);
7391 if (pred & 1) {
7392 uint32_t nn = n[H4(i >> 2)];
7393
7394 for (j = 0; j <= i; j += 4) {
7395 pred = pg[H1(j >> 3)] >> (j & 7);
7396 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7397 ++count;
7398 }
7399 }
7400 }
7401 d[H4(i >> 2)] = count;
7402 }
7403}
7404
7405void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7406 uint32_t desc)
7407{
7408 ARMVectorReg scratch;
7409 intptr_t i, j;
7410 intptr_t opr_sz = simd_oprsz(desc);
7411 uint64_t *d = vd, *n = vn, *m = vm;
7412 uint8_t *pg = vg;
7413
7414 if (d == n) {
7415 n = memcpy(&scratch, n, opr_sz);
7416 if (d == m) {
7417 m = n;
7418 }
7419 } else if (d == m) {
7420 m = memcpy(&scratch, m, opr_sz);
7421 }
7422
7423 for (i = 0; i < opr_sz / 8; ++i) {
7424 uint64_t count = 0;
7425 if (pg[H1(i)] & 1) {
7426 uint64_t nn = n[i];
7427 for (j = 0; j <= i; ++j) {
7428 if ((pg[H1(j)] & 1) && nn == m[j]) {
7429 ++count;
7430 }
7431 }
7432 }
7433 d[i] = count;
7434 }
7435}
7436
7437
7438
7439
7440
7441
7442static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7443{
7444 const uint64_t mask = dup_const(MO_8, 0x7f);
7445 uint64_t cmp0, cmp1;
7446
7447 cmp1 = dup_const(MO_8, n);
7448 cmp0 = cmp1 ^ m0;
7449 cmp1 = cmp1 ^ m1;
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7460 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471 return ctpop64(cmp0 | (cmp1 >> 1));
7472}
7473
7474void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7475{
7476 intptr_t i, j;
7477 intptr_t opr_sz = simd_oprsz(desc);
7478
7479 for (i = 0; i < opr_sz; i += 16) {
7480 uint64_t n0 = *(uint64_t *)(vn + i);
7481 uint64_t m0 = *(uint64_t *)(vm + i);
7482 uint64_t n1 = *(uint64_t *)(vn + i + 8);
7483 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7484 uint64_t out0 = 0;
7485 uint64_t out1 = 0;
7486
7487 for (j = 0; j < 64; j += 8) {
7488 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7489 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7490 out0 |= cnt0 << j;
7491 out1 |= cnt1 << j;
7492 }
7493
7494 *(uint64_t *)(vd + i) = out0;
7495 *(uint64_t *)(vd + i + 8) = out1;
7496 }
7497}
7498
7499void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7500{
7501 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7502 int shr = simd_data(desc);
7503 int shl = 8 - shr;
7504 uint64_t mask = dup_const(MO_8, 0xff >> shr);
7505 uint64_t *d = vd, *n = vn, *m = vm;
7506
7507 for (i = 0; i < opr_sz; ++i) {
7508 uint64_t t = n[i] ^ m[i];
7509 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7510 }
7511}
7512
7513void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7514{
7515 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7516 int shr = simd_data(desc);
7517 int shl = 16 - shr;
7518 uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7519 uint64_t *d = vd, *n = vn, *m = vm;
7520
7521 for (i = 0; i < opr_sz; ++i) {
7522 uint64_t t = n[i] ^ m[i];
7523 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7524 }
7525}
7526
7527void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7528{
7529 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7530 int shr = simd_data(desc);
7531 uint32_t *d = vd, *n = vn, *m = vm;
7532
7533 for (i = 0; i < opr_sz; ++i) {
7534 d[i] = ror32(n[i] ^ m[i], shr);
7535 }
7536}
7537
7538void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7539 void *status, uint32_t desc)
7540{
7541 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7542
7543 for (s = 0; s < opr_sz; ++s) {
7544 float32 *n = vn + s * sizeof(float32) * 4;
7545 float32 *m = vm + s * sizeof(float32) * 4;
7546 float32 *a = va + s * sizeof(float32) * 4;
7547 float32 *d = vd + s * sizeof(float32) * 4;
7548 float32 n00 = n[H4(0)], n01 = n[H4(1)];
7549 float32 n10 = n[H4(2)], n11 = n[H4(3)];
7550 float32 m00 = m[H4(0)], m01 = m[H4(1)];
7551 float32 m10 = m[H4(2)], m11 = m[H4(3)];
7552 float32 p0, p1;
7553
7554
7555 p0 = float32_mul(n00, m00, status);
7556 p1 = float32_mul(n01, m01, status);
7557 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7558
7559
7560 p0 = float32_mul(n00, m10, status);
7561 p1 = float32_mul(n01, m11, status);
7562 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7563
7564
7565 p0 = float32_mul(n10, m00, status);
7566 p1 = float32_mul(n11, m01, status);
7567 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7568
7569
7570 p0 = float32_mul(n10, m10, status);
7571 p1 = float32_mul(n11, m11, status);
7572 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7573 }
7574}
7575
7576void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7577 void *status, uint32_t desc)
7578{
7579 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7580
7581 for (s = 0; s < opr_sz; ++s) {
7582 float64 *n = vn + s * sizeof(float64) * 4;
7583 float64 *m = vm + s * sizeof(float64) * 4;
7584 float64 *a = va + s * sizeof(float64) * 4;
7585 float64 *d = vd + s * sizeof(float64) * 4;
7586 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7587 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7588 float64 p0, p1;
7589
7590
7591 p0 = float64_mul(n00, m00, status);
7592 p1 = float64_mul(n01, m01, status);
7593 d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7594
7595
7596 p0 = float64_mul(n00, m10, status);
7597 p1 = float64_mul(n01, m11, status);
7598 d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7599
7600
7601 p0 = float64_mul(n10, m00, status);
7602 p1 = float64_mul(n11, m01, status);
7603 d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7604
7605
7606 p0 = float64_mul(n10, m10, status);
7607 p1 = float64_mul(n11, m11, status);
7608 d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7609 }
7610}
7611
7612#define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7613void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7614{ \
7615 intptr_t i = simd_oprsz(desc); \
7616 uint64_t *g = vg; \
7617 do { \
7618 uint64_t pg = g[(i - 1) >> 6]; \
7619 do { \
7620 i -= sizeof(TYPEW); \
7621 if (likely((pg >> (i & 63)) & 1)) { \
7622 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
7623 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \
7624 } \
7625 } while (i & 63); \
7626 } while (i != 0); \
7627}
7628
7629DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7630DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7631DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7632
7633#define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7634void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7635{ \
7636 intptr_t i = simd_oprsz(desc); \
7637 uint64_t *g = vg; \
7638 do { \
7639 uint64_t pg = g[(i - 1) >> 6]; \
7640 do { \
7641 i -= sizeof(TYPEW); \
7642 if (likely((pg >> (i & 63)) & 1)) { \
7643 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \
7644 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \
7645 } \
7646 } while (i & 63); \
7647 } while (i != 0); \
7648}
7649
7650DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7651DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7652
7653#undef DO_FCVTLT
7654#undef DO_FCVTNT
7655