1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20#include "qemu/osdep.h"
21#include "cpu.h"
22#include "internals.h"
23#include "exec/exec-all.h"
24#include "exec/helper-proto.h"
25#include "tcg/tcg-gvec-desc.h"
26#include "fpu/softfloat.h"
27#include "tcg/tcg.h"
28#include "vec_internal.h"
29#include "sve_ldst_internal.h"
30#include "hw/core/tcg-cpu-ops.h"
31
32
33
34
35
36
37
38
39
40
41#define PREDTEST_INIT 1
42
43
44
45
46static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
47{
48 if (likely(g)) {
49
50
51 if (!(flags & 4)) {
52 flags |= ((d & (g & -g)) != 0) << 31;
53 flags |= 4;
54 }
55
56
57 flags |= ((d & g) != 0) << 1;
58
59
60 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
61 }
62 return flags;
63}
64
65
66
67
68static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
69{
70 if (likely(g)) {
71
72
73 if (!(flags & 4)) {
74 flags += 4 - 1;
75 flags |= (d & pow2floor(g)) == 0;
76 }
77
78
79 flags |= ((d & g) != 0) << 1;
80
81
82 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
83 }
84 return flags;
85}
86
87
88uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
89{
90 return iter_predtest_fwd(d, g, PREDTEST_INIT);
91}
92
93
94uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
95{
96 uint32_t flags = PREDTEST_INIT;
97 uint64_t *d = vd, *g = vg;
98 uintptr_t i = 0;
99
100 do {
101 flags = iter_predtest_fwd(d[i], g[i], flags);
102 } while (++i < words);
103
104 return flags;
105}
106
107
108static inline uint64_t expand_pred_s(uint8_t byte)
109{
110 static const uint64_t word[] = {
111 [0x01] = 0x00000000ffffffffull,
112 [0x10] = 0xffffffff00000000ull,
113 [0x11] = 0xffffffffffffffffull,
114 };
115 return word[byte & 0x11];
116}
117
118#define LOGICAL_PPPP(NAME, FUNC) \
119void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
120{ \
121 uintptr_t opr_sz = simd_oprsz(desc); \
122 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
123 uintptr_t i; \
124 for (i = 0; i < opr_sz / 8; ++i) { \
125 d[i] = FUNC(n[i], m[i], g[i]); \
126 } \
127}
128
129#define DO_AND(N, M, G) (((N) & (M)) & (G))
130#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
131#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
132#define DO_ORR(N, M, G) (((N) | (M)) & (G))
133#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
134#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
135#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
136#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
137
138LOGICAL_PPPP(sve_and_pppp, DO_AND)
139LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
140LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
141LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
142LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
143LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
144LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
145LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
146
147#undef DO_AND
148#undef DO_BIC
149#undef DO_EOR
150#undef DO_ORR
151#undef DO_ORN
152#undef DO_NOR
153#undef DO_NAND
154#undef DO_SEL
155#undef LOGICAL_PPPP
156
157
158
159
160
161
162
163
164
165#define DO_ZPZZ(NAME, TYPE, H, OP) \
166void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
167{ \
168 intptr_t i, opr_sz = simd_oprsz(desc); \
169 for (i = 0; i < opr_sz; ) { \
170 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
171 do { \
172 if (pg & 1) { \
173 TYPE nn = *(TYPE *)(vn + H(i)); \
174 TYPE mm = *(TYPE *)(vm + H(i)); \
175 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
176 } \
177 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
178 } while (i & 15); \
179 } \
180}
181
182
183#define DO_ZPZZ_D(NAME, TYPE, OP) \
184void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
185{ \
186 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
187 TYPE *d = vd, *n = vn, *m = vm; \
188 uint8_t *pg = vg; \
189 for (i = 0; i < opr_sz; i += 1) { \
190 if (pg[H1(i)] & 1) { \
191 TYPE nn = n[i], mm = m[i]; \
192 d[i] = OP(nn, mm); \
193 } \
194 } \
195}
196
197#define DO_AND(N, M) (N & M)
198#define DO_EOR(N, M) (N ^ M)
199#define DO_ORR(N, M) (N | M)
200#define DO_BIC(N, M) (N & ~M)
201#define DO_ADD(N, M) (N + M)
202#define DO_SUB(N, M) (N - M)
203#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
204#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
205#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
206#define DO_MUL(N, M) (N * M)
207
208
209
210
211
212
213
214
215
216#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
217#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
218
219DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
220DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
221DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
222DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
223
224DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
225DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
226DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
227DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
228
229DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
230DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
231DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
232DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
233
234DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
235DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
236DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
237DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
238
239DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
240DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
241DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
242DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
243
244DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
245DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
246DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
247DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
248
249DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
250DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
251DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
252DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
253
254DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
255DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
256DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
257DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
258
259DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
260DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
261DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
262DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
263
264DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
265DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
266DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
267DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
268
269DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
270DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
271DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
272DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
273
274DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
275DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
276DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
277DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
278
279
280
281static inline uint8_t do_mulh_b(int32_t n, int32_t m)
282{
283 return (n * m) >> 8;
284}
285
286static inline uint16_t do_mulh_h(int32_t n, int32_t m)
287{
288 return (n * m) >> 16;
289}
290
291static inline uint32_t do_mulh_s(int64_t n, int64_t m)
292{
293 return (n * m) >> 32;
294}
295
296static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
297{
298 uint64_t lo, hi;
299 muls64(&lo, &hi, n, m);
300 return hi;
301}
302
303static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
304{
305 uint64_t lo, hi;
306 mulu64(&lo, &hi, n, m);
307 return hi;
308}
309
310DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
311DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
312DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
313DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
314
315DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
316DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
317DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
318DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
319
320DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
321DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
322DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
323DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
324
325DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
326DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
327
328DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
329DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
330
331
332
333#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
334#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
335#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
336
337DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
338DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
339DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
340
341DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
342DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
343DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
344
345DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
346DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
347DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
348
349DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
350DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
351DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
352
353static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
354{
355 int8_t n1 = n, n2 = n >> 8;
356 return m + n1 + n2;
357}
358
359static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
360{
361 int16_t n1 = n, n2 = n >> 16;
362 return m + n1 + n2;
363}
364
365static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
366{
367 int32_t n1 = n, n2 = n >> 32;
368 return m + n1 + n2;
369}
370
371DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
372DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
373DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
374
375static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
376{
377 uint8_t n1 = n, n2 = n >> 8;
378 return m + n1 + n2;
379}
380
381static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
382{
383 uint16_t n1 = n, n2 = n >> 16;
384 return m + n1 + n2;
385}
386
387static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
388{
389 uint32_t n1 = n, n2 = n >> 32;
390 return m + n1 + n2;
391}
392
393DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
394DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
395DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
396
397#define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
398#define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
399#define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
400#define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
401
402DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
403DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
404DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
405DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
406
407#define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
408#define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
409#define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
410#define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
411
412DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
413DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
414DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
415DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
416
417
418
419
420
421
422
423#define do_sqshl_b(n, m) \
424 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
425#define do_sqshl_h(n, m) \
426 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
427#define do_sqshl_s(n, m) \
428 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
429#define do_sqshl_d(n, m) \
430 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
431
432DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
433DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
434DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
435DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
436
437#define do_uqshl_b(n, m) \
438 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
439#define do_uqshl_h(n, m) \
440 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
441#define do_uqshl_s(n, m) \
442 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
443#define do_uqshl_d(n, m) \
444 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
445
446DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
447DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
448DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
449DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
450
451#define do_sqrshl_b(n, m) \
452 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
453#define do_sqrshl_h(n, m) \
454 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
455#define do_sqrshl_s(n, m) \
456 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
457#define do_sqrshl_d(n, m) \
458 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
459
460DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
461DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
462DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
463DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
464
465#undef do_sqrshl_d
466
467#define do_uqrshl_b(n, m) \
468 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
469#define do_uqrshl_h(n, m) \
470 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
471#define do_uqrshl_s(n, m) \
472 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
473#define do_uqrshl_d(n, m) \
474 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
475
476DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
477DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
478DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
479DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
480
481#undef do_uqrshl_d
482
483#define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
484#define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
485
486DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
487DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
488DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
489DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
490
491DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
492DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
493DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
494DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
495
496#define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
497#define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
498
499DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
500DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
501DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
502DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
503
504DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
505DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
506DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
507DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
508
509#define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
510#define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
511
512DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
513DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
514DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
515DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
516
517DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
518DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
519DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
520DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
521
522static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
523{
524 return val >= max ? max : val <= min ? min : val;
525}
526
527#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
528#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
529#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
530
531static inline int64_t do_sqadd_d(int64_t n, int64_t m)
532{
533 int64_t r = n + m;
534 if (((r ^ n) & ~(n ^ m)) < 0) {
535
536 return r < 0 ? INT64_MAX : INT64_MIN;
537 }
538 return r;
539}
540
541DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
542DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
543DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
544DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
545
546#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
547#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
548#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
549
550static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
551{
552 uint64_t r = n + m;
553 return r < n ? UINT64_MAX : r;
554}
555
556DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
557DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
558DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
559DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
560
561#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
562#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
563#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
564
565static inline int64_t do_sqsub_d(int64_t n, int64_t m)
566{
567 int64_t r = n - m;
568 if (((r ^ n) & (n ^ m)) < 0) {
569
570 return r < 0 ? INT64_MAX : INT64_MIN;
571 }
572 return r;
573}
574
575DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
576DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
577DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
578DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
579
580#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
581#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
582#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
583
584static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
585{
586 return n > m ? n - m : 0;
587}
588
589DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
590DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
591DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
592DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
593
594#define DO_SUQADD_B(n, m) \
595 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
596#define DO_SUQADD_H(n, m) \
597 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
598#define DO_SUQADD_S(n, m) \
599 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
600
601static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
602{
603 uint64_t r = n + m;
604
605 if (n < 0) {
606
607 if (r > INT64_MAX) {
608
609 if (m > -n) {
610
611 return INT64_MAX;
612 }
613
614 }
615 } else {
616
617 if (r < m || r > INT64_MAX) {
618 return INT64_MAX;
619 }
620 }
621 return r;
622}
623
624DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
625DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
626DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
627DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
628
629#define DO_USQADD_B(n, m) \
630 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
631#define DO_USQADD_H(n, m) \
632 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
633#define DO_USQADD_S(n, m) \
634 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
635
636static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
637{
638 uint64_t r = n + m;
639
640 if (m < 0) {
641 return n < -m ? 0 : r;
642 }
643 return r < n ? UINT64_MAX : r;
644}
645
646DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
647DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
648DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
649DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
650
651#undef DO_ZPZZ
652#undef DO_ZPZZ_D
653
654
655
656
657
658
659
660#define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
661void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
662{ \
663 intptr_t i, opr_sz = simd_oprsz(desc); \
664 for (i = 0; i < opr_sz; ) { \
665 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
666 do { \
667 TYPE n0 = *(TYPE *)(vn + H(i)); \
668 TYPE m0 = *(TYPE *)(vm + H(i)); \
669 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
670 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
671 if (pg & 1) { \
672 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
673 } \
674 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
675 if (pg & 1) { \
676 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
677 } \
678 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
679 } while (i & 15); \
680 } \
681}
682
683
684#define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
685void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
686{ \
687 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
688 TYPE *d = vd, *n = vn, *m = vm; \
689 uint8_t *pg = vg; \
690 for (i = 0; i < opr_sz; i += 2) { \
691 TYPE n0 = n[i], n1 = n[i + 1]; \
692 TYPE m0 = m[i], m1 = m[i + 1]; \
693 if (pg[H1(i)] & 1) { \
694 d[i] = OP(n0, n1); \
695 } \
696 if (pg[H1(i + 1)] & 1) { \
697 d[i + 1] = OP(m0, m1); \
698 } \
699 } \
700}
701
702DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
703DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
704DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
705DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
706
707DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
708DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
709DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
710DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
711
712DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
713DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
714DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
715DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
716
717DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
718DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
719DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
720DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
721
722DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
723DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
724DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
725DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
726
727#undef DO_ZPZZ_PAIR
728#undef DO_ZPZZ_PAIR_D
729
730#define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
731void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
732 void *status, uint32_t desc) \
733{ \
734 intptr_t i, opr_sz = simd_oprsz(desc); \
735 for (i = 0; i < opr_sz; ) { \
736 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
737 do { \
738 TYPE n0 = *(TYPE *)(vn + H(i)); \
739 TYPE m0 = *(TYPE *)(vm + H(i)); \
740 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
741 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
742 if (pg & 1) { \
743 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
744 } \
745 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
746 if (pg & 1) { \
747 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
748 } \
749 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
750 } while (i & 15); \
751 } \
752}
753
754DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
755DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
756DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
757
758DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
759DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
760DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
761
762DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
763DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
764DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
765
766DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
767DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
768DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
769
770DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
771DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
772DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
773
774#undef DO_ZPZZ_PAIR_FP
775
776
777
778
779
780#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
781void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
782{ \
783 intptr_t i, opr_sz = simd_oprsz(desc); \
784 for (i = 0; i < opr_sz; ) { \
785 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
786 TYPEW mm = *(TYPEW *)(vm + i); \
787 do { \
788 if (pg & 1) { \
789 TYPE nn = *(TYPE *)(vn + H(i)); \
790 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
791 } \
792 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
793 } while (i & 7); \
794 } \
795}
796
797DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
798DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
799DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
800
801DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
802DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
803DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
804
805DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
806DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
807DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
808
809#undef DO_ZPZW
810
811
812
813#define DO_ZPZ(NAME, TYPE, H, OP) \
814void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
815{ \
816 intptr_t i, opr_sz = simd_oprsz(desc); \
817 for (i = 0; i < opr_sz; ) { \
818 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
819 do { \
820 if (pg & 1) { \
821 TYPE nn = *(TYPE *)(vn + H(i)); \
822 *(TYPE *)(vd + H(i)) = OP(nn); \
823 } \
824 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
825 } while (i & 15); \
826 } \
827}
828
829
830#define DO_ZPZ_D(NAME, TYPE, OP) \
831void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
832{ \
833 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
834 TYPE *d = vd, *n = vn; \
835 uint8_t *pg = vg; \
836 for (i = 0; i < opr_sz; i += 1) { \
837 if (pg[H1(i)] & 1) { \
838 TYPE nn = n[i]; \
839 d[i] = OP(nn); \
840 } \
841 } \
842}
843
844#define DO_CLS_B(N) (clrsb32(N) - 24)
845#define DO_CLS_H(N) (clrsb32(N) - 16)
846
847DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
848DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
849DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
850DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
851
852#define DO_CLZ_B(N) (clz32(N) - 24)
853#define DO_CLZ_H(N) (clz32(N) - 16)
854
855DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
856DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
857DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
858DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
859
860DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
861DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
862DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
863DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
864
865#define DO_CNOT(N) (N == 0)
866
867DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
868DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
869DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
870DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
871
872#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
873
874DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
875DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
876DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
877
878#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
879
880DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
881DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
882DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
883
884#define DO_NOT(N) (~N)
885
886DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
887DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
888DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
889DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
890
891#define DO_SXTB(N) ((int8_t)N)
892#define DO_SXTH(N) ((int16_t)N)
893#define DO_SXTS(N) ((int32_t)N)
894#define DO_UXTB(N) ((uint8_t)N)
895#define DO_UXTH(N) ((uint16_t)N)
896#define DO_UXTS(N) ((uint32_t)N)
897
898DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
899DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
900DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
901DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
902DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
903DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
904
905DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
906DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
907DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
908DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
909DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
910DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
911
912#define DO_ABS(N) (N < 0 ? -N : N)
913
914DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
915DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
916DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
917DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
918
919#define DO_NEG(N) (-N)
920
921DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
922DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
923DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
924DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
925
926DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
927DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
928DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
929
930DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
931DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
932
933DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
934
935void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
936{
937 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
938 uint64_t *d = vd, *n = vn;
939 uint8_t *pg = vg;
940
941 for (i = 0; i < opr_sz; i += 2) {
942 if (pg[H1(i)] & 1) {
943 uint64_t n0 = n[i + 0];
944 uint64_t n1 = n[i + 1];
945 d[i + 0] = n1;
946 d[i + 1] = n0;
947 }
948 }
949}
950
951DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
952DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
953DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
954DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
955
956#define DO_SQABS(X) \
957 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
958 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
959
960DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
961DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
962DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
963DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
964
965#define DO_SQNEG(X) \
966 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
967 x_ == min_ ? -min_ - 1 : -x_; })
968
969DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
970DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
971DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
972DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
973
974DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
975DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
976
977
978
979#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
980void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
981{ \
982 intptr_t i, opr_sz = simd_oprsz(desc); \
983 for (i = 0; i < opr_sz; ) { \
984 TYPEW mm = *(TYPEW *)(vm + i); \
985 do { \
986 TYPE nn = *(TYPE *)(vn + H(i)); \
987 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
988 i += sizeof(TYPE); \
989 } while (i & 7); \
990 } \
991}
992
993DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
994DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
995DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
996
997DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
998DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
999DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1000
1001DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1002DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1003DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1004
1005#undef DO_ZZW
1006
1007#undef DO_CLS_B
1008#undef DO_CLS_H
1009#undef DO_CLZ_B
1010#undef DO_CLZ_H
1011#undef DO_CNOT
1012#undef DO_FABS
1013#undef DO_FNEG
1014#undef DO_ABS
1015#undef DO_NEG
1016#undef DO_ZPZ
1017#undef DO_ZPZ_D
1018
1019
1020
1021
1022
1023#define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1024void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1025{ \
1026 intptr_t i, opr_sz = simd_oprsz(desc); \
1027 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1028 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1029 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1030 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1031 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1032 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1033 } \
1034}
1035
1036DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1037DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1038DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1039
1040DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1041DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1042DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1043
1044DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1045DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1046DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1047
1048DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1049DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1050DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1051
1052DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1053DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1054DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1055
1056DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1057DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1058DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1059
1060DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1061DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1062DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1063
1064DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1065DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1066DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1067
1068
1069static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1070{
1071 int16_t val = n * m;
1072 return DO_SQADD_H(val, val);
1073}
1074
1075static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1076{
1077 int32_t val = n * m;
1078 return DO_SQADD_S(val, val);
1079}
1080
1081static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1082{
1083 int64_t val = n * m;
1084 return do_sqadd_d(val, val);
1085}
1086
1087DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1088DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1089DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1090
1091#undef DO_ZZZ_TB
1092
1093#define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1094void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1095{ \
1096 intptr_t i, opr_sz = simd_oprsz(desc); \
1097 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1098 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1099 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1100 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1101 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1102 } \
1103}
1104
1105DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1106DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1107DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1108
1109DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1110DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1111DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1112
1113DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1114DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1115DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1116
1117DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1118DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1119DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1120
1121#undef DO_ZZZ_WTB
1122
1123#define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1124void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1125{ \
1126 intptr_t i, opr_sz = simd_oprsz(desc); \
1127 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1128 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1129 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1130 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1131 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1132 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1133 } \
1134}
1135
1136DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1137DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1138DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1139DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1140
1141#undef DO_ZZZ_NTB
1142
1143#define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1144void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1145{ \
1146 intptr_t i, opr_sz = simd_oprsz(desc); \
1147 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1148 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1149 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1150 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1151 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1152 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1153 } \
1154}
1155
1156DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1157DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1158DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1159
1160DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1161DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1162DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1163
1164DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1165DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1166DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1167
1168DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1169DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1170DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1171
1172#define DO_NMUL(N, M) -(N * M)
1173
1174DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1175DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1176DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1177
1178DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1179DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1180DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1181
1182#undef DO_ZZZW_ACC
1183
1184#define DO_XTNB(NAME, TYPE, OP) \
1185void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1186{ \
1187 intptr_t i, opr_sz = simd_oprsz(desc); \
1188 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1189 TYPE nn = *(TYPE *)(vn + i); \
1190 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1191 *(TYPE *)(vd + i) = nn; \
1192 } \
1193}
1194
1195#define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1196void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1197{ \
1198 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1199 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1200 TYPE nn = *(TYPE *)(vn + i); \
1201 *(TYPEN *)(vd + i + odd) = OP(nn); \
1202 } \
1203}
1204
1205#define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1206#define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1207#define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1208
1209DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1210DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1211DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1212
1213DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1214DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1215DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1216
1217#define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1218#define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1219#define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1220
1221DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1222DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1223DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1224
1225DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1226DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1227DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1228
1229DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1230DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1231DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1232
1233DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1234DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1235DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1236
1237#undef DO_XTNB
1238#undef DO_XTNT
1239
1240void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1241{
1242 intptr_t i, opr_sz = simd_oprsz(desc);
1243 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1244 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1245 uint32_t *a = va, *n = vn;
1246 uint64_t *d = vd, *m = vm;
1247
1248 for (i = 0; i < opr_sz / 8; ++i) {
1249 uint32_t e1 = a[2 * i + H4(0)];
1250 uint32_t e2 = n[2 * i + sel] ^ inv;
1251 uint64_t c = extract64(m[i], 32, 1);
1252
1253 d[i] = c + e1 + e2;
1254 }
1255}
1256
1257void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1258{
1259 intptr_t i, opr_sz = simd_oprsz(desc);
1260 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1261 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1262 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1263
1264 for (i = 0; i < opr_sz / 8; i += 2) {
1265 Int128 e1 = int128_make64(a[i]);
1266 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1267 Int128 c = int128_make64(m[i + 1] & 1);
1268 Int128 r = int128_add(int128_add(e1, e2), c);
1269 d[i + 0] = int128_getlo(r);
1270 d[i + 1] = int128_gethi(r);
1271 }
1272}
1273
1274#define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1275void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1276{ \
1277 intptr_t i, opr_sz = simd_oprsz(desc); \
1278 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1279 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1280 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1281 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1282 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1283 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1284 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1285 } \
1286}
1287
1288DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1289 do_sqdmull_h, DO_SQADD_H)
1290DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1291 do_sqdmull_s, DO_SQADD_S)
1292DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1293 do_sqdmull_d, do_sqadd_d)
1294
1295DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1296 do_sqdmull_h, DO_SQSUB_H)
1297DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1298 do_sqdmull_s, DO_SQSUB_S)
1299DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1300 do_sqdmull_d, do_sqsub_d)
1301
1302#undef DO_SQDMLAL
1303
1304#define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1305void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1306{ \
1307 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1308 int rot = simd_data(desc); \
1309 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1310 bool sub_r = rot == 1 || rot == 2; \
1311 bool sub_i = rot >= 2; \
1312 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1313 for (i = 0; i < opr_sz; i += 2) { \
1314 TYPE elt1_a = n[H(i + sel_a)]; \
1315 TYPE elt2_a = m[H(i + sel_a)]; \
1316 TYPE elt2_b = m[H(i + sel_b)]; \
1317 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1318 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1319 } \
1320}
1321
1322#define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1323
1324DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1325DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1326DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1327DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1328
1329#define DO_SQRDMLAH_B(N, M, A, S) \
1330 do_sqrdmlah_b(N, M, A, S, true)
1331#define DO_SQRDMLAH_H(N, M, A, S) \
1332 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1333#define DO_SQRDMLAH_S(N, M, A, S) \
1334 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1335#define DO_SQRDMLAH_D(N, M, A, S) \
1336 do_sqrdmlah_d(N, M, A, S, true)
1337
1338DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1339DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1340DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1341DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1342
1343#define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1344void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1345{ \
1346 intptr_t i, j, oprsz = simd_oprsz(desc); \
1347 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
1348 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
1349 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1350 bool sub_r = rot == 1 || rot == 2; \
1351 bool sub_i = rot >= 2; \
1352 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1353 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
1354 TYPE elt2_a = m[H(i + idx + sel_a)]; \
1355 TYPE elt2_b = m[H(i + idx + sel_b)]; \
1356 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
1357 TYPE elt1_a = n[H(i + j + sel_a)]; \
1358 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
1359 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
1360 } \
1361 } \
1362}
1363
1364DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1365DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1366
1367DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1368DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1369
1370#undef DO_CMLA
1371#undef DO_CMLA_FUNC
1372#undef DO_CMLA_IDX_FUNC
1373#undef DO_SQRDMLAH_B
1374#undef DO_SQRDMLAH_H
1375#undef DO_SQRDMLAH_S
1376#undef DO_SQRDMLAH_D
1377
1378
1379static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1380 int sel_a, int sel_b, int sub_i)
1381{
1382 for (int i = 0; i <= 1; i++) {
1383 int32_t elt1_r = (int8_t)(n >> (16 * i));
1384 int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1385 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1386 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1387
1388 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1389 }
1390 return a;
1391}
1392
1393static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1394 int sel_a, int sel_b, int sub_i)
1395{
1396 for (int i = 0; i <= 1; i++) {
1397 int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1398 int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1399 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1400 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1401
1402 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1403 }
1404 return a;
1405}
1406
1407void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1408 void *va, uint32_t desc)
1409{
1410 int opr_sz = simd_oprsz(desc);
1411 int rot = simd_data(desc);
1412 int sel_a = rot & 1;
1413 int sel_b = sel_a ^ 1;
1414 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1415 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1416
1417 for (int e = 0; e < opr_sz / 4; e++) {
1418 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1419 }
1420}
1421
1422void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1423 void *va, uint32_t desc)
1424{
1425 int opr_sz = simd_oprsz(desc);
1426 int rot = simd_data(desc);
1427 int sel_a = rot & 1;
1428 int sel_b = sel_a ^ 1;
1429 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1430 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1431
1432 for (int e = 0; e < opr_sz / 8; e++) {
1433 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1434 }
1435}
1436
1437void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1438 void *va, uint32_t desc)
1439{
1440 int opr_sz = simd_oprsz(desc);
1441 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1442 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1443 int sel_a = rot & 1;
1444 int sel_b = sel_a ^ 1;
1445 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1446 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1447
1448 for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1449 uint32_t seg_m = m[seg + idx];
1450 for (int e = 0; e < 4; e++) {
1451 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1452 sel_a, sel_b, sub_i);
1453 }
1454 }
1455}
1456
1457void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1458 void *va, uint32_t desc)
1459{
1460 int seg, opr_sz = simd_oprsz(desc);
1461 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1462 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1463 int sel_a = rot & 1;
1464 int sel_b = sel_a ^ 1;
1465 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1466 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1467
1468 for (seg = 0; seg < opr_sz / 8; seg += 2) {
1469 uint64_t seg_m = m[seg + idx];
1470 for (int e = 0; e < 2; e++) {
1471 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1472 sel_a, sel_b, sub_i);
1473 }
1474 }
1475}
1476
1477#define DO_ZZXZ(NAME, TYPE, H, OP) \
1478void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1479{ \
1480 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
1481 intptr_t i, j, idx = simd_data(desc); \
1482 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
1483 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1484 TYPE mm = m[i]; \
1485 for (j = 0; j < segment; j++) { \
1486 d[i + j] = OP(n[i + j], mm, a[i + j]); \
1487 } \
1488 } \
1489}
1490
1491#define DO_SQRDMLAH_H(N, M, A) \
1492 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1493#define DO_SQRDMLAH_S(N, M, A) \
1494 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1495#define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1496
1497DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1498DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1499DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1500
1501#define DO_SQRDMLSH_H(N, M, A) \
1502 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1503#define DO_SQRDMLSH_S(N, M, A) \
1504 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1505#define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1506
1507DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1508DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1509DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1510
1511#undef DO_ZZXZ
1512
1513#define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1514void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1515{ \
1516 intptr_t i, j, oprsz = simd_oprsz(desc); \
1517 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1518 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1519 for (i = 0; i < oprsz; i += 16) { \
1520 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1521 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1522 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1523 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
1524 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
1525 } \
1526 } \
1527}
1528
1529#define DO_MLA(N, M, A) (A + N * M)
1530
1531DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1532DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1533DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1534DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1535
1536#define DO_MLS(N, M, A) (A - N * M)
1537
1538DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1539DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1540DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1541DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1542
1543#define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
1544#define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
1545
1546DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1547DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1548
1549#define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
1550#define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
1551
1552DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1553DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1554
1555#undef DO_MLA
1556#undef DO_MLS
1557#undef DO_ZZXW
1558
1559#define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1560void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1561{ \
1562 intptr_t i, j, oprsz = simd_oprsz(desc); \
1563 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1564 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1565 for (i = 0; i < oprsz; i += 16) { \
1566 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1567 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1568 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1569 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
1570 } \
1571 } \
1572}
1573
1574DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1575DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1576
1577DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1578DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1579
1580DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1581DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1582
1583#undef DO_ZZX
1584
1585#define DO_BITPERM(NAME, TYPE, OP) \
1586void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1587{ \
1588 intptr_t i, opr_sz = simd_oprsz(desc); \
1589 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1590 TYPE nn = *(TYPE *)(vn + i); \
1591 TYPE mm = *(TYPE *)(vm + i); \
1592 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1593 } \
1594}
1595
1596static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1597{
1598 uint64_t res = 0;
1599 int db, rb = 0;
1600
1601 for (db = 0; db < n; ++db) {
1602 if ((mask >> db) & 1) {
1603 res |= ((data >> db) & 1) << rb;
1604 ++rb;
1605 }
1606 }
1607 return res;
1608}
1609
1610DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1611DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1612DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1613DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1614
1615static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1616{
1617 uint64_t res = 0;
1618 int rb, db = 0;
1619
1620 for (rb = 0; rb < n; ++rb) {
1621 if ((mask >> rb) & 1) {
1622 res |= ((data >> db) & 1) << rb;
1623 ++db;
1624 }
1625 }
1626 return res;
1627}
1628
1629DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1630DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1631DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1632DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1633
1634static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1635{
1636 uint64_t resm = 0, resu = 0;
1637 int db, rbm = 0, rbu = 0;
1638
1639 for (db = 0; db < n; ++db) {
1640 uint64_t val = (data >> db) & 1;
1641 if ((mask >> db) & 1) {
1642 resm |= val << rbm++;
1643 } else {
1644 resu |= val << rbu++;
1645 }
1646 }
1647
1648 return resm | (resu << rbm);
1649}
1650
1651DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1652DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1653DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1654DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1655
1656#undef DO_BITPERM
1657
1658#define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1659void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1660{ \
1661 intptr_t i, opr_sz = simd_oprsz(desc); \
1662 int sub_r = simd_data(desc); \
1663 if (sub_r) { \
1664 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1665 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1666 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1667 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1668 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1669 acc_r = ADD_OP(acc_r, el2_i); \
1670 acc_i = SUB_OP(acc_i, el2_r); \
1671 *(TYPE *)(vd + H(i)) = acc_r; \
1672 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1673 } \
1674 } else { \
1675 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1676 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1677 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1678 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1679 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1680 acc_r = SUB_OP(acc_r, el2_i); \
1681 acc_i = ADD_OP(acc_i, el2_r); \
1682 *(TYPE *)(vd + H(i)) = acc_r; \
1683 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1684 } \
1685 } \
1686}
1687
1688DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1689DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1690DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1691DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1692
1693DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1694DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1695DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1696DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1697
1698#undef DO_CADD
1699
1700#define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1701void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1702{ \
1703 intptr_t i, opr_sz = simd_oprsz(desc); \
1704 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1705 int shift = simd_data(desc) >> 1; \
1706 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1707 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1708 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1709 } \
1710}
1711
1712DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1713DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1714DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1715
1716DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1717DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1718DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1719
1720#undef DO_ZZI_SHLL
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1732uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1733{ \
1734 intptr_t i, opr_sz = simd_oprsz(desc); \
1735 TYPERED ret = INIT; \
1736 for (i = 0; i < opr_sz; ) { \
1737 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1738 do { \
1739 if (pg & 1) { \
1740 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1741 ret = OP(ret, nn); \
1742 } \
1743 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1744 } while (i & 15); \
1745 } \
1746 return (TYPERET)ret; \
1747}
1748
1749#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1750uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1751{ \
1752 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1753 TYPEE *n = vn; \
1754 uint8_t *pg = vg; \
1755 TYPER ret = INIT; \
1756 for (i = 0; i < opr_sz; i += 1) { \
1757 if (pg[H1(i)] & 1) { \
1758 TYPEE nn = n[i]; \
1759 ret = OP(ret, nn); \
1760 } \
1761 } \
1762 return ret; \
1763}
1764
1765DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1766DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1767DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1768DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1769
1770DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1771DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1772DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1773DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1774
1775DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1776DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1777DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1778DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1779
1780DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1781DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1782DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1783
1784DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1785DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1786DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1787DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1788
1789DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1790DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1791DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1792DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1793
1794DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1795DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1796DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1797DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1798
1799DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1800DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1801DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1802DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1803
1804DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1805DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1806DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1807DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1808
1809#undef DO_VPZ
1810#undef DO_VPZ_D
1811
1812
1813#define DO_ZZI(NAME, TYPE, OP) \
1814void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1815{ \
1816 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1817 TYPE s = s64, *d = vd, *n = vn; \
1818 for (i = 0; i < opr_sz; ++i) { \
1819 d[i] = OP(n[i], s); \
1820 } \
1821}
1822
1823#define DO_SUBR(X, Y) (Y - X)
1824
1825DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1826DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1827DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1828DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1829
1830DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1831DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1832DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1833DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1834
1835DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1836DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1837DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1838DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1839
1840DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1841DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1842DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1843DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1844
1845DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1846DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1847DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1848DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1849
1850#undef DO_ZZI
1851
1852#undef DO_AND
1853#undef DO_ORR
1854#undef DO_EOR
1855#undef DO_BIC
1856#undef DO_ADD
1857#undef DO_SUB
1858#undef DO_MAX
1859#undef DO_MIN
1860#undef DO_ABD
1861#undef DO_MUL
1862#undef DO_DIV
1863#undef DO_ASR
1864#undef DO_LSR
1865#undef DO_LSL
1866#undef DO_SUBR
1867
1868
1869
1870
1871static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1872{
1873 uint64_t mask = pred_esz_masks[esz];
1874 intptr_t i = words;
1875
1876 do {
1877 uint64_t this_g = g[--i] & mask;
1878 if (this_g) {
1879 return i * 64 + (63 - clz64(this_g));
1880 }
1881 } while (i > 0);
1882 return (intptr_t)-1 << esz;
1883}
1884
1885uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1886{
1887 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1888 uint32_t flags = PREDTEST_INIT;
1889 uint64_t *d = vd, *g = vg;
1890 intptr_t i = 0;
1891
1892 do {
1893 uint64_t this_d = d[i];
1894 uint64_t this_g = g[i];
1895
1896 if (this_g) {
1897 if (!(flags & 4)) {
1898
1899 this_d |= this_g & -this_g;
1900 d[i] = this_d;
1901 }
1902 flags = iter_predtest_fwd(this_d, this_g, flags);
1903 }
1904 } while (++i < words);
1905
1906 return flags;
1907}
1908
1909uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1910{
1911 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1912 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1913 uint32_t flags = PREDTEST_INIT;
1914 uint64_t *d = vd, *g = vg, esz_mask;
1915 intptr_t i, next;
1916
1917 next = last_active_element(vd, words, esz) + (1 << esz);
1918 esz_mask = pred_esz_masks[esz];
1919
1920
1921
1922 if (next < words * 64) {
1923 uint64_t mask = -1;
1924
1925 if (next & 63) {
1926 mask = ~((1ull << (next & 63)) - 1);
1927 next &= -64;
1928 }
1929 do {
1930 uint64_t this_g = g[next / 64] & esz_mask & mask;
1931 if (this_g != 0) {
1932 next = (next & -64) + ctz64(this_g);
1933 break;
1934 }
1935 next += 64;
1936 mask = -1;
1937 } while (next < words * 64);
1938 }
1939
1940 i = 0;
1941 do {
1942 uint64_t this_d = 0;
1943 if (i == next / 64) {
1944 this_d = 1ull << (next & 63);
1945 }
1946 d[i] = this_d;
1947 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1948 } while (++i < words);
1949
1950 return flags;
1951}
1952
1953
1954
1955
1956
1957void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1958{
1959 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1960 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1961 uint64_t *d = vd, *n = vn;
1962 uint8_t *pg = vg;
1963
1964 for (i = 0; i < opr_sz; i += 1) {
1965 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1966 }
1967}
1968
1969void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1970{
1971 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1972 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1973 uint64_t *d = vd, *n = vn;
1974 uint8_t *pg = vg;
1975
1976 for (i = 0; i < opr_sz; i += 1) {
1977 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1978 }
1979}
1980
1981void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1982{
1983 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1984 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1985 uint64_t *d = vd, *n = vn;
1986 uint8_t *pg = vg;
1987
1988 for (i = 0; i < opr_sz; i += 1) {
1989 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
1990 }
1991}
1992
1993void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1994{
1995 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1996 uint64_t *d = vd, *n = vn;
1997 uint8_t *pg = vg;
1998 uint8_t inv = simd_data(desc);
1999
2000 for (i = 0; i < opr_sz; i += 1) {
2001 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2002 }
2003}
2004
2005
2006
2007#define DO_ZPZI(NAME, TYPE, H, OP) \
2008void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2009{ \
2010 intptr_t i, opr_sz = simd_oprsz(desc); \
2011 TYPE imm = simd_data(desc); \
2012 for (i = 0; i < opr_sz; ) { \
2013 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2014 do { \
2015 if (pg & 1) { \
2016 TYPE nn = *(TYPE *)(vn + H(i)); \
2017 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
2018 } \
2019 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2020 } while (i & 15); \
2021 } \
2022}
2023
2024
2025#define DO_ZPZI_D(NAME, TYPE, OP) \
2026void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2027{ \
2028 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2029 TYPE *d = vd, *n = vn; \
2030 TYPE imm = simd_data(desc); \
2031 uint8_t *pg = vg; \
2032 for (i = 0; i < opr_sz; i += 1) { \
2033 if (pg[H1(i)] & 1) { \
2034 TYPE nn = n[i]; \
2035 d[i] = OP(nn, imm); \
2036 } \
2037 } \
2038}
2039
2040#define DO_SHR(N, M) (N >> M)
2041#define DO_SHL(N, M) (N << M)
2042
2043
2044
2045
2046#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2047
2048static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2049{
2050 if (likely(sh < 64)) {
2051 return (x >> sh) + ((x >> (sh - 1)) & 1);
2052 } else if (sh == 64) {
2053 return x >> 63;
2054 } else {
2055 return 0;
2056 }
2057}
2058
2059static inline int64_t do_srshr(int64_t x, unsigned sh)
2060{
2061 if (likely(sh < 64)) {
2062 return (x >> sh) + ((x >> (sh - 1)) & 1);
2063 } else {
2064
2065 return 0;
2066 }
2067}
2068
2069DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2070DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2071DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2072DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2073
2074DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2075DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2076DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2077DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2078
2079DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2080DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2081DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2082DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2083
2084DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2085DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2086DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2087DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2088
2089
2090DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2091DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2092DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2093DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2094
2095DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2096DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2097DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2098DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2099
2100DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2101DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2102DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2103DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2104
2105DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2106DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2107DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2108DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2109
2110#define do_suqrshl_b(n, m) \
2111 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2112#define do_suqrshl_h(n, m) \
2113 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2114#define do_suqrshl_s(n, m) \
2115 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2116#define do_suqrshl_d(n, m) \
2117 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2118
2119DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2120DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2121DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2122DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2123
2124#undef DO_ASRD
2125#undef DO_ZPZI
2126#undef DO_ZPZI_D
2127
2128#define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2129void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2130{ \
2131 intptr_t i, opr_sz = simd_oprsz(desc); \
2132 int shift = simd_data(desc); \
2133 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2134 TYPEW nn = *(TYPEW *)(vn + i); \
2135 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2136 } \
2137}
2138
2139#define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2140void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2141{ \
2142 intptr_t i, opr_sz = simd_oprsz(desc); \
2143 int shift = simd_data(desc); \
2144 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2145 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2146 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2147 } \
2148}
2149
2150DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2151DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2152DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2153
2154DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2155DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2156DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2157
2158DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2159DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2160DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2161
2162DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2163DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2164DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2165
2166#define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2167#define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2168#define DO_SQSHRUN_D(x, sh) \
2169 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2170
2171DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2172DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2173DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2174
2175DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2176DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2177DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2178
2179#define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2180#define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2181#define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2182
2183DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2184DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2185DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2186
2187DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2188DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2189DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2190
2191#define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2192#define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2193#define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2194
2195DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2196DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2197DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2198
2199DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2200DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2201DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2202
2203#define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2204#define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2205#define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2206
2207DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2208DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2209DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2210
2211DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2212DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2213DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2214
2215#define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2216#define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2217#define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2218
2219DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2220DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2221DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2222
2223DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2224DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2225DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2226
2227#define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2228#define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2229#define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2230
2231DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2232DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2233DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2234
2235DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2236DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2237DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2238
2239#undef DO_SHRNB
2240#undef DO_SHRNT
2241
2242#define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2243void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2244{ \
2245 intptr_t i, opr_sz = simd_oprsz(desc); \
2246 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2247 TYPEW nn = *(TYPEW *)(vn + i); \
2248 TYPEW mm = *(TYPEW *)(vm + i); \
2249 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2250 } \
2251}
2252
2253#define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2254void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2255{ \
2256 intptr_t i, opr_sz = simd_oprsz(desc); \
2257 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2258 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2259 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2260 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2261 } \
2262}
2263
2264#define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2265#define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2266#define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2267#define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2268
2269DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2270DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2271DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2272
2273DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2274DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2275DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2276
2277DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2278DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2279DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2280
2281DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2282DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2283DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2284
2285DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2286DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2287DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2288
2289DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2290DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2291DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2292
2293DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2294DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2295DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2296
2297DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2298DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2299DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2300
2301#undef DO_RSUBHN
2302#undef DO_SUBHN
2303#undef DO_RADDHN
2304#undef DO_ADDHN
2305
2306#undef DO_BINOPNB
2307
2308
2309
2310#define DO_ZPZZZ(NAME, TYPE, H, OP) \
2311void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2312 void *vg, uint32_t desc) \
2313{ \
2314 intptr_t i, opr_sz = simd_oprsz(desc); \
2315 for (i = 0; i < opr_sz; ) { \
2316 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2317 do { \
2318 if (pg & 1) { \
2319 TYPE nn = *(TYPE *)(vn + H(i)); \
2320 TYPE mm = *(TYPE *)(vm + H(i)); \
2321 TYPE aa = *(TYPE *)(va + H(i)); \
2322 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2323 } \
2324 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2325 } while (i & 15); \
2326 } \
2327}
2328
2329
2330#define DO_ZPZZZ_D(NAME, TYPE, OP) \
2331void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2332 void *vg, uint32_t desc) \
2333{ \
2334 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2335 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2336 uint8_t *pg = vg; \
2337 for (i = 0; i < opr_sz; i += 1) { \
2338 if (pg[H1(i)] & 1) { \
2339 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2340 d[i] = OP(aa, nn, mm); \
2341 } \
2342 } \
2343}
2344
2345#define DO_MLA(A, N, M) (A + N * M)
2346#define DO_MLS(A, N, M) (A - N * M)
2347
2348DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2349DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2350
2351DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2352DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2353
2354DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2355DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2356
2357DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2358DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2359
2360#undef DO_MLA
2361#undef DO_MLS
2362#undef DO_ZPZZZ
2363#undef DO_ZPZZZ_D
2364
2365void HELPER(sve_index_b)(void *vd, uint32_t start,
2366 uint32_t incr, uint32_t desc)
2367{
2368 intptr_t i, opr_sz = simd_oprsz(desc);
2369 uint8_t *d = vd;
2370 for (i = 0; i < opr_sz; i += 1) {
2371 d[H1(i)] = start + i * incr;
2372 }
2373}
2374
2375void HELPER(sve_index_h)(void *vd, uint32_t start,
2376 uint32_t incr, uint32_t desc)
2377{
2378 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2379 uint16_t *d = vd;
2380 for (i = 0; i < opr_sz; i += 1) {
2381 d[H2(i)] = start + i * incr;
2382 }
2383}
2384
2385void HELPER(sve_index_s)(void *vd, uint32_t start,
2386 uint32_t incr, uint32_t desc)
2387{
2388 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2389 uint32_t *d = vd;
2390 for (i = 0; i < opr_sz; i += 1) {
2391 d[H4(i)] = start + i * incr;
2392 }
2393}
2394
2395void HELPER(sve_index_d)(void *vd, uint64_t start,
2396 uint64_t incr, uint32_t desc)
2397{
2398 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2399 uint64_t *d = vd;
2400 for (i = 0; i < opr_sz; i += 1) {
2401 d[i] = start + i * incr;
2402 }
2403}
2404
2405void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2406{
2407 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2408 uint32_t sh = simd_data(desc);
2409 uint32_t *d = vd, *n = vn, *m = vm;
2410 for (i = 0; i < opr_sz; i += 1) {
2411 d[i] = n[i] + (m[i] << sh);
2412 }
2413}
2414
2415void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2416{
2417 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2418 uint64_t sh = simd_data(desc);
2419 uint64_t *d = vd, *n = vn, *m = vm;
2420 for (i = 0; i < opr_sz; i += 1) {
2421 d[i] = n[i] + (m[i] << sh);
2422 }
2423}
2424
2425void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2426{
2427 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2428 uint64_t sh = simd_data(desc);
2429 uint64_t *d = vd, *n = vn, *m = vm;
2430 for (i = 0; i < opr_sz; i += 1) {
2431 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2432 }
2433}
2434
2435void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2436{
2437 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2438 uint64_t sh = simd_data(desc);
2439 uint64_t *d = vd, *n = vn, *m = vm;
2440 for (i = 0; i < opr_sz; i += 1) {
2441 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2442 }
2443}
2444
2445void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2446{
2447
2448 static const uint16_t coeff[] = {
2449 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2450 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2451 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2452 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2453 };
2454 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2455 uint16_t *d = vd, *n = vn;
2456
2457 for (i = 0; i < opr_sz; i++) {
2458 uint16_t nn = n[i];
2459 intptr_t idx = extract32(nn, 0, 5);
2460 uint16_t exp = extract32(nn, 5, 5);
2461 d[i] = coeff[idx] | (exp << 10);
2462 }
2463}
2464
2465void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2466{
2467
2468 static const uint32_t coeff[] = {
2469 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2470 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2471 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2472 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2473 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2474 0x1ef532, 0x20b051, 0x227043, 0x243516,
2475 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2476 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2477 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2478 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2479 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2480 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2481 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2482 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2483 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2484 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2485 };
2486 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2487 uint32_t *d = vd, *n = vn;
2488
2489 for (i = 0; i < opr_sz; i++) {
2490 uint32_t nn = n[i];
2491 intptr_t idx = extract32(nn, 0, 6);
2492 uint32_t exp = extract32(nn, 6, 8);
2493 d[i] = coeff[idx] | (exp << 23);
2494 }
2495}
2496
2497void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2498{
2499
2500 static const uint64_t coeff[] = {
2501 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2502 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2503 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2504 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2505 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2506 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2507 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2508 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2509 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2510 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2511 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2512 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2513 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2514 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2515 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2516 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2517 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2518 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2519 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2520 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2521 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2522 0xFA7C1819E90D8ull,
2523 };
2524 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2525 uint64_t *d = vd, *n = vn;
2526
2527 for (i = 0; i < opr_sz; i++) {
2528 uint64_t nn = n[i];
2529 intptr_t idx = extract32(nn, 0, 6);
2530 uint64_t exp = extract32(nn, 6, 11);
2531 d[i] = coeff[idx] | (exp << 52);
2532 }
2533}
2534
2535void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2536{
2537 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2538 uint16_t *d = vd, *n = vn, *m = vm;
2539 for (i = 0; i < opr_sz; i += 1) {
2540 uint16_t nn = n[i];
2541 uint16_t mm = m[i];
2542 if (mm & 1) {
2543 nn = float16_one;
2544 }
2545 d[i] = nn ^ (mm & 2) << 14;
2546 }
2547}
2548
2549void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2550{
2551 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2552 uint32_t *d = vd, *n = vn, *m = vm;
2553 for (i = 0; i < opr_sz; i += 1) {
2554 uint32_t nn = n[i];
2555 uint32_t mm = m[i];
2556 if (mm & 1) {
2557 nn = float32_one;
2558 }
2559 d[i] = nn ^ (mm & 2) << 30;
2560 }
2561}
2562
2563void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2564{
2565 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2566 uint64_t *d = vd, *n = vn, *m = vm;
2567 for (i = 0; i < opr_sz; i += 1) {
2568 uint64_t nn = n[i];
2569 uint64_t mm = m[i];
2570 if (mm & 1) {
2571 nn = float64_one;
2572 }
2573 d[i] = nn ^ (mm & 2) << 62;
2574 }
2575}
2576
2577
2578
2579
2580
2581void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2582{
2583 intptr_t i, oprsz = simd_oprsz(desc);
2584
2585 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2586 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2587 }
2588}
2589
2590void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2591{
2592 intptr_t i, oprsz = simd_oprsz(desc);
2593
2594 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2595 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2596 }
2597}
2598
2599void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2600{
2601 intptr_t i, oprsz = simd_oprsz(desc);
2602
2603 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2604 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2605 }
2606}
2607
2608void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2609{
2610 intptr_t i, oprsz = simd_oprsz(desc);
2611
2612 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2613 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2614 }
2615}
2616
2617
2618
2619
2620
2621void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2622{
2623 intptr_t i, oprsz = simd_oprsz(desc);
2624
2625 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2626 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2627 }
2628}
2629
2630void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2631{
2632 intptr_t i, oprsz = simd_oprsz(desc);
2633
2634 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2635 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2636 }
2637}
2638
2639void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2640{
2641 intptr_t i, oprsz = simd_oprsz(desc);
2642
2643 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2644 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2645 }
2646}
2647
2648void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2649{
2650 intptr_t i, oprsz = simd_oprsz(desc);
2651
2652 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2653 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2654 }
2655}
2656
2657void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2658{
2659 intptr_t i, oprsz = simd_oprsz(desc);
2660
2661 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2662 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2663 }
2664}
2665
2666
2667
2668
2669void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2670 uint64_t mm, uint32_t desc)
2671{
2672 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2673 uint64_t *d = vd, *n = vn;
2674 uint8_t *pg = vg;
2675
2676 mm = dup_const(MO_8, mm);
2677 for (i = 0; i < opr_sz; i += 1) {
2678 uint64_t nn = n[i];
2679 uint64_t pp = expand_pred_b(pg[H1(i)]);
2680 d[i] = (mm & pp) | (nn & ~pp);
2681 }
2682}
2683
2684void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2685 uint64_t mm, uint32_t desc)
2686{
2687 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2688 uint64_t *d = vd, *n = vn;
2689 uint8_t *pg = vg;
2690
2691 mm = dup_const(MO_16, mm);
2692 for (i = 0; i < opr_sz; i += 1) {
2693 uint64_t nn = n[i];
2694 uint64_t pp = expand_pred_h(pg[H1(i)]);
2695 d[i] = (mm & pp) | (nn & ~pp);
2696 }
2697}
2698
2699void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2700 uint64_t mm, uint32_t desc)
2701{
2702 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2703 uint64_t *d = vd, *n = vn;
2704 uint8_t *pg = vg;
2705
2706 mm = dup_const(MO_32, mm);
2707 for (i = 0; i < opr_sz; i += 1) {
2708 uint64_t nn = n[i];
2709 uint64_t pp = expand_pred_s(pg[H1(i)]);
2710 d[i] = (mm & pp) | (nn & ~pp);
2711 }
2712}
2713
2714void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2715 uint64_t mm, uint32_t desc)
2716{
2717 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2718 uint64_t *d = vd, *n = vn;
2719 uint8_t *pg = vg;
2720
2721 for (i = 0; i < opr_sz; i += 1) {
2722 uint64_t nn = n[i];
2723 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2724 }
2725}
2726
2727void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2728{
2729 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2730 uint64_t *d = vd;
2731 uint8_t *pg = vg;
2732
2733 val = dup_const(MO_8, val);
2734 for (i = 0; i < opr_sz; i += 1) {
2735 d[i] = val & expand_pred_b(pg[H1(i)]);
2736 }
2737}
2738
2739void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2740{
2741 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2742 uint64_t *d = vd;
2743 uint8_t *pg = vg;
2744
2745 val = dup_const(MO_16, val);
2746 for (i = 0; i < opr_sz; i += 1) {
2747 d[i] = val & expand_pred_h(pg[H1(i)]);
2748 }
2749}
2750
2751void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2752{
2753 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2754 uint64_t *d = vd;
2755 uint8_t *pg = vg;
2756
2757 val = dup_const(MO_32, val);
2758 for (i = 0; i < opr_sz; i += 1) {
2759 d[i] = val & expand_pred_s(pg[H1(i)]);
2760 }
2761}
2762
2763void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2764{
2765 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2766 uint64_t *d = vd;
2767 uint8_t *pg = vg;
2768
2769 for (i = 0; i < opr_sz; i += 1) {
2770 d[i] = (pg[H1(i)] & 1 ? val : 0);
2771 }
2772}
2773
2774
2775
2776
2777static void swap_memmove(void *vd, void *vs, size_t n)
2778{
2779 uintptr_t d = (uintptr_t)vd;
2780 uintptr_t s = (uintptr_t)vs;
2781 uintptr_t o = (d | s | n) & 7;
2782 size_t i;
2783
2784#if !HOST_BIG_ENDIAN
2785 o = 0;
2786#endif
2787 switch (o) {
2788 case 0:
2789 memmove(vd, vs, n);
2790 break;
2791
2792 case 4:
2793 if (d < s || d >= s + n) {
2794 for (i = 0; i < n; i += 4) {
2795 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2796 }
2797 } else {
2798 for (i = n; i > 0; ) {
2799 i -= 4;
2800 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2801 }
2802 }
2803 break;
2804
2805 case 2:
2806 case 6:
2807 if (d < s || d >= s + n) {
2808 for (i = 0; i < n; i += 2) {
2809 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2810 }
2811 } else {
2812 for (i = n; i > 0; ) {
2813 i -= 2;
2814 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2815 }
2816 }
2817 break;
2818
2819 default:
2820 if (d < s || d >= s + n) {
2821 for (i = 0; i < n; i++) {
2822 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2823 }
2824 } else {
2825 for (i = n; i > 0; ) {
2826 i -= 1;
2827 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2828 }
2829 }
2830 break;
2831 }
2832}
2833
2834
2835static void swap_memzero(void *vd, size_t n)
2836{
2837 uintptr_t d = (uintptr_t)vd;
2838 uintptr_t o = (d | n) & 7;
2839 size_t i;
2840
2841
2842 if (likely(n == 0)) {
2843 return;
2844 }
2845
2846#if !HOST_BIG_ENDIAN
2847 o = 0;
2848#endif
2849 switch (o) {
2850 case 0:
2851 memset(vd, 0, n);
2852 break;
2853
2854 case 4:
2855 for (i = 0; i < n; i += 4) {
2856 *(uint32_t *)H1_4(d + i) = 0;
2857 }
2858 break;
2859
2860 case 2:
2861 case 6:
2862 for (i = 0; i < n; i += 2) {
2863 *(uint16_t *)H1_2(d + i) = 0;
2864 }
2865 break;
2866
2867 default:
2868 for (i = 0; i < n; i++) {
2869 *(uint8_t *)H1(d + i) = 0;
2870 }
2871 break;
2872 }
2873}
2874
2875void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2876{
2877 intptr_t opr_sz = simd_oprsz(desc);
2878 size_t n_ofs = simd_data(desc);
2879 size_t n_siz = opr_sz - n_ofs;
2880
2881 if (vd != vm) {
2882 swap_memmove(vd, vn + n_ofs, n_siz);
2883 swap_memmove(vd + n_siz, vm, n_ofs);
2884 } else if (vd != vn) {
2885 swap_memmove(vd + n_siz, vd, n_ofs);
2886 swap_memmove(vd, vn + n_ofs, n_siz);
2887 } else {
2888
2889 ARMVectorReg tmp;
2890 swap_memmove(&tmp, vm, n_ofs);
2891 swap_memmove(vd, vd + n_ofs, n_siz);
2892 memcpy(vd + n_siz, &tmp, n_ofs);
2893 }
2894}
2895
2896#define DO_INSR(NAME, TYPE, H) \
2897void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2898{ \
2899 intptr_t opr_sz = simd_oprsz(desc); \
2900 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2901 *(TYPE *)(vd + H(0)) = val; \
2902}
2903
2904DO_INSR(sve_insr_b, uint8_t, H1)
2905DO_INSR(sve_insr_h, uint16_t, H1_2)
2906DO_INSR(sve_insr_s, uint32_t, H1_4)
2907DO_INSR(sve_insr_d, uint64_t, H1_8)
2908
2909#undef DO_INSR
2910
2911void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2912{
2913 intptr_t i, j, opr_sz = simd_oprsz(desc);
2914 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2915 uint64_t f = *(uint64_t *)(vn + i);
2916 uint64_t b = *(uint64_t *)(vn + j);
2917 *(uint64_t *)(vd + i) = bswap64(b);
2918 *(uint64_t *)(vd + j) = bswap64(f);
2919 }
2920}
2921
2922void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2923{
2924 intptr_t i, j, opr_sz = simd_oprsz(desc);
2925 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2926 uint64_t f = *(uint64_t *)(vn + i);
2927 uint64_t b = *(uint64_t *)(vn + j);
2928 *(uint64_t *)(vd + i) = hswap64(b);
2929 *(uint64_t *)(vd + j) = hswap64(f);
2930 }
2931}
2932
2933void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2934{
2935 intptr_t i, j, opr_sz = simd_oprsz(desc);
2936 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2937 uint64_t f = *(uint64_t *)(vn + i);
2938 uint64_t b = *(uint64_t *)(vn + j);
2939 *(uint64_t *)(vd + i) = rol64(b, 32);
2940 *(uint64_t *)(vd + j) = rol64(f, 32);
2941 }
2942}
2943
2944void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2945{
2946 intptr_t i, j, opr_sz = simd_oprsz(desc);
2947 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2948 uint64_t f = *(uint64_t *)(vn + i);
2949 uint64_t b = *(uint64_t *)(vn + j);
2950 *(uint64_t *)(vd + i) = b;
2951 *(uint64_t *)(vd + j) = f;
2952 }
2953}
2954
2955typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2956
2957static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2958 bool is_tbx, tb_impl_fn *fn)
2959{
2960 ARMVectorReg scratch;
2961 uintptr_t oprsz = simd_oprsz(desc);
2962
2963 if (unlikely(vd == vn)) {
2964 vn = memcpy(&scratch, vn, oprsz);
2965 }
2966
2967 fn(vd, vn, NULL, vm, oprsz, is_tbx);
2968}
2969
2970static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
2971 uint32_t desc, bool is_tbx, tb_impl_fn *fn)
2972{
2973 ARMVectorReg scratch;
2974 uintptr_t oprsz = simd_oprsz(desc);
2975
2976 if (unlikely(vd == vn0)) {
2977 vn0 = memcpy(&scratch, vn0, oprsz);
2978 if (vd == vn1) {
2979 vn1 = vn0;
2980 }
2981 } else if (unlikely(vd == vn1)) {
2982 vn1 = memcpy(&scratch, vn1, oprsz);
2983 }
2984
2985 fn(vd, vn0, vn1, vm, oprsz, is_tbx);
2986}
2987
2988#define DO_TB(SUFF, TYPE, H) \
2989static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \
2990 void *vm, uintptr_t oprsz, bool is_tbx) \
2991{ \
2992 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \
2993 uintptr_t i, nelem = oprsz / sizeof(TYPE); \
2994 for (i = 0; i < nelem; ++i) { \
2995 TYPE index = indexes[H1(i)], val = 0; \
2996 if (index < nelem) { \
2997 val = tbl0[H(index)]; \
2998 } else { \
2999 index -= nelem; \
3000 if (tbl1 && index < nelem) { \
3001 val = tbl1[H(index)]; \
3002 } else if (is_tbx) { \
3003 continue; \
3004 } \
3005 } \
3006 d[H(i)] = val; \
3007 } \
3008} \
3009void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3010{ \
3011 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \
3012} \
3013void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \
3014 void *vm, uint32_t desc) \
3015{ \
3016 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \
3017} \
3018void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3019{ \
3020 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \
3021}
3022
3023DO_TB(b, uint8_t, H1)
3024DO_TB(h, uint16_t, H2)
3025DO_TB(s, uint32_t, H4)
3026DO_TB(d, uint64_t, H8)
3027
3028#undef DO_TB
3029
3030#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3031void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
3032{ \
3033 intptr_t i, opr_sz = simd_oprsz(desc); \
3034 TYPED *d = vd; \
3035 TYPES *n = vn; \
3036 ARMVectorReg tmp; \
3037 if (unlikely(vn - vd < opr_sz)) { \
3038 n = memcpy(&tmp, n, opr_sz / 2); \
3039 } \
3040 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
3041 d[HD(i)] = n[HS(i)]; \
3042 } \
3043}
3044
3045DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3046DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3047DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3048
3049DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3050DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3051DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3052
3053#undef DO_UNPK
3054
3055
3056
3057
3058
3059static const uint64_t even_bit_esz_masks[5] = {
3060 0x5555555555555555ull,
3061 0x3333333333333333ull,
3062 0x0f0f0f0f0f0f0f0full,
3063 0x00ff00ff00ff00ffull,
3064 0x0000ffff0000ffffull,
3065};
3066
3067
3068
3069
3070
3071
3072static uint64_t expand_bits(uint64_t x, int n)
3073{
3074 int i;
3075
3076 x &= 0xffffffffu;
3077 for (i = 4; i >= n; i--) {
3078 int sh = 1 << i;
3079 x = ((x << sh) | x) & even_bit_esz_masks[i];
3080 }
3081 return x;
3082}
3083
3084
3085
3086
3087
3088
3089static uint64_t compress_bits(uint64_t x, int n)
3090{
3091 int i;
3092
3093 for (i = n; i <= 4; i++) {
3094 int sh = 1 << i;
3095 x &= even_bit_esz_masks[i];
3096 x = (x >> sh) | x;
3097 }
3098 return x & 0xffffffffu;
3099}
3100
3101void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3102{
3103 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3104 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3105 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3106 int esize = 1 << esz;
3107 uint64_t *d = vd;
3108 intptr_t i;
3109
3110 if (oprsz <= 8) {
3111 uint64_t nn = *(uint64_t *)vn;
3112 uint64_t mm = *(uint64_t *)vm;
3113 int half = 4 * oprsz;
3114
3115 nn = extract64(nn, high * half, half);
3116 mm = extract64(mm, high * half, half);
3117 nn = expand_bits(nn, esz);
3118 mm = expand_bits(mm, esz);
3119 d[0] = nn | (mm << esize);
3120 } else {
3121 ARMPredicateReg tmp;
3122
3123
3124
3125 if (vd == vn) {
3126 vn = memcpy(&tmp, vn, oprsz);
3127 if (vd == vm) {
3128 vm = vn;
3129 }
3130 } else if (vd == vm) {
3131 vm = memcpy(&tmp, vm, oprsz);
3132 }
3133 if (high) {
3134 high = oprsz >> 1;
3135 }
3136
3137 if ((oprsz & 7) == 0) {
3138 uint32_t *n = vn, *m = vm;
3139 high >>= 2;
3140
3141 for (i = 0; i < oprsz / 8; i++) {
3142 uint64_t nn = n[H4(high + i)];
3143 uint64_t mm = m[H4(high + i)];
3144
3145 nn = expand_bits(nn, esz);
3146 mm = expand_bits(mm, esz);
3147 d[i] = nn | (mm << esize);
3148 }
3149 } else {
3150 uint8_t *n = vn, *m = vm;
3151 uint16_t *d16 = vd;
3152
3153 for (i = 0; i < oprsz / 2; i++) {
3154 uint16_t nn = n[H1(high + i)];
3155 uint16_t mm = m[H1(high + i)];
3156
3157 nn = expand_bits(nn, esz);
3158 mm = expand_bits(mm, esz);
3159 d16[H2(i)] = nn | (mm << esize);
3160 }
3161 }
3162 }
3163}
3164
3165void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3166{
3167 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3168 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3169 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3170 uint64_t *d = vd, *n = vn, *m = vm;
3171 uint64_t l, h;
3172 intptr_t i;
3173
3174 if (oprsz <= 8) {
3175 l = compress_bits(n[0] >> odd, esz);
3176 h = compress_bits(m[0] >> odd, esz);
3177 d[0] = l | (h << (4 * oprsz));
3178 } else {
3179 ARMPredicateReg tmp_m;
3180 intptr_t oprsz_16 = oprsz / 16;
3181
3182 if ((vm - vd) < (uintptr_t)oprsz) {
3183 m = memcpy(&tmp_m, vm, oprsz);
3184 }
3185
3186 for (i = 0; i < oprsz_16; i++) {
3187 l = n[2 * i + 0];
3188 h = n[2 * i + 1];
3189 l = compress_bits(l >> odd, esz);
3190 h = compress_bits(h >> odd, esz);
3191 d[i] = l | (h << 32);
3192 }
3193
3194
3195
3196
3197
3198
3199 if (oprsz & 15) {
3200 int final_shift = (oprsz & 15) * 2;
3201
3202 l = n[2 * i + 0];
3203 h = n[2 * i + 1];
3204 l = compress_bits(l >> odd, esz);
3205 h = compress_bits(h >> odd, esz);
3206 d[i] = l | (h << final_shift);
3207
3208 for (i = 0; i < oprsz_16; i++) {
3209 l = m[2 * i + 0];
3210 h = m[2 * i + 1];
3211 l = compress_bits(l >> odd, esz);
3212 h = compress_bits(h >> odd, esz);
3213 tmp_m.p[i] = l | (h << 32);
3214 }
3215 l = m[2 * i + 0];
3216 h = m[2 * i + 1];
3217 l = compress_bits(l >> odd, esz);
3218 h = compress_bits(h >> odd, esz);
3219 tmp_m.p[i] = l | (h << final_shift);
3220
3221 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3222 } else {
3223 for (i = 0; i < oprsz_16; i++) {
3224 l = m[2 * i + 0];
3225 h = m[2 * i + 1];
3226 l = compress_bits(l >> odd, esz);
3227 h = compress_bits(h >> odd, esz);
3228 d[oprsz_16 + i] = l | (h << 32);
3229 }
3230 }
3231 }
3232}
3233
3234void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3235{
3236 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3237 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3238 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3239 uint64_t *d = vd, *n = vn, *m = vm;
3240 uint64_t mask;
3241 int shr, shl;
3242 intptr_t i;
3243
3244 shl = 1 << esz;
3245 shr = 0;
3246 mask = even_bit_esz_masks[esz];
3247 if (odd) {
3248 mask <<= shl;
3249 shr = shl;
3250 shl = 0;
3251 }
3252
3253 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3254 uint64_t nn = (n[i] & mask) >> shr;
3255 uint64_t mm = (m[i] & mask) << shl;
3256 d[i] = nn + mm;
3257 }
3258}
3259
3260
3261static uint64_t reverse_bits_64(uint64_t x, int n)
3262{
3263 int i, sh;
3264
3265 x = bswap64(x);
3266 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3267 uint64_t mask = even_bit_esz_masks[i];
3268 x = ((x & mask) << sh) | ((x >> sh) & mask);
3269 }
3270 return x;
3271}
3272
3273static uint8_t reverse_bits_8(uint8_t x, int n)
3274{
3275 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3276 int i, sh;
3277
3278 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3279 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3280 }
3281 return x;
3282}
3283
3284void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3285{
3286 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3287 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3288 intptr_t i, oprsz_2 = oprsz / 2;
3289
3290 if (oprsz <= 8) {
3291 uint64_t l = *(uint64_t *)vn;
3292 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3293 *(uint64_t *)vd = l;
3294 } else if ((oprsz & 15) == 0) {
3295 for (i = 0; i < oprsz_2; i += 8) {
3296 intptr_t ih = oprsz - 8 - i;
3297 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3298 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3299 *(uint64_t *)(vd + i) = h;
3300 *(uint64_t *)(vd + ih) = l;
3301 }
3302 } else {
3303 for (i = 0; i < oprsz_2; i += 1) {
3304 intptr_t il = H1(i);
3305 intptr_t ih = H1(oprsz - 1 - i);
3306 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3307 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3308 *(uint8_t *)(vd + il) = h;
3309 *(uint8_t *)(vd + ih) = l;
3310 }
3311 }
3312}
3313
3314void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3315{
3316 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3317 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3318 uint64_t *d = vd;
3319 intptr_t i;
3320
3321 if (oprsz <= 8) {
3322 uint64_t nn = *(uint64_t *)vn;
3323 int half = 4 * oprsz;
3324
3325 nn = extract64(nn, high * half, half);
3326 nn = expand_bits(nn, 0);
3327 d[0] = nn;
3328 } else {
3329 ARMPredicateReg tmp_n;
3330
3331
3332
3333 if ((vn - vd) < (uintptr_t)oprsz) {
3334 vn = memcpy(&tmp_n, vn, oprsz);
3335 }
3336 if (high) {
3337 high = oprsz >> 1;
3338 }
3339
3340 if ((oprsz & 7) == 0) {
3341 uint32_t *n = vn;
3342 high >>= 2;
3343
3344 for (i = 0; i < oprsz / 8; i++) {
3345 uint64_t nn = n[H4(high + i)];
3346 d[i] = expand_bits(nn, 0);
3347 }
3348 } else {
3349 uint16_t *d16 = vd;
3350 uint8_t *n = vn;
3351
3352 for (i = 0; i < oprsz / 2; i++) {
3353 uint16_t nn = n[H1(high + i)];
3354 d16[H2(i)] = expand_bits(nn, 0);
3355 }
3356 }
3357 }
3358}
3359
3360#define DO_ZIP(NAME, TYPE, H) \
3361void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3362{ \
3363 intptr_t oprsz = simd_oprsz(desc); \
3364 intptr_t odd_ofs = simd_data(desc); \
3365 intptr_t i, oprsz_2 = oprsz / 2; \
3366 ARMVectorReg tmp_n, tmp_m; \
3367
3368 \
3369 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3370 vn = memcpy(&tmp_n, vn, oprsz); \
3371 } \
3372 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3373 vm = memcpy(&tmp_m, vm, oprsz); \
3374 } \
3375 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3376 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3377 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \
3378 *(TYPE *)(vm + odd_ofs + H(i)); \
3379 } \
3380 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3381 memset(vd + oprsz - 16, 0, 16); \
3382 } \
3383}
3384
3385DO_ZIP(sve_zip_b, uint8_t, H1)
3386DO_ZIP(sve_zip_h, uint16_t, H1_2)
3387DO_ZIP(sve_zip_s, uint32_t, H1_4)
3388DO_ZIP(sve_zip_d, uint64_t, H1_8)
3389DO_ZIP(sve2_zip_q, Int128, )
3390
3391#define DO_UZP(NAME, TYPE, H) \
3392void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3393{ \
3394 intptr_t oprsz = simd_oprsz(desc); \
3395 intptr_t odd_ofs = simd_data(desc); \
3396 intptr_t i, p; \
3397 ARMVectorReg tmp_m; \
3398 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3399 vm = memcpy(&tmp_m, vm, oprsz); \
3400 } \
3401 i = 0, p = odd_ofs; \
3402 do { \
3403 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \
3404 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3405 } while (p < oprsz); \
3406 p -= oprsz; \
3407 do { \
3408 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \
3409 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3410 } while (p < oprsz); \
3411 tcg_debug_assert(i == oprsz); \
3412}
3413
3414DO_UZP(sve_uzp_b, uint8_t, H1)
3415DO_UZP(sve_uzp_h, uint16_t, H1_2)
3416DO_UZP(sve_uzp_s, uint32_t, H1_4)
3417DO_UZP(sve_uzp_d, uint64_t, H1_8)
3418DO_UZP(sve2_uzp_q, Int128, )
3419
3420#define DO_TRN(NAME, TYPE, H) \
3421void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3422{ \
3423 intptr_t oprsz = simd_oprsz(desc); \
3424 intptr_t odd_ofs = simd_data(desc); \
3425 intptr_t i; \
3426 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3427 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3428 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3429 *(TYPE *)(vd + H(i + 0)) = ae; \
3430 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3431 } \
3432 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3433 memset(vd + oprsz - 16, 0, 16); \
3434 } \
3435}
3436
3437DO_TRN(sve_trn_b, uint8_t, H1)
3438DO_TRN(sve_trn_h, uint16_t, H1_2)
3439DO_TRN(sve_trn_s, uint32_t, H1_4)
3440DO_TRN(sve_trn_d, uint64_t, H1_8)
3441DO_TRN(sve2_trn_q, Int128, )
3442
3443#undef DO_ZIP
3444#undef DO_UZP
3445#undef DO_TRN
3446
3447void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3448{
3449 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3450 uint32_t *d = vd, *n = vn;
3451 uint8_t *pg = vg;
3452
3453 for (i = j = 0; i < opr_sz; i++) {
3454 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3455 d[H4(j)] = n[H4(i)];
3456 j++;
3457 }
3458 }
3459 for (; j < opr_sz; j++) {
3460 d[H4(j)] = 0;
3461 }
3462}
3463
3464void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3465{
3466 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3467 uint64_t *d = vd, *n = vn;
3468 uint8_t *pg = vg;
3469
3470 for (i = j = 0; i < opr_sz; i++) {
3471 if (pg[H1(i)] & 1) {
3472 d[j] = n[i];
3473 j++;
3474 }
3475 }
3476 for (; j < opr_sz; j++) {
3477 d[j] = 0;
3478 }
3479}
3480
3481
3482
3483
3484
3485int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3486{
3487 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3488 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3489
3490 return last_active_element(vg, words, esz);
3491}
3492
3493void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3494{
3495 intptr_t opr_sz = simd_oprsz(desc) / 8;
3496 int esz = simd_data(desc);
3497 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3498 intptr_t i, first_i, last_i;
3499 ARMVectorReg tmp;
3500
3501 first_i = last_i = 0;
3502 first_g = last_g = 0;
3503
3504
3505 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3506 pg = *(uint64_t *)(vg + i) & mask;
3507 if (pg) {
3508 if (last_g == 0) {
3509 last_g = pg;
3510 last_i = i;
3511 }
3512 first_g = pg;
3513 first_i = i;
3514 }
3515 }
3516
3517 len = 0;
3518 if (first_g != 0) {
3519 first_i = first_i * 8 + ctz64(first_g);
3520 last_i = last_i * 8 + 63 - clz64(last_g);
3521 len = last_i - first_i + (1 << esz);
3522 if (vd == vm) {
3523 vm = memcpy(&tmp, vm, opr_sz * 8);
3524 }
3525 swap_memmove(vd, vn + first_i, len);
3526 }
3527 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3528}
3529
3530void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3531 void *vg, uint32_t desc)
3532{
3533 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3534 uint64_t *d = vd, *n = vn, *m = vm;
3535 uint8_t *pg = vg;
3536
3537 for (i = 0; i < opr_sz; i += 1) {
3538 uint64_t nn = n[i], mm = m[i];
3539 uint64_t pp = expand_pred_b(pg[H1(i)]);
3540 d[i] = (nn & pp) | (mm & ~pp);
3541 }
3542}
3543
3544void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3545 void *vg, uint32_t desc)
3546{
3547 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3548 uint64_t *d = vd, *n = vn, *m = vm;
3549 uint8_t *pg = vg;
3550
3551 for (i = 0; i < opr_sz; i += 1) {
3552 uint64_t nn = n[i], mm = m[i];
3553 uint64_t pp = expand_pred_h(pg[H1(i)]);
3554 d[i] = (nn & pp) | (mm & ~pp);
3555 }
3556}
3557
3558void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3559 void *vg, uint32_t desc)
3560{
3561 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3562 uint64_t *d = vd, *n = vn, *m = vm;
3563 uint8_t *pg = vg;
3564
3565 for (i = 0; i < opr_sz; i += 1) {
3566 uint64_t nn = n[i], mm = m[i];
3567 uint64_t pp = expand_pred_s(pg[H1(i)]);
3568 d[i] = (nn & pp) | (mm & ~pp);
3569 }
3570}
3571
3572void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3573 void *vg, uint32_t desc)
3574{
3575 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3576 uint64_t *d = vd, *n = vn, *m = vm;
3577 uint8_t *pg = vg;
3578
3579 for (i = 0; i < opr_sz; i += 1) {
3580 uint64_t nn = n[i], mm = m[i];
3581 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3582 }
3583}
3584
3585void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3586 void *vg, uint32_t desc)
3587{
3588 intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3589 Int128 *d = vd, *n = vn, *m = vm;
3590 uint16_t *pg = vg;
3591
3592 for (i = 0; i < opr_sz; i += 1) {
3593 d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3594 }
3595}
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3619uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3620{ \
3621 intptr_t opr_sz = simd_oprsz(desc); \
3622 uint32_t flags = PREDTEST_INIT; \
3623 intptr_t i = opr_sz; \
3624 do { \
3625 uint64_t out = 0, pg; \
3626 do { \
3627 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3628 TYPE nn = *(TYPE *)(vn + H(i)); \
3629 TYPE mm = *(TYPE *)(vm + H(i)); \
3630 out |= nn OP mm; \
3631 } while (i & 63); \
3632 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3633 out &= pg; \
3634 *(uint64_t *)(vd + (i >> 3)) = out; \
3635 flags = iter_predtest_bwd(out, pg, flags); \
3636 } while (i > 0); \
3637 return flags; \
3638}
3639
3640#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3641 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3642#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3643 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3644#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3645 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3646#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3647 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3648
3649DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3650DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3651DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3652DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3653
3654DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3655DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3656DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3657DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3658
3659DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3660DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3661DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3662DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3663
3664DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3665DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3666DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3667DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3668
3669DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3670DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3671DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3672DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3673
3674DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3675DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3676DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3677DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3678
3679#undef DO_CMP_PPZZ_B
3680#undef DO_CMP_PPZZ_H
3681#undef DO_CMP_PPZZ_S
3682#undef DO_CMP_PPZZ_D
3683#undef DO_CMP_PPZZ
3684
3685
3686#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3687uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3688{ \
3689 intptr_t opr_sz = simd_oprsz(desc); \
3690 uint32_t flags = PREDTEST_INIT; \
3691 intptr_t i = opr_sz; \
3692 do { \
3693 uint64_t out = 0, pg; \
3694 do { \
3695 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3696 do { \
3697 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3698 TYPE nn = *(TYPE *)(vn + H(i)); \
3699 out |= nn OP mm; \
3700 } while (i & 7); \
3701 } while (i & 63); \
3702 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3703 out &= pg; \
3704 *(uint64_t *)(vd + (i >> 3)) = out; \
3705 flags = iter_predtest_bwd(out, pg, flags); \
3706 } while (i > 0); \
3707 return flags; \
3708}
3709
3710#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3711 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3712#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3713 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3714#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3715 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3716
3717DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3718DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3719DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3720
3721DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3722DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3723DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3724
3725DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3726DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3727DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3728
3729DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3730DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3731DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3732
3733DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3734DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3735DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3736
3737DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3738DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3739DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3740
3741DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3742DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3743DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3744
3745DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3746DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3747DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3748
3749DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3750DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3751DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3752
3753DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3754DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3755DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3756
3757#undef DO_CMP_PPZW_B
3758#undef DO_CMP_PPZW_H
3759#undef DO_CMP_PPZW_S
3760#undef DO_CMP_PPZW
3761
3762
3763#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3764uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3765{ \
3766 intptr_t opr_sz = simd_oprsz(desc); \
3767 uint32_t flags = PREDTEST_INIT; \
3768 TYPE mm = simd_data(desc); \
3769 intptr_t i = opr_sz; \
3770 do { \
3771 uint64_t out = 0, pg; \
3772 do { \
3773 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3774 TYPE nn = *(TYPE *)(vn + H(i)); \
3775 out |= nn OP mm; \
3776 } while (i & 63); \
3777 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3778 out &= pg; \
3779 *(uint64_t *)(vd + (i >> 3)) = out; \
3780 flags = iter_predtest_bwd(out, pg, flags); \
3781 } while (i > 0); \
3782 return flags; \
3783}
3784
3785#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3786 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3787#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3788 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3789#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3790 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3791#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3792 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3793
3794DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3795DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3796DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3797DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3798
3799DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3800DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3801DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3802DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3803
3804DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3805DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3806DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3807DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3808
3809DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3810DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3811DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3812DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3813
3814DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3815DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3816DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3817DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3818
3819DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3820DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3821DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3822DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3823
3824DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3825DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3826DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3827DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3828
3829DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3830DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3831DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3832DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3833
3834DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3835DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3836DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3837DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3838
3839DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3840DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3841DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3842DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3843
3844#undef DO_CMP_PPZI_B
3845#undef DO_CMP_PPZI_H
3846#undef DO_CMP_PPZI_S
3847#undef DO_CMP_PPZI_D
3848#undef DO_CMP_PPZI
3849
3850
3851static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3852{
3853 intptr_t i;
3854
3855 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3856 uint64_t pg = *(uint64_t *)(vg + i);
3857 if (pg) {
3858 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3859 }
3860 }
3861 return 0;
3862}
3863
3864
3865
3866
3867
3868static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3869 bool brk, bool after)
3870{
3871 uint64_t b;
3872
3873 if (brk) {
3874 b = 0;
3875 } else if ((g & n) == 0) {
3876
3877 b = g;
3878 } else {
3879
3880 b = g & n;
3881 b = b & -b;
3882 if (after) {
3883 b = b | (b - 1);
3884 } else {
3885 b = b - 1;
3886 }
3887 brk = true;
3888 }
3889
3890 *retb = b;
3891 return brk;
3892}
3893
3894
3895static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3896 intptr_t oprsz, bool after)
3897{
3898 bool brk = false;
3899 intptr_t i;
3900
3901 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3902 uint64_t this_b, this_g = g[i];
3903
3904 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3905 d[i] = this_b & this_g;
3906 }
3907}
3908
3909
3910static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3911 intptr_t oprsz, bool after)
3912{
3913 uint32_t flags = PREDTEST_INIT;
3914 bool brk = false;
3915 intptr_t i;
3916
3917 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3918 uint64_t this_b, this_d, this_g = g[i];
3919
3920 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3921 d[i] = this_d = this_b & this_g;
3922 flags = iter_predtest_fwd(this_d, this_g, flags);
3923 }
3924 return flags;
3925}
3926
3927
3928static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3929 intptr_t oprsz, bool after)
3930{
3931 bool brk = false;
3932 intptr_t i;
3933
3934 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3935 uint64_t this_b, this_g = g[i];
3936
3937 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3938 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3939 }
3940}
3941
3942
3943static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3944 intptr_t oprsz, bool after)
3945{
3946 uint32_t flags = PREDTEST_INIT;
3947 bool brk = false;
3948 intptr_t i;
3949
3950 for (i = 0; i < oprsz / 8; ++i) {
3951 uint64_t this_b, this_d = d[i], this_g = g[i];
3952
3953 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3954 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3955 flags = iter_predtest_fwd(this_d, this_g, flags);
3956 }
3957 return flags;
3958}
3959
3960static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3961{
3962
3963
3964
3965 memset(d, 0, sizeof(ARMPredicateReg));
3966 return PREDTEST_INIT;
3967}
3968
3969void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3970 uint32_t pred_desc)
3971{
3972 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3973 if (last_active_pred(vn, vg, oprsz)) {
3974 compute_brk_z(vd, vm, vg, oprsz, true);
3975 } else {
3976 do_zero(vd, oprsz);
3977 }
3978}
3979
3980uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3981 uint32_t pred_desc)
3982{
3983 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3984 if (last_active_pred(vn, vg, oprsz)) {
3985 return compute_brks_z(vd, vm, vg, oprsz, true);
3986 } else {
3987 return do_zero(vd, oprsz);
3988 }
3989}
3990
3991void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3992 uint32_t pred_desc)
3993{
3994 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3995 if (last_active_pred(vn, vg, oprsz)) {
3996 compute_brk_z(vd, vm, vg, oprsz, false);
3997 } else {
3998 do_zero(vd, oprsz);
3999 }
4000}
4001
4002uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4003 uint32_t pred_desc)
4004{
4005 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4006 if (last_active_pred(vn, vg, oprsz)) {
4007 return compute_brks_z(vd, vm, vg, oprsz, false);
4008 } else {
4009 return do_zero(vd, oprsz);
4010 }
4011}
4012
4013void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4014{
4015 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4016 compute_brk_z(vd, vn, vg, oprsz, true);
4017}
4018
4019uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4020{
4021 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4022 return compute_brks_z(vd, vn, vg, oprsz, true);
4023}
4024
4025void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4026{
4027 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4028 compute_brk_z(vd, vn, vg, oprsz, false);
4029}
4030
4031uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4032{
4033 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4034 return compute_brks_z(vd, vn, vg, oprsz, false);
4035}
4036
4037void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4038{
4039 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4040 compute_brk_m(vd, vn, vg, oprsz, true);
4041}
4042
4043uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4044{
4045 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4046 return compute_brks_m(vd, vn, vg, oprsz, true);
4047}
4048
4049void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4050{
4051 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4052 compute_brk_m(vd, vn, vg, oprsz, false);
4053}
4054
4055uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4056{
4057 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4058 return compute_brks_m(vd, vn, vg, oprsz, false);
4059}
4060
4061void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4062{
4063 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4064 if (!last_active_pred(vn, vg, oprsz)) {
4065 do_zero(vd, oprsz);
4066 }
4067}
4068
4069
4070static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4071 uint64_t esz_mask)
4072{
4073 uint32_t flags = PREDTEST_INIT;
4074 intptr_t i;
4075
4076 for (i = 0; i < oprsz / 8; i++) {
4077 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4078 }
4079 if (oprsz & 7) {
4080 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4081 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4082 }
4083 return flags;
4084}
4085
4086uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4087{
4088 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4089 if (last_active_pred(vn, vg, oprsz)) {
4090 return predtest_ones(vd, oprsz, -1);
4091 } else {
4092 return do_zero(vd, oprsz);
4093 }
4094}
4095
4096uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4097{
4098 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4099 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4100 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4101 intptr_t i;
4102
4103 for (i = 0; i < words; ++i) {
4104 uint64_t t = n[i] & g[i] & mask;
4105 sum += ctpop64(t);
4106 }
4107 return sum;
4108}
4109
4110uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4111{
4112 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4113 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4114 uint64_t esz_mask = pred_esz_masks[esz];
4115 ARMPredicateReg *d = vd;
4116 uint32_t flags;
4117 intptr_t i;
4118
4119
4120 flags = do_zero(d, oprsz);
4121 if (count == 0) {
4122 return flags;
4123 }
4124
4125
4126 for (i = 0; i < count / 64; ++i) {
4127 d->p[i] = esz_mask;
4128 }
4129 if (count & 63) {
4130 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4131 }
4132
4133 return predtest_ones(d, oprsz, esz_mask);
4134}
4135
4136uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4137{
4138 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4139 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4140 uint64_t esz_mask = pred_esz_masks[esz];
4141 ARMPredicateReg *d = vd;
4142 intptr_t i, invcount, oprbits;
4143 uint64_t bits;
4144
4145 if (count == 0) {
4146 return do_zero(d, oprsz);
4147 }
4148
4149 oprbits = oprsz * 8;
4150 tcg_debug_assert(count <= oprbits);
4151
4152 bits = esz_mask;
4153 if (oprbits & 63) {
4154 bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4155 }
4156
4157 invcount = oprbits - count;
4158 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4159 d->p[i] = bits;
4160 bits = esz_mask;
4161 }
4162
4163 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4164
4165 while (--i >= 0) {
4166 d->p[i] = 0;
4167 }
4168
4169 return predtest_ones(d, oprsz, esz_mask);
4170}
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
4181static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4182{ \
4183 if (n == 1) { \
4184 return *data; \
4185 } else { \
4186 uintptr_t half = n / 2; \
4187 TYPE lo = NAME##_reduce(data, status, half); \
4188 TYPE hi = NAME##_reduce(data + half, status, half); \
4189 return TYPE##_##FUNC(lo, hi, status); \
4190 } \
4191} \
4192uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
4193{ \
4194 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
4195 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
4196 for (i = 0; i < oprsz; ) { \
4197 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4198 do { \
4199 TYPE nn = *(TYPE *)(vn + H(i)); \
4200 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
4201 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4202 } while (i & 15); \
4203 } \
4204 for (; i < maxsz; i += sizeof(TYPE)) { \
4205 *(TYPE *)((void *)data + i) = IDENT; \
4206 } \
4207 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
4208}
4209
4210DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4211DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4212DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4213
4214
4215DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4216DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4217DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4218
4219DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4220DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4221DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4222
4223DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4224DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4225DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4226
4227DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4228DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4229DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4230
4231#undef DO_REDUCE
4232
4233uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4234 void *status, uint32_t desc)
4235{
4236 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4237 float16 result = nn;
4238
4239 do {
4240 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4241 do {
4242 if (pg & 1) {
4243 float16 mm = *(float16 *)(vm + H1_2(i));
4244 result = float16_add(result, mm, status);
4245 }
4246 i += sizeof(float16), pg >>= sizeof(float16);
4247 } while (i & 15);
4248 } while (i < opr_sz);
4249
4250 return result;
4251}
4252
4253uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4254 void *status, uint32_t desc)
4255{
4256 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4257 float32 result = nn;
4258
4259 do {
4260 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4261 do {
4262 if (pg & 1) {
4263 float32 mm = *(float32 *)(vm + H1_2(i));
4264 result = float32_add(result, mm, status);
4265 }
4266 i += sizeof(float32), pg >>= sizeof(float32);
4267 } while (i & 15);
4268 } while (i < opr_sz);
4269
4270 return result;
4271}
4272
4273uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4274 void *status, uint32_t desc)
4275{
4276 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4277 uint64_t *m = vm;
4278 uint8_t *pg = vg;
4279
4280 for (i = 0; i < opr_sz; i++) {
4281 if (pg[H1(i)] & 1) {
4282 nn = float64_add(nn, m[i], status);
4283 }
4284 }
4285
4286 return nn;
4287}
4288
4289
4290
4291
4292#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4293void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4294 void *status, uint32_t desc) \
4295{ \
4296 intptr_t i = simd_oprsz(desc); \
4297 uint64_t *g = vg; \
4298 do { \
4299 uint64_t pg = g[(i - 1) >> 6]; \
4300 do { \
4301 i -= sizeof(TYPE); \
4302 if (likely((pg >> (i & 63)) & 1)) { \
4303 TYPE nn = *(TYPE *)(vn + H(i)); \
4304 TYPE mm = *(TYPE *)(vm + H(i)); \
4305 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4306 } \
4307 } while (i & 63); \
4308 } while (i != 0); \
4309}
4310
4311DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4312DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4313DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4314
4315DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4316DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4317DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4318
4319DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4320DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4321DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4322
4323DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4324DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4325DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4326
4327DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4328DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4329DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4330
4331DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4332DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4333DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4334
4335DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4336DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4337DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4338
4339DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4340DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4341DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4342
4343static inline float16 abd_h(float16 a, float16 b, float_status *s)
4344{
4345 return float16_abs(float16_sub(a, b, s));
4346}
4347
4348static inline float32 abd_s(float32 a, float32 b, float_status *s)
4349{
4350 return float32_abs(float32_sub(a, b, s));
4351}
4352
4353static inline float64 abd_d(float64 a, float64 b, float_status *s)
4354{
4355 return float64_abs(float64_sub(a, b, s));
4356}
4357
4358DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4359DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4360DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4361
4362static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4363{
4364 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4365 return float64_scalbn(a, b_int, s);
4366}
4367
4368DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4369DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4370DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4371
4372DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4373DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4374DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4375
4376#undef DO_ZPZZ_FP
4377
4378
4379
4380
4381#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4382void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4383 void *status, uint32_t desc) \
4384{ \
4385 intptr_t i = simd_oprsz(desc); \
4386 uint64_t *g = vg; \
4387 TYPE mm = scalar; \
4388 do { \
4389 uint64_t pg = g[(i - 1) >> 6]; \
4390 do { \
4391 i -= sizeof(TYPE); \
4392 if (likely((pg >> (i & 63)) & 1)) { \
4393 TYPE nn = *(TYPE *)(vn + H(i)); \
4394 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4395 } \
4396 } while (i & 63); \
4397 } while (i != 0); \
4398}
4399
4400DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4401DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4402DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4403
4404DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4405DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4406DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4407
4408DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4409DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4410DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4411
4412static inline float16 subr_h(float16 a, float16 b, float_status *s)
4413{
4414 return float16_sub(b, a, s);
4415}
4416
4417static inline float32 subr_s(float32 a, float32 b, float_status *s)
4418{
4419 return float32_sub(b, a, s);
4420}
4421
4422static inline float64 subr_d(float64 a, float64 b, float_status *s)
4423{
4424 return float64_sub(b, a, s);
4425}
4426
4427DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4428DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4429DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4430
4431DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4432DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4433DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4434
4435DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4436DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4437DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4438
4439DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4440DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4441DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4442
4443DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4444DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4445DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4446
4447
4448
4449
4450#define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4451void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4452{ \
4453 intptr_t i = simd_oprsz(desc); \
4454 uint64_t *g = vg; \
4455 do { \
4456 uint64_t pg = g[(i - 1) >> 6]; \
4457 do { \
4458 i -= sizeof(TYPE); \
4459 if (likely((pg >> (i & 63)) & 1)) { \
4460 TYPE nn = *(TYPE *)(vn + H(i)); \
4461 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4462 } \
4463 } while (i & 63); \
4464 } while (i != 0); \
4465}
4466
4467
4468
4469
4470
4471static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4472{
4473 bool save = get_flush_inputs_to_zero(fpst);
4474 float32 ret;
4475
4476 set_flush_inputs_to_zero(false, fpst);
4477 ret = float16_to_float32(f, true, fpst);
4478 set_flush_inputs_to_zero(save, fpst);
4479 return ret;
4480}
4481
4482static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4483{
4484 bool save = get_flush_inputs_to_zero(fpst);
4485 float64 ret;
4486
4487 set_flush_inputs_to_zero(false, fpst);
4488 ret = float16_to_float64(f, true, fpst);
4489 set_flush_inputs_to_zero(save, fpst);
4490 return ret;
4491}
4492
4493static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4494{
4495 bool save = get_flush_to_zero(fpst);
4496 float16 ret;
4497
4498 set_flush_to_zero(false, fpst);
4499 ret = float32_to_float16(f, true, fpst);
4500 set_flush_to_zero(save, fpst);
4501 return ret;
4502}
4503
4504static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4505{
4506 bool save = get_flush_to_zero(fpst);
4507 float16 ret;
4508
4509 set_flush_to_zero(false, fpst);
4510 ret = float64_to_float16(f, true, fpst);
4511 set_flush_to_zero(save, fpst);
4512 return ret;
4513}
4514
4515static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4516{
4517 if (float16_is_any_nan(f)) {
4518 float_raise(float_flag_invalid, s);
4519 return 0;
4520 }
4521 return float16_to_int16_round_to_zero(f, s);
4522}
4523
4524static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4525{
4526 if (float16_is_any_nan(f)) {
4527 float_raise(float_flag_invalid, s);
4528 return 0;
4529 }
4530 return float16_to_int64_round_to_zero(f, s);
4531}
4532
4533static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4534{
4535 if (float32_is_any_nan(f)) {
4536 float_raise(float_flag_invalid, s);
4537 return 0;
4538 }
4539 return float32_to_int64_round_to_zero(f, s);
4540}
4541
4542static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4543{
4544 if (float64_is_any_nan(f)) {
4545 float_raise(float_flag_invalid, s);
4546 return 0;
4547 }
4548 return float64_to_int64_round_to_zero(f, s);
4549}
4550
4551static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4552{
4553 if (float16_is_any_nan(f)) {
4554 float_raise(float_flag_invalid, s);
4555 return 0;
4556 }
4557 return float16_to_uint16_round_to_zero(f, s);
4558}
4559
4560static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4561{
4562 if (float16_is_any_nan(f)) {
4563 float_raise(float_flag_invalid, s);
4564 return 0;
4565 }
4566 return float16_to_uint64_round_to_zero(f, s);
4567}
4568
4569static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4570{
4571 if (float32_is_any_nan(f)) {
4572 float_raise(float_flag_invalid, s);
4573 return 0;
4574 }
4575 return float32_to_uint64_round_to_zero(f, s);
4576}
4577
4578static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4579{
4580 if (float64_is_any_nan(f)) {
4581 float_raise(float_flag_invalid, s);
4582 return 0;
4583 }
4584 return float64_to_uint64_round_to_zero(f, s);
4585}
4586
4587DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4588DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4589DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16)
4590DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4591DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4592DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4593DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4594
4595DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4596DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4597DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4598DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4599DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4600DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4601DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4602
4603DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4604DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4605DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4606DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4607DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4608DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4609DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4610
4611DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4612DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4613DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4614
4615DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4616DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4617DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4618
4619DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4620DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4621DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4622
4623DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4624DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4625DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4626
4627DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4628DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4629DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4630DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4631DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4632DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4633DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4634
4635DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4636DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4637DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4638DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4639DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4640DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4641DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4642
4643static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4644{
4645
4646 uint32_t frac = (uint32_t)a << (16 + 6);
4647 int16_t exp = extract32(a, 10, 5);
4648
4649 if (unlikely(exp == 0)) {
4650 if (frac != 0) {
4651 if (!get_flush_inputs_to_zero(s)) {
4652
4653 return -15 - clz32(frac);
4654 }
4655
4656 float_raise(float_flag_input_denormal, s);
4657 }
4658 } else if (unlikely(exp == 0x1f)) {
4659 if (frac == 0) {
4660 return INT16_MAX;
4661 }
4662 } else {
4663
4664 return exp - 15;
4665 }
4666
4667 float_raise(float_flag_invalid, s);
4668 return INT16_MIN;
4669}
4670
4671static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4672{
4673
4674 uint32_t frac = a << 9;
4675 int32_t exp = extract32(a, 23, 8);
4676
4677 if (unlikely(exp == 0)) {
4678 if (frac != 0) {
4679 if (!get_flush_inputs_to_zero(s)) {
4680
4681 return -127 - clz32(frac);
4682 }
4683
4684 float_raise(float_flag_input_denormal, s);
4685 }
4686 } else if (unlikely(exp == 0xff)) {
4687 if (frac == 0) {
4688 return INT32_MAX;
4689 }
4690 } else {
4691
4692 return exp - 127;
4693 }
4694
4695 float_raise(float_flag_invalid, s);
4696 return INT32_MIN;
4697}
4698
4699static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4700{
4701
4702 uint64_t frac = a << 12;
4703 int64_t exp = extract64(a, 52, 11);
4704
4705 if (unlikely(exp == 0)) {
4706 if (frac != 0) {
4707 if (!get_flush_inputs_to_zero(s)) {
4708
4709 return -1023 - clz64(frac);
4710 }
4711
4712 float_raise(float_flag_input_denormal, s);
4713 }
4714 } else if (unlikely(exp == 0x7ff)) {
4715 if (frac == 0) {
4716 return INT64_MAX;
4717 }
4718 } else {
4719
4720 return exp - 1023;
4721 }
4722
4723 float_raise(float_flag_invalid, s);
4724 return INT64_MIN;
4725}
4726
4727DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4728DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4729DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4730
4731#undef DO_ZPZ_FP
4732
4733static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4734 float_status *status, uint32_t desc,
4735 uint16_t neg1, uint16_t neg3)
4736{
4737 intptr_t i = simd_oprsz(desc);
4738 uint64_t *g = vg;
4739
4740 do {
4741 uint64_t pg = g[(i - 1) >> 6];
4742 do {
4743 i -= 2;
4744 if (likely((pg >> (i & 63)) & 1)) {
4745 float16 e1, e2, e3, r;
4746
4747 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4748 e2 = *(uint16_t *)(vm + H1_2(i));
4749 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4750 r = float16_muladd(e1, e2, e3, 0, status);
4751 *(uint16_t *)(vd + H1_2(i)) = r;
4752 }
4753 } while (i & 63);
4754 } while (i != 0);
4755}
4756
4757void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4758 void *vg, void *status, uint32_t desc)
4759{
4760 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4761}
4762
4763void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4764 void *vg, void *status, uint32_t desc)
4765{
4766 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4767}
4768
4769void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4770 void *vg, void *status, uint32_t desc)
4771{
4772 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4773}
4774
4775void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4776 void *vg, void *status, uint32_t desc)
4777{
4778 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4779}
4780
4781static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4782 float_status *status, uint32_t desc,
4783 uint32_t neg1, uint32_t neg3)
4784{
4785 intptr_t i = simd_oprsz(desc);
4786 uint64_t *g = vg;
4787
4788 do {
4789 uint64_t pg = g[(i - 1) >> 6];
4790 do {
4791 i -= 4;
4792 if (likely((pg >> (i & 63)) & 1)) {
4793 float32 e1, e2, e3, r;
4794
4795 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4796 e2 = *(uint32_t *)(vm + H1_4(i));
4797 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4798 r = float32_muladd(e1, e2, e3, 0, status);
4799 *(uint32_t *)(vd + H1_4(i)) = r;
4800 }
4801 } while (i & 63);
4802 } while (i != 0);
4803}
4804
4805void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4806 void *vg, void *status, uint32_t desc)
4807{
4808 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4809}
4810
4811void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4812 void *vg, void *status, uint32_t desc)
4813{
4814 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4815}
4816
4817void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4818 void *vg, void *status, uint32_t desc)
4819{
4820 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4821}
4822
4823void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4824 void *vg, void *status, uint32_t desc)
4825{
4826 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4827}
4828
4829static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4830 float_status *status, uint32_t desc,
4831 uint64_t neg1, uint64_t neg3)
4832{
4833 intptr_t i = simd_oprsz(desc);
4834 uint64_t *g = vg;
4835
4836 do {
4837 uint64_t pg = g[(i - 1) >> 6];
4838 do {
4839 i -= 8;
4840 if (likely((pg >> (i & 63)) & 1)) {
4841 float64 e1, e2, e3, r;
4842
4843 e1 = *(uint64_t *)(vn + i) ^ neg1;
4844 e2 = *(uint64_t *)(vm + i);
4845 e3 = *(uint64_t *)(va + i) ^ neg3;
4846 r = float64_muladd(e1, e2, e3, 0, status);
4847 *(uint64_t *)(vd + i) = r;
4848 }
4849 } while (i & 63);
4850 } while (i != 0);
4851}
4852
4853void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4854 void *vg, void *status, uint32_t desc)
4855{
4856 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4857}
4858
4859void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4860 void *vg, void *status, uint32_t desc)
4861{
4862 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4863}
4864
4865void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4866 void *vg, void *status, uint32_t desc)
4867{
4868 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4869}
4870
4871void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4872 void *vg, void *status, uint32_t desc)
4873{
4874 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4875}
4876
4877
4878
4879
4880
4881
4882#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4883void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4884 void *status, uint32_t desc) \
4885{ \
4886 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4887 uint64_t *d = vd, *g = vg; \
4888 do { \
4889 uint64_t out = 0, pg = g[j]; \
4890 do { \
4891 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4892 if (likely((pg >> (i & 63)) & 1)) { \
4893 TYPE nn = *(TYPE *)(vn + H(i)); \
4894 TYPE mm = *(TYPE *)(vm + H(i)); \
4895 out |= OP(TYPE, nn, mm, status); \
4896 } \
4897 } while (i & 63); \
4898 d[j--] = out; \
4899 } while (i > 0); \
4900}
4901
4902#define DO_FPCMP_PPZZ_H(NAME, OP) \
4903 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4904#define DO_FPCMP_PPZZ_S(NAME, OP) \
4905 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4906#define DO_FPCMP_PPZZ_D(NAME, OP) \
4907 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4908
4909#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4910 DO_FPCMP_PPZZ_H(NAME, OP) \
4911 DO_FPCMP_PPZZ_S(NAME, OP) \
4912 DO_FPCMP_PPZZ_D(NAME, OP)
4913
4914#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4915#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4916#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4917#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
4918#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4919#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4920#define DO_FCMUO(TYPE, X, Y, ST) \
4921 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4922#define DO_FACGE(TYPE, X, Y, ST) \
4923 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4924#define DO_FACGT(TYPE, X, Y, ST) \
4925 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4926
4927DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4928DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4929DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4930DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4931DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4932DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4933DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4934
4935#undef DO_FPCMP_PPZZ_ALL
4936#undef DO_FPCMP_PPZZ_D
4937#undef DO_FPCMP_PPZZ_S
4938#undef DO_FPCMP_PPZZ_H
4939#undef DO_FPCMP_PPZZ
4940
4941
4942
4943
4944#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4945void HELPER(NAME)(void *vd, void *vn, void *vg, \
4946 void *status, uint32_t desc) \
4947{ \
4948 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4949 uint64_t *d = vd, *g = vg; \
4950 do { \
4951 uint64_t out = 0, pg = g[j]; \
4952 do { \
4953 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4954 if ((pg >> (i & 63)) & 1) { \
4955 TYPE nn = *(TYPE *)(vn + H(i)); \
4956 out |= OP(TYPE, nn, 0, status); \
4957 } \
4958 } while (i & 63); \
4959 d[j--] = out; \
4960 } while (i > 0); \
4961}
4962
4963#define DO_FPCMP_PPZ0_H(NAME, OP) \
4964 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4965#define DO_FPCMP_PPZ0_S(NAME, OP) \
4966 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4967#define DO_FPCMP_PPZ0_D(NAME, OP) \
4968 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4969
4970#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4971 DO_FPCMP_PPZ0_H(NAME, OP) \
4972 DO_FPCMP_PPZ0_S(NAME, OP) \
4973 DO_FPCMP_PPZ0_D(NAME, OP)
4974
4975DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4976DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4977DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4978DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4979DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4980DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4981
4982
4983
4984void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4985{
4986 static const float16 coeff[16] = {
4987 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4988 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4989 };
4990 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4991 intptr_t x = simd_data(desc);
4992 float16 *d = vd, *n = vn, *m = vm;
4993 for (i = 0; i < opr_sz; i++) {
4994 float16 mm = m[i];
4995 intptr_t xx = x;
4996 if (float16_is_neg(mm)) {
4997 mm = float16_abs(mm);
4998 xx += 8;
4999 }
5000 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
5001 }
5002}
5003
5004void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5005{
5006 static const float32 coeff[16] = {
5007 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5008 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5009 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5010 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5011 };
5012 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5013 intptr_t x = simd_data(desc);
5014 float32 *d = vd, *n = vn, *m = vm;
5015 for (i = 0; i < opr_sz; i++) {
5016 float32 mm = m[i];
5017 intptr_t xx = x;
5018 if (float32_is_neg(mm)) {
5019 mm = float32_abs(mm);
5020 xx += 8;
5021 }
5022 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
5023 }
5024}
5025
5026void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5027{
5028 static const float64 coeff[16] = {
5029 0x3ff0000000000000ull, 0xbfc5555555555543ull,
5030 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5031 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5032 0x3de5d8408868552full, 0x0000000000000000ull,
5033 0x3ff0000000000000ull, 0xbfe0000000000000ull,
5034 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5035 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5036 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5037 };
5038 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5039 intptr_t x = simd_data(desc);
5040 float64 *d = vd, *n = vn, *m = vm;
5041 for (i = 0; i < opr_sz; i++) {
5042 float64 mm = m[i];
5043 intptr_t xx = x;
5044 if (float64_is_neg(mm)) {
5045 mm = float64_abs(mm);
5046 xx += 8;
5047 }
5048 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5049 }
5050}
5051
5052
5053
5054
5055
5056void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5057 void *vs, uint32_t desc)
5058{
5059 intptr_t j, i = simd_oprsz(desc);
5060 uint64_t *g = vg;
5061 float16 neg_imag = float16_set_sign(0, simd_data(desc));
5062 float16 neg_real = float16_chs(neg_imag);
5063
5064 do {
5065 uint64_t pg = g[(i - 1) >> 6];
5066 do {
5067 float16 e0, e1, e2, e3;
5068
5069
5070 j = i - sizeof(float16);
5071 i -= 2 * sizeof(float16);
5072
5073 e0 = *(float16 *)(vn + H1_2(i));
5074 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5075 e2 = *(float16 *)(vn + H1_2(j));
5076 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5077
5078 if (likely((pg >> (i & 63)) & 1)) {
5079 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5080 }
5081 if (likely((pg >> (j & 63)) & 1)) {
5082 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5083 }
5084 } while (i & 63);
5085 } while (i != 0);
5086}
5087
5088void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5089 void *vs, uint32_t desc)
5090{
5091 intptr_t j, i = simd_oprsz(desc);
5092 uint64_t *g = vg;
5093 float32 neg_imag = float32_set_sign(0, simd_data(desc));
5094 float32 neg_real = float32_chs(neg_imag);
5095
5096 do {
5097 uint64_t pg = g[(i - 1) >> 6];
5098 do {
5099 float32 e0, e1, e2, e3;
5100
5101
5102 j = i - sizeof(float32);
5103 i -= 2 * sizeof(float32);
5104
5105 e0 = *(float32 *)(vn + H1_2(i));
5106 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5107 e2 = *(float32 *)(vn + H1_2(j));
5108 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5109
5110 if (likely((pg >> (i & 63)) & 1)) {
5111 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5112 }
5113 if (likely((pg >> (j & 63)) & 1)) {
5114 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5115 }
5116 } while (i & 63);
5117 } while (i != 0);
5118}
5119
5120void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5121 void *vs, uint32_t desc)
5122{
5123 intptr_t j, i = simd_oprsz(desc);
5124 uint64_t *g = vg;
5125 float64 neg_imag = float64_set_sign(0, simd_data(desc));
5126 float64 neg_real = float64_chs(neg_imag);
5127
5128 do {
5129 uint64_t pg = g[(i - 1) >> 6];
5130 do {
5131 float64 e0, e1, e2, e3;
5132
5133
5134 j = i - sizeof(float64);
5135 i -= 2 * sizeof(float64);
5136
5137 e0 = *(float64 *)(vn + H1_2(i));
5138 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5139 e2 = *(float64 *)(vn + H1_2(j));
5140 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5141
5142 if (likely((pg >> (i & 63)) & 1)) {
5143 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5144 }
5145 if (likely((pg >> (j & 63)) & 1)) {
5146 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5147 }
5148 } while (i & 63);
5149 } while (i != 0);
5150}
5151
5152
5153
5154
5155
5156void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5157 void *vg, void *status, uint32_t desc)
5158{
5159 intptr_t j, i = simd_oprsz(desc);
5160 unsigned rot = simd_data(desc);
5161 bool flip = rot & 1;
5162 float16 neg_imag, neg_real;
5163 uint64_t *g = vg;
5164
5165 neg_imag = float16_set_sign(0, (rot & 2) != 0);
5166 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5167
5168 do {
5169 uint64_t pg = g[(i - 1) >> 6];
5170 do {
5171 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5172
5173
5174 j = i - sizeof(float16);
5175 i -= 2 * sizeof(float16);
5176
5177 nr = *(float16 *)(vn + H1_2(i));
5178 ni = *(float16 *)(vn + H1_2(j));
5179 mr = *(float16 *)(vm + H1_2(i));
5180 mi = *(float16 *)(vm + H1_2(j));
5181
5182 e2 = (flip ? ni : nr);
5183 e1 = (flip ? mi : mr) ^ neg_real;
5184 e4 = e2;
5185 e3 = (flip ? mr : mi) ^ neg_imag;
5186
5187 if (likely((pg >> (i & 63)) & 1)) {
5188 d = *(float16 *)(va + H1_2(i));
5189 d = float16_muladd(e2, e1, d, 0, status);
5190 *(float16 *)(vd + H1_2(i)) = d;
5191 }
5192 if (likely((pg >> (j & 63)) & 1)) {
5193 d = *(float16 *)(va + H1_2(j));
5194 d = float16_muladd(e4, e3, d, 0, status);
5195 *(float16 *)(vd + H1_2(j)) = d;
5196 }
5197 } while (i & 63);
5198 } while (i != 0);
5199}
5200
5201void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5202 void *vg, void *status, uint32_t desc)
5203{
5204 intptr_t j, i = simd_oprsz(desc);
5205 unsigned rot = simd_data(desc);
5206 bool flip = rot & 1;
5207 float32 neg_imag, neg_real;
5208 uint64_t *g = vg;
5209
5210 neg_imag = float32_set_sign(0, (rot & 2) != 0);
5211 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5212
5213 do {
5214 uint64_t pg = g[(i - 1) >> 6];
5215 do {
5216 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5217
5218
5219 j = i - sizeof(float32);
5220 i -= 2 * sizeof(float32);
5221
5222 nr = *(float32 *)(vn + H1_2(i));
5223 ni = *(float32 *)(vn + H1_2(j));
5224 mr = *(float32 *)(vm + H1_2(i));
5225 mi = *(float32 *)(vm + H1_2(j));
5226
5227 e2 = (flip ? ni : nr);
5228 e1 = (flip ? mi : mr) ^ neg_real;
5229 e4 = e2;
5230 e3 = (flip ? mr : mi) ^ neg_imag;
5231
5232 if (likely((pg >> (i & 63)) & 1)) {
5233 d = *(float32 *)(va + H1_2(i));
5234 d = float32_muladd(e2, e1, d, 0, status);
5235 *(float32 *)(vd + H1_2(i)) = d;
5236 }
5237 if (likely((pg >> (j & 63)) & 1)) {
5238 d = *(float32 *)(va + H1_2(j));
5239 d = float32_muladd(e4, e3, d, 0, status);
5240 *(float32 *)(vd + H1_2(j)) = d;
5241 }
5242 } while (i & 63);
5243 } while (i != 0);
5244}
5245
5246void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5247 void *vg, void *status, uint32_t desc)
5248{
5249 intptr_t j, i = simd_oprsz(desc);
5250 unsigned rot = simd_data(desc);
5251 bool flip = rot & 1;
5252 float64 neg_imag, neg_real;
5253 uint64_t *g = vg;
5254
5255 neg_imag = float64_set_sign(0, (rot & 2) != 0);
5256 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5257
5258 do {
5259 uint64_t pg = g[(i - 1) >> 6];
5260 do {
5261 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5262
5263
5264 j = i - sizeof(float64);
5265 i -= 2 * sizeof(float64);
5266
5267 nr = *(float64 *)(vn + H1_2(i));
5268 ni = *(float64 *)(vn + H1_2(j));
5269 mr = *(float64 *)(vm + H1_2(i));
5270 mi = *(float64 *)(vm + H1_2(j));
5271
5272 e2 = (flip ? ni : nr);
5273 e1 = (flip ? mi : mr) ^ neg_real;
5274 e4 = e2;
5275 e3 = (flip ? mr : mi) ^ neg_imag;
5276
5277 if (likely((pg >> (i & 63)) & 1)) {
5278 d = *(float64 *)(va + H1_2(i));
5279 d = float64_muladd(e2, e1, d, 0, status);
5280 *(float64 *)(vd + H1_2(i)) = d;
5281 }
5282 if (likely((pg >> (j & 63)) & 1)) {
5283 d = *(float64 *)(va + H1_2(j));
5284 d = float64_muladd(e4, e3, d, 0, status);
5285 *(float64 *)(vd + H1_2(j)) = d;
5286 }
5287 } while (i & 63);
5288 } while (i != 0);
5289}
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5301 intptr_t reg_max, int esz)
5302{
5303 uint64_t pg_mask = pred_esz_masks[esz];
5304 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5305
5306
5307 if (likely(pg & 1)) {
5308 return reg_off;
5309 }
5310
5311 if (pg == 0) {
5312 reg_off &= -64;
5313 do {
5314 reg_off += 64;
5315 if (unlikely(reg_off >= reg_max)) {
5316
5317 return reg_max;
5318 }
5319 pg = vg[reg_off >> 6] & pg_mask;
5320 } while (pg == 0);
5321 }
5322 reg_off += ctz64(pg);
5323
5324
5325 tcg_debug_assert(reg_off < reg_max);
5326 return reg_off;
5327}
5328
5329
5330
5331
5332
5333
5334
5335bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5336 target_ulong addr, int mem_off, MMUAccessType access_type,
5337 int mmu_idx, uintptr_t retaddr)
5338{
5339 int flags;
5340
5341 addr += mem_off;
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353 addr = useronly_clean_ptr(addr);
5354
5355#ifdef CONFIG_USER_ONLY
5356 flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5357 &info->host, retaddr);
5358#else
5359 CPUTLBEntryFull *full;
5360 flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5361 &info->host, &full, retaddr);
5362#endif
5363 info->flags = flags;
5364
5365 if (flags & TLB_INVALID_MASK) {
5366 g_assert(nofault);
5367 return false;
5368 }
5369
5370#ifdef CONFIG_USER_ONLY
5371 memset(&info->attrs, 0, sizeof(info->attrs));
5372
5373 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5374#else
5375 info->attrs = full->attrs;
5376 info->tagged = full->pte_attrs == 0xf0;
5377#endif
5378
5379
5380 info->host -= mem_off;
5381 return true;
5382}
5383
5384
5385
5386
5387
5388
5389bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5390 intptr_t reg_max, int esz, int msize)
5391{
5392 const int esize = 1 << esz;
5393 const uint64_t pg_mask = pred_esz_masks[esz];
5394 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5395 intptr_t mem_off_last, mem_off_split;
5396 intptr_t page_split, elt_split;
5397 intptr_t i;
5398
5399
5400 memset(info, -1, offsetof(SVEContLdSt, page));
5401 memset(info->page, 0, sizeof(info->page));
5402
5403
5404 i = 0;
5405 do {
5406 uint64_t pg = vg[i] & pg_mask;
5407 if (pg) {
5408 reg_off_last = i * 64 + 63 - clz64(pg);
5409 if (reg_off_first < 0) {
5410 reg_off_first = i * 64 + ctz64(pg);
5411 }
5412 }
5413 } while (++i * 64 < reg_max);
5414
5415 if (unlikely(reg_off_first < 0)) {
5416
5417 return false;
5418 }
5419 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5420
5421 info->reg_off_first[0] = reg_off_first;
5422 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5423 mem_off_last = (reg_off_last >> esz) * msize;
5424
5425 page_split = -(addr | TARGET_PAGE_MASK);
5426 if (likely(mem_off_last + msize <= page_split)) {
5427
5428 info->reg_off_last[0] = reg_off_last;
5429 return true;
5430 }
5431
5432 info->page_split = page_split;
5433 elt_split = page_split / msize;
5434 reg_off_split = elt_split << esz;
5435 mem_off_split = elt_split * msize;
5436
5437
5438
5439
5440
5441
5442
5443 if (elt_split != 0) {
5444 info->reg_off_last[0] = reg_off_split - esize;
5445 }
5446
5447
5448 if (page_split % msize != 0) {
5449
5450 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5451 info->reg_off_split = reg_off_split;
5452 info->mem_off_split = mem_off_split;
5453
5454 if (reg_off_split == reg_off_last) {
5455
5456 return true;
5457 }
5458 }
5459 reg_off_split += esize;
5460 mem_off_split += msize;
5461 }
5462
5463
5464
5465
5466
5467 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5468 tcg_debug_assert(reg_off_split <= reg_off_last);
5469 info->reg_off_first[1] = reg_off_split;
5470 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5471 info->reg_off_last[1] = reg_off_last;
5472 return true;
5473}
5474
5475
5476
5477
5478
5479
5480bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5481 CPUARMState *env, target_ulong addr,
5482 MMUAccessType access_type, uintptr_t retaddr)
5483{
5484 int mmu_idx = cpu_mmu_index(env, false);
5485 int mem_off = info->mem_off_first[0];
5486 bool nofault = fault == FAULT_NO;
5487 bool have_work = true;
5488
5489 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5490 access_type, mmu_idx, retaddr)) {
5491
5492 return false;
5493 }
5494
5495 if (likely(info->page_split < 0)) {
5496
5497 return true;
5498 }
5499
5500
5501
5502
5503
5504 if (info->mem_off_split >= 0) {
5505
5506
5507
5508
5509 mem_off = info->page_split;
5510
5511
5512
5513
5514
5515
5516 if (info->mem_off_first[0] < info->mem_off_split) {
5517 nofault = FAULT_FIRST;
5518 have_work = false;
5519 }
5520 } else {
5521
5522
5523
5524
5525 mem_off = info->mem_off_first[1];
5526
5527
5528
5529
5530 nofault = fault != FAULT_ALL;
5531 }
5532
5533 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5534 access_type, mmu_idx, retaddr);
5535 return have_work;
5536}
5537
5538#ifndef CONFIG_USER_ONLY
5539void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5540 uint64_t *vg, target_ulong addr,
5541 int esize, int msize, int wp_access,
5542 uintptr_t retaddr)
5543{
5544 intptr_t mem_off, reg_off, reg_last;
5545 int flags0 = info->page[0].flags;
5546 int flags1 = info->page[1].flags;
5547
5548 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5549 return;
5550 }
5551
5552
5553 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5554 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5555
5556 if (flags0 & TLB_WATCHPOINT) {
5557 mem_off = info->mem_off_first[0];
5558 reg_off = info->reg_off_first[0];
5559 reg_last = info->reg_off_last[0];
5560
5561 while (reg_off <= reg_last) {
5562 uint64_t pg = vg[reg_off >> 6];
5563 do {
5564 if ((pg >> (reg_off & 63)) & 1) {
5565 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5566 msize, info->page[0].attrs,
5567 wp_access, retaddr);
5568 }
5569 reg_off += esize;
5570 mem_off += msize;
5571 } while (reg_off <= reg_last && (reg_off & 63));
5572 }
5573 }
5574
5575 mem_off = info->mem_off_split;
5576 if (mem_off >= 0) {
5577 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5578 info->page[0].attrs, wp_access, retaddr);
5579 }
5580
5581 mem_off = info->mem_off_first[1];
5582 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5583 reg_off = info->reg_off_first[1];
5584 reg_last = info->reg_off_last[1];
5585
5586 do {
5587 uint64_t pg = vg[reg_off >> 6];
5588 do {
5589 if ((pg >> (reg_off & 63)) & 1) {
5590 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5591 msize, info->page[1].attrs,
5592 wp_access, retaddr);
5593 }
5594 reg_off += esize;
5595 mem_off += msize;
5596 } while (reg_off & 63);
5597 } while (reg_off <= reg_last);
5598 }
5599}
5600#endif
5601
5602void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5603 uint64_t *vg, target_ulong addr, int esize,
5604 int msize, uint32_t mtedesc, uintptr_t ra)
5605{
5606 intptr_t mem_off, reg_off, reg_last;
5607
5608
5609 if (info->page[0].tagged) {
5610 mem_off = info->mem_off_first[0];
5611 reg_off = info->reg_off_first[0];
5612 reg_last = info->reg_off_split;
5613 if (reg_last < 0) {
5614 reg_last = info->reg_off_last[0];
5615 }
5616
5617 do {
5618 uint64_t pg = vg[reg_off >> 6];
5619 do {
5620 if ((pg >> (reg_off & 63)) & 1) {
5621 mte_check(env, mtedesc, addr, ra);
5622 }
5623 reg_off += esize;
5624 mem_off += msize;
5625 } while (reg_off <= reg_last && (reg_off & 63));
5626 } while (reg_off <= reg_last);
5627 }
5628
5629 mem_off = info->mem_off_first[1];
5630 if (mem_off >= 0 && info->page[1].tagged) {
5631 reg_off = info->reg_off_first[1];
5632 reg_last = info->reg_off_last[1];
5633
5634 do {
5635 uint64_t pg = vg[reg_off >> 6];
5636 do {
5637 if ((pg >> (reg_off & 63)) & 1) {
5638 mte_check(env, mtedesc, addr, ra);
5639 }
5640 reg_off += esize;
5641 mem_off += msize;
5642 } while (reg_off & 63);
5643 } while (reg_off <= reg_last);
5644 }
5645}
5646
5647
5648
5649
5650static inline QEMU_ALWAYS_INLINE
5651void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5652 uint32_t desc, const uintptr_t retaddr,
5653 const int esz, const int msz, const int N, uint32_t mtedesc,
5654 sve_ldst1_host_fn *host_fn,
5655 sve_ldst1_tlb_fn *tlb_fn)
5656{
5657 const unsigned rd = simd_data(desc);
5658 const intptr_t reg_max = simd_oprsz(desc);
5659 intptr_t reg_off, reg_last, mem_off;
5660 SVEContLdSt info;
5661 void *host;
5662 int flags, i;
5663
5664
5665 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5666
5667 for (i = 0; i < N; ++i) {
5668 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5669 }
5670 return;
5671 }
5672
5673
5674 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5675
5676
5677 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5678 BP_MEM_READ, retaddr);
5679
5680
5681
5682
5683
5684 if (mtedesc) {
5685 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5686 mtedesc, retaddr);
5687 }
5688
5689 flags = info.page[0].flags | info.page[1].flags;
5690 if (unlikely(flags != 0)) {
5691
5692
5693
5694
5695
5696
5697 ARMVectorReg scratch[4] = { };
5698
5699 mem_off = info.mem_off_first[0];
5700 reg_off = info.reg_off_first[0];
5701 reg_last = info.reg_off_last[1];
5702 if (reg_last < 0) {
5703 reg_last = info.reg_off_split;
5704 if (reg_last < 0) {
5705 reg_last = info.reg_off_last[0];
5706 }
5707 }
5708
5709 do {
5710 uint64_t pg = vg[reg_off >> 6];
5711 do {
5712 if ((pg >> (reg_off & 63)) & 1) {
5713 for (i = 0; i < N; ++i) {
5714 tlb_fn(env, &scratch[i], reg_off,
5715 addr + mem_off + (i << msz), retaddr);
5716 }
5717 }
5718 reg_off += 1 << esz;
5719 mem_off += N << msz;
5720 } while (reg_off & 63);
5721 } while (reg_off <= reg_last);
5722
5723 for (i = 0; i < N; ++i) {
5724 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5725 }
5726 return;
5727 }
5728
5729
5730
5731 for (i = 0; i < N; ++i) {
5732 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5733 }
5734
5735 mem_off = info.mem_off_first[0];
5736 reg_off = info.reg_off_first[0];
5737 reg_last = info.reg_off_last[0];
5738 host = info.page[0].host;
5739
5740 while (reg_off <= reg_last) {
5741 uint64_t pg = vg[reg_off >> 6];
5742 do {
5743 if ((pg >> (reg_off & 63)) & 1) {
5744 for (i = 0; i < N; ++i) {
5745 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5746 host + mem_off + (i << msz));
5747 }
5748 }
5749 reg_off += 1 << esz;
5750 mem_off += N << msz;
5751 } while (reg_off <= reg_last && (reg_off & 63));
5752 }
5753
5754
5755
5756
5757
5758 mem_off = info.mem_off_split;
5759 if (unlikely(mem_off >= 0)) {
5760 reg_off = info.reg_off_split;
5761 for (i = 0; i < N; ++i) {
5762 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5763 addr + mem_off + (i << msz), retaddr);
5764 }
5765 }
5766
5767 mem_off = info.mem_off_first[1];
5768 if (unlikely(mem_off >= 0)) {
5769 reg_off = info.reg_off_first[1];
5770 reg_last = info.reg_off_last[1];
5771 host = info.page[1].host;
5772
5773 do {
5774 uint64_t pg = vg[reg_off >> 6];
5775 do {
5776 if ((pg >> (reg_off & 63)) & 1) {
5777 for (i = 0; i < N; ++i) {
5778 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5779 host + mem_off + (i << msz));
5780 }
5781 }
5782 reg_off += 1 << esz;
5783 mem_off += N << msz;
5784 } while (reg_off & 63);
5785 } while (reg_off <= reg_last);
5786 }
5787}
5788
5789static inline QEMU_ALWAYS_INLINE
5790void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5791 uint32_t desc, const uintptr_t ra,
5792 const int esz, const int msz, const int N,
5793 sve_ldst1_host_fn *host_fn,
5794 sve_ldst1_tlb_fn *tlb_fn)
5795{
5796 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5797 int bit55 = extract64(addr, 55, 1);
5798
5799
5800 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5801
5802
5803 if (!tbi_check(desc, bit55) ||
5804 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5805 mtedesc = 0;
5806 }
5807
5808 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5809}
5810
5811#define DO_LD1_1(NAME, ESZ) \
5812void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5813 target_ulong addr, uint32_t desc) \
5814{ \
5815 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
5816 sve_##NAME##_host, sve_##NAME##_tlb); \
5817} \
5818void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5819 target_ulong addr, uint32_t desc) \
5820{ \
5821 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
5822 sve_##NAME##_host, sve_##NAME##_tlb); \
5823}
5824
5825#define DO_LD1_2(NAME, ESZ, MSZ) \
5826void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
5827 target_ulong addr, uint32_t desc) \
5828{ \
5829 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5830 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5831} \
5832void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
5833 target_ulong addr, uint32_t desc) \
5834{ \
5835 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5836 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5837} \
5838void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5839 target_ulong addr, uint32_t desc) \
5840{ \
5841 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5842 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5843} \
5844void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5845 target_ulong addr, uint32_t desc) \
5846{ \
5847 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5848 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5849}
5850
5851DO_LD1_1(ld1bb, MO_8)
5852DO_LD1_1(ld1bhu, MO_16)
5853DO_LD1_1(ld1bhs, MO_16)
5854DO_LD1_1(ld1bsu, MO_32)
5855DO_LD1_1(ld1bss, MO_32)
5856DO_LD1_1(ld1bdu, MO_64)
5857DO_LD1_1(ld1bds, MO_64)
5858
5859DO_LD1_2(ld1hh, MO_16, MO_16)
5860DO_LD1_2(ld1hsu, MO_32, MO_16)
5861DO_LD1_2(ld1hss, MO_32, MO_16)
5862DO_LD1_2(ld1hdu, MO_64, MO_16)
5863DO_LD1_2(ld1hds, MO_64, MO_16)
5864
5865DO_LD1_2(ld1ss, MO_32, MO_32)
5866DO_LD1_2(ld1sdu, MO_64, MO_32)
5867DO_LD1_2(ld1sds, MO_64, MO_32)
5868
5869DO_LD1_2(ld1dd, MO_64, MO_64)
5870
5871#undef DO_LD1_1
5872#undef DO_LD1_2
5873
5874#define DO_LDN_1(N) \
5875void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
5876 target_ulong addr, uint32_t desc) \
5877{ \
5878 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
5879 sve_ld1bb_host, sve_ld1bb_tlb); \
5880} \
5881void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
5882 target_ulong addr, uint32_t desc) \
5883{ \
5884 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
5885 sve_ld1bb_host, sve_ld1bb_tlb); \
5886}
5887
5888#define DO_LDN_2(N, SUFF, ESZ) \
5889void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
5890 target_ulong addr, uint32_t desc) \
5891{ \
5892 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5893 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5894} \
5895void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
5896 target_ulong addr, uint32_t desc) \
5897{ \
5898 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5899 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5900} \
5901void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
5902 target_ulong addr, uint32_t desc) \
5903{ \
5904 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5905 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5906} \
5907void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
5908 target_ulong addr, uint32_t desc) \
5909{ \
5910 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5911 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5912}
5913
5914DO_LDN_1(2)
5915DO_LDN_1(3)
5916DO_LDN_1(4)
5917
5918DO_LDN_2(2, hh, MO_16)
5919DO_LDN_2(3, hh, MO_16)
5920DO_LDN_2(4, hh, MO_16)
5921
5922DO_LDN_2(2, ss, MO_32)
5923DO_LDN_2(3, ss, MO_32)
5924DO_LDN_2(4, ss, MO_32)
5925
5926DO_LDN_2(2, dd, MO_64)
5927DO_LDN_2(3, dd, MO_64)
5928DO_LDN_2(4, dd, MO_64)
5929
5930#undef DO_LDN_1
5931#undef DO_LDN_2
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5952{
5953 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5954
5955 if (i & 63) {
5956 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5957 i = ROUND_UP(i, 64);
5958 }
5959 for (; i < oprsz; i += 64) {
5960 ffr[i / 64] = 0;
5961 }
5962}
5963
5964
5965
5966
5967static inline QEMU_ALWAYS_INLINE
5968void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
5969 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
5970 const int esz, const int msz, const SVEContFault fault,
5971 sve_ldst1_host_fn *host_fn,
5972 sve_ldst1_tlb_fn *tlb_fn)
5973{
5974 const unsigned rd = simd_data(desc);
5975 void *vd = &env->vfp.zregs[rd];
5976 const intptr_t reg_max = simd_oprsz(desc);
5977 intptr_t reg_off, mem_off, reg_last;
5978 SVEContLdSt info;
5979 int flags;
5980 void *host;
5981
5982
5983 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
5984
5985 memset(vd, 0, reg_max);
5986 return;
5987 }
5988 reg_off = info.reg_off_first[0];
5989
5990
5991 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
5992
5993 tcg_debug_assert(fault == FAULT_NO);
5994 memset(vd, 0, reg_max);
5995 goto do_fault;
5996 }
5997
5998 mem_off = info.mem_off_first[0];
5999 flags = info.page[0].flags;
6000
6001
6002
6003
6004
6005 if (!info.page[0].tagged) {
6006 mtedesc = 0;
6007 }
6008
6009 if (fault == FAULT_FIRST) {
6010
6011 if (mtedesc) {
6012 mte_check(env, mtedesc, addr + mem_off, retaddr);
6013 }
6014
6015
6016
6017
6018
6019 bool is_split = mem_off == info.mem_off_split;
6020 if (unlikely(flags != 0) || unlikely(is_split)) {
6021
6022
6023
6024
6025 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6026
6027
6028 swap_memzero(vd, reg_off);
6029 reg_off += 1 << esz;
6030 mem_off += 1 << msz;
6031 swap_memzero(vd + reg_off, reg_max - reg_off);
6032
6033 if (is_split) {
6034 goto second_page;
6035 }
6036 } else {
6037 memset(vd, 0, reg_max);
6038 }
6039 } else {
6040 memset(vd, 0, reg_max);
6041 if (unlikely(mem_off == info.mem_off_split)) {
6042
6043 flags |= info.page[1].flags;
6044 if (unlikely(flags & TLB_MMIO)) {
6045
6046 goto do_fault;
6047 }
6048 if (unlikely(flags & TLB_WATCHPOINT) &&
6049 (cpu_watchpoint_address_matches
6050 (env_cpu(env), addr + mem_off, 1 << msz)
6051 & BP_MEM_READ)) {
6052
6053 goto do_fault;
6054 }
6055 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6056 goto do_fault;
6057 }
6058
6059
6060
6061
6062 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6063 goto second_page;
6064 }
6065 }
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088 if (unlikely(flags & TLB_MMIO)) {
6089 goto do_fault;
6090 }
6091
6092 reg_last = info.reg_off_last[0];
6093 host = info.page[0].host;
6094
6095 do {
6096 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6097 do {
6098 if ((pg >> (reg_off & 63)) & 1) {
6099 if (unlikely(flags & TLB_WATCHPOINT) &&
6100 (cpu_watchpoint_address_matches
6101 (env_cpu(env), addr + mem_off, 1 << msz)
6102 & BP_MEM_READ)) {
6103 goto do_fault;
6104 }
6105 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6106 goto do_fault;
6107 }
6108 host_fn(vd, reg_off, host + mem_off);
6109 }
6110 reg_off += 1 << esz;
6111 mem_off += 1 << msz;
6112 } while (reg_off <= reg_last && (reg_off & 63));
6113 } while (reg_off <= reg_last);
6114
6115
6116
6117
6118
6119
6120
6121 reg_off = info.reg_off_split;
6122 if (reg_off >= 0) {
6123 goto do_fault;
6124 }
6125
6126 second_page:
6127 reg_off = info.reg_off_first[1];
6128 if (likely(reg_off < 0)) {
6129
6130 return;
6131 }
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141 do_fault:
6142 record_fault(env, reg_off, reg_max);
6143}
6144
6145static inline QEMU_ALWAYS_INLINE
6146void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6147 uint32_t desc, const uintptr_t retaddr,
6148 const int esz, const int msz, const SVEContFault fault,
6149 sve_ldst1_host_fn *host_fn,
6150 sve_ldst1_tlb_fn *tlb_fn)
6151{
6152 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6153 int bit55 = extract64(addr, 55, 1);
6154
6155
6156 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6157
6158
6159 if (!tbi_check(desc, bit55) ||
6160 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6161 mtedesc = 0;
6162 }
6163
6164 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6165 esz, msz, fault, host_fn, tlb_fn);
6166}
6167
6168#define DO_LDFF1_LDNF1_1(PART, ESZ) \
6169void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6170 target_ulong addr, uint32_t desc) \
6171{ \
6172 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6173 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6174} \
6175void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6176 target_ulong addr, uint32_t desc) \
6177{ \
6178 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6179 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6180} \
6181void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6182 target_ulong addr, uint32_t desc) \
6183{ \
6184 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6185 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6186} \
6187void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6188 target_ulong addr, uint32_t desc) \
6189{ \
6190 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6191 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6192}
6193
6194#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6195void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6196 target_ulong addr, uint32_t desc) \
6197{ \
6198 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6199 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6200} \
6201void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6202 target_ulong addr, uint32_t desc) \
6203{ \
6204 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6205 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6206} \
6207void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6208 target_ulong addr, uint32_t desc) \
6209{ \
6210 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6211 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6212} \
6213void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6214 target_ulong addr, uint32_t desc) \
6215{ \
6216 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6217 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6218} \
6219void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6220 target_ulong addr, uint32_t desc) \
6221{ \
6222 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6223 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6224} \
6225void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6226 target_ulong addr, uint32_t desc) \
6227{ \
6228 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6229 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6230} \
6231void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6232 target_ulong addr, uint32_t desc) \
6233{ \
6234 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6235 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6236} \
6237void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6238 target_ulong addr, uint32_t desc) \
6239{ \
6240 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6241 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6242}
6243
6244DO_LDFF1_LDNF1_1(bb, MO_8)
6245DO_LDFF1_LDNF1_1(bhu, MO_16)
6246DO_LDFF1_LDNF1_1(bhs, MO_16)
6247DO_LDFF1_LDNF1_1(bsu, MO_32)
6248DO_LDFF1_LDNF1_1(bss, MO_32)
6249DO_LDFF1_LDNF1_1(bdu, MO_64)
6250DO_LDFF1_LDNF1_1(bds, MO_64)
6251
6252DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
6253DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6254DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6255DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6256DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6257
6258DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
6259DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6260DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6261
6262DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
6263
6264#undef DO_LDFF1_LDNF1_1
6265#undef DO_LDFF1_LDNF1_2
6266
6267
6268
6269
6270
6271static inline QEMU_ALWAYS_INLINE
6272void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6273 uint32_t desc, const uintptr_t retaddr,
6274 const int esz, const int msz, const int N, uint32_t mtedesc,
6275 sve_ldst1_host_fn *host_fn,
6276 sve_ldst1_tlb_fn *tlb_fn)
6277{
6278 const unsigned rd = simd_data(desc);
6279 const intptr_t reg_max = simd_oprsz(desc);
6280 intptr_t reg_off, reg_last, mem_off;
6281 SVEContLdSt info;
6282 void *host;
6283 int i, flags;
6284
6285
6286 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6287
6288 return;
6289 }
6290
6291
6292 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6293
6294
6295 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6296 BP_MEM_WRITE, retaddr);
6297
6298
6299
6300
6301
6302 if (mtedesc) {
6303 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6304 mtedesc, retaddr);
6305 }
6306
6307 flags = info.page[0].flags | info.page[1].flags;
6308 if (unlikely(flags != 0)) {
6309#ifdef CONFIG_USER_ONLY
6310 g_assert_not_reached();
6311#else
6312
6313
6314
6315
6316
6317
6318 mem_off = info.mem_off_first[0];
6319 reg_off = info.reg_off_first[0];
6320 reg_last = info.reg_off_last[1];
6321 if (reg_last < 0) {
6322 reg_last = info.reg_off_split;
6323 if (reg_last < 0) {
6324 reg_last = info.reg_off_last[0];
6325 }
6326 }
6327
6328 do {
6329 uint64_t pg = vg[reg_off >> 6];
6330 do {
6331 if ((pg >> (reg_off & 63)) & 1) {
6332 for (i = 0; i < N; ++i) {
6333 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6334 addr + mem_off + (i << msz), retaddr);
6335 }
6336 }
6337 reg_off += 1 << esz;
6338 mem_off += N << msz;
6339 } while (reg_off & 63);
6340 } while (reg_off <= reg_last);
6341 return;
6342#endif
6343 }
6344
6345 mem_off = info.mem_off_first[0];
6346 reg_off = info.reg_off_first[0];
6347 reg_last = info.reg_off_last[0];
6348 host = info.page[0].host;
6349
6350 while (reg_off <= reg_last) {
6351 uint64_t pg = vg[reg_off >> 6];
6352 do {
6353 if ((pg >> (reg_off & 63)) & 1) {
6354 for (i = 0; i < N; ++i) {
6355 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6356 host + mem_off + (i << msz));
6357 }
6358 }
6359 reg_off += 1 << esz;
6360 mem_off += N << msz;
6361 } while (reg_off <= reg_last && (reg_off & 63));
6362 }
6363
6364
6365
6366
6367
6368 mem_off = info.mem_off_split;
6369 if (unlikely(mem_off >= 0)) {
6370 reg_off = info.reg_off_split;
6371 for (i = 0; i < N; ++i) {
6372 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6373 addr + mem_off + (i << msz), retaddr);
6374 }
6375 }
6376
6377 mem_off = info.mem_off_first[1];
6378 if (unlikely(mem_off >= 0)) {
6379 reg_off = info.reg_off_first[1];
6380 reg_last = info.reg_off_last[1];
6381 host = info.page[1].host;
6382
6383 do {
6384 uint64_t pg = vg[reg_off >> 6];
6385 do {
6386 if ((pg >> (reg_off & 63)) & 1) {
6387 for (i = 0; i < N; ++i) {
6388 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6389 host + mem_off + (i << msz));
6390 }
6391 }
6392 reg_off += 1 << esz;
6393 mem_off += N << msz;
6394 } while (reg_off & 63);
6395 } while (reg_off <= reg_last);
6396 }
6397}
6398
6399static inline QEMU_ALWAYS_INLINE
6400void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6401 uint32_t desc, const uintptr_t ra,
6402 const int esz, const int msz, const int N,
6403 sve_ldst1_host_fn *host_fn,
6404 sve_ldst1_tlb_fn *tlb_fn)
6405{
6406 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6407 int bit55 = extract64(addr, 55, 1);
6408
6409
6410 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6411
6412
6413 if (!tbi_check(desc, bit55) ||
6414 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6415 mtedesc = 0;
6416 }
6417
6418 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6419}
6420
6421#define DO_STN_1(N, NAME, ESZ) \
6422void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6423 target_ulong addr, uint32_t desc) \
6424{ \
6425 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6426 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6427} \
6428void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6429 target_ulong addr, uint32_t desc) \
6430{ \
6431 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6432 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6433}
6434
6435#define DO_STN_2(N, NAME, ESZ, MSZ) \
6436void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6437 target_ulong addr, uint32_t desc) \
6438{ \
6439 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6440 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6441} \
6442void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6443 target_ulong addr, uint32_t desc) \
6444{ \
6445 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6446 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6447} \
6448void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6449 target_ulong addr, uint32_t desc) \
6450{ \
6451 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6452 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6453} \
6454void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6455 target_ulong addr, uint32_t desc) \
6456{ \
6457 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6458 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6459}
6460
6461DO_STN_1(1, bb, MO_8)
6462DO_STN_1(1, bh, MO_16)
6463DO_STN_1(1, bs, MO_32)
6464DO_STN_1(1, bd, MO_64)
6465DO_STN_1(2, bb, MO_8)
6466DO_STN_1(3, bb, MO_8)
6467DO_STN_1(4, bb, MO_8)
6468
6469DO_STN_2(1, hh, MO_16, MO_16)
6470DO_STN_2(1, hs, MO_32, MO_16)
6471DO_STN_2(1, hd, MO_64, MO_16)
6472DO_STN_2(2, hh, MO_16, MO_16)
6473DO_STN_2(3, hh, MO_16, MO_16)
6474DO_STN_2(4, hh, MO_16, MO_16)
6475
6476DO_STN_2(1, ss, MO_32, MO_32)
6477DO_STN_2(1, sd, MO_64, MO_32)
6478DO_STN_2(2, ss, MO_32, MO_32)
6479DO_STN_2(3, ss, MO_32, MO_32)
6480DO_STN_2(4, ss, MO_32, MO_32)
6481
6482DO_STN_2(1, dd, MO_64, MO_64)
6483DO_STN_2(2, dd, MO_64, MO_64)
6484DO_STN_2(3, dd, MO_64, MO_64)
6485DO_STN_2(4, dd, MO_64, MO_64)
6486
6487#undef DO_STN_1
6488#undef DO_STN_2
6489
6490
6491
6492
6493
6494
6495
6496
6497typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6498
6499static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6500{
6501 return *(uint32_t *)(reg + H1_4(reg_ofs));
6502}
6503
6504static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6505{
6506 return *(int32_t *)(reg + H1_4(reg_ofs));
6507}
6508
6509static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6510{
6511 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6512}
6513
6514static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6515{
6516 return (int32_t)*(uint64_t *)(reg + reg_ofs);
6517}
6518
6519static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6520{
6521 return *(uint64_t *)(reg + reg_ofs);
6522}
6523
6524static inline QEMU_ALWAYS_INLINE
6525void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6526 target_ulong base, uint32_t desc, uintptr_t retaddr,
6527 uint32_t mtedesc, int esize, int msize,
6528 zreg_off_fn *off_fn,
6529 sve_ldst1_host_fn *host_fn,
6530 sve_ldst1_tlb_fn *tlb_fn)
6531{
6532 const int mmu_idx = cpu_mmu_index(env, false);
6533 const intptr_t reg_max = simd_oprsz(desc);
6534 const int scale = simd_data(desc);
6535 ARMVectorReg scratch;
6536 intptr_t reg_off;
6537 SVEHostPage info, info2;
6538
6539 memset(&scratch, 0, reg_max);
6540 reg_off = 0;
6541 do {
6542 uint64_t pg = vg[reg_off >> 6];
6543 do {
6544 if (likely(pg & 1)) {
6545 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6546 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6547
6548 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6549 mmu_idx, retaddr);
6550
6551 if (likely(in_page >= msize)) {
6552 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6553 cpu_check_watchpoint(env_cpu(env), addr, msize,
6554 info.attrs, BP_MEM_READ, retaddr);
6555 }
6556 if (mtedesc && info.tagged) {
6557 mte_check(env, mtedesc, addr, retaddr);
6558 }
6559 if (unlikely(info.flags & TLB_MMIO)) {
6560 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6561 } else {
6562 host_fn(&scratch, reg_off, info.host);
6563 }
6564 } else {
6565
6566 sve_probe_page(&info2, false, env, addr + in_page, 0,
6567 MMU_DATA_LOAD, mmu_idx, retaddr);
6568 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6569 cpu_check_watchpoint(env_cpu(env), addr,
6570 msize, info.attrs,
6571 BP_MEM_READ, retaddr);
6572 }
6573 if (mtedesc && info.tagged) {
6574 mte_check(env, mtedesc, addr, retaddr);
6575 }
6576 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6577 }
6578 }
6579 reg_off += esize;
6580 pg >>= esize;
6581 } while (reg_off & 63);
6582 } while (reg_off < reg_max);
6583
6584
6585 memcpy(vd, &scratch, reg_max);
6586}
6587
6588static inline QEMU_ALWAYS_INLINE
6589void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6590 target_ulong base, uint32_t desc, uintptr_t retaddr,
6591 int esize, int msize, zreg_off_fn *off_fn,
6592 sve_ldst1_host_fn *host_fn,
6593 sve_ldst1_tlb_fn *tlb_fn)
6594{
6595 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6596
6597 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6598
6599
6600
6601
6602
6603
6604
6605 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6606 esize, msize, off_fn, host_fn, tlb_fn);
6607}
6608
6609#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6610void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6611 void *vm, target_ulong base, uint32_t desc) \
6612{ \
6613 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6614 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6615} \
6616void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6617 void *vm, target_ulong base, uint32_t desc) \
6618{ \
6619 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6620 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6621}
6622
6623#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6624void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6625 void *vm, target_ulong base, uint32_t desc) \
6626{ \
6627 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6628 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6629} \
6630void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6631 void *vm, target_ulong base, uint32_t desc) \
6632{ \
6633 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6634 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6635}
6636
6637DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6638DO_LD1_ZPZ_S(bsu, zss, MO_8)
6639DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6640DO_LD1_ZPZ_D(bdu, zss, MO_8)
6641DO_LD1_ZPZ_D(bdu, zd, MO_8)
6642
6643DO_LD1_ZPZ_S(bss, zsu, MO_8)
6644DO_LD1_ZPZ_S(bss, zss, MO_8)
6645DO_LD1_ZPZ_D(bds, zsu, MO_8)
6646DO_LD1_ZPZ_D(bds, zss, MO_8)
6647DO_LD1_ZPZ_D(bds, zd, MO_8)
6648
6649DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6650DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6651DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6652DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6653DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6654
6655DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6656DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6657DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6658DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6659DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6660
6661DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6662DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6663DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6664DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6665DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6666
6667DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6668DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6669DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6670DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6671DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6672
6673DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6674DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6675DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6676DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6677DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6678
6679DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6680DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6681DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6682DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6683DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6684
6685DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6686DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6687DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6688
6689DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6690DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6691DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6692
6693DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6694DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6695DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6696
6697DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6698DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6699DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6700
6701#undef DO_LD1_ZPZ_S
6702#undef DO_LD1_ZPZ_D
6703
6704
6705
6706
6707
6708
6709
6710static inline QEMU_ALWAYS_INLINE
6711void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6712 target_ulong base, uint32_t desc, uintptr_t retaddr,
6713 uint32_t mtedesc, const int esz, const int msz,
6714 zreg_off_fn *off_fn,
6715 sve_ldst1_host_fn *host_fn,
6716 sve_ldst1_tlb_fn *tlb_fn)
6717{
6718 const int mmu_idx = cpu_mmu_index(env, false);
6719 const intptr_t reg_max = simd_oprsz(desc);
6720 const int scale = simd_data(desc);
6721 const int esize = 1 << esz;
6722 const int msize = 1 << msz;
6723 intptr_t reg_off;
6724 SVEHostPage info;
6725 target_ulong addr, in_page;
6726 ARMVectorReg scratch;
6727
6728
6729 reg_off = find_next_active(vg, 0, reg_max, esz);
6730 if (unlikely(reg_off >= reg_max)) {
6731
6732 memset(vd, 0, reg_max);
6733 return;
6734 }
6735
6736
6737 if (unlikely(vd == vm)) {
6738 vm = memcpy(&scratch, vm, reg_max);
6739 }
6740
6741
6742
6743
6744 addr = base + (off_fn(vm, reg_off) << scale);
6745 if (mtedesc) {
6746 mte_check(env, mtedesc, addr, retaddr);
6747 }
6748 tlb_fn(env, vd, reg_off, addr, retaddr);
6749
6750
6751 swap_memzero(vd, reg_off);
6752 reg_off += esize;
6753 swap_memzero(vd + reg_off, reg_max - reg_off);
6754
6755
6756
6757
6758 while (reg_off < reg_max) {
6759 uint64_t pg = vg[reg_off >> 6];
6760 do {
6761 if (likely((pg >> (reg_off & 63)) & 1)) {
6762 addr = base + (off_fn(vm, reg_off) << scale);
6763 in_page = -(addr | TARGET_PAGE_MASK);
6764
6765 if (unlikely(in_page < msize)) {
6766
6767 goto fault;
6768 }
6769
6770 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6771 mmu_idx, retaddr);
6772 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6773 goto fault;
6774 }
6775 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6776 (cpu_watchpoint_address_matches
6777 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6778 goto fault;
6779 }
6780 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
6781 goto fault;
6782 }
6783
6784 host_fn(vd, reg_off, info.host);
6785 }
6786 reg_off += esize;
6787 } while (reg_off & 63);
6788 }
6789 return;
6790
6791 fault:
6792 record_fault(env, reg_off, reg_max);
6793}
6794
6795static inline QEMU_ALWAYS_INLINE
6796void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6797 target_ulong base, uint32_t desc, uintptr_t retaddr,
6798 const int esz, const int msz,
6799 zreg_off_fn *off_fn,
6800 sve_ldst1_host_fn *host_fn,
6801 sve_ldst1_tlb_fn *tlb_fn)
6802{
6803 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6804
6805 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6806
6807
6808
6809
6810
6811
6812
6813 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6814 esz, msz, off_fn, host_fn, tlb_fn);
6815}
6816
6817#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6818void HELPER(sve_ldff##MEM##_##OFS) \
6819 (CPUARMState *env, void *vd, void *vg, \
6820 void *vm, target_ulong base, uint32_t desc) \
6821{ \
6822 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6823 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6824} \
6825void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6826 (CPUARMState *env, void *vd, void *vg, \
6827 void *vm, target_ulong base, uint32_t desc) \
6828{ \
6829 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
6830 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6831}
6832
6833#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
6834void HELPER(sve_ldff##MEM##_##OFS) \
6835 (CPUARMState *env, void *vd, void *vg, \
6836 void *vm, target_ulong base, uint32_t desc) \
6837{ \
6838 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
6839 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6840} \
6841void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6842 (CPUARMState *env, void *vd, void *vg, \
6843 void *vm, target_ulong base, uint32_t desc) \
6844{ \
6845 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
6846 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6847}
6848
6849DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6850DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6851DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6852DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6853DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6854
6855DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6856DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6857DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6858DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6859DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6860
6861DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6862DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6863DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6864DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6865DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6866
6867DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6868DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6869DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6870DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6871DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6872
6873DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6874DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6875DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6876DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6877DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6878
6879DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6880DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6881DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6882DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6883DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6884
6885DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
6886DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
6887DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6888DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6889DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6890
6891DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
6892DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
6893DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6894DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6895DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6896
6897DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6898DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6899DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6900
6901DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6902DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6903DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6904
6905DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6906DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6907DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6908
6909DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6910DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6911DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
6912
6913
6914
6915static inline QEMU_ALWAYS_INLINE
6916void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6917 target_ulong base, uint32_t desc, uintptr_t retaddr,
6918 uint32_t mtedesc, int esize, int msize,
6919 zreg_off_fn *off_fn,
6920 sve_ldst1_host_fn *host_fn,
6921 sve_ldst1_tlb_fn *tlb_fn)
6922{
6923 const int mmu_idx = cpu_mmu_index(env, false);
6924 const intptr_t reg_max = simd_oprsz(desc);
6925 const int scale = simd_data(desc);
6926 void *host[ARM_MAX_VQ * 4];
6927 intptr_t reg_off, i;
6928 SVEHostPage info, info2;
6929
6930
6931
6932
6933 i = reg_off = 0;
6934 do {
6935 uint64_t pg = vg[reg_off >> 6];
6936 do {
6937 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6938 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6939
6940 host[i] = NULL;
6941 if (likely((pg >> (reg_off & 63)) & 1)) {
6942 if (likely(in_page >= msize)) {
6943 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6944 mmu_idx, retaddr);
6945 if (!(info.flags & TLB_MMIO)) {
6946 host[i] = info.host;
6947 }
6948 } else {
6949
6950
6951
6952
6953
6954 sve_probe_page(&info, false, env, addr, 0,
6955 MMU_DATA_STORE, mmu_idx, retaddr);
6956 sve_probe_page(&info2, false, env, addr + in_page, 0,
6957 MMU_DATA_STORE, mmu_idx, retaddr);
6958 info.flags |= info2.flags;
6959 }
6960
6961 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6962 cpu_check_watchpoint(env_cpu(env), addr, msize,
6963 info.attrs, BP_MEM_WRITE, retaddr);
6964 }
6965
6966 if (mtedesc && info.tagged) {
6967 mte_check(env, mtedesc, addr, retaddr);
6968 }
6969 }
6970 i += 1;
6971 reg_off += esize;
6972 } while (reg_off & 63);
6973 } while (reg_off < reg_max);
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984 i = reg_off = 0;
6985 do {
6986 void *h = host[i];
6987 if (likely(h != NULL)) {
6988 host_fn(vd, reg_off, h);
6989 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
6990 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6991 tlb_fn(env, vd, reg_off, addr, retaddr);
6992 }
6993 i += 1;
6994 reg_off += esize;
6995 } while (reg_off < reg_max);
6996}
6997
6998static inline QEMU_ALWAYS_INLINE
6999void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7000 target_ulong base, uint32_t desc, uintptr_t retaddr,
7001 int esize, int msize, zreg_off_fn *off_fn,
7002 sve_ldst1_host_fn *host_fn,
7003 sve_ldst1_tlb_fn *tlb_fn)
7004{
7005 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7006
7007 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7008
7009
7010
7011
7012
7013
7014
7015 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7016 esize, msize, off_fn, host_fn, tlb_fn);
7017}
7018
7019#define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
7020void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7021 void *vm, target_ulong base, uint32_t desc) \
7022{ \
7023 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
7024 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7025} \
7026void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7027 void *vm, target_ulong base, uint32_t desc) \
7028{ \
7029 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
7030 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7031}
7032
7033#define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
7034void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7035 void *vm, target_ulong base, uint32_t desc) \
7036{ \
7037 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
7038 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7039} \
7040void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7041 void *vm, target_ulong base, uint32_t desc) \
7042{ \
7043 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
7044 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7045}
7046
7047DO_ST1_ZPZ_S(bs, zsu, MO_8)
7048DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7049DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7050DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7051DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7052
7053DO_ST1_ZPZ_S(bs, zss, MO_8)
7054DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7055DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7056DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7057DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7058
7059DO_ST1_ZPZ_D(bd, zsu, MO_8)
7060DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7061DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7062DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7063DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7064DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7065DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7066
7067DO_ST1_ZPZ_D(bd, zss, MO_8)
7068DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7069DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7070DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7071DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7072DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7073DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7074
7075DO_ST1_ZPZ_D(bd, zd, MO_8)
7076DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7077DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7078DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7079DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7080DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7081DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7082
7083#undef DO_ST1_ZPZ_S
7084#undef DO_ST1_ZPZ_D
7085
7086void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7087{
7088 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7089 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7090
7091 for (i = 0; i < opr_sz; ++i) {
7092 d[i] = n[i] ^ m[i] ^ k[i];
7093 }
7094}
7095
7096void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7097{
7098 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7099 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7100
7101 for (i = 0; i < opr_sz; ++i) {
7102 d[i] = n[i] ^ (m[i] & ~k[i]);
7103 }
7104}
7105
7106void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7107{
7108 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7109 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7110
7111 for (i = 0; i < opr_sz; ++i) {
7112 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7113 }
7114}
7115
7116void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7117{
7118 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7119 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7120
7121 for (i = 0; i < opr_sz; ++i) {
7122 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7123 }
7124}
7125
7126void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7127{
7128 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7129 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7130
7131 for (i = 0; i < opr_sz; ++i) {
7132 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7133 }
7134}
7135
7136
7137
7138
7139
7140
7141static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7142{
7143 int bits = 8 << esz;
7144 uint64_t ones = dup_const(esz, 1);
7145 uint64_t signs = ones << (bits - 1);
7146 uint64_t cmp0, cmp1;
7147
7148 cmp1 = dup_const(esz, n);
7149 cmp0 = cmp1 ^ m0;
7150 cmp1 = cmp1 ^ m1;
7151 cmp0 = (cmp0 - ones) & ~cmp0;
7152 cmp1 = (cmp1 - ones) & ~cmp1;
7153 return (cmp0 | cmp1) & signs;
7154}
7155
7156static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7157 uint32_t desc, int esz, bool nmatch)
7158{
7159 uint16_t esz_mask = pred_esz_masks[esz];
7160 intptr_t opr_sz = simd_oprsz(desc);
7161 uint32_t flags = PREDTEST_INIT;
7162 intptr_t i, j, k;
7163
7164 for (i = 0; i < opr_sz; i += 16) {
7165 uint64_t m0 = *(uint64_t *)(vm + i);
7166 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7167 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7168 uint16_t out = 0;
7169
7170 for (j = 0; j < 16; j += 8) {
7171 uint64_t n = *(uint64_t *)(vn + i + j);
7172
7173 for (k = 0; k < 8; k += 1 << esz) {
7174 if (pg & (1 << (j + k))) {
7175 bool o = do_match2(n >> (k * 8), m0, m1, esz);
7176 out |= (o ^ nmatch) << (j + k);
7177 }
7178 }
7179 }
7180 *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7181 flags = iter_predtest_fwd(out, pg, flags);
7182 }
7183 return flags;
7184}
7185
7186#define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7187uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7188{ \
7189 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7190}
7191
7192DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7193DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7194
7195DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7196DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7197
7198#undef DO_PPZZ_MATCH
7199
7200void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7201 uint32_t desc)
7202{
7203 ARMVectorReg scratch;
7204 intptr_t i, j;
7205 intptr_t opr_sz = simd_oprsz(desc);
7206 uint32_t *d = vd, *n = vn, *m = vm;
7207 uint8_t *pg = vg;
7208
7209 if (d == n) {
7210 n = memcpy(&scratch, n, opr_sz);
7211 if (d == m) {
7212 m = n;
7213 }
7214 } else if (d == m) {
7215 m = memcpy(&scratch, m, opr_sz);
7216 }
7217
7218 for (i = 0; i < opr_sz; i += 4) {
7219 uint64_t count = 0;
7220 uint8_t pred;
7221
7222 pred = pg[H1(i >> 3)] >> (i & 7);
7223 if (pred & 1) {
7224 uint32_t nn = n[H4(i >> 2)];
7225
7226 for (j = 0; j <= i; j += 4) {
7227 pred = pg[H1(j >> 3)] >> (j & 7);
7228 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7229 ++count;
7230 }
7231 }
7232 }
7233 d[H4(i >> 2)] = count;
7234 }
7235}
7236
7237void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7238 uint32_t desc)
7239{
7240 ARMVectorReg scratch;
7241 intptr_t i, j;
7242 intptr_t opr_sz = simd_oprsz(desc);
7243 uint64_t *d = vd, *n = vn, *m = vm;
7244 uint8_t *pg = vg;
7245
7246 if (d == n) {
7247 n = memcpy(&scratch, n, opr_sz);
7248 if (d == m) {
7249 m = n;
7250 }
7251 } else if (d == m) {
7252 m = memcpy(&scratch, m, opr_sz);
7253 }
7254
7255 for (i = 0; i < opr_sz / 8; ++i) {
7256 uint64_t count = 0;
7257 if (pg[H1(i)] & 1) {
7258 uint64_t nn = n[i];
7259 for (j = 0; j <= i; ++j) {
7260 if ((pg[H1(j)] & 1) && nn == m[j]) {
7261 ++count;
7262 }
7263 }
7264 }
7265 d[i] = count;
7266 }
7267}
7268
7269
7270
7271
7272
7273
7274static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7275{
7276 const uint64_t mask = dup_const(MO_8, 0x7f);
7277 uint64_t cmp0, cmp1;
7278
7279 cmp1 = dup_const(MO_8, n);
7280 cmp0 = cmp1 ^ m0;
7281 cmp1 = cmp1 ^ m1;
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7292 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303 return ctpop64(cmp0 | (cmp1 >> 1));
7304}
7305
7306void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7307{
7308 intptr_t i, j;
7309 intptr_t opr_sz = simd_oprsz(desc);
7310
7311 for (i = 0; i < opr_sz; i += 16) {
7312 uint64_t n0 = *(uint64_t *)(vn + i);
7313 uint64_t m0 = *(uint64_t *)(vm + i);
7314 uint64_t n1 = *(uint64_t *)(vn + i + 8);
7315 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7316 uint64_t out0 = 0;
7317 uint64_t out1 = 0;
7318
7319 for (j = 0; j < 64; j += 8) {
7320 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7321 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7322 out0 |= cnt0 << j;
7323 out1 |= cnt1 << j;
7324 }
7325
7326 *(uint64_t *)(vd + i) = out0;
7327 *(uint64_t *)(vd + i + 8) = out1;
7328 }
7329}
7330
7331void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7332{
7333 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7334 int shr = simd_data(desc);
7335 int shl = 8 - shr;
7336 uint64_t mask = dup_const(MO_8, 0xff >> shr);
7337 uint64_t *d = vd, *n = vn, *m = vm;
7338
7339 for (i = 0; i < opr_sz; ++i) {
7340 uint64_t t = n[i] ^ m[i];
7341 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7342 }
7343}
7344
7345void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7346{
7347 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7348 int shr = simd_data(desc);
7349 int shl = 16 - shr;
7350 uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7351 uint64_t *d = vd, *n = vn, *m = vm;
7352
7353 for (i = 0; i < opr_sz; ++i) {
7354 uint64_t t = n[i] ^ m[i];
7355 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7356 }
7357}
7358
7359void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7360{
7361 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7362 int shr = simd_data(desc);
7363 uint32_t *d = vd, *n = vn, *m = vm;
7364
7365 for (i = 0; i < opr_sz; ++i) {
7366 d[i] = ror32(n[i] ^ m[i], shr);
7367 }
7368}
7369
7370void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7371 void *status, uint32_t desc)
7372{
7373 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7374
7375 for (s = 0; s < opr_sz; ++s) {
7376 float32 *n = vn + s * sizeof(float32) * 4;
7377 float32 *m = vm + s * sizeof(float32) * 4;
7378 float32 *a = va + s * sizeof(float32) * 4;
7379 float32 *d = vd + s * sizeof(float32) * 4;
7380 float32 n00 = n[H4(0)], n01 = n[H4(1)];
7381 float32 n10 = n[H4(2)], n11 = n[H4(3)];
7382 float32 m00 = m[H4(0)], m01 = m[H4(1)];
7383 float32 m10 = m[H4(2)], m11 = m[H4(3)];
7384 float32 p0, p1;
7385
7386
7387 p0 = float32_mul(n00, m00, status);
7388 p1 = float32_mul(n01, m01, status);
7389 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7390
7391
7392 p0 = float32_mul(n00, m10, status);
7393 p1 = float32_mul(n01, m11, status);
7394 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7395
7396
7397 p0 = float32_mul(n10, m00, status);
7398 p1 = float32_mul(n11, m01, status);
7399 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7400
7401
7402 p0 = float32_mul(n10, m10, status);
7403 p1 = float32_mul(n11, m11, status);
7404 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7405 }
7406}
7407
7408void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7409 void *status, uint32_t desc)
7410{
7411 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7412
7413 for (s = 0; s < opr_sz; ++s) {
7414 float64 *n = vn + s * sizeof(float64) * 4;
7415 float64 *m = vm + s * sizeof(float64) * 4;
7416 float64 *a = va + s * sizeof(float64) * 4;
7417 float64 *d = vd + s * sizeof(float64) * 4;
7418 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7419 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7420 float64 p0, p1;
7421
7422
7423 p0 = float64_mul(n00, m00, status);
7424 p1 = float64_mul(n01, m01, status);
7425 d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7426
7427
7428 p0 = float64_mul(n00, m10, status);
7429 p1 = float64_mul(n01, m11, status);
7430 d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7431
7432
7433 p0 = float64_mul(n10, m00, status);
7434 p1 = float64_mul(n11, m01, status);
7435 d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7436
7437
7438 p0 = float64_mul(n10, m10, status);
7439 p1 = float64_mul(n11, m11, status);
7440 d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7441 }
7442}
7443
7444#define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7445void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7446{ \
7447 intptr_t i = simd_oprsz(desc); \
7448 uint64_t *g = vg; \
7449 do { \
7450 uint64_t pg = g[(i - 1) >> 6]; \
7451 do { \
7452 i -= sizeof(TYPEW); \
7453 if (likely((pg >> (i & 63)) & 1)) { \
7454 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
7455 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \
7456 } \
7457 } while (i & 63); \
7458 } while (i != 0); \
7459}
7460
7461DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7462DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7463DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7464
7465#define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7466void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7467{ \
7468 intptr_t i = simd_oprsz(desc); \
7469 uint64_t *g = vg; \
7470 do { \
7471 uint64_t pg = g[(i - 1) >> 6]; \
7472 do { \
7473 i -= sizeof(TYPEW); \
7474 if (likely((pg >> (i & 63)) & 1)) { \
7475 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \
7476 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \
7477 } \
7478 } while (i & 63); \
7479 } while (i != 0); \
7480}
7481
7482DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7483DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7484
7485#undef DO_FCVTLT
7486#undef DO_FCVTNT
7487