1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20#include "qemu/osdep.h"
21#include "cpu.h"
22#include "internals.h"
23#include "exec/exec-all.h"
24#include "exec/helper-proto.h"
25#include "tcg/tcg-gvec-desc.h"
26#include "fpu/softfloat.h"
27#include "tcg/tcg.h"
28#include "vec_internal.h"
29#include "sve_ldst_internal.h"
30
31
32
33
34
35
36
37
38
39
40#define PREDTEST_INIT 1
41
42
43
44
45static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
46{
47 if (likely(g)) {
48
49
50 if (!(flags & 4)) {
51 flags |= ((d & (g & -g)) != 0) << 31;
52 flags |= 4;
53 }
54
55
56 flags |= ((d & g) != 0) << 1;
57
58
59 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
60 }
61 return flags;
62}
63
64
65
66
67static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
68{
69 if (likely(g)) {
70
71
72 if (!(flags & 4)) {
73 flags += 4 - 1;
74 flags |= (d & pow2floor(g)) == 0;
75 }
76
77
78 flags |= ((d & g) != 0) << 1;
79
80
81 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
82 }
83 return flags;
84}
85
86
87uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
88{
89 return iter_predtest_fwd(d, g, PREDTEST_INIT);
90}
91
92
93uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
94{
95 uint32_t flags = PREDTEST_INIT;
96 uint64_t *d = vd, *g = vg;
97 uintptr_t i = 0;
98
99 do {
100 flags = iter_predtest_fwd(d[i], g[i], flags);
101 } while (++i < words);
102
103 return flags;
104}
105
106
107static inline uint64_t expand_pred_s(uint8_t byte)
108{
109 static const uint64_t word[] = {
110 [0x01] = 0x00000000ffffffffull,
111 [0x10] = 0xffffffff00000000ull,
112 [0x11] = 0xffffffffffffffffull,
113 };
114 return word[byte & 0x11];
115}
116
117#define LOGICAL_PPPP(NAME, FUNC) \
118void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
119{ \
120 uintptr_t opr_sz = simd_oprsz(desc); \
121 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
122 uintptr_t i; \
123 for (i = 0; i < opr_sz / 8; ++i) { \
124 d[i] = FUNC(n[i], m[i], g[i]); \
125 } \
126}
127
128#define DO_AND(N, M, G) (((N) & (M)) & (G))
129#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
130#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
131#define DO_ORR(N, M, G) (((N) | (M)) & (G))
132#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
133#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
134#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
135#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
136
137LOGICAL_PPPP(sve_and_pppp, DO_AND)
138LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
139LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
140LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
141LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
142LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
143LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
144LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
145
146#undef DO_AND
147#undef DO_BIC
148#undef DO_EOR
149#undef DO_ORR
150#undef DO_ORN
151#undef DO_NOR
152#undef DO_NAND
153#undef DO_SEL
154#undef LOGICAL_PPPP
155
156
157
158
159
160
161
162
163
164#define DO_ZPZZ(NAME, TYPE, H, OP) \
165void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
166{ \
167 intptr_t i, opr_sz = simd_oprsz(desc); \
168 for (i = 0; i < opr_sz; ) { \
169 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
170 do { \
171 if (pg & 1) { \
172 TYPE nn = *(TYPE *)(vn + H(i)); \
173 TYPE mm = *(TYPE *)(vm + H(i)); \
174 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
175 } \
176 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
177 } while (i & 15); \
178 } \
179}
180
181
182#define DO_ZPZZ_D(NAME, TYPE, OP) \
183void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
184{ \
185 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
186 TYPE *d = vd, *n = vn, *m = vm; \
187 uint8_t *pg = vg; \
188 for (i = 0; i < opr_sz; i += 1) { \
189 if (pg[H1(i)] & 1) { \
190 TYPE nn = n[i], mm = m[i]; \
191 d[i] = OP(nn, mm); \
192 } \
193 } \
194}
195
196#define DO_AND(N, M) (N & M)
197#define DO_EOR(N, M) (N ^ M)
198#define DO_ORR(N, M) (N | M)
199#define DO_BIC(N, M) (N & ~M)
200#define DO_ADD(N, M) (N + M)
201#define DO_SUB(N, M) (N - M)
202#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
203#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
204#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
205#define DO_MUL(N, M) (N * M)
206
207
208
209
210
211
212
213
214
215#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
216#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
217
218DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
219DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
220DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
221DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
222
223DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
224DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
225DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
226DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
227
228DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
229DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
230DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
231DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
232
233DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
234DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
235DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
236DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
237
238DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
239DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
240DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
241DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
242
243DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
244DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
245DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
246DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
247
248DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
249DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
250DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
251DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
252
253DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
254DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
255DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
256DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
257
258DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
259DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
260DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
261DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
262
263DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
264DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
265DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
266DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
267
268DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
269DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
270DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
271DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
272
273DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
274DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
275DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
276DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
277
278
279
280static inline uint8_t do_mulh_b(int32_t n, int32_t m)
281{
282 return (n * m) >> 8;
283}
284
285static inline uint16_t do_mulh_h(int32_t n, int32_t m)
286{
287 return (n * m) >> 16;
288}
289
290static inline uint32_t do_mulh_s(int64_t n, int64_t m)
291{
292 return (n * m) >> 32;
293}
294
295static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
296{
297 uint64_t lo, hi;
298 muls64(&lo, &hi, n, m);
299 return hi;
300}
301
302static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
303{
304 uint64_t lo, hi;
305 mulu64(&lo, &hi, n, m);
306 return hi;
307}
308
309DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
310DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
311DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
312DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
313
314DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
315DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
316DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
317DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
318
319DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
320DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
321DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
322DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
323
324DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
325DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
326
327DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
328DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
329
330
331
332#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
333#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
334#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
335
336DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
337DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
338DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
339
340DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
341DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
342DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
343
344DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
345DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
346DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
347
348DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
349DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
350DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
351
352static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
353{
354 int8_t n1 = n, n2 = n >> 8;
355 return m + n1 + n2;
356}
357
358static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
359{
360 int16_t n1 = n, n2 = n >> 16;
361 return m + n1 + n2;
362}
363
364static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
365{
366 int32_t n1 = n, n2 = n >> 32;
367 return m + n1 + n2;
368}
369
370DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
371DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
372DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
373
374static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
375{
376 uint8_t n1 = n, n2 = n >> 8;
377 return m + n1 + n2;
378}
379
380static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
381{
382 uint16_t n1 = n, n2 = n >> 16;
383 return m + n1 + n2;
384}
385
386static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
387{
388 uint32_t n1 = n, n2 = n >> 32;
389 return m + n1 + n2;
390}
391
392DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
393DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
394DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
395
396#define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
397#define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
398#define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
399#define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
400
401DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
402DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
403DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
404DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
405
406#define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
407#define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
408#define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
409#define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
410
411DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
412DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
413DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
414DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
415
416
417
418
419
420
421
422#define do_sqshl_b(n, m) \
423 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
424#define do_sqshl_h(n, m) \
425 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
426#define do_sqshl_s(n, m) \
427 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
428#define do_sqshl_d(n, m) \
429 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
430
431DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
432DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
433DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
434DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
435
436#define do_uqshl_b(n, m) \
437 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
438#define do_uqshl_h(n, m) \
439 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
440#define do_uqshl_s(n, m) \
441 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
442#define do_uqshl_d(n, m) \
443 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
444
445DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
446DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
447DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
448DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
449
450#define do_sqrshl_b(n, m) \
451 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
452#define do_sqrshl_h(n, m) \
453 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
454#define do_sqrshl_s(n, m) \
455 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
456#define do_sqrshl_d(n, m) \
457 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
458
459DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
460DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
461DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
462DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
463
464#undef do_sqrshl_d
465
466#define do_uqrshl_b(n, m) \
467 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
468#define do_uqrshl_h(n, m) \
469 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
470#define do_uqrshl_s(n, m) \
471 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
472#define do_uqrshl_d(n, m) \
473 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
474
475DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
476DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
477DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
478DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
479
480#undef do_uqrshl_d
481
482#define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
483#define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
484
485DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
486DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
487DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
488DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
489
490DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
491DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
492DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
493DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
494
495#define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
496#define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
497
498DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
499DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
500DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
501DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
502
503DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
504DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
505DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
506DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
507
508#define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
509#define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
510
511DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
512DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
513DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
514DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
515
516DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
517DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
518DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
519DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
520
521static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
522{
523 return val >= max ? max : val <= min ? min : val;
524}
525
526#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
527#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
528#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
529
530static inline int64_t do_sqadd_d(int64_t n, int64_t m)
531{
532 int64_t r = n + m;
533 if (((r ^ n) & ~(n ^ m)) < 0) {
534
535 return r < 0 ? INT64_MAX : INT64_MIN;
536 }
537 return r;
538}
539
540DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
541DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
542DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
543DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
544
545#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
546#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
547#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
548
549static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
550{
551 uint64_t r = n + m;
552 return r < n ? UINT64_MAX : r;
553}
554
555DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
556DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
557DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
558DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
559
560#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
561#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
562#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
563
564static inline int64_t do_sqsub_d(int64_t n, int64_t m)
565{
566 int64_t r = n - m;
567 if (((r ^ n) & (n ^ m)) < 0) {
568
569 return r < 0 ? INT64_MAX : INT64_MIN;
570 }
571 return r;
572}
573
574DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
575DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
576DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
577DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
578
579#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
580#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
581#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
582
583static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
584{
585 return n > m ? n - m : 0;
586}
587
588DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
589DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
590DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
591DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
592
593#define DO_SUQADD_B(n, m) \
594 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
595#define DO_SUQADD_H(n, m) \
596 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
597#define DO_SUQADD_S(n, m) \
598 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
599
600static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
601{
602 uint64_t r = n + m;
603
604 if (n < 0) {
605
606 if (r > INT64_MAX) {
607
608 if (m > -n) {
609
610 return INT64_MAX;
611 }
612
613 }
614 } else {
615
616 if (r < m || r > INT64_MAX) {
617 return INT64_MAX;
618 }
619 }
620 return r;
621}
622
623DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
624DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
625DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
626DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
627
628#define DO_USQADD_B(n, m) \
629 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
630#define DO_USQADD_H(n, m) \
631 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
632#define DO_USQADD_S(n, m) \
633 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
634
635static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
636{
637 uint64_t r = n + m;
638
639 if (m < 0) {
640 return n < -m ? 0 : r;
641 }
642 return r < n ? UINT64_MAX : r;
643}
644
645DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
646DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
647DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
648DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
649
650#undef DO_ZPZZ
651#undef DO_ZPZZ_D
652
653
654
655
656
657
658
659#define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
660void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
661{ \
662 intptr_t i, opr_sz = simd_oprsz(desc); \
663 for (i = 0; i < opr_sz; ) { \
664 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
665 do { \
666 TYPE n0 = *(TYPE *)(vn + H(i)); \
667 TYPE m0 = *(TYPE *)(vm + H(i)); \
668 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
669 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
670 if (pg & 1) { \
671 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
672 } \
673 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
674 if (pg & 1) { \
675 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
676 } \
677 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
678 } while (i & 15); \
679 } \
680}
681
682
683#define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
684void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
685{ \
686 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
687 TYPE *d = vd, *n = vn, *m = vm; \
688 uint8_t *pg = vg; \
689 for (i = 0; i < opr_sz; i += 2) { \
690 TYPE n0 = n[i], n1 = n[i + 1]; \
691 TYPE m0 = m[i], m1 = m[i + 1]; \
692 if (pg[H1(i)] & 1) { \
693 d[i] = OP(n0, n1); \
694 } \
695 if (pg[H1(i + 1)] & 1) { \
696 d[i + 1] = OP(m0, m1); \
697 } \
698 } \
699}
700
701DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
702DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
703DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
704DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
705
706DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
707DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
708DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
709DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
710
711DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
712DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
713DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
714DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
715
716DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
717DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
718DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
719DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
720
721DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
722DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
723DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
724DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
725
726#undef DO_ZPZZ_PAIR
727#undef DO_ZPZZ_PAIR_D
728
729#define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
730void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
731 void *status, uint32_t desc) \
732{ \
733 intptr_t i, opr_sz = simd_oprsz(desc); \
734 for (i = 0; i < opr_sz; ) { \
735 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
736 do { \
737 TYPE n0 = *(TYPE *)(vn + H(i)); \
738 TYPE m0 = *(TYPE *)(vm + H(i)); \
739 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
740 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
741 if (pg & 1) { \
742 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
743 } \
744 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
745 if (pg & 1) { \
746 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
747 } \
748 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
749 } while (i & 15); \
750 } \
751}
752
753DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
754DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
755DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
756
757DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
758DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
759DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
760
761DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
762DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
763DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
764
765DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
766DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
767DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
768
769DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
770DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
771DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
772
773#undef DO_ZPZZ_PAIR_FP
774
775
776
777
778
779#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
780void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
781{ \
782 intptr_t i, opr_sz = simd_oprsz(desc); \
783 for (i = 0; i < opr_sz; ) { \
784 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
785 TYPEW mm = *(TYPEW *)(vm + i); \
786 do { \
787 if (pg & 1) { \
788 TYPE nn = *(TYPE *)(vn + H(i)); \
789 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
790 } \
791 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
792 } while (i & 7); \
793 } \
794}
795
796DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
797DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
798DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
799
800DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
801DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
802DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
803
804DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
805DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
806DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
807
808#undef DO_ZPZW
809
810
811
812#define DO_ZPZ(NAME, TYPE, H, OP) \
813void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
814{ \
815 intptr_t i, opr_sz = simd_oprsz(desc); \
816 for (i = 0; i < opr_sz; ) { \
817 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
818 do { \
819 if (pg & 1) { \
820 TYPE nn = *(TYPE *)(vn + H(i)); \
821 *(TYPE *)(vd + H(i)) = OP(nn); \
822 } \
823 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
824 } while (i & 15); \
825 } \
826}
827
828
829#define DO_ZPZ_D(NAME, TYPE, OP) \
830void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
831{ \
832 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
833 TYPE *d = vd, *n = vn; \
834 uint8_t *pg = vg; \
835 for (i = 0; i < opr_sz; i += 1) { \
836 if (pg[H1(i)] & 1) { \
837 TYPE nn = n[i]; \
838 d[i] = OP(nn); \
839 } \
840 } \
841}
842
843#define DO_CLS_B(N) (clrsb32(N) - 24)
844#define DO_CLS_H(N) (clrsb32(N) - 16)
845
846DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
847DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
848DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
849DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
850
851#define DO_CLZ_B(N) (clz32(N) - 24)
852#define DO_CLZ_H(N) (clz32(N) - 16)
853
854DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
855DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
856DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
857DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
858
859DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
860DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
861DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
862DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
863
864#define DO_CNOT(N) (N == 0)
865
866DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
867DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
868DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
869DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
870
871#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
872
873DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
874DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
875DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
876
877#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
878
879DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
880DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
881DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
882
883#define DO_NOT(N) (~N)
884
885DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
886DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
887DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
888DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
889
890#define DO_SXTB(N) ((int8_t)N)
891#define DO_SXTH(N) ((int16_t)N)
892#define DO_SXTS(N) ((int32_t)N)
893#define DO_UXTB(N) ((uint8_t)N)
894#define DO_UXTH(N) ((uint16_t)N)
895#define DO_UXTS(N) ((uint32_t)N)
896
897DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
898DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
899DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
900DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
901DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
902DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
903
904DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
905DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
906DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
907DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
908DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
909DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
910
911#define DO_ABS(N) (N < 0 ? -N : N)
912
913DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
914DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
915DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
916DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
917
918#define DO_NEG(N) (-N)
919
920DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
921DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
922DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
923DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
924
925DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
926DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
927DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
928
929DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
930DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
931
932DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
933
934void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
935{
936 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
937 uint64_t *d = vd, *n = vn;
938 uint8_t *pg = vg;
939
940 for (i = 0; i < opr_sz; i += 2) {
941 if (pg[H1(i)] & 1) {
942 uint64_t n0 = n[i + 0];
943 uint64_t n1 = n[i + 1];
944 d[i + 0] = n1;
945 d[i + 1] = n0;
946 }
947 }
948}
949
950DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
951DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
952DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
953DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
954
955#define DO_SQABS(X) \
956 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
957 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
958
959DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
960DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
961DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
962DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
963
964#define DO_SQNEG(X) \
965 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
966 x_ == min_ ? -min_ - 1 : -x_; })
967
968DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
969DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
970DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
971DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
972
973DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
974DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
975
976
977
978#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
979void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
980{ \
981 intptr_t i, opr_sz = simd_oprsz(desc); \
982 for (i = 0; i < opr_sz; ) { \
983 TYPEW mm = *(TYPEW *)(vm + i); \
984 do { \
985 TYPE nn = *(TYPE *)(vn + H(i)); \
986 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
987 i += sizeof(TYPE); \
988 } while (i & 7); \
989 } \
990}
991
992DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
993DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
994DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
995
996DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
997DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
998DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
999
1000DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1001DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1002DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1003
1004#undef DO_ZZW
1005
1006#undef DO_CLS_B
1007#undef DO_CLS_H
1008#undef DO_CLZ_B
1009#undef DO_CLZ_H
1010#undef DO_CNOT
1011#undef DO_FABS
1012#undef DO_FNEG
1013#undef DO_ABS
1014#undef DO_NEG
1015#undef DO_ZPZ
1016#undef DO_ZPZ_D
1017
1018
1019
1020
1021
1022#define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1023void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1024{ \
1025 intptr_t i, opr_sz = simd_oprsz(desc); \
1026 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1027 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1028 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1029 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1030 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1031 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1032 } \
1033}
1034
1035DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1036DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1037DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1038
1039DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1040DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1041DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1042
1043DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1044DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1045DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1046
1047DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1048DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1049DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1050
1051DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1052DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1053DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1054
1055DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1056DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1057DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1058
1059DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1060DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1061DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1062
1063DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1064DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1065DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1066
1067
1068static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1069{
1070 int16_t val = n * m;
1071 return DO_SQADD_H(val, val);
1072}
1073
1074static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1075{
1076 int32_t val = n * m;
1077 return DO_SQADD_S(val, val);
1078}
1079
1080static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1081{
1082 int64_t val = n * m;
1083 return do_sqadd_d(val, val);
1084}
1085
1086DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1087DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1088DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1089
1090#undef DO_ZZZ_TB
1091
1092#define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1093void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1094{ \
1095 intptr_t i, opr_sz = simd_oprsz(desc); \
1096 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1097 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1098 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1099 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1100 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1101 } \
1102}
1103
1104DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1105DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1106DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1107
1108DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1109DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1110DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1111
1112DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1113DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1114DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1115
1116DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1117DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1118DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1119
1120#undef DO_ZZZ_WTB
1121
1122#define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1123void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1124{ \
1125 intptr_t i, opr_sz = simd_oprsz(desc); \
1126 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1127 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1128 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1129 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1130 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1131 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1132 } \
1133}
1134
1135DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1136DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1137DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1138DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1139
1140#undef DO_ZZZ_NTB
1141
1142#define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1143void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1144{ \
1145 intptr_t i, opr_sz = simd_oprsz(desc); \
1146 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1147 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1148 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1149 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1150 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1151 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1152 } \
1153}
1154
1155DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1156DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1157DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1158
1159DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1160DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1161DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1162
1163DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1164DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1165DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1166
1167DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1168DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1169DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1170
1171#define DO_NMUL(N, M) -(N * M)
1172
1173DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1174DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1175DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1176
1177DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1178DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1179DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1180
1181#undef DO_ZZZW_ACC
1182
1183#define DO_XTNB(NAME, TYPE, OP) \
1184void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1185{ \
1186 intptr_t i, opr_sz = simd_oprsz(desc); \
1187 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1188 TYPE nn = *(TYPE *)(vn + i); \
1189 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1190 *(TYPE *)(vd + i) = nn; \
1191 } \
1192}
1193
1194#define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1195void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1196{ \
1197 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1198 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1199 TYPE nn = *(TYPE *)(vn + i); \
1200 *(TYPEN *)(vd + i + odd) = OP(nn); \
1201 } \
1202}
1203
1204#define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1205#define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1206#define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1207
1208DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1209DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1210DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1211
1212DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1213DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1214DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1215
1216#define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1217#define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1218#define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1219
1220DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1221DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1222DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1223
1224DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1225DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1226DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1227
1228DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1229DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1230DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1231
1232DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1233DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1234DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1235
1236#undef DO_XTNB
1237#undef DO_XTNT
1238
1239void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1240{
1241 intptr_t i, opr_sz = simd_oprsz(desc);
1242 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1243 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1244 uint32_t *a = va, *n = vn;
1245 uint64_t *d = vd, *m = vm;
1246
1247 for (i = 0; i < opr_sz / 8; ++i) {
1248 uint32_t e1 = a[2 * i + H4(0)];
1249 uint32_t e2 = n[2 * i + sel] ^ inv;
1250 uint64_t c = extract64(m[i], 32, 1);
1251
1252 d[i] = c + e1 + e2;
1253 }
1254}
1255
1256void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1257{
1258 intptr_t i, opr_sz = simd_oprsz(desc);
1259 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1260 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1261 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1262
1263 for (i = 0; i < opr_sz / 8; i += 2) {
1264 Int128 e1 = int128_make64(a[i]);
1265 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1266 Int128 c = int128_make64(m[i + 1] & 1);
1267 Int128 r = int128_add(int128_add(e1, e2), c);
1268 d[i + 0] = int128_getlo(r);
1269 d[i + 1] = int128_gethi(r);
1270 }
1271}
1272
1273#define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1274void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1275{ \
1276 intptr_t i, opr_sz = simd_oprsz(desc); \
1277 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1278 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1279 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1280 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1281 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1282 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1283 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1284 } \
1285}
1286
1287DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1288 do_sqdmull_h, DO_SQADD_H)
1289DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1290 do_sqdmull_s, DO_SQADD_S)
1291DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1292 do_sqdmull_d, do_sqadd_d)
1293
1294DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1295 do_sqdmull_h, DO_SQSUB_H)
1296DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1297 do_sqdmull_s, DO_SQSUB_S)
1298DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1299 do_sqdmull_d, do_sqsub_d)
1300
1301#undef DO_SQDMLAL
1302
1303#define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1304void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1305{ \
1306 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1307 int rot = simd_data(desc); \
1308 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1309 bool sub_r = rot == 1 || rot == 2; \
1310 bool sub_i = rot >= 2; \
1311 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1312 for (i = 0; i < opr_sz; i += 2) { \
1313 TYPE elt1_a = n[H(i + sel_a)]; \
1314 TYPE elt2_a = m[H(i + sel_a)]; \
1315 TYPE elt2_b = m[H(i + sel_b)]; \
1316 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1317 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1318 } \
1319}
1320
1321#define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1322
1323DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1324DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1325DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1326DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1327
1328#define DO_SQRDMLAH_B(N, M, A, S) \
1329 do_sqrdmlah_b(N, M, A, S, true)
1330#define DO_SQRDMLAH_H(N, M, A, S) \
1331 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1332#define DO_SQRDMLAH_S(N, M, A, S) \
1333 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1334#define DO_SQRDMLAH_D(N, M, A, S) \
1335 do_sqrdmlah_d(N, M, A, S, true)
1336
1337DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1338DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1339DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1340DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1341
1342#define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1343void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1344{ \
1345 intptr_t i, j, oprsz = simd_oprsz(desc); \
1346 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
1347 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
1348 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1349 bool sub_r = rot == 1 || rot == 2; \
1350 bool sub_i = rot >= 2; \
1351 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1352 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
1353 TYPE elt2_a = m[H(i + idx + sel_a)]; \
1354 TYPE elt2_b = m[H(i + idx + sel_b)]; \
1355 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
1356 TYPE elt1_a = n[H(i + j + sel_a)]; \
1357 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
1358 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
1359 } \
1360 } \
1361}
1362
1363DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1364DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1365
1366DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1367DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1368
1369#undef DO_CMLA
1370#undef DO_CMLA_FUNC
1371#undef DO_CMLA_IDX_FUNC
1372#undef DO_SQRDMLAH_B
1373#undef DO_SQRDMLAH_H
1374#undef DO_SQRDMLAH_S
1375#undef DO_SQRDMLAH_D
1376
1377
1378static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1379 int sel_a, int sel_b, int sub_i)
1380{
1381 for (int i = 0; i <= 1; i++) {
1382 int32_t elt1_r = (int8_t)(n >> (16 * i));
1383 int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1384 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1385 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1386
1387 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1388 }
1389 return a;
1390}
1391
1392static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1393 int sel_a, int sel_b, int sub_i)
1394{
1395 for (int i = 0; i <= 1; i++) {
1396 int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1397 int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1398 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1399 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1400
1401 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1402 }
1403 return a;
1404}
1405
1406void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1407 void *va, uint32_t desc)
1408{
1409 int opr_sz = simd_oprsz(desc);
1410 int rot = simd_data(desc);
1411 int sel_a = rot & 1;
1412 int sel_b = sel_a ^ 1;
1413 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1414 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1415
1416 for (int e = 0; e < opr_sz / 4; e++) {
1417 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1418 }
1419}
1420
1421void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1422 void *va, uint32_t desc)
1423{
1424 int opr_sz = simd_oprsz(desc);
1425 int rot = simd_data(desc);
1426 int sel_a = rot & 1;
1427 int sel_b = sel_a ^ 1;
1428 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1429 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1430
1431 for (int e = 0; e < opr_sz / 8; e++) {
1432 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1433 }
1434}
1435
1436void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1437 void *va, uint32_t desc)
1438{
1439 int opr_sz = simd_oprsz(desc);
1440 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1441 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1442 int sel_a = rot & 1;
1443 int sel_b = sel_a ^ 1;
1444 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1445 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1446
1447 for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1448 uint32_t seg_m = m[seg + idx];
1449 for (int e = 0; e < 4; e++) {
1450 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1451 sel_a, sel_b, sub_i);
1452 }
1453 }
1454}
1455
1456void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1457 void *va, uint32_t desc)
1458{
1459 int seg, opr_sz = simd_oprsz(desc);
1460 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1461 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1462 int sel_a = rot & 1;
1463 int sel_b = sel_a ^ 1;
1464 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1465 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1466
1467 for (seg = 0; seg < opr_sz / 8; seg += 2) {
1468 uint64_t seg_m = m[seg + idx];
1469 for (int e = 0; e < 2; e++) {
1470 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1471 sel_a, sel_b, sub_i);
1472 }
1473 }
1474}
1475
1476#define DO_ZZXZ(NAME, TYPE, H, OP) \
1477void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1478{ \
1479 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
1480 intptr_t i, j, idx = simd_data(desc); \
1481 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
1482 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1483 TYPE mm = m[i]; \
1484 for (j = 0; j < segment; j++) { \
1485 d[i + j] = OP(n[i + j], mm, a[i + j]); \
1486 } \
1487 } \
1488}
1489
1490#define DO_SQRDMLAH_H(N, M, A) \
1491 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1492#define DO_SQRDMLAH_S(N, M, A) \
1493 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1494#define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1495
1496DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1497DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1498DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1499
1500#define DO_SQRDMLSH_H(N, M, A) \
1501 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1502#define DO_SQRDMLSH_S(N, M, A) \
1503 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1504#define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1505
1506DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1507DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1508DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1509
1510#undef DO_ZZXZ
1511
1512#define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1513void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1514{ \
1515 intptr_t i, j, oprsz = simd_oprsz(desc); \
1516 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1517 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1518 for (i = 0; i < oprsz; i += 16) { \
1519 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1520 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1521 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1522 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
1523 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
1524 } \
1525 } \
1526}
1527
1528#define DO_MLA(N, M, A) (A + N * M)
1529
1530DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1531DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1532DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1533DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1534
1535#define DO_MLS(N, M, A) (A - N * M)
1536
1537DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1538DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1539DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1540DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1541
1542#define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
1543#define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
1544
1545DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1546DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1547
1548#define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
1549#define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
1550
1551DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1552DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1553
1554#undef DO_MLA
1555#undef DO_MLS
1556#undef DO_ZZXW
1557
1558#define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1559void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1560{ \
1561 intptr_t i, j, oprsz = simd_oprsz(desc); \
1562 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1563 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1564 for (i = 0; i < oprsz; i += 16) { \
1565 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1566 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1567 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1568 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
1569 } \
1570 } \
1571}
1572
1573DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1574DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1575
1576DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1577DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1578
1579DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1580DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1581
1582#undef DO_ZZX
1583
1584#define DO_BITPERM(NAME, TYPE, OP) \
1585void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1586{ \
1587 intptr_t i, opr_sz = simd_oprsz(desc); \
1588 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1589 TYPE nn = *(TYPE *)(vn + i); \
1590 TYPE mm = *(TYPE *)(vm + i); \
1591 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1592 } \
1593}
1594
1595static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1596{
1597 uint64_t res = 0;
1598 int db, rb = 0;
1599
1600 for (db = 0; db < n; ++db) {
1601 if ((mask >> db) & 1) {
1602 res |= ((data >> db) & 1) << rb;
1603 ++rb;
1604 }
1605 }
1606 return res;
1607}
1608
1609DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1610DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1611DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1612DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1613
1614static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1615{
1616 uint64_t res = 0;
1617 int rb, db = 0;
1618
1619 for (rb = 0; rb < n; ++rb) {
1620 if ((mask >> rb) & 1) {
1621 res |= ((data >> db) & 1) << rb;
1622 ++db;
1623 }
1624 }
1625 return res;
1626}
1627
1628DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1629DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1630DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1631DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1632
1633static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1634{
1635 uint64_t resm = 0, resu = 0;
1636 int db, rbm = 0, rbu = 0;
1637
1638 for (db = 0; db < n; ++db) {
1639 uint64_t val = (data >> db) & 1;
1640 if ((mask >> db) & 1) {
1641 resm |= val << rbm++;
1642 } else {
1643 resu |= val << rbu++;
1644 }
1645 }
1646
1647 return resm | (resu << rbm);
1648}
1649
1650DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1651DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1652DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1653DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1654
1655#undef DO_BITPERM
1656
1657#define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1658void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1659{ \
1660 intptr_t i, opr_sz = simd_oprsz(desc); \
1661 int sub_r = simd_data(desc); \
1662 if (sub_r) { \
1663 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1664 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1665 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1666 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1667 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1668 acc_r = ADD_OP(acc_r, el2_i); \
1669 acc_i = SUB_OP(acc_i, el2_r); \
1670 *(TYPE *)(vd + H(i)) = acc_r; \
1671 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1672 } \
1673 } else { \
1674 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1675 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1676 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1677 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1678 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1679 acc_r = SUB_OP(acc_r, el2_i); \
1680 acc_i = ADD_OP(acc_i, el2_r); \
1681 *(TYPE *)(vd + H(i)) = acc_r; \
1682 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1683 } \
1684 } \
1685}
1686
1687DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1688DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1689DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1690DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1691
1692DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1693DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1694DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1695DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1696
1697#undef DO_CADD
1698
1699#define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1700void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1701{ \
1702 intptr_t i, opr_sz = simd_oprsz(desc); \
1703 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1704 int shift = simd_data(desc) >> 1; \
1705 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1706 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1707 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1708 } \
1709}
1710
1711DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1712DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1713DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1714
1715DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1716DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1717DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1718
1719#undef DO_ZZI_SHLL
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1731uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1732{ \
1733 intptr_t i, opr_sz = simd_oprsz(desc); \
1734 TYPERED ret = INIT; \
1735 for (i = 0; i < opr_sz; ) { \
1736 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1737 do { \
1738 if (pg & 1) { \
1739 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1740 ret = OP(ret, nn); \
1741 } \
1742 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1743 } while (i & 15); \
1744 } \
1745 return (TYPERET)ret; \
1746}
1747
1748#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1749uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1750{ \
1751 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1752 TYPEE *n = vn; \
1753 uint8_t *pg = vg; \
1754 TYPER ret = INIT; \
1755 for (i = 0; i < opr_sz; i += 1) { \
1756 if (pg[H1(i)] & 1) { \
1757 TYPEE nn = n[i]; \
1758 ret = OP(ret, nn); \
1759 } \
1760 } \
1761 return ret; \
1762}
1763
1764DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1765DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1766DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1767DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1768
1769DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1770DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1771DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1772DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1773
1774DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1775DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1776DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1777DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1778
1779DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1780DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1781DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1782
1783DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1784DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1785DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1786DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1787
1788DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1789DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1790DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1791DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1792
1793DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1794DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1795DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1796DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1797
1798DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1799DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1800DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1801DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1802
1803DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1804DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1805DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1806DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1807
1808#undef DO_VPZ
1809#undef DO_VPZ_D
1810
1811
1812#define DO_ZZI(NAME, TYPE, OP) \
1813void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1814{ \
1815 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1816 TYPE s = s64, *d = vd, *n = vn; \
1817 for (i = 0; i < opr_sz; ++i) { \
1818 d[i] = OP(n[i], s); \
1819 } \
1820}
1821
1822#define DO_SUBR(X, Y) (Y - X)
1823
1824DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1825DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1826DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1827DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1828
1829DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1830DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1831DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1832DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1833
1834DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1835DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1836DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1837DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1838
1839DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1840DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1841DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1842DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1843
1844DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1845DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1846DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1847DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1848
1849#undef DO_ZZI
1850
1851#undef DO_AND
1852#undef DO_ORR
1853#undef DO_EOR
1854#undef DO_BIC
1855#undef DO_ADD
1856#undef DO_SUB
1857#undef DO_MAX
1858#undef DO_MIN
1859#undef DO_ABD
1860#undef DO_MUL
1861#undef DO_DIV
1862#undef DO_ASR
1863#undef DO_LSR
1864#undef DO_LSL
1865#undef DO_SUBR
1866
1867
1868
1869
1870static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1871{
1872 uint64_t mask = pred_esz_masks[esz];
1873 intptr_t i = words;
1874
1875 do {
1876 uint64_t this_g = g[--i] & mask;
1877 if (this_g) {
1878 return i * 64 + (63 - clz64(this_g));
1879 }
1880 } while (i > 0);
1881 return (intptr_t)-1 << esz;
1882}
1883
1884uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1885{
1886 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1887 uint32_t flags = PREDTEST_INIT;
1888 uint64_t *d = vd, *g = vg;
1889 intptr_t i = 0;
1890
1891 do {
1892 uint64_t this_d = d[i];
1893 uint64_t this_g = g[i];
1894
1895 if (this_g) {
1896 if (!(flags & 4)) {
1897
1898 this_d |= this_g & -this_g;
1899 d[i] = this_d;
1900 }
1901 flags = iter_predtest_fwd(this_d, this_g, flags);
1902 }
1903 } while (++i < words);
1904
1905 return flags;
1906}
1907
1908uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1909{
1910 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1911 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1912 uint32_t flags = PREDTEST_INIT;
1913 uint64_t *d = vd, *g = vg, esz_mask;
1914 intptr_t i, next;
1915
1916 next = last_active_element(vd, words, esz) + (1 << esz);
1917 esz_mask = pred_esz_masks[esz];
1918
1919
1920
1921 if (next < words * 64) {
1922 uint64_t mask = -1;
1923
1924 if (next & 63) {
1925 mask = ~((1ull << (next & 63)) - 1);
1926 next &= -64;
1927 }
1928 do {
1929 uint64_t this_g = g[next / 64] & esz_mask & mask;
1930 if (this_g != 0) {
1931 next = (next & -64) + ctz64(this_g);
1932 break;
1933 }
1934 next += 64;
1935 mask = -1;
1936 } while (next < words * 64);
1937 }
1938
1939 i = 0;
1940 do {
1941 uint64_t this_d = 0;
1942 if (i == next / 64) {
1943 this_d = 1ull << (next & 63);
1944 }
1945 d[i] = this_d;
1946 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1947 } while (++i < words);
1948
1949 return flags;
1950}
1951
1952
1953
1954
1955
1956void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1957{
1958 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1959 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1960 uint64_t *d = vd, *n = vn;
1961 uint8_t *pg = vg;
1962
1963 for (i = 0; i < opr_sz; i += 1) {
1964 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1965 }
1966}
1967
1968void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1969{
1970 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1971 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1972 uint64_t *d = vd, *n = vn;
1973 uint8_t *pg = vg;
1974
1975 for (i = 0; i < opr_sz; i += 1) {
1976 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1977 }
1978}
1979
1980void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1981{
1982 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1983 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1984 uint64_t *d = vd, *n = vn;
1985 uint8_t *pg = vg;
1986
1987 for (i = 0; i < opr_sz; i += 1) {
1988 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
1989 }
1990}
1991
1992void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1993{
1994 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1995 uint64_t *d = vd, *n = vn;
1996 uint8_t *pg = vg;
1997 uint8_t inv = simd_data(desc);
1998
1999 for (i = 0; i < opr_sz; i += 1) {
2000 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2001 }
2002}
2003
2004
2005
2006#define DO_ZPZI(NAME, TYPE, H, OP) \
2007void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2008{ \
2009 intptr_t i, opr_sz = simd_oprsz(desc); \
2010 TYPE imm = simd_data(desc); \
2011 for (i = 0; i < opr_sz; ) { \
2012 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2013 do { \
2014 if (pg & 1) { \
2015 TYPE nn = *(TYPE *)(vn + H(i)); \
2016 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
2017 } \
2018 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2019 } while (i & 15); \
2020 } \
2021}
2022
2023
2024#define DO_ZPZI_D(NAME, TYPE, OP) \
2025void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2026{ \
2027 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2028 TYPE *d = vd, *n = vn; \
2029 TYPE imm = simd_data(desc); \
2030 uint8_t *pg = vg; \
2031 for (i = 0; i < opr_sz; i += 1) { \
2032 if (pg[H1(i)] & 1) { \
2033 TYPE nn = n[i]; \
2034 d[i] = OP(nn, imm); \
2035 } \
2036 } \
2037}
2038
2039#define DO_SHR(N, M) (N >> M)
2040#define DO_SHL(N, M) (N << M)
2041
2042
2043
2044
2045#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2046
2047static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2048{
2049 if (likely(sh < 64)) {
2050 return (x >> sh) + ((x >> (sh - 1)) & 1);
2051 } else if (sh == 64) {
2052 return x >> 63;
2053 } else {
2054 return 0;
2055 }
2056}
2057
2058static inline int64_t do_srshr(int64_t x, unsigned sh)
2059{
2060 if (likely(sh < 64)) {
2061 return (x >> sh) + ((x >> (sh - 1)) & 1);
2062 } else {
2063
2064 return 0;
2065 }
2066}
2067
2068DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2069DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2070DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2071DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2072
2073DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2074DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2075DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2076DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2077
2078DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2079DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2080DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2081DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2082
2083DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2084DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2085DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2086DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2087
2088
2089DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2090DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2091DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2092DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2093
2094DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2095DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2096DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2097DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2098
2099DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2100DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2101DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2102DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2103
2104DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2105DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2106DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2107DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2108
2109#define do_suqrshl_b(n, m) \
2110 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2111#define do_suqrshl_h(n, m) \
2112 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2113#define do_suqrshl_s(n, m) \
2114 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2115#define do_suqrshl_d(n, m) \
2116 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2117
2118DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2119DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2120DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2121DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2122
2123#undef DO_ASRD
2124#undef DO_ZPZI
2125#undef DO_ZPZI_D
2126
2127#define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2128void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2129{ \
2130 intptr_t i, opr_sz = simd_oprsz(desc); \
2131 int shift = simd_data(desc); \
2132 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2133 TYPEW nn = *(TYPEW *)(vn + i); \
2134 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2135 } \
2136}
2137
2138#define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2139void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2140{ \
2141 intptr_t i, opr_sz = simd_oprsz(desc); \
2142 int shift = simd_data(desc); \
2143 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2144 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2145 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2146 } \
2147}
2148
2149DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2150DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2151DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2152
2153DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2154DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2155DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2156
2157DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2158DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2159DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2160
2161DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2162DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2163DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2164
2165#define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2166#define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2167#define DO_SQSHRUN_D(x, sh) \
2168 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2169
2170DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2171DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2172DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2173
2174DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2175DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2176DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2177
2178#define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2179#define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2180#define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2181
2182DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2183DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2184DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2185
2186DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2187DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2188DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2189
2190#define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2191#define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2192#define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2193
2194DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2195DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2196DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2197
2198DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2199DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2200DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2201
2202#define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2203#define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2204#define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2205
2206DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2207DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2208DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2209
2210DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2211DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2212DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2213
2214#define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2215#define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2216#define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2217
2218DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2219DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2220DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2221
2222DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2223DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2224DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2225
2226#define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2227#define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2228#define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2229
2230DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2231DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2232DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2233
2234DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2235DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2236DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2237
2238#undef DO_SHRNB
2239#undef DO_SHRNT
2240
2241#define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2242void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2243{ \
2244 intptr_t i, opr_sz = simd_oprsz(desc); \
2245 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2246 TYPEW nn = *(TYPEW *)(vn + i); \
2247 TYPEW mm = *(TYPEW *)(vm + i); \
2248 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2249 } \
2250}
2251
2252#define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2253void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2254{ \
2255 intptr_t i, opr_sz = simd_oprsz(desc); \
2256 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2257 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2258 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2259 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2260 } \
2261}
2262
2263#define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2264#define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2265#define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2266#define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2267
2268DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2269DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2270DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2271
2272DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2273DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2274DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2275
2276DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2277DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2278DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2279
2280DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2281DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2282DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2283
2284DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2285DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2286DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2287
2288DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2289DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2290DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2291
2292DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2293DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2294DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2295
2296DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2297DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2298DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2299
2300#undef DO_RSUBHN
2301#undef DO_SUBHN
2302#undef DO_RADDHN
2303#undef DO_ADDHN
2304
2305#undef DO_BINOPNB
2306
2307
2308
2309#define DO_ZPZZZ(NAME, TYPE, H, OP) \
2310void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2311 void *vg, uint32_t desc) \
2312{ \
2313 intptr_t i, opr_sz = simd_oprsz(desc); \
2314 for (i = 0; i < opr_sz; ) { \
2315 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2316 do { \
2317 if (pg & 1) { \
2318 TYPE nn = *(TYPE *)(vn + H(i)); \
2319 TYPE mm = *(TYPE *)(vm + H(i)); \
2320 TYPE aa = *(TYPE *)(va + H(i)); \
2321 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2322 } \
2323 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2324 } while (i & 15); \
2325 } \
2326}
2327
2328
2329#define DO_ZPZZZ_D(NAME, TYPE, OP) \
2330void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2331 void *vg, uint32_t desc) \
2332{ \
2333 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2334 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2335 uint8_t *pg = vg; \
2336 for (i = 0; i < opr_sz; i += 1) { \
2337 if (pg[H1(i)] & 1) { \
2338 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2339 d[i] = OP(aa, nn, mm); \
2340 } \
2341 } \
2342}
2343
2344#define DO_MLA(A, N, M) (A + N * M)
2345#define DO_MLS(A, N, M) (A - N * M)
2346
2347DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2348DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2349
2350DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2351DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2352
2353DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2354DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2355
2356DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2357DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2358
2359#undef DO_MLA
2360#undef DO_MLS
2361#undef DO_ZPZZZ
2362#undef DO_ZPZZZ_D
2363
2364void HELPER(sve_index_b)(void *vd, uint32_t start,
2365 uint32_t incr, uint32_t desc)
2366{
2367 intptr_t i, opr_sz = simd_oprsz(desc);
2368 uint8_t *d = vd;
2369 for (i = 0; i < opr_sz; i += 1) {
2370 d[H1(i)] = start + i * incr;
2371 }
2372}
2373
2374void HELPER(sve_index_h)(void *vd, uint32_t start,
2375 uint32_t incr, uint32_t desc)
2376{
2377 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2378 uint16_t *d = vd;
2379 for (i = 0; i < opr_sz; i += 1) {
2380 d[H2(i)] = start + i * incr;
2381 }
2382}
2383
2384void HELPER(sve_index_s)(void *vd, uint32_t start,
2385 uint32_t incr, uint32_t desc)
2386{
2387 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2388 uint32_t *d = vd;
2389 for (i = 0; i < opr_sz; i += 1) {
2390 d[H4(i)] = start + i * incr;
2391 }
2392}
2393
2394void HELPER(sve_index_d)(void *vd, uint64_t start,
2395 uint64_t incr, uint32_t desc)
2396{
2397 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2398 uint64_t *d = vd;
2399 for (i = 0; i < opr_sz; i += 1) {
2400 d[i] = start + i * incr;
2401 }
2402}
2403
2404void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2405{
2406 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2407 uint32_t sh = simd_data(desc);
2408 uint32_t *d = vd, *n = vn, *m = vm;
2409 for (i = 0; i < opr_sz; i += 1) {
2410 d[i] = n[i] + (m[i] << sh);
2411 }
2412}
2413
2414void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2415{
2416 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2417 uint64_t sh = simd_data(desc);
2418 uint64_t *d = vd, *n = vn, *m = vm;
2419 for (i = 0; i < opr_sz; i += 1) {
2420 d[i] = n[i] + (m[i] << sh);
2421 }
2422}
2423
2424void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2425{
2426 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2427 uint64_t sh = simd_data(desc);
2428 uint64_t *d = vd, *n = vn, *m = vm;
2429 for (i = 0; i < opr_sz; i += 1) {
2430 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2431 }
2432}
2433
2434void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2435{
2436 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2437 uint64_t sh = simd_data(desc);
2438 uint64_t *d = vd, *n = vn, *m = vm;
2439 for (i = 0; i < opr_sz; i += 1) {
2440 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2441 }
2442}
2443
2444void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2445{
2446
2447 static const uint16_t coeff[] = {
2448 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2449 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2450 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2451 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2452 };
2453 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2454 uint16_t *d = vd, *n = vn;
2455
2456 for (i = 0; i < opr_sz; i++) {
2457 uint16_t nn = n[i];
2458 intptr_t idx = extract32(nn, 0, 5);
2459 uint16_t exp = extract32(nn, 5, 5);
2460 d[i] = coeff[idx] | (exp << 10);
2461 }
2462}
2463
2464void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2465{
2466
2467 static const uint32_t coeff[] = {
2468 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2469 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2470 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2471 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2472 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2473 0x1ef532, 0x20b051, 0x227043, 0x243516,
2474 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2475 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2476 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2477 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2478 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2479 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2480 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2481 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2482 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2483 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2484 };
2485 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2486 uint32_t *d = vd, *n = vn;
2487
2488 for (i = 0; i < opr_sz; i++) {
2489 uint32_t nn = n[i];
2490 intptr_t idx = extract32(nn, 0, 6);
2491 uint32_t exp = extract32(nn, 6, 8);
2492 d[i] = coeff[idx] | (exp << 23);
2493 }
2494}
2495
2496void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2497{
2498
2499 static const uint64_t coeff[] = {
2500 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2501 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2502 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2503 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2504 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2505 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2506 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2507 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2508 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2509 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2510 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2511 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2512 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2513 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2514 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2515 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2516 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2517 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2518 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2519 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2520 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2521 0xFA7C1819E90D8ull,
2522 };
2523 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2524 uint64_t *d = vd, *n = vn;
2525
2526 for (i = 0; i < opr_sz; i++) {
2527 uint64_t nn = n[i];
2528 intptr_t idx = extract32(nn, 0, 6);
2529 uint64_t exp = extract32(nn, 6, 11);
2530 d[i] = coeff[idx] | (exp << 52);
2531 }
2532}
2533
2534void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2535{
2536 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2537 uint16_t *d = vd, *n = vn, *m = vm;
2538 for (i = 0; i < opr_sz; i += 1) {
2539 uint16_t nn = n[i];
2540 uint16_t mm = m[i];
2541 if (mm & 1) {
2542 nn = float16_one;
2543 }
2544 d[i] = nn ^ (mm & 2) << 14;
2545 }
2546}
2547
2548void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2549{
2550 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2551 uint32_t *d = vd, *n = vn, *m = vm;
2552 for (i = 0; i < opr_sz; i += 1) {
2553 uint32_t nn = n[i];
2554 uint32_t mm = m[i];
2555 if (mm & 1) {
2556 nn = float32_one;
2557 }
2558 d[i] = nn ^ (mm & 2) << 30;
2559 }
2560}
2561
2562void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2563{
2564 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2565 uint64_t *d = vd, *n = vn, *m = vm;
2566 for (i = 0; i < opr_sz; i += 1) {
2567 uint64_t nn = n[i];
2568 uint64_t mm = m[i];
2569 if (mm & 1) {
2570 nn = float64_one;
2571 }
2572 d[i] = nn ^ (mm & 2) << 62;
2573 }
2574}
2575
2576
2577
2578
2579
2580void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2581{
2582 intptr_t i, oprsz = simd_oprsz(desc);
2583
2584 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2585 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2586 }
2587}
2588
2589void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2590{
2591 intptr_t i, oprsz = simd_oprsz(desc);
2592
2593 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2594 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2595 }
2596}
2597
2598void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2599{
2600 intptr_t i, oprsz = simd_oprsz(desc);
2601
2602 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2603 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2604 }
2605}
2606
2607void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2608{
2609 intptr_t i, oprsz = simd_oprsz(desc);
2610
2611 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2612 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2613 }
2614}
2615
2616
2617
2618
2619
2620void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2621{
2622 intptr_t i, oprsz = simd_oprsz(desc);
2623
2624 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2625 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2626 }
2627}
2628
2629void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2630{
2631 intptr_t i, oprsz = simd_oprsz(desc);
2632
2633 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2634 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2635 }
2636}
2637
2638void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2639{
2640 intptr_t i, oprsz = simd_oprsz(desc);
2641
2642 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2643 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2644 }
2645}
2646
2647void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2648{
2649 intptr_t i, oprsz = simd_oprsz(desc);
2650
2651 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2652 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2653 }
2654}
2655
2656void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2657{
2658 intptr_t i, oprsz = simd_oprsz(desc);
2659
2660 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2661 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2662 }
2663}
2664
2665
2666
2667
2668void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2669 uint64_t mm, uint32_t desc)
2670{
2671 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2672 uint64_t *d = vd, *n = vn;
2673 uint8_t *pg = vg;
2674
2675 mm = dup_const(MO_8, mm);
2676 for (i = 0; i < opr_sz; i += 1) {
2677 uint64_t nn = n[i];
2678 uint64_t pp = expand_pred_b(pg[H1(i)]);
2679 d[i] = (mm & pp) | (nn & ~pp);
2680 }
2681}
2682
2683void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2684 uint64_t mm, uint32_t desc)
2685{
2686 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2687 uint64_t *d = vd, *n = vn;
2688 uint8_t *pg = vg;
2689
2690 mm = dup_const(MO_16, mm);
2691 for (i = 0; i < opr_sz; i += 1) {
2692 uint64_t nn = n[i];
2693 uint64_t pp = expand_pred_h(pg[H1(i)]);
2694 d[i] = (mm & pp) | (nn & ~pp);
2695 }
2696}
2697
2698void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2699 uint64_t mm, uint32_t desc)
2700{
2701 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2702 uint64_t *d = vd, *n = vn;
2703 uint8_t *pg = vg;
2704
2705 mm = dup_const(MO_32, mm);
2706 for (i = 0; i < opr_sz; i += 1) {
2707 uint64_t nn = n[i];
2708 uint64_t pp = expand_pred_s(pg[H1(i)]);
2709 d[i] = (mm & pp) | (nn & ~pp);
2710 }
2711}
2712
2713void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2714 uint64_t mm, uint32_t desc)
2715{
2716 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2717 uint64_t *d = vd, *n = vn;
2718 uint8_t *pg = vg;
2719
2720 for (i = 0; i < opr_sz; i += 1) {
2721 uint64_t nn = n[i];
2722 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2723 }
2724}
2725
2726void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2727{
2728 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2729 uint64_t *d = vd;
2730 uint8_t *pg = vg;
2731
2732 val = dup_const(MO_8, val);
2733 for (i = 0; i < opr_sz; i += 1) {
2734 d[i] = val & expand_pred_b(pg[H1(i)]);
2735 }
2736}
2737
2738void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2739{
2740 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2741 uint64_t *d = vd;
2742 uint8_t *pg = vg;
2743
2744 val = dup_const(MO_16, val);
2745 for (i = 0; i < opr_sz; i += 1) {
2746 d[i] = val & expand_pred_h(pg[H1(i)]);
2747 }
2748}
2749
2750void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2751{
2752 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2753 uint64_t *d = vd;
2754 uint8_t *pg = vg;
2755
2756 val = dup_const(MO_32, val);
2757 for (i = 0; i < opr_sz; i += 1) {
2758 d[i] = val & expand_pred_s(pg[H1(i)]);
2759 }
2760}
2761
2762void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2763{
2764 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2765 uint64_t *d = vd;
2766 uint8_t *pg = vg;
2767
2768 for (i = 0; i < opr_sz; i += 1) {
2769 d[i] = (pg[H1(i)] & 1 ? val : 0);
2770 }
2771}
2772
2773
2774
2775
2776static void swap_memmove(void *vd, void *vs, size_t n)
2777{
2778 uintptr_t d = (uintptr_t)vd;
2779 uintptr_t s = (uintptr_t)vs;
2780 uintptr_t o = (d | s | n) & 7;
2781 size_t i;
2782
2783#if !HOST_BIG_ENDIAN
2784 o = 0;
2785#endif
2786 switch (o) {
2787 case 0:
2788 memmove(vd, vs, n);
2789 break;
2790
2791 case 4:
2792 if (d < s || d >= s + n) {
2793 for (i = 0; i < n; i += 4) {
2794 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2795 }
2796 } else {
2797 for (i = n; i > 0; ) {
2798 i -= 4;
2799 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2800 }
2801 }
2802 break;
2803
2804 case 2:
2805 case 6:
2806 if (d < s || d >= s + n) {
2807 for (i = 0; i < n; i += 2) {
2808 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2809 }
2810 } else {
2811 for (i = n; i > 0; ) {
2812 i -= 2;
2813 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2814 }
2815 }
2816 break;
2817
2818 default:
2819 if (d < s || d >= s + n) {
2820 for (i = 0; i < n; i++) {
2821 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2822 }
2823 } else {
2824 for (i = n; i > 0; ) {
2825 i -= 1;
2826 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2827 }
2828 }
2829 break;
2830 }
2831}
2832
2833
2834static void swap_memzero(void *vd, size_t n)
2835{
2836 uintptr_t d = (uintptr_t)vd;
2837 uintptr_t o = (d | n) & 7;
2838 size_t i;
2839
2840
2841 if (likely(n == 0)) {
2842 return;
2843 }
2844
2845#if !HOST_BIG_ENDIAN
2846 o = 0;
2847#endif
2848 switch (o) {
2849 case 0:
2850 memset(vd, 0, n);
2851 break;
2852
2853 case 4:
2854 for (i = 0; i < n; i += 4) {
2855 *(uint32_t *)H1_4(d + i) = 0;
2856 }
2857 break;
2858
2859 case 2:
2860 case 6:
2861 for (i = 0; i < n; i += 2) {
2862 *(uint16_t *)H1_2(d + i) = 0;
2863 }
2864 break;
2865
2866 default:
2867 for (i = 0; i < n; i++) {
2868 *(uint8_t *)H1(d + i) = 0;
2869 }
2870 break;
2871 }
2872}
2873
2874void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2875{
2876 intptr_t opr_sz = simd_oprsz(desc);
2877 size_t n_ofs = simd_data(desc);
2878 size_t n_siz = opr_sz - n_ofs;
2879
2880 if (vd != vm) {
2881 swap_memmove(vd, vn + n_ofs, n_siz);
2882 swap_memmove(vd + n_siz, vm, n_ofs);
2883 } else if (vd != vn) {
2884 swap_memmove(vd + n_siz, vd, n_ofs);
2885 swap_memmove(vd, vn + n_ofs, n_siz);
2886 } else {
2887
2888 ARMVectorReg tmp;
2889 swap_memmove(&tmp, vm, n_ofs);
2890 swap_memmove(vd, vd + n_ofs, n_siz);
2891 memcpy(vd + n_siz, &tmp, n_ofs);
2892 }
2893}
2894
2895#define DO_INSR(NAME, TYPE, H) \
2896void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2897{ \
2898 intptr_t opr_sz = simd_oprsz(desc); \
2899 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2900 *(TYPE *)(vd + H(0)) = val; \
2901}
2902
2903DO_INSR(sve_insr_b, uint8_t, H1)
2904DO_INSR(sve_insr_h, uint16_t, H1_2)
2905DO_INSR(sve_insr_s, uint32_t, H1_4)
2906DO_INSR(sve_insr_d, uint64_t, H1_8)
2907
2908#undef DO_INSR
2909
2910void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2911{
2912 intptr_t i, j, opr_sz = simd_oprsz(desc);
2913 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2914 uint64_t f = *(uint64_t *)(vn + i);
2915 uint64_t b = *(uint64_t *)(vn + j);
2916 *(uint64_t *)(vd + i) = bswap64(b);
2917 *(uint64_t *)(vd + j) = bswap64(f);
2918 }
2919}
2920
2921void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2922{
2923 intptr_t i, j, opr_sz = simd_oprsz(desc);
2924 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2925 uint64_t f = *(uint64_t *)(vn + i);
2926 uint64_t b = *(uint64_t *)(vn + j);
2927 *(uint64_t *)(vd + i) = hswap64(b);
2928 *(uint64_t *)(vd + j) = hswap64(f);
2929 }
2930}
2931
2932void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2933{
2934 intptr_t i, j, opr_sz = simd_oprsz(desc);
2935 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2936 uint64_t f = *(uint64_t *)(vn + i);
2937 uint64_t b = *(uint64_t *)(vn + j);
2938 *(uint64_t *)(vd + i) = rol64(b, 32);
2939 *(uint64_t *)(vd + j) = rol64(f, 32);
2940 }
2941}
2942
2943void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2944{
2945 intptr_t i, j, opr_sz = simd_oprsz(desc);
2946 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2947 uint64_t f = *(uint64_t *)(vn + i);
2948 uint64_t b = *(uint64_t *)(vn + j);
2949 *(uint64_t *)(vd + i) = b;
2950 *(uint64_t *)(vd + j) = f;
2951 }
2952}
2953
2954typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2955
2956static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2957 bool is_tbx, tb_impl_fn *fn)
2958{
2959 ARMVectorReg scratch;
2960 uintptr_t oprsz = simd_oprsz(desc);
2961
2962 if (unlikely(vd == vn)) {
2963 vn = memcpy(&scratch, vn, oprsz);
2964 }
2965
2966 fn(vd, vn, NULL, vm, oprsz, is_tbx);
2967}
2968
2969static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
2970 uint32_t desc, bool is_tbx, tb_impl_fn *fn)
2971{
2972 ARMVectorReg scratch;
2973 uintptr_t oprsz = simd_oprsz(desc);
2974
2975 if (unlikely(vd == vn0)) {
2976 vn0 = memcpy(&scratch, vn0, oprsz);
2977 if (vd == vn1) {
2978 vn1 = vn0;
2979 }
2980 } else if (unlikely(vd == vn1)) {
2981 vn1 = memcpy(&scratch, vn1, oprsz);
2982 }
2983
2984 fn(vd, vn0, vn1, vm, oprsz, is_tbx);
2985}
2986
2987#define DO_TB(SUFF, TYPE, H) \
2988static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \
2989 void *vm, uintptr_t oprsz, bool is_tbx) \
2990{ \
2991 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \
2992 uintptr_t i, nelem = oprsz / sizeof(TYPE); \
2993 for (i = 0; i < nelem; ++i) { \
2994 TYPE index = indexes[H1(i)], val = 0; \
2995 if (index < nelem) { \
2996 val = tbl0[H(index)]; \
2997 } else { \
2998 index -= nelem; \
2999 if (tbl1 && index < nelem) { \
3000 val = tbl1[H(index)]; \
3001 } else if (is_tbx) { \
3002 continue; \
3003 } \
3004 } \
3005 d[H(i)] = val; \
3006 } \
3007} \
3008void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3009{ \
3010 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \
3011} \
3012void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \
3013 void *vm, uint32_t desc) \
3014{ \
3015 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \
3016} \
3017void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3018{ \
3019 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \
3020}
3021
3022DO_TB(b, uint8_t, H1)
3023DO_TB(h, uint16_t, H2)
3024DO_TB(s, uint32_t, H4)
3025DO_TB(d, uint64_t, H8)
3026
3027#undef DO_TB
3028
3029#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3030void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
3031{ \
3032 intptr_t i, opr_sz = simd_oprsz(desc); \
3033 TYPED *d = vd; \
3034 TYPES *n = vn; \
3035 ARMVectorReg tmp; \
3036 if (unlikely(vn - vd < opr_sz)) { \
3037 n = memcpy(&tmp, n, opr_sz / 2); \
3038 } \
3039 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
3040 d[HD(i)] = n[HS(i)]; \
3041 } \
3042}
3043
3044DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3045DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3046DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3047
3048DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3049DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3050DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3051
3052#undef DO_UNPK
3053
3054
3055
3056
3057
3058static const uint64_t even_bit_esz_masks[5] = {
3059 0x5555555555555555ull,
3060 0x3333333333333333ull,
3061 0x0f0f0f0f0f0f0f0full,
3062 0x00ff00ff00ff00ffull,
3063 0x0000ffff0000ffffull,
3064};
3065
3066
3067
3068
3069
3070
3071static uint64_t expand_bits(uint64_t x, int n)
3072{
3073 int i;
3074
3075 x &= 0xffffffffu;
3076 for (i = 4; i >= n; i--) {
3077 int sh = 1 << i;
3078 x = ((x << sh) | x) & even_bit_esz_masks[i];
3079 }
3080 return x;
3081}
3082
3083
3084
3085
3086
3087
3088static uint64_t compress_bits(uint64_t x, int n)
3089{
3090 int i;
3091
3092 for (i = n; i <= 4; i++) {
3093 int sh = 1 << i;
3094 x &= even_bit_esz_masks[i];
3095 x = (x >> sh) | x;
3096 }
3097 return x & 0xffffffffu;
3098}
3099
3100void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3101{
3102 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3103 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3104 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3105 int esize = 1 << esz;
3106 uint64_t *d = vd;
3107 intptr_t i;
3108
3109 if (oprsz <= 8) {
3110 uint64_t nn = *(uint64_t *)vn;
3111 uint64_t mm = *(uint64_t *)vm;
3112 int half = 4 * oprsz;
3113
3114 nn = extract64(nn, high * half, half);
3115 mm = extract64(mm, high * half, half);
3116 nn = expand_bits(nn, esz);
3117 mm = expand_bits(mm, esz);
3118 d[0] = nn | (mm << esize);
3119 } else {
3120 ARMPredicateReg tmp;
3121
3122
3123
3124 if (vd == vn) {
3125 vn = memcpy(&tmp, vn, oprsz);
3126 if (vd == vm) {
3127 vm = vn;
3128 }
3129 } else if (vd == vm) {
3130 vm = memcpy(&tmp, vm, oprsz);
3131 }
3132 if (high) {
3133 high = oprsz >> 1;
3134 }
3135
3136 if ((oprsz & 7) == 0) {
3137 uint32_t *n = vn, *m = vm;
3138 high >>= 2;
3139
3140 for (i = 0; i < oprsz / 8; i++) {
3141 uint64_t nn = n[H4(high + i)];
3142 uint64_t mm = m[H4(high + i)];
3143
3144 nn = expand_bits(nn, esz);
3145 mm = expand_bits(mm, esz);
3146 d[i] = nn | (mm << esize);
3147 }
3148 } else {
3149 uint8_t *n = vn, *m = vm;
3150 uint16_t *d16 = vd;
3151
3152 for (i = 0; i < oprsz / 2; i++) {
3153 uint16_t nn = n[H1(high + i)];
3154 uint16_t mm = m[H1(high + i)];
3155
3156 nn = expand_bits(nn, esz);
3157 mm = expand_bits(mm, esz);
3158 d16[H2(i)] = nn | (mm << esize);
3159 }
3160 }
3161 }
3162}
3163
3164void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3165{
3166 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3167 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3168 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3169 uint64_t *d = vd, *n = vn, *m = vm;
3170 uint64_t l, h;
3171 intptr_t i;
3172
3173 if (oprsz <= 8) {
3174 l = compress_bits(n[0] >> odd, esz);
3175 h = compress_bits(m[0] >> odd, esz);
3176 d[0] = l | (h << (4 * oprsz));
3177 } else {
3178 ARMPredicateReg tmp_m;
3179 intptr_t oprsz_16 = oprsz / 16;
3180
3181 if ((vm - vd) < (uintptr_t)oprsz) {
3182 m = memcpy(&tmp_m, vm, oprsz);
3183 }
3184
3185 for (i = 0; i < oprsz_16; i++) {
3186 l = n[2 * i + 0];
3187 h = n[2 * i + 1];
3188 l = compress_bits(l >> odd, esz);
3189 h = compress_bits(h >> odd, esz);
3190 d[i] = l | (h << 32);
3191 }
3192
3193
3194
3195
3196
3197
3198 if (oprsz & 15) {
3199 int final_shift = (oprsz & 15) * 2;
3200
3201 l = n[2 * i + 0];
3202 h = n[2 * i + 1];
3203 l = compress_bits(l >> odd, esz);
3204 h = compress_bits(h >> odd, esz);
3205 d[i] = l | (h << final_shift);
3206
3207 for (i = 0; i < oprsz_16; i++) {
3208 l = m[2 * i + 0];
3209 h = m[2 * i + 1];
3210 l = compress_bits(l >> odd, esz);
3211 h = compress_bits(h >> odd, esz);
3212 tmp_m.p[i] = l | (h << 32);
3213 }
3214 l = m[2 * i + 0];
3215 h = m[2 * i + 1];
3216 l = compress_bits(l >> odd, esz);
3217 h = compress_bits(h >> odd, esz);
3218 tmp_m.p[i] = l | (h << final_shift);
3219
3220 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3221 } else {
3222 for (i = 0; i < oprsz_16; i++) {
3223 l = m[2 * i + 0];
3224 h = m[2 * i + 1];
3225 l = compress_bits(l >> odd, esz);
3226 h = compress_bits(h >> odd, esz);
3227 d[oprsz_16 + i] = l | (h << 32);
3228 }
3229 }
3230 }
3231}
3232
3233void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3234{
3235 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3236 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3237 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3238 uint64_t *d = vd, *n = vn, *m = vm;
3239 uint64_t mask;
3240 int shr, shl;
3241 intptr_t i;
3242
3243 shl = 1 << esz;
3244 shr = 0;
3245 mask = even_bit_esz_masks[esz];
3246 if (odd) {
3247 mask <<= shl;
3248 shr = shl;
3249 shl = 0;
3250 }
3251
3252 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3253 uint64_t nn = (n[i] & mask) >> shr;
3254 uint64_t mm = (m[i] & mask) << shl;
3255 d[i] = nn + mm;
3256 }
3257}
3258
3259
3260static uint64_t reverse_bits_64(uint64_t x, int n)
3261{
3262 int i, sh;
3263
3264 x = bswap64(x);
3265 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3266 uint64_t mask = even_bit_esz_masks[i];
3267 x = ((x & mask) << sh) | ((x >> sh) & mask);
3268 }
3269 return x;
3270}
3271
3272static uint8_t reverse_bits_8(uint8_t x, int n)
3273{
3274 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3275 int i, sh;
3276
3277 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3278 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3279 }
3280 return x;
3281}
3282
3283void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3284{
3285 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3286 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3287 intptr_t i, oprsz_2 = oprsz / 2;
3288
3289 if (oprsz <= 8) {
3290 uint64_t l = *(uint64_t *)vn;
3291 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3292 *(uint64_t *)vd = l;
3293 } else if ((oprsz & 15) == 0) {
3294 for (i = 0; i < oprsz_2; i += 8) {
3295 intptr_t ih = oprsz - 8 - i;
3296 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3297 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3298 *(uint64_t *)(vd + i) = h;
3299 *(uint64_t *)(vd + ih) = l;
3300 }
3301 } else {
3302 for (i = 0; i < oprsz_2; i += 1) {
3303 intptr_t il = H1(i);
3304 intptr_t ih = H1(oprsz - 1 - i);
3305 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3306 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3307 *(uint8_t *)(vd + il) = h;
3308 *(uint8_t *)(vd + ih) = l;
3309 }
3310 }
3311}
3312
3313void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3314{
3315 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3316 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3317 uint64_t *d = vd;
3318 intptr_t i;
3319
3320 if (oprsz <= 8) {
3321 uint64_t nn = *(uint64_t *)vn;
3322 int half = 4 * oprsz;
3323
3324 nn = extract64(nn, high * half, half);
3325 nn = expand_bits(nn, 0);
3326 d[0] = nn;
3327 } else {
3328 ARMPredicateReg tmp_n;
3329
3330
3331
3332 if ((vn - vd) < (uintptr_t)oprsz) {
3333 vn = memcpy(&tmp_n, vn, oprsz);
3334 }
3335 if (high) {
3336 high = oprsz >> 1;
3337 }
3338
3339 if ((oprsz & 7) == 0) {
3340 uint32_t *n = vn;
3341 high >>= 2;
3342
3343 for (i = 0; i < oprsz / 8; i++) {
3344 uint64_t nn = n[H4(high + i)];
3345 d[i] = expand_bits(nn, 0);
3346 }
3347 } else {
3348 uint16_t *d16 = vd;
3349 uint8_t *n = vn;
3350
3351 for (i = 0; i < oprsz / 2; i++) {
3352 uint16_t nn = n[H1(high + i)];
3353 d16[H2(i)] = expand_bits(nn, 0);
3354 }
3355 }
3356 }
3357}
3358
3359#define DO_ZIP(NAME, TYPE, H) \
3360void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3361{ \
3362 intptr_t oprsz = simd_oprsz(desc); \
3363 intptr_t odd_ofs = simd_data(desc); \
3364 intptr_t i, oprsz_2 = oprsz / 2; \
3365 ARMVectorReg tmp_n, tmp_m; \
3366
3367 \
3368 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3369 vn = memcpy(&tmp_n, vn, oprsz_2); \
3370 } \
3371 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3372 vm = memcpy(&tmp_m, vm, oprsz_2); \
3373 } \
3374 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3375 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3376 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \
3377 *(TYPE *)(vm + odd_ofs + H(i)); \
3378 } \
3379 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3380 memset(vd + oprsz - 16, 0, 16); \
3381 } \
3382}
3383
3384DO_ZIP(sve_zip_b, uint8_t, H1)
3385DO_ZIP(sve_zip_h, uint16_t, H1_2)
3386DO_ZIP(sve_zip_s, uint32_t, H1_4)
3387DO_ZIP(sve_zip_d, uint64_t, H1_8)
3388DO_ZIP(sve2_zip_q, Int128, )
3389
3390#define DO_UZP(NAME, TYPE, H) \
3391void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3392{ \
3393 intptr_t oprsz = simd_oprsz(desc); \
3394 intptr_t odd_ofs = simd_data(desc); \
3395 intptr_t i, p; \
3396 ARMVectorReg tmp_m; \
3397 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3398 vm = memcpy(&tmp_m, vm, oprsz); \
3399 } \
3400 i = 0, p = odd_ofs; \
3401 do { \
3402 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \
3403 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3404 } while (p < oprsz); \
3405 p -= oprsz; \
3406 do { \
3407 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \
3408 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3409 } while (p < oprsz); \
3410 tcg_debug_assert(i == oprsz); \
3411}
3412
3413DO_UZP(sve_uzp_b, uint8_t, H1)
3414DO_UZP(sve_uzp_h, uint16_t, H1_2)
3415DO_UZP(sve_uzp_s, uint32_t, H1_4)
3416DO_UZP(sve_uzp_d, uint64_t, H1_8)
3417DO_UZP(sve2_uzp_q, Int128, )
3418
3419#define DO_TRN(NAME, TYPE, H) \
3420void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3421{ \
3422 intptr_t oprsz = simd_oprsz(desc); \
3423 intptr_t odd_ofs = simd_data(desc); \
3424 intptr_t i; \
3425 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3426 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3427 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3428 *(TYPE *)(vd + H(i + 0)) = ae; \
3429 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3430 } \
3431 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3432 memset(vd + oprsz - 16, 0, 16); \
3433 } \
3434}
3435
3436DO_TRN(sve_trn_b, uint8_t, H1)
3437DO_TRN(sve_trn_h, uint16_t, H1_2)
3438DO_TRN(sve_trn_s, uint32_t, H1_4)
3439DO_TRN(sve_trn_d, uint64_t, H1_8)
3440DO_TRN(sve2_trn_q, Int128, )
3441
3442#undef DO_ZIP
3443#undef DO_UZP
3444#undef DO_TRN
3445
3446void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3447{
3448 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3449 uint32_t *d = vd, *n = vn;
3450 uint8_t *pg = vg;
3451
3452 for (i = j = 0; i < opr_sz; i++) {
3453 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3454 d[H4(j)] = n[H4(i)];
3455 j++;
3456 }
3457 }
3458 for (; j < opr_sz; j++) {
3459 d[H4(j)] = 0;
3460 }
3461}
3462
3463void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3464{
3465 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3466 uint64_t *d = vd, *n = vn;
3467 uint8_t *pg = vg;
3468
3469 for (i = j = 0; i < opr_sz; i++) {
3470 if (pg[H1(i)] & 1) {
3471 d[j] = n[i];
3472 j++;
3473 }
3474 }
3475 for (; j < opr_sz; j++) {
3476 d[j] = 0;
3477 }
3478}
3479
3480
3481
3482
3483
3484int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3485{
3486 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3487 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3488
3489 return last_active_element(vg, words, esz);
3490}
3491
3492void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3493{
3494 intptr_t opr_sz = simd_oprsz(desc) / 8;
3495 int esz = simd_data(desc);
3496 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3497 intptr_t i, first_i, last_i;
3498 ARMVectorReg tmp;
3499
3500 first_i = last_i = 0;
3501 first_g = last_g = 0;
3502
3503
3504 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3505 pg = *(uint64_t *)(vg + i) & mask;
3506 if (pg) {
3507 if (last_g == 0) {
3508 last_g = pg;
3509 last_i = i;
3510 }
3511 first_g = pg;
3512 first_i = i;
3513 }
3514 }
3515
3516 len = 0;
3517 if (first_g != 0) {
3518 first_i = first_i * 8 + ctz64(first_g);
3519 last_i = last_i * 8 + 63 - clz64(last_g);
3520 len = last_i - first_i + (1 << esz);
3521 if (vd == vm) {
3522 vm = memcpy(&tmp, vm, opr_sz * 8);
3523 }
3524 swap_memmove(vd, vn + first_i, len);
3525 }
3526 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3527}
3528
3529void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3530 void *vg, uint32_t desc)
3531{
3532 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3533 uint64_t *d = vd, *n = vn, *m = vm;
3534 uint8_t *pg = vg;
3535
3536 for (i = 0; i < opr_sz; i += 1) {
3537 uint64_t nn = n[i], mm = m[i];
3538 uint64_t pp = expand_pred_b(pg[H1(i)]);
3539 d[i] = (nn & pp) | (mm & ~pp);
3540 }
3541}
3542
3543void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3544 void *vg, uint32_t desc)
3545{
3546 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3547 uint64_t *d = vd, *n = vn, *m = vm;
3548 uint8_t *pg = vg;
3549
3550 for (i = 0; i < opr_sz; i += 1) {
3551 uint64_t nn = n[i], mm = m[i];
3552 uint64_t pp = expand_pred_h(pg[H1(i)]);
3553 d[i] = (nn & pp) | (mm & ~pp);
3554 }
3555}
3556
3557void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3558 void *vg, uint32_t desc)
3559{
3560 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3561 uint64_t *d = vd, *n = vn, *m = vm;
3562 uint8_t *pg = vg;
3563
3564 for (i = 0; i < opr_sz; i += 1) {
3565 uint64_t nn = n[i], mm = m[i];
3566 uint64_t pp = expand_pred_s(pg[H1(i)]);
3567 d[i] = (nn & pp) | (mm & ~pp);
3568 }
3569}
3570
3571void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3572 void *vg, uint32_t desc)
3573{
3574 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3575 uint64_t *d = vd, *n = vn, *m = vm;
3576 uint8_t *pg = vg;
3577
3578 for (i = 0; i < opr_sz; i += 1) {
3579 uint64_t nn = n[i], mm = m[i];
3580 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3581 }
3582}
3583
3584void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3585 void *vg, uint32_t desc)
3586{
3587 intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3588 Int128 *d = vd, *n = vn, *m = vm;
3589 uint16_t *pg = vg;
3590
3591 for (i = 0; i < opr_sz; i += 1) {
3592 d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3593 }
3594}
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3618uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3619{ \
3620 intptr_t opr_sz = simd_oprsz(desc); \
3621 uint32_t flags = PREDTEST_INIT; \
3622 intptr_t i = opr_sz; \
3623 do { \
3624 uint64_t out = 0, pg; \
3625 do { \
3626 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3627 TYPE nn = *(TYPE *)(vn + H(i)); \
3628 TYPE mm = *(TYPE *)(vm + H(i)); \
3629 out |= nn OP mm; \
3630 } while (i & 63); \
3631 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3632 out &= pg; \
3633 *(uint64_t *)(vd + (i >> 3)) = out; \
3634 flags = iter_predtest_bwd(out, pg, flags); \
3635 } while (i > 0); \
3636 return flags; \
3637}
3638
3639#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3640 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3641#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3642 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3643#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3644 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3645#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3646 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3647
3648DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3649DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3650DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3651DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3652
3653DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3654DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3655DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3656DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3657
3658DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3659DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3660DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3661DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3662
3663DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3664DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3665DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3666DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3667
3668DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3669DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3670DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3671DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3672
3673DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3674DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3675DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3676DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3677
3678#undef DO_CMP_PPZZ_B
3679#undef DO_CMP_PPZZ_H
3680#undef DO_CMP_PPZZ_S
3681#undef DO_CMP_PPZZ_D
3682#undef DO_CMP_PPZZ
3683
3684
3685#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3686uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3687{ \
3688 intptr_t opr_sz = simd_oprsz(desc); \
3689 uint32_t flags = PREDTEST_INIT; \
3690 intptr_t i = opr_sz; \
3691 do { \
3692 uint64_t out = 0, pg; \
3693 do { \
3694 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3695 do { \
3696 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3697 TYPE nn = *(TYPE *)(vn + H(i)); \
3698 out |= nn OP mm; \
3699 } while (i & 7); \
3700 } while (i & 63); \
3701 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3702 out &= pg; \
3703 *(uint64_t *)(vd + (i >> 3)) = out; \
3704 flags = iter_predtest_bwd(out, pg, flags); \
3705 } while (i > 0); \
3706 return flags; \
3707}
3708
3709#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3710 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3711#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3712 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3713#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3714 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3715
3716DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3717DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3718DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3719
3720DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3721DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3722DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3723
3724DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3725DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3726DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3727
3728DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3729DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3730DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3731
3732DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3733DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3734DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3735
3736DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3737DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3738DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3739
3740DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3741DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3742DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3743
3744DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3745DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3746DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3747
3748DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3749DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3750DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3751
3752DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3753DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3754DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3755
3756#undef DO_CMP_PPZW_B
3757#undef DO_CMP_PPZW_H
3758#undef DO_CMP_PPZW_S
3759#undef DO_CMP_PPZW
3760
3761
3762#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3763uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3764{ \
3765 intptr_t opr_sz = simd_oprsz(desc); \
3766 uint32_t flags = PREDTEST_INIT; \
3767 TYPE mm = simd_data(desc); \
3768 intptr_t i = opr_sz; \
3769 do { \
3770 uint64_t out = 0, pg; \
3771 do { \
3772 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3773 TYPE nn = *(TYPE *)(vn + H(i)); \
3774 out |= nn OP mm; \
3775 } while (i & 63); \
3776 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3777 out &= pg; \
3778 *(uint64_t *)(vd + (i >> 3)) = out; \
3779 flags = iter_predtest_bwd(out, pg, flags); \
3780 } while (i > 0); \
3781 return flags; \
3782}
3783
3784#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3785 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3786#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3787 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3788#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3789 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3790#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3791 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3792
3793DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3794DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3795DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3796DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3797
3798DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3799DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3800DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3801DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3802
3803DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3804DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3805DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3806DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3807
3808DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3809DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3810DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3811DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3812
3813DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3814DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3815DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3816DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3817
3818DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3819DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3820DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3821DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3822
3823DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3824DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3825DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3826DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3827
3828DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3829DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3830DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3831DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3832
3833DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3834DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3835DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3836DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3837
3838DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3839DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3840DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3841DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3842
3843#undef DO_CMP_PPZI_B
3844#undef DO_CMP_PPZI_H
3845#undef DO_CMP_PPZI_S
3846#undef DO_CMP_PPZI_D
3847#undef DO_CMP_PPZI
3848
3849
3850static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3851{
3852 intptr_t i;
3853
3854 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3855 uint64_t pg = *(uint64_t *)(vg + i);
3856 if (pg) {
3857 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3858 }
3859 }
3860 return 0;
3861}
3862
3863
3864
3865
3866
3867static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3868 bool brk, bool after)
3869{
3870 uint64_t b;
3871
3872 if (brk) {
3873 b = 0;
3874 } else if ((g & n) == 0) {
3875
3876 b = g;
3877 } else {
3878
3879 b = g & n;
3880 b = b & -b;
3881 if (after) {
3882 b = b | (b - 1);
3883 } else {
3884 b = b - 1;
3885 }
3886 brk = true;
3887 }
3888
3889 *retb = b;
3890 return brk;
3891}
3892
3893
3894static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3895 intptr_t oprsz, bool after)
3896{
3897 bool brk = false;
3898 intptr_t i;
3899
3900 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3901 uint64_t this_b, this_g = g[i];
3902
3903 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3904 d[i] = this_b & this_g;
3905 }
3906}
3907
3908
3909static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3910 intptr_t oprsz, bool after)
3911{
3912 uint32_t flags = PREDTEST_INIT;
3913 bool brk = false;
3914 intptr_t i;
3915
3916 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3917 uint64_t this_b, this_d, this_g = g[i];
3918
3919 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3920 d[i] = this_d = this_b & this_g;
3921 flags = iter_predtest_fwd(this_d, this_g, flags);
3922 }
3923 return flags;
3924}
3925
3926
3927static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3928 intptr_t oprsz, bool after)
3929{
3930 bool brk = false;
3931 intptr_t i;
3932
3933 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3934 uint64_t this_b, this_g = g[i];
3935
3936 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3937 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3938 }
3939}
3940
3941
3942static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3943 intptr_t oprsz, bool after)
3944{
3945 uint32_t flags = PREDTEST_INIT;
3946 bool brk = false;
3947 intptr_t i;
3948
3949 for (i = 0; i < oprsz / 8; ++i) {
3950 uint64_t this_b, this_d = d[i], this_g = g[i];
3951
3952 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3953 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3954 flags = iter_predtest_fwd(this_d, this_g, flags);
3955 }
3956 return flags;
3957}
3958
3959static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3960{
3961
3962
3963
3964 memset(d, 0, sizeof(ARMPredicateReg));
3965 return PREDTEST_INIT;
3966}
3967
3968void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3969 uint32_t pred_desc)
3970{
3971 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3972 if (last_active_pred(vn, vg, oprsz)) {
3973 compute_brk_z(vd, vm, vg, oprsz, true);
3974 } else {
3975 do_zero(vd, oprsz);
3976 }
3977}
3978
3979uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3980 uint32_t pred_desc)
3981{
3982 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3983 if (last_active_pred(vn, vg, oprsz)) {
3984 return compute_brks_z(vd, vm, vg, oprsz, true);
3985 } else {
3986 return do_zero(vd, oprsz);
3987 }
3988}
3989
3990void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3991 uint32_t pred_desc)
3992{
3993 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3994 if (last_active_pred(vn, vg, oprsz)) {
3995 compute_brk_z(vd, vm, vg, oprsz, false);
3996 } else {
3997 do_zero(vd, oprsz);
3998 }
3999}
4000
4001uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4002 uint32_t pred_desc)
4003{
4004 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4005 if (last_active_pred(vn, vg, oprsz)) {
4006 return compute_brks_z(vd, vm, vg, oprsz, false);
4007 } else {
4008 return do_zero(vd, oprsz);
4009 }
4010}
4011
4012void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4013{
4014 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4015 compute_brk_z(vd, vn, vg, oprsz, true);
4016}
4017
4018uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4019{
4020 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4021 return compute_brks_z(vd, vn, vg, oprsz, true);
4022}
4023
4024void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4025{
4026 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4027 compute_brk_z(vd, vn, vg, oprsz, false);
4028}
4029
4030uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4031{
4032 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4033 return compute_brks_z(vd, vn, vg, oprsz, false);
4034}
4035
4036void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4037{
4038 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4039 compute_brk_m(vd, vn, vg, oprsz, true);
4040}
4041
4042uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4043{
4044 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4045 return compute_brks_m(vd, vn, vg, oprsz, true);
4046}
4047
4048void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4049{
4050 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4051 compute_brk_m(vd, vn, vg, oprsz, false);
4052}
4053
4054uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4055{
4056 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4057 return compute_brks_m(vd, vn, vg, oprsz, false);
4058}
4059
4060void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4061{
4062 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4063 if (!last_active_pred(vn, vg, oprsz)) {
4064 do_zero(vd, oprsz);
4065 }
4066}
4067
4068
4069static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4070 uint64_t esz_mask)
4071{
4072 uint32_t flags = PREDTEST_INIT;
4073 intptr_t i;
4074
4075 for (i = 0; i < oprsz / 8; i++) {
4076 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4077 }
4078 if (oprsz & 7) {
4079 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4080 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4081 }
4082 return flags;
4083}
4084
4085uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4086{
4087 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4088 if (last_active_pred(vn, vg, oprsz)) {
4089 return predtest_ones(vd, oprsz, -1);
4090 } else {
4091 return do_zero(vd, oprsz);
4092 }
4093}
4094
4095uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4096{
4097 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4098 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4099 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4100 intptr_t i;
4101
4102 for (i = 0; i < words; ++i) {
4103 uint64_t t = n[i] & g[i] & mask;
4104 sum += ctpop64(t);
4105 }
4106 return sum;
4107}
4108
4109uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4110{
4111 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4112 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4113 uint64_t esz_mask = pred_esz_masks[esz];
4114 ARMPredicateReg *d = vd;
4115 uint32_t flags;
4116 intptr_t i;
4117
4118
4119 flags = do_zero(d, oprsz);
4120 if (count == 0) {
4121 return flags;
4122 }
4123
4124
4125 for (i = 0; i < count / 64; ++i) {
4126 d->p[i] = esz_mask;
4127 }
4128 if (count & 63) {
4129 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4130 }
4131
4132 return predtest_ones(d, oprsz, esz_mask);
4133}
4134
4135uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4136{
4137 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4138 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4139 uint64_t esz_mask = pred_esz_masks[esz];
4140 ARMPredicateReg *d = vd;
4141 intptr_t i, invcount, oprbits;
4142 uint64_t bits;
4143
4144 if (count == 0) {
4145 return do_zero(d, oprsz);
4146 }
4147
4148 oprbits = oprsz * 8;
4149 tcg_debug_assert(count <= oprbits);
4150
4151 bits = esz_mask;
4152 if (oprbits & 63) {
4153 bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4154 }
4155
4156 invcount = oprbits - count;
4157 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4158 d->p[i] = bits;
4159 bits = esz_mask;
4160 }
4161
4162 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4163
4164 while (--i >= 0) {
4165 d->p[i] = 0;
4166 }
4167
4168 return predtest_ones(d, oprsz, esz_mask);
4169}
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
4180static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4181{ \
4182 if (n == 1) { \
4183 return *data; \
4184 } else { \
4185 uintptr_t half = n / 2; \
4186 TYPE lo = NAME##_reduce(data, status, half); \
4187 TYPE hi = NAME##_reduce(data + half, status, half); \
4188 return TYPE##_##FUNC(lo, hi, status); \
4189 } \
4190} \
4191uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
4192{ \
4193 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
4194 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
4195 for (i = 0; i < oprsz; ) { \
4196 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4197 do { \
4198 TYPE nn = *(TYPE *)(vn + H(i)); \
4199 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
4200 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4201 } while (i & 15); \
4202 } \
4203 for (; i < maxsz; i += sizeof(TYPE)) { \
4204 *(TYPE *)((void *)data + i) = IDENT; \
4205 } \
4206 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
4207}
4208
4209DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4210DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4211DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4212
4213
4214DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4215DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4216DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4217
4218DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4219DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4220DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4221
4222DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4223DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4224DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4225
4226DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4227DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4228DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4229
4230#undef DO_REDUCE
4231
4232uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4233 void *status, uint32_t desc)
4234{
4235 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4236 float16 result = nn;
4237
4238 do {
4239 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4240 do {
4241 if (pg & 1) {
4242 float16 mm = *(float16 *)(vm + H1_2(i));
4243 result = float16_add(result, mm, status);
4244 }
4245 i += sizeof(float16), pg >>= sizeof(float16);
4246 } while (i & 15);
4247 } while (i < opr_sz);
4248
4249 return result;
4250}
4251
4252uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4253 void *status, uint32_t desc)
4254{
4255 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4256 float32 result = nn;
4257
4258 do {
4259 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4260 do {
4261 if (pg & 1) {
4262 float32 mm = *(float32 *)(vm + H1_2(i));
4263 result = float32_add(result, mm, status);
4264 }
4265 i += sizeof(float32), pg >>= sizeof(float32);
4266 } while (i & 15);
4267 } while (i < opr_sz);
4268
4269 return result;
4270}
4271
4272uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4273 void *status, uint32_t desc)
4274{
4275 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4276 uint64_t *m = vm;
4277 uint8_t *pg = vg;
4278
4279 for (i = 0; i < opr_sz; i++) {
4280 if (pg[H1(i)] & 1) {
4281 nn = float64_add(nn, m[i], status);
4282 }
4283 }
4284
4285 return nn;
4286}
4287
4288
4289
4290
4291#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4292void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4293 void *status, uint32_t desc) \
4294{ \
4295 intptr_t i = simd_oprsz(desc); \
4296 uint64_t *g = vg; \
4297 do { \
4298 uint64_t pg = g[(i - 1) >> 6]; \
4299 do { \
4300 i -= sizeof(TYPE); \
4301 if (likely((pg >> (i & 63)) & 1)) { \
4302 TYPE nn = *(TYPE *)(vn + H(i)); \
4303 TYPE mm = *(TYPE *)(vm + H(i)); \
4304 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4305 } \
4306 } while (i & 63); \
4307 } while (i != 0); \
4308}
4309
4310DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4311DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4312DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4313
4314DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4315DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4316DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4317
4318DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4319DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4320DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4321
4322DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4323DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4324DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4325
4326DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4327DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4328DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4329
4330DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4331DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4332DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4333
4334DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4335DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4336DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4337
4338DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4339DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4340DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4341
4342static inline float16 abd_h(float16 a, float16 b, float_status *s)
4343{
4344 return float16_abs(float16_sub(a, b, s));
4345}
4346
4347static inline float32 abd_s(float32 a, float32 b, float_status *s)
4348{
4349 return float32_abs(float32_sub(a, b, s));
4350}
4351
4352static inline float64 abd_d(float64 a, float64 b, float_status *s)
4353{
4354 return float64_abs(float64_sub(a, b, s));
4355}
4356
4357DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4358DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4359DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4360
4361static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4362{
4363 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4364 return float64_scalbn(a, b_int, s);
4365}
4366
4367DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4368DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4369DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4370
4371DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4372DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4373DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4374
4375#undef DO_ZPZZ_FP
4376
4377
4378
4379
4380#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4381void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4382 void *status, uint32_t desc) \
4383{ \
4384 intptr_t i = simd_oprsz(desc); \
4385 uint64_t *g = vg; \
4386 TYPE mm = scalar; \
4387 do { \
4388 uint64_t pg = g[(i - 1) >> 6]; \
4389 do { \
4390 i -= sizeof(TYPE); \
4391 if (likely((pg >> (i & 63)) & 1)) { \
4392 TYPE nn = *(TYPE *)(vn + H(i)); \
4393 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4394 } \
4395 } while (i & 63); \
4396 } while (i != 0); \
4397}
4398
4399DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4400DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4401DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4402
4403DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4404DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4405DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4406
4407DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4408DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4409DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4410
4411static inline float16 subr_h(float16 a, float16 b, float_status *s)
4412{
4413 return float16_sub(b, a, s);
4414}
4415
4416static inline float32 subr_s(float32 a, float32 b, float_status *s)
4417{
4418 return float32_sub(b, a, s);
4419}
4420
4421static inline float64 subr_d(float64 a, float64 b, float_status *s)
4422{
4423 return float64_sub(b, a, s);
4424}
4425
4426DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4427DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4428DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4429
4430DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4431DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4432DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4433
4434DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4435DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4436DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4437
4438DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4439DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4440DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4441
4442DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4443DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4444DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4445
4446
4447
4448
4449#define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4450void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4451{ \
4452 intptr_t i = simd_oprsz(desc); \
4453 uint64_t *g = vg; \
4454 do { \
4455 uint64_t pg = g[(i - 1) >> 6]; \
4456 do { \
4457 i -= sizeof(TYPE); \
4458 if (likely((pg >> (i & 63)) & 1)) { \
4459 TYPE nn = *(TYPE *)(vn + H(i)); \
4460 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4461 } \
4462 } while (i & 63); \
4463 } while (i != 0); \
4464}
4465
4466
4467
4468
4469
4470static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4471{
4472 bool save = get_flush_inputs_to_zero(fpst);
4473 float32 ret;
4474
4475 set_flush_inputs_to_zero(false, fpst);
4476 ret = float16_to_float32(f, true, fpst);
4477 set_flush_inputs_to_zero(save, fpst);
4478 return ret;
4479}
4480
4481static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4482{
4483 bool save = get_flush_inputs_to_zero(fpst);
4484 float64 ret;
4485
4486 set_flush_inputs_to_zero(false, fpst);
4487 ret = float16_to_float64(f, true, fpst);
4488 set_flush_inputs_to_zero(save, fpst);
4489 return ret;
4490}
4491
4492static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4493{
4494 bool save = get_flush_to_zero(fpst);
4495 float16 ret;
4496
4497 set_flush_to_zero(false, fpst);
4498 ret = float32_to_float16(f, true, fpst);
4499 set_flush_to_zero(save, fpst);
4500 return ret;
4501}
4502
4503static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4504{
4505 bool save = get_flush_to_zero(fpst);
4506 float16 ret;
4507
4508 set_flush_to_zero(false, fpst);
4509 ret = float64_to_float16(f, true, fpst);
4510 set_flush_to_zero(save, fpst);
4511 return ret;
4512}
4513
4514static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4515{
4516 if (float16_is_any_nan(f)) {
4517 float_raise(float_flag_invalid, s);
4518 return 0;
4519 }
4520 return float16_to_int16_round_to_zero(f, s);
4521}
4522
4523static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4524{
4525 if (float16_is_any_nan(f)) {
4526 float_raise(float_flag_invalid, s);
4527 return 0;
4528 }
4529 return float16_to_int64_round_to_zero(f, s);
4530}
4531
4532static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4533{
4534 if (float32_is_any_nan(f)) {
4535 float_raise(float_flag_invalid, s);
4536 return 0;
4537 }
4538 return float32_to_int64_round_to_zero(f, s);
4539}
4540
4541static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4542{
4543 if (float64_is_any_nan(f)) {
4544 float_raise(float_flag_invalid, s);
4545 return 0;
4546 }
4547 return float64_to_int64_round_to_zero(f, s);
4548}
4549
4550static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4551{
4552 if (float16_is_any_nan(f)) {
4553 float_raise(float_flag_invalid, s);
4554 return 0;
4555 }
4556 return float16_to_uint16_round_to_zero(f, s);
4557}
4558
4559static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4560{
4561 if (float16_is_any_nan(f)) {
4562 float_raise(float_flag_invalid, s);
4563 return 0;
4564 }
4565 return float16_to_uint64_round_to_zero(f, s);
4566}
4567
4568static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4569{
4570 if (float32_is_any_nan(f)) {
4571 float_raise(float_flag_invalid, s);
4572 return 0;
4573 }
4574 return float32_to_uint64_round_to_zero(f, s);
4575}
4576
4577static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4578{
4579 if (float64_is_any_nan(f)) {
4580 float_raise(float_flag_invalid, s);
4581 return 0;
4582 }
4583 return float64_to_uint64_round_to_zero(f, s);
4584}
4585
4586DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4587DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4588DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16)
4589DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4590DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4591DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4592DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4593
4594DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4595DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4596DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4597DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4598DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4599DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4600DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4601
4602DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4603DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4604DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4605DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4606DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4607DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4608DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4609
4610DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4611DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4612DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4613
4614DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4615DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4616DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4617
4618DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4619DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4620DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4621
4622DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4623DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4624DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4625
4626DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4627DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4628DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4629DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4630DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4631DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4632DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4633
4634DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4635DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4636DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4637DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4638DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4639DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4640DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4641
4642static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4643{
4644
4645 uint32_t frac = (uint32_t)a << (16 + 6);
4646 int16_t exp = extract32(a, 10, 5);
4647
4648 if (unlikely(exp == 0)) {
4649 if (frac != 0) {
4650 if (!get_flush_inputs_to_zero(s)) {
4651
4652 return -15 - clz32(frac);
4653 }
4654
4655 float_raise(float_flag_input_denormal, s);
4656 }
4657 } else if (unlikely(exp == 0x1f)) {
4658 if (frac == 0) {
4659 return INT16_MAX;
4660 }
4661 } else {
4662
4663 return exp - 15;
4664 }
4665
4666 float_raise(float_flag_invalid, s);
4667 return INT16_MIN;
4668}
4669
4670static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4671{
4672
4673 uint32_t frac = a << 9;
4674 int32_t exp = extract32(a, 23, 8);
4675
4676 if (unlikely(exp == 0)) {
4677 if (frac != 0) {
4678 if (!get_flush_inputs_to_zero(s)) {
4679
4680 return -127 - clz32(frac);
4681 }
4682
4683 float_raise(float_flag_input_denormal, s);
4684 }
4685 } else if (unlikely(exp == 0xff)) {
4686 if (frac == 0) {
4687 return INT32_MAX;
4688 }
4689 } else {
4690
4691 return exp - 127;
4692 }
4693
4694 float_raise(float_flag_invalid, s);
4695 return INT32_MIN;
4696}
4697
4698static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4699{
4700
4701 uint64_t frac = a << 12;
4702 int64_t exp = extract64(a, 52, 11);
4703
4704 if (unlikely(exp == 0)) {
4705 if (frac != 0) {
4706 if (!get_flush_inputs_to_zero(s)) {
4707
4708 return -1023 - clz64(frac);
4709 }
4710
4711 float_raise(float_flag_input_denormal, s);
4712 }
4713 } else if (unlikely(exp == 0x7ff)) {
4714 if (frac == 0) {
4715 return INT64_MAX;
4716 }
4717 } else {
4718
4719 return exp - 1023;
4720 }
4721
4722 float_raise(float_flag_invalid, s);
4723 return INT64_MIN;
4724}
4725
4726DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4727DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4728DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4729
4730#undef DO_ZPZ_FP
4731
4732static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4733 float_status *status, uint32_t desc,
4734 uint16_t neg1, uint16_t neg3)
4735{
4736 intptr_t i = simd_oprsz(desc);
4737 uint64_t *g = vg;
4738
4739 do {
4740 uint64_t pg = g[(i - 1) >> 6];
4741 do {
4742 i -= 2;
4743 if (likely((pg >> (i & 63)) & 1)) {
4744 float16 e1, e2, e3, r;
4745
4746 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4747 e2 = *(uint16_t *)(vm + H1_2(i));
4748 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4749 r = float16_muladd(e1, e2, e3, 0, status);
4750 *(uint16_t *)(vd + H1_2(i)) = r;
4751 }
4752 } while (i & 63);
4753 } while (i != 0);
4754}
4755
4756void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4757 void *vg, void *status, uint32_t desc)
4758{
4759 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4760}
4761
4762void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4763 void *vg, void *status, uint32_t desc)
4764{
4765 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4766}
4767
4768void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4769 void *vg, void *status, uint32_t desc)
4770{
4771 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4772}
4773
4774void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4775 void *vg, void *status, uint32_t desc)
4776{
4777 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4778}
4779
4780static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4781 float_status *status, uint32_t desc,
4782 uint32_t neg1, uint32_t neg3)
4783{
4784 intptr_t i = simd_oprsz(desc);
4785 uint64_t *g = vg;
4786
4787 do {
4788 uint64_t pg = g[(i - 1) >> 6];
4789 do {
4790 i -= 4;
4791 if (likely((pg >> (i & 63)) & 1)) {
4792 float32 e1, e2, e3, r;
4793
4794 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4795 e2 = *(uint32_t *)(vm + H1_4(i));
4796 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4797 r = float32_muladd(e1, e2, e3, 0, status);
4798 *(uint32_t *)(vd + H1_4(i)) = r;
4799 }
4800 } while (i & 63);
4801 } while (i != 0);
4802}
4803
4804void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4805 void *vg, void *status, uint32_t desc)
4806{
4807 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4808}
4809
4810void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4811 void *vg, void *status, uint32_t desc)
4812{
4813 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4814}
4815
4816void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4817 void *vg, void *status, uint32_t desc)
4818{
4819 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4820}
4821
4822void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4823 void *vg, void *status, uint32_t desc)
4824{
4825 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4826}
4827
4828static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4829 float_status *status, uint32_t desc,
4830 uint64_t neg1, uint64_t neg3)
4831{
4832 intptr_t i = simd_oprsz(desc);
4833 uint64_t *g = vg;
4834
4835 do {
4836 uint64_t pg = g[(i - 1) >> 6];
4837 do {
4838 i -= 8;
4839 if (likely((pg >> (i & 63)) & 1)) {
4840 float64 e1, e2, e3, r;
4841
4842 e1 = *(uint64_t *)(vn + i) ^ neg1;
4843 e2 = *(uint64_t *)(vm + i);
4844 e3 = *(uint64_t *)(va + i) ^ neg3;
4845 r = float64_muladd(e1, e2, e3, 0, status);
4846 *(uint64_t *)(vd + i) = r;
4847 }
4848 } while (i & 63);
4849 } while (i != 0);
4850}
4851
4852void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4853 void *vg, void *status, uint32_t desc)
4854{
4855 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4856}
4857
4858void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4859 void *vg, void *status, uint32_t desc)
4860{
4861 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4862}
4863
4864void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4865 void *vg, void *status, uint32_t desc)
4866{
4867 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4868}
4869
4870void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4871 void *vg, void *status, uint32_t desc)
4872{
4873 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4874}
4875
4876
4877
4878
4879
4880
4881#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4882void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4883 void *status, uint32_t desc) \
4884{ \
4885 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4886 uint64_t *d = vd, *g = vg; \
4887 do { \
4888 uint64_t out = 0, pg = g[j]; \
4889 do { \
4890 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4891 if (likely((pg >> (i & 63)) & 1)) { \
4892 TYPE nn = *(TYPE *)(vn + H(i)); \
4893 TYPE mm = *(TYPE *)(vm + H(i)); \
4894 out |= OP(TYPE, nn, mm, status); \
4895 } \
4896 } while (i & 63); \
4897 d[j--] = out; \
4898 } while (i > 0); \
4899}
4900
4901#define DO_FPCMP_PPZZ_H(NAME, OP) \
4902 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4903#define DO_FPCMP_PPZZ_S(NAME, OP) \
4904 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4905#define DO_FPCMP_PPZZ_D(NAME, OP) \
4906 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4907
4908#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4909 DO_FPCMP_PPZZ_H(NAME, OP) \
4910 DO_FPCMP_PPZZ_S(NAME, OP) \
4911 DO_FPCMP_PPZZ_D(NAME, OP)
4912
4913#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4914#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4915#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4916#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
4917#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4918#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4919#define DO_FCMUO(TYPE, X, Y, ST) \
4920 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4921#define DO_FACGE(TYPE, X, Y, ST) \
4922 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4923#define DO_FACGT(TYPE, X, Y, ST) \
4924 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4925
4926DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4927DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4928DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4929DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4930DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4931DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4932DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4933
4934#undef DO_FPCMP_PPZZ_ALL
4935#undef DO_FPCMP_PPZZ_D
4936#undef DO_FPCMP_PPZZ_S
4937#undef DO_FPCMP_PPZZ_H
4938#undef DO_FPCMP_PPZZ
4939
4940
4941
4942
4943#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4944void HELPER(NAME)(void *vd, void *vn, void *vg, \
4945 void *status, uint32_t desc) \
4946{ \
4947 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4948 uint64_t *d = vd, *g = vg; \
4949 do { \
4950 uint64_t out = 0, pg = g[j]; \
4951 do { \
4952 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4953 if ((pg >> (i & 63)) & 1) { \
4954 TYPE nn = *(TYPE *)(vn + H(i)); \
4955 out |= OP(TYPE, nn, 0, status); \
4956 } \
4957 } while (i & 63); \
4958 d[j--] = out; \
4959 } while (i > 0); \
4960}
4961
4962#define DO_FPCMP_PPZ0_H(NAME, OP) \
4963 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4964#define DO_FPCMP_PPZ0_S(NAME, OP) \
4965 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4966#define DO_FPCMP_PPZ0_D(NAME, OP) \
4967 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4968
4969#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4970 DO_FPCMP_PPZ0_H(NAME, OP) \
4971 DO_FPCMP_PPZ0_S(NAME, OP) \
4972 DO_FPCMP_PPZ0_D(NAME, OP)
4973
4974DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4975DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4976DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4977DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4978DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4979DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4980
4981
4982
4983void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4984{
4985 static const float16 coeff[16] = {
4986 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4987 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4988 };
4989 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4990 intptr_t x = simd_data(desc);
4991 float16 *d = vd, *n = vn, *m = vm;
4992 for (i = 0; i < opr_sz; i++) {
4993 float16 mm = m[i];
4994 intptr_t xx = x;
4995 if (float16_is_neg(mm)) {
4996 mm = float16_abs(mm);
4997 xx += 8;
4998 }
4999 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
5000 }
5001}
5002
5003void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5004{
5005 static const float32 coeff[16] = {
5006 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5007 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5008 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5009 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5010 };
5011 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5012 intptr_t x = simd_data(desc);
5013 float32 *d = vd, *n = vn, *m = vm;
5014 for (i = 0; i < opr_sz; i++) {
5015 float32 mm = m[i];
5016 intptr_t xx = x;
5017 if (float32_is_neg(mm)) {
5018 mm = float32_abs(mm);
5019 xx += 8;
5020 }
5021 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
5022 }
5023}
5024
5025void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5026{
5027 static const float64 coeff[16] = {
5028 0x3ff0000000000000ull, 0xbfc5555555555543ull,
5029 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5030 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5031 0x3de5d8408868552full, 0x0000000000000000ull,
5032 0x3ff0000000000000ull, 0xbfe0000000000000ull,
5033 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5034 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5035 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5036 };
5037 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5038 intptr_t x = simd_data(desc);
5039 float64 *d = vd, *n = vn, *m = vm;
5040 for (i = 0; i < opr_sz; i++) {
5041 float64 mm = m[i];
5042 intptr_t xx = x;
5043 if (float64_is_neg(mm)) {
5044 mm = float64_abs(mm);
5045 xx += 8;
5046 }
5047 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5048 }
5049}
5050
5051
5052
5053
5054
5055void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5056 void *vs, uint32_t desc)
5057{
5058 intptr_t j, i = simd_oprsz(desc);
5059 uint64_t *g = vg;
5060 float16 neg_imag = float16_set_sign(0, simd_data(desc));
5061 float16 neg_real = float16_chs(neg_imag);
5062
5063 do {
5064 uint64_t pg = g[(i - 1) >> 6];
5065 do {
5066 float16 e0, e1, e2, e3;
5067
5068
5069 j = i - sizeof(float16);
5070 i -= 2 * sizeof(float16);
5071
5072 e0 = *(float16 *)(vn + H1_2(i));
5073 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5074 e2 = *(float16 *)(vn + H1_2(j));
5075 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5076
5077 if (likely((pg >> (i & 63)) & 1)) {
5078 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5079 }
5080 if (likely((pg >> (j & 63)) & 1)) {
5081 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5082 }
5083 } while (i & 63);
5084 } while (i != 0);
5085}
5086
5087void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5088 void *vs, uint32_t desc)
5089{
5090 intptr_t j, i = simd_oprsz(desc);
5091 uint64_t *g = vg;
5092 float32 neg_imag = float32_set_sign(0, simd_data(desc));
5093 float32 neg_real = float32_chs(neg_imag);
5094
5095 do {
5096 uint64_t pg = g[(i - 1) >> 6];
5097 do {
5098 float32 e0, e1, e2, e3;
5099
5100
5101 j = i - sizeof(float32);
5102 i -= 2 * sizeof(float32);
5103
5104 e0 = *(float32 *)(vn + H1_2(i));
5105 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5106 e2 = *(float32 *)(vn + H1_2(j));
5107 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5108
5109 if (likely((pg >> (i & 63)) & 1)) {
5110 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5111 }
5112 if (likely((pg >> (j & 63)) & 1)) {
5113 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5114 }
5115 } while (i & 63);
5116 } while (i != 0);
5117}
5118
5119void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5120 void *vs, uint32_t desc)
5121{
5122 intptr_t j, i = simd_oprsz(desc);
5123 uint64_t *g = vg;
5124 float64 neg_imag = float64_set_sign(0, simd_data(desc));
5125 float64 neg_real = float64_chs(neg_imag);
5126
5127 do {
5128 uint64_t pg = g[(i - 1) >> 6];
5129 do {
5130 float64 e0, e1, e2, e3;
5131
5132
5133 j = i - sizeof(float64);
5134 i -= 2 * sizeof(float64);
5135
5136 e0 = *(float64 *)(vn + H1_2(i));
5137 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5138 e2 = *(float64 *)(vn + H1_2(j));
5139 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5140
5141 if (likely((pg >> (i & 63)) & 1)) {
5142 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5143 }
5144 if (likely((pg >> (j & 63)) & 1)) {
5145 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5146 }
5147 } while (i & 63);
5148 } while (i != 0);
5149}
5150
5151
5152
5153
5154
5155void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5156 void *vg, void *status, uint32_t desc)
5157{
5158 intptr_t j, i = simd_oprsz(desc);
5159 unsigned rot = simd_data(desc);
5160 bool flip = rot & 1;
5161 float16 neg_imag, neg_real;
5162 uint64_t *g = vg;
5163
5164 neg_imag = float16_set_sign(0, (rot & 2) != 0);
5165 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5166
5167 do {
5168 uint64_t pg = g[(i - 1) >> 6];
5169 do {
5170 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5171
5172
5173 j = i - sizeof(float16);
5174 i -= 2 * sizeof(float16);
5175
5176 nr = *(float16 *)(vn + H1_2(i));
5177 ni = *(float16 *)(vn + H1_2(j));
5178 mr = *(float16 *)(vm + H1_2(i));
5179 mi = *(float16 *)(vm + H1_2(j));
5180
5181 e2 = (flip ? ni : nr);
5182 e1 = (flip ? mi : mr) ^ neg_real;
5183 e4 = e2;
5184 e3 = (flip ? mr : mi) ^ neg_imag;
5185
5186 if (likely((pg >> (i & 63)) & 1)) {
5187 d = *(float16 *)(va + H1_2(i));
5188 d = float16_muladd(e2, e1, d, 0, status);
5189 *(float16 *)(vd + H1_2(i)) = d;
5190 }
5191 if (likely((pg >> (j & 63)) & 1)) {
5192 d = *(float16 *)(va + H1_2(j));
5193 d = float16_muladd(e4, e3, d, 0, status);
5194 *(float16 *)(vd + H1_2(j)) = d;
5195 }
5196 } while (i & 63);
5197 } while (i != 0);
5198}
5199
5200void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5201 void *vg, void *status, uint32_t desc)
5202{
5203 intptr_t j, i = simd_oprsz(desc);
5204 unsigned rot = simd_data(desc);
5205 bool flip = rot & 1;
5206 float32 neg_imag, neg_real;
5207 uint64_t *g = vg;
5208
5209 neg_imag = float32_set_sign(0, (rot & 2) != 0);
5210 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5211
5212 do {
5213 uint64_t pg = g[(i - 1) >> 6];
5214 do {
5215 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5216
5217
5218 j = i - sizeof(float32);
5219 i -= 2 * sizeof(float32);
5220
5221 nr = *(float32 *)(vn + H1_2(i));
5222 ni = *(float32 *)(vn + H1_2(j));
5223 mr = *(float32 *)(vm + H1_2(i));
5224 mi = *(float32 *)(vm + H1_2(j));
5225
5226 e2 = (flip ? ni : nr);
5227 e1 = (flip ? mi : mr) ^ neg_real;
5228 e4 = e2;
5229 e3 = (flip ? mr : mi) ^ neg_imag;
5230
5231 if (likely((pg >> (i & 63)) & 1)) {
5232 d = *(float32 *)(va + H1_2(i));
5233 d = float32_muladd(e2, e1, d, 0, status);
5234 *(float32 *)(vd + H1_2(i)) = d;
5235 }
5236 if (likely((pg >> (j & 63)) & 1)) {
5237 d = *(float32 *)(va + H1_2(j));
5238 d = float32_muladd(e4, e3, d, 0, status);
5239 *(float32 *)(vd + H1_2(j)) = d;
5240 }
5241 } while (i & 63);
5242 } while (i != 0);
5243}
5244
5245void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5246 void *vg, void *status, uint32_t desc)
5247{
5248 intptr_t j, i = simd_oprsz(desc);
5249 unsigned rot = simd_data(desc);
5250 bool flip = rot & 1;
5251 float64 neg_imag, neg_real;
5252 uint64_t *g = vg;
5253
5254 neg_imag = float64_set_sign(0, (rot & 2) != 0);
5255 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5256
5257 do {
5258 uint64_t pg = g[(i - 1) >> 6];
5259 do {
5260 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5261
5262
5263 j = i - sizeof(float64);
5264 i -= 2 * sizeof(float64);
5265
5266 nr = *(float64 *)(vn + H1_2(i));
5267 ni = *(float64 *)(vn + H1_2(j));
5268 mr = *(float64 *)(vm + H1_2(i));
5269 mi = *(float64 *)(vm + H1_2(j));
5270
5271 e2 = (flip ? ni : nr);
5272 e1 = (flip ? mi : mr) ^ neg_real;
5273 e4 = e2;
5274 e3 = (flip ? mr : mi) ^ neg_imag;
5275
5276 if (likely((pg >> (i & 63)) & 1)) {
5277 d = *(float64 *)(va + H1_2(i));
5278 d = float64_muladd(e2, e1, d, 0, status);
5279 *(float64 *)(vd + H1_2(i)) = d;
5280 }
5281 if (likely((pg >> (j & 63)) & 1)) {
5282 d = *(float64 *)(va + H1_2(j));
5283 d = float64_muladd(e4, e3, d, 0, status);
5284 *(float64 *)(vd + H1_2(j)) = d;
5285 }
5286 } while (i & 63);
5287 } while (i != 0);
5288}
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5300 intptr_t reg_max, int esz)
5301{
5302 uint64_t pg_mask = pred_esz_masks[esz];
5303 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5304
5305
5306 if (likely(pg & 1)) {
5307 return reg_off;
5308 }
5309
5310 if (pg == 0) {
5311 reg_off &= -64;
5312 do {
5313 reg_off += 64;
5314 if (unlikely(reg_off >= reg_max)) {
5315
5316 return reg_max;
5317 }
5318 pg = vg[reg_off >> 6] & pg_mask;
5319 } while (pg == 0);
5320 }
5321 reg_off += ctz64(pg);
5322
5323
5324 tcg_debug_assert(reg_off < reg_max);
5325 return reg_off;
5326}
5327
5328
5329
5330
5331
5332
5333
5334bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5335 target_ulong addr, int mem_off, MMUAccessType access_type,
5336 int mmu_idx, uintptr_t retaddr)
5337{
5338 int flags;
5339
5340 addr += mem_off;
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352 addr = useronly_clean_ptr(addr);
5353
5354 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
5355 &info->host, retaddr);
5356 info->flags = flags;
5357
5358 if (flags & TLB_INVALID_MASK) {
5359 g_assert(nofault);
5360 return false;
5361 }
5362
5363
5364 info->host -= mem_off;
5365
5366#ifdef CONFIG_USER_ONLY
5367 memset(&info->attrs, 0, sizeof(info->attrs));
5368
5369 arm_tlb_mte_tagged(&info->attrs) =
5370 (flags & PAGE_ANON) && (flags & PAGE_MTE);
5371#else
5372
5373
5374
5375
5376 {
5377 uintptr_t index = tlb_index(env, mmu_idx, addr);
5378
5379# ifdef CONFIG_DEBUG_TCG
5380 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
5381 target_ulong comparator = (access_type == MMU_DATA_LOAD
5382 ? entry->addr_read
5383 : tlb_addr_write(entry));
5384 g_assert(tlb_hit(comparator, addr));
5385# endif
5386
5387 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
5388 info->attrs = iotlbentry->attrs;
5389 }
5390#endif
5391
5392 return true;
5393}
5394
5395
5396
5397
5398
5399
5400bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5401 intptr_t reg_max, int esz, int msize)
5402{
5403 const int esize = 1 << esz;
5404 const uint64_t pg_mask = pred_esz_masks[esz];
5405 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5406 intptr_t mem_off_last, mem_off_split;
5407 intptr_t page_split, elt_split;
5408 intptr_t i;
5409
5410
5411 memset(info, -1, offsetof(SVEContLdSt, page));
5412 memset(info->page, 0, sizeof(info->page));
5413
5414
5415 i = 0;
5416 do {
5417 uint64_t pg = vg[i] & pg_mask;
5418 if (pg) {
5419 reg_off_last = i * 64 + 63 - clz64(pg);
5420 if (reg_off_first < 0) {
5421 reg_off_first = i * 64 + ctz64(pg);
5422 }
5423 }
5424 } while (++i * 64 < reg_max);
5425
5426 if (unlikely(reg_off_first < 0)) {
5427
5428 return false;
5429 }
5430 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5431
5432 info->reg_off_first[0] = reg_off_first;
5433 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5434 mem_off_last = (reg_off_last >> esz) * msize;
5435
5436 page_split = -(addr | TARGET_PAGE_MASK);
5437 if (likely(mem_off_last + msize <= page_split)) {
5438
5439 info->reg_off_last[0] = reg_off_last;
5440 return true;
5441 }
5442
5443 info->page_split = page_split;
5444 elt_split = page_split / msize;
5445 reg_off_split = elt_split << esz;
5446 mem_off_split = elt_split * msize;
5447
5448
5449
5450
5451
5452
5453
5454 if (elt_split != 0) {
5455 info->reg_off_last[0] = reg_off_split - esize;
5456 }
5457
5458
5459 if (page_split % msize != 0) {
5460
5461 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5462 info->reg_off_split = reg_off_split;
5463 info->mem_off_split = mem_off_split;
5464
5465 if (reg_off_split == reg_off_last) {
5466
5467 return true;
5468 }
5469 }
5470 reg_off_split += esize;
5471 mem_off_split += msize;
5472 }
5473
5474
5475
5476
5477
5478 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5479 tcg_debug_assert(reg_off_split <= reg_off_last);
5480 info->reg_off_first[1] = reg_off_split;
5481 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5482 info->reg_off_last[1] = reg_off_last;
5483 return true;
5484}
5485
5486
5487
5488
5489
5490
5491bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5492 CPUARMState *env, target_ulong addr,
5493 MMUAccessType access_type, uintptr_t retaddr)
5494{
5495 int mmu_idx = cpu_mmu_index(env, false);
5496 int mem_off = info->mem_off_first[0];
5497 bool nofault = fault == FAULT_NO;
5498 bool have_work = true;
5499
5500 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5501 access_type, mmu_idx, retaddr)) {
5502
5503 return false;
5504 }
5505
5506 if (likely(info->page_split < 0)) {
5507
5508 return true;
5509 }
5510
5511
5512
5513
5514
5515 if (info->mem_off_split >= 0) {
5516
5517
5518
5519
5520 mem_off = info->page_split;
5521
5522
5523
5524
5525
5526
5527 if (info->mem_off_first[0] < info->mem_off_split) {
5528 nofault = FAULT_FIRST;
5529 have_work = false;
5530 }
5531 } else {
5532
5533
5534
5535
5536 mem_off = info->mem_off_first[1];
5537
5538
5539
5540
5541 nofault = fault != FAULT_ALL;
5542 }
5543
5544 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5545 access_type, mmu_idx, retaddr);
5546 return have_work;
5547}
5548
5549#ifndef CONFIG_USER_ONLY
5550void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5551 uint64_t *vg, target_ulong addr,
5552 int esize, int msize, int wp_access,
5553 uintptr_t retaddr)
5554{
5555 intptr_t mem_off, reg_off, reg_last;
5556 int flags0 = info->page[0].flags;
5557 int flags1 = info->page[1].flags;
5558
5559 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5560 return;
5561 }
5562
5563
5564 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5565 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5566
5567 if (flags0 & TLB_WATCHPOINT) {
5568 mem_off = info->mem_off_first[0];
5569 reg_off = info->reg_off_first[0];
5570 reg_last = info->reg_off_last[0];
5571
5572 while (reg_off <= reg_last) {
5573 uint64_t pg = vg[reg_off >> 6];
5574 do {
5575 if ((pg >> (reg_off & 63)) & 1) {
5576 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5577 msize, info->page[0].attrs,
5578 wp_access, retaddr);
5579 }
5580 reg_off += esize;
5581 mem_off += msize;
5582 } while (reg_off <= reg_last && (reg_off & 63));
5583 }
5584 }
5585
5586 mem_off = info->mem_off_split;
5587 if (mem_off >= 0) {
5588 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5589 info->page[0].attrs, wp_access, retaddr);
5590 }
5591
5592 mem_off = info->mem_off_first[1];
5593 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5594 reg_off = info->reg_off_first[1];
5595 reg_last = info->reg_off_last[1];
5596
5597 do {
5598 uint64_t pg = vg[reg_off >> 6];
5599 do {
5600 if ((pg >> (reg_off & 63)) & 1) {
5601 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5602 msize, info->page[1].attrs,
5603 wp_access, retaddr);
5604 }
5605 reg_off += esize;
5606 mem_off += msize;
5607 } while (reg_off & 63);
5608 } while (reg_off <= reg_last);
5609 }
5610}
5611#endif
5612
5613void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5614 uint64_t *vg, target_ulong addr, int esize,
5615 int msize, uint32_t mtedesc, uintptr_t ra)
5616{
5617 intptr_t mem_off, reg_off, reg_last;
5618
5619
5620 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
5621 mem_off = info->mem_off_first[0];
5622 reg_off = info->reg_off_first[0];
5623 reg_last = info->reg_off_split;
5624 if (reg_last < 0) {
5625 reg_last = info->reg_off_last[0];
5626 }
5627
5628 do {
5629 uint64_t pg = vg[reg_off >> 6];
5630 do {
5631 if ((pg >> (reg_off & 63)) & 1) {
5632 mte_check(env, mtedesc, addr, ra);
5633 }
5634 reg_off += esize;
5635 mem_off += msize;
5636 } while (reg_off <= reg_last && (reg_off & 63));
5637 } while (reg_off <= reg_last);
5638 }
5639
5640 mem_off = info->mem_off_first[1];
5641 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
5642 reg_off = info->reg_off_first[1];
5643 reg_last = info->reg_off_last[1];
5644
5645 do {
5646 uint64_t pg = vg[reg_off >> 6];
5647 do {
5648 if ((pg >> (reg_off & 63)) & 1) {
5649 mte_check(env, mtedesc, addr, ra);
5650 }
5651 reg_off += esize;
5652 mem_off += msize;
5653 } while (reg_off & 63);
5654 } while (reg_off <= reg_last);
5655 }
5656}
5657
5658
5659
5660
5661static inline QEMU_ALWAYS_INLINE
5662void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5663 uint32_t desc, const uintptr_t retaddr,
5664 const int esz, const int msz, const int N, uint32_t mtedesc,
5665 sve_ldst1_host_fn *host_fn,
5666 sve_ldst1_tlb_fn *tlb_fn)
5667{
5668 const unsigned rd = simd_data(desc);
5669 const intptr_t reg_max = simd_oprsz(desc);
5670 intptr_t reg_off, reg_last, mem_off;
5671 SVEContLdSt info;
5672 void *host;
5673 int flags, i;
5674
5675
5676 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5677
5678 for (i = 0; i < N; ++i) {
5679 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5680 }
5681 return;
5682 }
5683
5684
5685 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5686
5687
5688 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5689 BP_MEM_READ, retaddr);
5690
5691
5692
5693
5694
5695 if (mtedesc) {
5696 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5697 mtedesc, retaddr);
5698 }
5699
5700 flags = info.page[0].flags | info.page[1].flags;
5701 if (unlikely(flags != 0)) {
5702#ifdef CONFIG_USER_ONLY
5703 g_assert_not_reached();
5704#else
5705
5706
5707
5708
5709
5710
5711 ARMVectorReg scratch[4] = { };
5712
5713 mem_off = info.mem_off_first[0];
5714 reg_off = info.reg_off_first[0];
5715 reg_last = info.reg_off_last[1];
5716 if (reg_last < 0) {
5717 reg_last = info.reg_off_split;
5718 if (reg_last < 0) {
5719 reg_last = info.reg_off_last[0];
5720 }
5721 }
5722
5723 do {
5724 uint64_t pg = vg[reg_off >> 6];
5725 do {
5726 if ((pg >> (reg_off & 63)) & 1) {
5727 for (i = 0; i < N; ++i) {
5728 tlb_fn(env, &scratch[i], reg_off,
5729 addr + mem_off + (i << msz), retaddr);
5730 }
5731 }
5732 reg_off += 1 << esz;
5733 mem_off += N << msz;
5734 } while (reg_off & 63);
5735 } while (reg_off <= reg_last);
5736
5737 for (i = 0; i < N; ++i) {
5738 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5739 }
5740 return;
5741#endif
5742 }
5743
5744
5745
5746 for (i = 0; i < N; ++i) {
5747 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5748 }
5749
5750 mem_off = info.mem_off_first[0];
5751 reg_off = info.reg_off_first[0];
5752 reg_last = info.reg_off_last[0];
5753 host = info.page[0].host;
5754
5755 while (reg_off <= reg_last) {
5756 uint64_t pg = vg[reg_off >> 6];
5757 do {
5758 if ((pg >> (reg_off & 63)) & 1) {
5759 for (i = 0; i < N; ++i) {
5760 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5761 host + mem_off + (i << msz));
5762 }
5763 }
5764 reg_off += 1 << esz;
5765 mem_off += N << msz;
5766 } while (reg_off <= reg_last && (reg_off & 63));
5767 }
5768
5769
5770
5771
5772
5773 mem_off = info.mem_off_split;
5774 if (unlikely(mem_off >= 0)) {
5775 reg_off = info.reg_off_split;
5776 for (i = 0; i < N; ++i) {
5777 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5778 addr + mem_off + (i << msz), retaddr);
5779 }
5780 }
5781
5782 mem_off = info.mem_off_first[1];
5783 if (unlikely(mem_off >= 0)) {
5784 reg_off = info.reg_off_first[1];
5785 reg_last = info.reg_off_last[1];
5786 host = info.page[1].host;
5787
5788 do {
5789 uint64_t pg = vg[reg_off >> 6];
5790 do {
5791 if ((pg >> (reg_off & 63)) & 1) {
5792 for (i = 0; i < N; ++i) {
5793 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5794 host + mem_off + (i << msz));
5795 }
5796 }
5797 reg_off += 1 << esz;
5798 mem_off += N << msz;
5799 } while (reg_off & 63);
5800 } while (reg_off <= reg_last);
5801 }
5802}
5803
5804static inline QEMU_ALWAYS_INLINE
5805void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5806 uint32_t desc, const uintptr_t ra,
5807 const int esz, const int msz, const int N,
5808 sve_ldst1_host_fn *host_fn,
5809 sve_ldst1_tlb_fn *tlb_fn)
5810{
5811 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5812 int bit55 = extract64(addr, 55, 1);
5813
5814
5815 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5816
5817
5818 if (!tbi_check(desc, bit55) ||
5819 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5820 mtedesc = 0;
5821 }
5822
5823 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5824}
5825
5826#define DO_LD1_1(NAME, ESZ) \
5827void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5828 target_ulong addr, uint32_t desc) \
5829{ \
5830 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
5831 sve_##NAME##_host, sve_##NAME##_tlb); \
5832} \
5833void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5834 target_ulong addr, uint32_t desc) \
5835{ \
5836 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
5837 sve_##NAME##_host, sve_##NAME##_tlb); \
5838}
5839
5840#define DO_LD1_2(NAME, ESZ, MSZ) \
5841void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
5842 target_ulong addr, uint32_t desc) \
5843{ \
5844 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5845 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5846} \
5847void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
5848 target_ulong addr, uint32_t desc) \
5849{ \
5850 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5851 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5852} \
5853void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5854 target_ulong addr, uint32_t desc) \
5855{ \
5856 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5857 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5858} \
5859void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5860 target_ulong addr, uint32_t desc) \
5861{ \
5862 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5863 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5864}
5865
5866DO_LD1_1(ld1bb, MO_8)
5867DO_LD1_1(ld1bhu, MO_16)
5868DO_LD1_1(ld1bhs, MO_16)
5869DO_LD1_1(ld1bsu, MO_32)
5870DO_LD1_1(ld1bss, MO_32)
5871DO_LD1_1(ld1bdu, MO_64)
5872DO_LD1_1(ld1bds, MO_64)
5873
5874DO_LD1_2(ld1hh, MO_16, MO_16)
5875DO_LD1_2(ld1hsu, MO_32, MO_16)
5876DO_LD1_2(ld1hss, MO_32, MO_16)
5877DO_LD1_2(ld1hdu, MO_64, MO_16)
5878DO_LD1_2(ld1hds, MO_64, MO_16)
5879
5880DO_LD1_2(ld1ss, MO_32, MO_32)
5881DO_LD1_2(ld1sdu, MO_64, MO_32)
5882DO_LD1_2(ld1sds, MO_64, MO_32)
5883
5884DO_LD1_2(ld1dd, MO_64, MO_64)
5885
5886#undef DO_LD1_1
5887#undef DO_LD1_2
5888
5889#define DO_LDN_1(N) \
5890void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
5891 target_ulong addr, uint32_t desc) \
5892{ \
5893 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
5894 sve_ld1bb_host, sve_ld1bb_tlb); \
5895} \
5896void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
5897 target_ulong addr, uint32_t desc) \
5898{ \
5899 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
5900 sve_ld1bb_host, sve_ld1bb_tlb); \
5901}
5902
5903#define DO_LDN_2(N, SUFF, ESZ) \
5904void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
5905 target_ulong addr, uint32_t desc) \
5906{ \
5907 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5908 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5909} \
5910void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
5911 target_ulong addr, uint32_t desc) \
5912{ \
5913 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5914 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5915} \
5916void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
5917 target_ulong addr, uint32_t desc) \
5918{ \
5919 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5920 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5921} \
5922void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
5923 target_ulong addr, uint32_t desc) \
5924{ \
5925 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5926 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5927}
5928
5929DO_LDN_1(2)
5930DO_LDN_1(3)
5931DO_LDN_1(4)
5932
5933DO_LDN_2(2, hh, MO_16)
5934DO_LDN_2(3, hh, MO_16)
5935DO_LDN_2(4, hh, MO_16)
5936
5937DO_LDN_2(2, ss, MO_32)
5938DO_LDN_2(3, ss, MO_32)
5939DO_LDN_2(4, ss, MO_32)
5940
5941DO_LDN_2(2, dd, MO_64)
5942DO_LDN_2(3, dd, MO_64)
5943DO_LDN_2(4, dd, MO_64)
5944
5945#undef DO_LDN_1
5946#undef DO_LDN_2
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5967{
5968 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5969
5970 if (i & 63) {
5971 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5972 i = ROUND_UP(i, 64);
5973 }
5974 for (; i < oprsz; i += 64) {
5975 ffr[i / 64] = 0;
5976 }
5977}
5978
5979
5980
5981
5982static inline QEMU_ALWAYS_INLINE
5983void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
5984 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
5985 const int esz, const int msz, const SVEContFault fault,
5986 sve_ldst1_host_fn *host_fn,
5987 sve_ldst1_tlb_fn *tlb_fn)
5988{
5989 const unsigned rd = simd_data(desc);
5990 void *vd = &env->vfp.zregs[rd];
5991 const intptr_t reg_max = simd_oprsz(desc);
5992 intptr_t reg_off, mem_off, reg_last;
5993 SVEContLdSt info;
5994 int flags;
5995 void *host;
5996
5997
5998 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
5999
6000 memset(vd, 0, reg_max);
6001 return;
6002 }
6003 reg_off = info.reg_off_first[0];
6004
6005
6006 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
6007
6008 tcg_debug_assert(fault == FAULT_NO);
6009 memset(vd, 0, reg_max);
6010 goto do_fault;
6011 }
6012
6013 mem_off = info.mem_off_first[0];
6014 flags = info.page[0].flags;
6015
6016
6017
6018
6019
6020 if (!arm_tlb_mte_tagged(&info.page[0].attrs)) {
6021 mtedesc = 0;
6022 }
6023
6024 if (fault == FAULT_FIRST) {
6025
6026 if (mtedesc) {
6027 mte_check(env, mtedesc, addr + mem_off, retaddr);
6028 }
6029
6030
6031
6032
6033
6034 bool is_split = mem_off == info.mem_off_split;
6035 if (unlikely(flags != 0) || unlikely(is_split)) {
6036
6037
6038
6039
6040 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6041
6042
6043 swap_memzero(vd, reg_off);
6044 reg_off += 1 << esz;
6045 mem_off += 1 << msz;
6046 swap_memzero(vd + reg_off, reg_max - reg_off);
6047
6048 if (is_split) {
6049 goto second_page;
6050 }
6051 } else {
6052 memset(vd, 0, reg_max);
6053 }
6054 } else {
6055 memset(vd, 0, reg_max);
6056 if (unlikely(mem_off == info.mem_off_split)) {
6057
6058 flags |= info.page[1].flags;
6059 if (unlikely(flags & TLB_MMIO)) {
6060
6061 goto do_fault;
6062 }
6063 if (unlikely(flags & TLB_WATCHPOINT) &&
6064 (cpu_watchpoint_address_matches
6065 (env_cpu(env), addr + mem_off, 1 << msz)
6066 & BP_MEM_READ)) {
6067
6068 goto do_fault;
6069 }
6070 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6071 goto do_fault;
6072 }
6073
6074
6075
6076
6077 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6078 goto second_page;
6079 }
6080 }
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103 if (unlikely(flags & TLB_MMIO)) {
6104 goto do_fault;
6105 }
6106
6107 reg_last = info.reg_off_last[0];
6108 host = info.page[0].host;
6109
6110 do {
6111 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6112 do {
6113 if ((pg >> (reg_off & 63)) & 1) {
6114 if (unlikely(flags & TLB_WATCHPOINT) &&
6115 (cpu_watchpoint_address_matches
6116 (env_cpu(env), addr + mem_off, 1 << msz)
6117 & BP_MEM_READ)) {
6118 goto do_fault;
6119 }
6120 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6121 goto do_fault;
6122 }
6123 host_fn(vd, reg_off, host + mem_off);
6124 }
6125 reg_off += 1 << esz;
6126 mem_off += 1 << msz;
6127 } while (reg_off <= reg_last && (reg_off & 63));
6128 } while (reg_off <= reg_last);
6129
6130
6131
6132
6133
6134
6135
6136 reg_off = info.reg_off_split;
6137 if (reg_off >= 0) {
6138 goto do_fault;
6139 }
6140
6141 second_page:
6142 reg_off = info.reg_off_first[1];
6143 if (likely(reg_off < 0)) {
6144
6145 return;
6146 }
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156 do_fault:
6157 record_fault(env, reg_off, reg_max);
6158}
6159
6160static inline QEMU_ALWAYS_INLINE
6161void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6162 uint32_t desc, const uintptr_t retaddr,
6163 const int esz, const int msz, const SVEContFault fault,
6164 sve_ldst1_host_fn *host_fn,
6165 sve_ldst1_tlb_fn *tlb_fn)
6166{
6167 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6168 int bit55 = extract64(addr, 55, 1);
6169
6170
6171 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6172
6173
6174 if (!tbi_check(desc, bit55) ||
6175 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6176 mtedesc = 0;
6177 }
6178
6179 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6180 esz, msz, fault, host_fn, tlb_fn);
6181}
6182
6183#define DO_LDFF1_LDNF1_1(PART, ESZ) \
6184void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6185 target_ulong addr, uint32_t desc) \
6186{ \
6187 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6188 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6189} \
6190void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6191 target_ulong addr, uint32_t desc) \
6192{ \
6193 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6194 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6195} \
6196void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6197 target_ulong addr, uint32_t desc) \
6198{ \
6199 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6200 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6201} \
6202void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6203 target_ulong addr, uint32_t desc) \
6204{ \
6205 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6206 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6207}
6208
6209#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6210void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6211 target_ulong addr, uint32_t desc) \
6212{ \
6213 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6214 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6215} \
6216void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6217 target_ulong addr, uint32_t desc) \
6218{ \
6219 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6220 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6221} \
6222void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6223 target_ulong addr, uint32_t desc) \
6224{ \
6225 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6226 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6227} \
6228void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6229 target_ulong addr, uint32_t desc) \
6230{ \
6231 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6232 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6233} \
6234void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6235 target_ulong addr, uint32_t desc) \
6236{ \
6237 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6238 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6239} \
6240void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6241 target_ulong addr, uint32_t desc) \
6242{ \
6243 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6244 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6245} \
6246void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6247 target_ulong addr, uint32_t desc) \
6248{ \
6249 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6250 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6251} \
6252void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6253 target_ulong addr, uint32_t desc) \
6254{ \
6255 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6256 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6257}
6258
6259DO_LDFF1_LDNF1_1(bb, MO_8)
6260DO_LDFF1_LDNF1_1(bhu, MO_16)
6261DO_LDFF1_LDNF1_1(bhs, MO_16)
6262DO_LDFF1_LDNF1_1(bsu, MO_32)
6263DO_LDFF1_LDNF1_1(bss, MO_32)
6264DO_LDFF1_LDNF1_1(bdu, MO_64)
6265DO_LDFF1_LDNF1_1(bds, MO_64)
6266
6267DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
6268DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6269DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6270DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6271DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6272
6273DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
6274DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6275DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6276
6277DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
6278
6279#undef DO_LDFF1_LDNF1_1
6280#undef DO_LDFF1_LDNF1_2
6281
6282
6283
6284
6285
6286static inline QEMU_ALWAYS_INLINE
6287void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6288 uint32_t desc, const uintptr_t retaddr,
6289 const int esz, const int msz, const int N, uint32_t mtedesc,
6290 sve_ldst1_host_fn *host_fn,
6291 sve_ldst1_tlb_fn *tlb_fn)
6292{
6293 const unsigned rd = simd_data(desc);
6294 const intptr_t reg_max = simd_oprsz(desc);
6295 intptr_t reg_off, reg_last, mem_off;
6296 SVEContLdSt info;
6297 void *host;
6298 int i, flags;
6299
6300
6301 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6302
6303 return;
6304 }
6305
6306
6307 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6308
6309
6310 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6311 BP_MEM_WRITE, retaddr);
6312
6313
6314
6315
6316
6317 if (mtedesc) {
6318 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6319 mtedesc, retaddr);
6320 }
6321
6322 flags = info.page[0].flags | info.page[1].flags;
6323 if (unlikely(flags != 0)) {
6324#ifdef CONFIG_USER_ONLY
6325 g_assert_not_reached();
6326#else
6327
6328
6329
6330
6331
6332
6333 mem_off = info.mem_off_first[0];
6334 reg_off = info.reg_off_first[0];
6335 reg_last = info.reg_off_last[1];
6336 if (reg_last < 0) {
6337 reg_last = info.reg_off_split;
6338 if (reg_last < 0) {
6339 reg_last = info.reg_off_last[0];
6340 }
6341 }
6342
6343 do {
6344 uint64_t pg = vg[reg_off >> 6];
6345 do {
6346 if ((pg >> (reg_off & 63)) & 1) {
6347 for (i = 0; i < N; ++i) {
6348 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6349 addr + mem_off + (i << msz), retaddr);
6350 }
6351 }
6352 reg_off += 1 << esz;
6353 mem_off += N << msz;
6354 } while (reg_off & 63);
6355 } while (reg_off <= reg_last);
6356 return;
6357#endif
6358 }
6359
6360 mem_off = info.mem_off_first[0];
6361 reg_off = info.reg_off_first[0];
6362 reg_last = info.reg_off_last[0];
6363 host = info.page[0].host;
6364
6365 while (reg_off <= reg_last) {
6366 uint64_t pg = vg[reg_off >> 6];
6367 do {
6368 if ((pg >> (reg_off & 63)) & 1) {
6369 for (i = 0; i < N; ++i) {
6370 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6371 host + mem_off + (i << msz));
6372 }
6373 }
6374 reg_off += 1 << esz;
6375 mem_off += N << msz;
6376 } while (reg_off <= reg_last && (reg_off & 63));
6377 }
6378
6379
6380
6381
6382
6383 mem_off = info.mem_off_split;
6384 if (unlikely(mem_off >= 0)) {
6385 reg_off = info.reg_off_split;
6386 for (i = 0; i < N; ++i) {
6387 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6388 addr + mem_off + (i << msz), retaddr);
6389 }
6390 }
6391
6392 mem_off = info.mem_off_first[1];
6393 if (unlikely(mem_off >= 0)) {
6394 reg_off = info.reg_off_first[1];
6395 reg_last = info.reg_off_last[1];
6396 host = info.page[1].host;
6397
6398 do {
6399 uint64_t pg = vg[reg_off >> 6];
6400 do {
6401 if ((pg >> (reg_off & 63)) & 1) {
6402 for (i = 0; i < N; ++i) {
6403 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6404 host + mem_off + (i << msz));
6405 }
6406 }
6407 reg_off += 1 << esz;
6408 mem_off += N << msz;
6409 } while (reg_off & 63);
6410 } while (reg_off <= reg_last);
6411 }
6412}
6413
6414static inline QEMU_ALWAYS_INLINE
6415void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6416 uint32_t desc, const uintptr_t ra,
6417 const int esz, const int msz, const int N,
6418 sve_ldst1_host_fn *host_fn,
6419 sve_ldst1_tlb_fn *tlb_fn)
6420{
6421 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6422 int bit55 = extract64(addr, 55, 1);
6423
6424
6425 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6426
6427
6428 if (!tbi_check(desc, bit55) ||
6429 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6430 mtedesc = 0;
6431 }
6432
6433 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6434}
6435
6436#define DO_STN_1(N, NAME, ESZ) \
6437void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6438 target_ulong addr, uint32_t desc) \
6439{ \
6440 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6441 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6442} \
6443void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6444 target_ulong addr, uint32_t desc) \
6445{ \
6446 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6447 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6448}
6449
6450#define DO_STN_2(N, NAME, ESZ, MSZ) \
6451void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6452 target_ulong addr, uint32_t desc) \
6453{ \
6454 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6455 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6456} \
6457void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6458 target_ulong addr, uint32_t desc) \
6459{ \
6460 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6461 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6462} \
6463void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6464 target_ulong addr, uint32_t desc) \
6465{ \
6466 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6467 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6468} \
6469void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6470 target_ulong addr, uint32_t desc) \
6471{ \
6472 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6473 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6474}
6475
6476DO_STN_1(1, bb, MO_8)
6477DO_STN_1(1, bh, MO_16)
6478DO_STN_1(1, bs, MO_32)
6479DO_STN_1(1, bd, MO_64)
6480DO_STN_1(2, bb, MO_8)
6481DO_STN_1(3, bb, MO_8)
6482DO_STN_1(4, bb, MO_8)
6483
6484DO_STN_2(1, hh, MO_16, MO_16)
6485DO_STN_2(1, hs, MO_32, MO_16)
6486DO_STN_2(1, hd, MO_64, MO_16)
6487DO_STN_2(2, hh, MO_16, MO_16)
6488DO_STN_2(3, hh, MO_16, MO_16)
6489DO_STN_2(4, hh, MO_16, MO_16)
6490
6491DO_STN_2(1, ss, MO_32, MO_32)
6492DO_STN_2(1, sd, MO_64, MO_32)
6493DO_STN_2(2, ss, MO_32, MO_32)
6494DO_STN_2(3, ss, MO_32, MO_32)
6495DO_STN_2(4, ss, MO_32, MO_32)
6496
6497DO_STN_2(1, dd, MO_64, MO_64)
6498DO_STN_2(2, dd, MO_64, MO_64)
6499DO_STN_2(3, dd, MO_64, MO_64)
6500DO_STN_2(4, dd, MO_64, MO_64)
6501
6502#undef DO_STN_1
6503#undef DO_STN_2
6504
6505
6506
6507
6508
6509
6510
6511
6512typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6513
6514static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6515{
6516 return *(uint32_t *)(reg + H1_4(reg_ofs));
6517}
6518
6519static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6520{
6521 return *(int32_t *)(reg + H1_4(reg_ofs));
6522}
6523
6524static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6525{
6526 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6527}
6528
6529static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6530{
6531 return (int32_t)*(uint64_t *)(reg + reg_ofs);
6532}
6533
6534static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6535{
6536 return *(uint64_t *)(reg + reg_ofs);
6537}
6538
6539static inline QEMU_ALWAYS_INLINE
6540void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6541 target_ulong base, uint32_t desc, uintptr_t retaddr,
6542 uint32_t mtedesc, int esize, int msize,
6543 zreg_off_fn *off_fn,
6544 sve_ldst1_host_fn *host_fn,
6545 sve_ldst1_tlb_fn *tlb_fn)
6546{
6547 const int mmu_idx = cpu_mmu_index(env, false);
6548 const intptr_t reg_max = simd_oprsz(desc);
6549 const int scale = simd_data(desc);
6550 ARMVectorReg scratch;
6551 intptr_t reg_off;
6552 SVEHostPage info, info2;
6553
6554 memset(&scratch, 0, reg_max);
6555 reg_off = 0;
6556 do {
6557 uint64_t pg = vg[reg_off >> 6];
6558 do {
6559 if (likely(pg & 1)) {
6560 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6561 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6562
6563 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6564 mmu_idx, retaddr);
6565
6566 if (likely(in_page >= msize)) {
6567 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6568 cpu_check_watchpoint(env_cpu(env), addr, msize,
6569 info.attrs, BP_MEM_READ, retaddr);
6570 }
6571 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6572 mte_check(env, mtedesc, addr, retaddr);
6573 }
6574 if (unlikely(info.flags & TLB_MMIO)) {
6575 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6576 } else {
6577 host_fn(&scratch, reg_off, info.host);
6578 }
6579 } else {
6580
6581 sve_probe_page(&info2, false, env, addr + in_page, 0,
6582 MMU_DATA_LOAD, mmu_idx, retaddr);
6583 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6584 cpu_check_watchpoint(env_cpu(env), addr,
6585 msize, info.attrs,
6586 BP_MEM_READ, retaddr);
6587 }
6588 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6589 mte_check(env, mtedesc, addr, retaddr);
6590 }
6591 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6592 }
6593 }
6594 reg_off += esize;
6595 pg >>= esize;
6596 } while (reg_off & 63);
6597 } while (reg_off < reg_max);
6598
6599
6600 memcpy(vd, &scratch, reg_max);
6601}
6602
6603static inline QEMU_ALWAYS_INLINE
6604void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6605 target_ulong base, uint32_t desc, uintptr_t retaddr,
6606 int esize, int msize, zreg_off_fn *off_fn,
6607 sve_ldst1_host_fn *host_fn,
6608 sve_ldst1_tlb_fn *tlb_fn)
6609{
6610 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6611
6612 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6613
6614
6615
6616
6617
6618
6619
6620 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6621 esize, msize, off_fn, host_fn, tlb_fn);
6622}
6623
6624#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6625void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6626 void *vm, target_ulong base, uint32_t desc) \
6627{ \
6628 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6629 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6630} \
6631void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6632 void *vm, target_ulong base, uint32_t desc) \
6633{ \
6634 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6635 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6636}
6637
6638#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6639void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6640 void *vm, target_ulong base, uint32_t desc) \
6641{ \
6642 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6643 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6644} \
6645void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6646 void *vm, target_ulong base, uint32_t desc) \
6647{ \
6648 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6649 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6650}
6651
6652DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6653DO_LD1_ZPZ_S(bsu, zss, MO_8)
6654DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6655DO_LD1_ZPZ_D(bdu, zss, MO_8)
6656DO_LD1_ZPZ_D(bdu, zd, MO_8)
6657
6658DO_LD1_ZPZ_S(bss, zsu, MO_8)
6659DO_LD1_ZPZ_S(bss, zss, MO_8)
6660DO_LD1_ZPZ_D(bds, zsu, MO_8)
6661DO_LD1_ZPZ_D(bds, zss, MO_8)
6662DO_LD1_ZPZ_D(bds, zd, MO_8)
6663
6664DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6665DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6666DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6667DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6668DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6669
6670DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6671DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6672DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6673DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6674DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6675
6676DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6677DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6678DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6679DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6680DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6681
6682DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6683DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6684DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6685DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6686DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6687
6688DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6689DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6690DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6691DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6692DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6693
6694DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6695DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6696DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6697DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6698DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6699
6700DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6701DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6702DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6703
6704DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6705DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6706DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6707
6708DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6709DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6710DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6711
6712DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6713DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6714DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6715
6716#undef DO_LD1_ZPZ_S
6717#undef DO_LD1_ZPZ_D
6718
6719
6720
6721
6722
6723
6724
6725static inline QEMU_ALWAYS_INLINE
6726void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6727 target_ulong base, uint32_t desc, uintptr_t retaddr,
6728 uint32_t mtedesc, const int esz, const int msz,
6729 zreg_off_fn *off_fn,
6730 sve_ldst1_host_fn *host_fn,
6731 sve_ldst1_tlb_fn *tlb_fn)
6732{
6733 const int mmu_idx = cpu_mmu_index(env, false);
6734 const intptr_t reg_max = simd_oprsz(desc);
6735 const int scale = simd_data(desc);
6736 const int esize = 1 << esz;
6737 const int msize = 1 << msz;
6738 intptr_t reg_off;
6739 SVEHostPage info;
6740 target_ulong addr, in_page;
6741
6742
6743 reg_off = find_next_active(vg, 0, reg_max, esz);
6744 if (unlikely(reg_off >= reg_max)) {
6745
6746 memset(vd, 0, reg_max);
6747 return;
6748 }
6749
6750
6751
6752
6753 addr = base + (off_fn(vm, reg_off) << scale);
6754 if (mtedesc) {
6755 mte_check(env, mtedesc, addr, retaddr);
6756 }
6757 tlb_fn(env, vd, reg_off, addr, retaddr);
6758
6759
6760 swap_memzero(vd, reg_off);
6761 reg_off += esize;
6762 swap_memzero(vd + reg_off, reg_max - reg_off);
6763
6764
6765
6766
6767 while (reg_off < reg_max) {
6768 uint64_t pg = vg[reg_off >> 6];
6769 do {
6770 if (likely((pg >> (reg_off & 63)) & 1)) {
6771 addr = base + (off_fn(vm, reg_off) << scale);
6772 in_page = -(addr | TARGET_PAGE_MASK);
6773
6774 if (unlikely(in_page < msize)) {
6775
6776 goto fault;
6777 }
6778
6779 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6780 mmu_idx, retaddr);
6781 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6782 goto fault;
6783 }
6784 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6785 (cpu_watchpoint_address_matches
6786 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6787 goto fault;
6788 }
6789 if (mtedesc &&
6790 arm_tlb_mte_tagged(&info.attrs) &&
6791 !mte_probe(env, mtedesc, addr)) {
6792 goto fault;
6793 }
6794
6795 host_fn(vd, reg_off, info.host);
6796 }
6797 reg_off += esize;
6798 } while (reg_off & 63);
6799 }
6800 return;
6801
6802 fault:
6803 record_fault(env, reg_off, reg_max);
6804}
6805
6806static inline QEMU_ALWAYS_INLINE
6807void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6808 target_ulong base, uint32_t desc, uintptr_t retaddr,
6809 const int esz, const int msz,
6810 zreg_off_fn *off_fn,
6811 sve_ldst1_host_fn *host_fn,
6812 sve_ldst1_tlb_fn *tlb_fn)
6813{
6814 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6815
6816 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6817
6818
6819
6820
6821
6822
6823
6824 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6825 esz, msz, off_fn, host_fn, tlb_fn);
6826}
6827
6828#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6829void HELPER(sve_ldff##MEM##_##OFS) \
6830 (CPUARMState *env, void *vd, void *vg, \
6831 void *vm, target_ulong base, uint32_t desc) \
6832{ \
6833 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6834 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6835} \
6836void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6837 (CPUARMState *env, void *vd, void *vg, \
6838 void *vm, target_ulong base, uint32_t desc) \
6839{ \
6840 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
6841 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6842}
6843
6844#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
6845void HELPER(sve_ldff##MEM##_##OFS) \
6846 (CPUARMState *env, void *vd, void *vg, \
6847 void *vm, target_ulong base, uint32_t desc) \
6848{ \
6849 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
6850 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6851} \
6852void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6853 (CPUARMState *env, void *vd, void *vg, \
6854 void *vm, target_ulong base, uint32_t desc) \
6855{ \
6856 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
6857 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6858}
6859
6860DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6861DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6862DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6863DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6864DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6865
6866DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6867DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6868DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6869DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6870DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6871
6872DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6873DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6874DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6875DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6876DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6877
6878DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6879DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6880DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6881DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6882DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6883
6884DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6885DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6886DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6887DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6888DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6889
6890DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6891DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6892DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6893DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6894DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6895
6896DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
6897DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
6898DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6899DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6900DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6901
6902DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
6903DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
6904DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6905DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6906DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6907
6908DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6909DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6910DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6911
6912DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6913DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6914DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6915
6916DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6917DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6918DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6919
6920DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6921DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6922DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
6923
6924
6925
6926static inline QEMU_ALWAYS_INLINE
6927void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6928 target_ulong base, uint32_t desc, uintptr_t retaddr,
6929 uint32_t mtedesc, int esize, int msize,
6930 zreg_off_fn *off_fn,
6931 sve_ldst1_host_fn *host_fn,
6932 sve_ldst1_tlb_fn *tlb_fn)
6933{
6934 const int mmu_idx = cpu_mmu_index(env, false);
6935 const intptr_t reg_max = simd_oprsz(desc);
6936 const int scale = simd_data(desc);
6937 void *host[ARM_MAX_VQ * 4];
6938 intptr_t reg_off, i;
6939 SVEHostPage info, info2;
6940
6941
6942
6943
6944 i = reg_off = 0;
6945 do {
6946 uint64_t pg = vg[reg_off >> 6];
6947 do {
6948 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6949 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6950
6951 host[i] = NULL;
6952 if (likely((pg >> (reg_off & 63)) & 1)) {
6953 if (likely(in_page >= msize)) {
6954 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6955 mmu_idx, retaddr);
6956 if (!(info.flags & TLB_MMIO)) {
6957 host[i] = info.host;
6958 }
6959 } else {
6960
6961
6962
6963
6964
6965 sve_probe_page(&info, false, env, addr, 0,
6966 MMU_DATA_STORE, mmu_idx, retaddr);
6967 sve_probe_page(&info2, false, env, addr + in_page, 0,
6968 MMU_DATA_STORE, mmu_idx, retaddr);
6969 info.flags |= info2.flags;
6970 }
6971
6972 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6973 cpu_check_watchpoint(env_cpu(env), addr, msize,
6974 info.attrs, BP_MEM_WRITE, retaddr);
6975 }
6976
6977 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
6978 mte_check(env, mtedesc, addr, retaddr);
6979 }
6980 }
6981 i += 1;
6982 reg_off += esize;
6983 } while (reg_off & 63);
6984 } while (reg_off < reg_max);
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995 i = reg_off = 0;
6996 do {
6997 void *h = host[i];
6998 if (likely(h != NULL)) {
6999 host_fn(vd, reg_off, h);
7000 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7001 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7002 tlb_fn(env, vd, reg_off, addr, retaddr);
7003 }
7004 i += 1;
7005 reg_off += esize;
7006 } while (reg_off < reg_max);
7007}
7008
7009static inline QEMU_ALWAYS_INLINE
7010void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7011 target_ulong base, uint32_t desc, uintptr_t retaddr,
7012 int esize, int msize, zreg_off_fn *off_fn,
7013 sve_ldst1_host_fn *host_fn,
7014 sve_ldst1_tlb_fn *tlb_fn)
7015{
7016 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7017
7018 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7019
7020
7021
7022
7023
7024
7025
7026 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7027 esize, msize, off_fn, host_fn, tlb_fn);
7028}
7029
7030#define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
7031void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7032 void *vm, target_ulong base, uint32_t desc) \
7033{ \
7034 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
7035 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7036} \
7037void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7038 void *vm, target_ulong base, uint32_t desc) \
7039{ \
7040 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
7041 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7042}
7043
7044#define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
7045void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7046 void *vm, target_ulong base, uint32_t desc) \
7047{ \
7048 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
7049 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7050} \
7051void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7052 void *vm, target_ulong base, uint32_t desc) \
7053{ \
7054 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
7055 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7056}
7057
7058DO_ST1_ZPZ_S(bs, zsu, MO_8)
7059DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7060DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7061DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7062DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7063
7064DO_ST1_ZPZ_S(bs, zss, MO_8)
7065DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7066DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7067DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7068DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7069
7070DO_ST1_ZPZ_D(bd, zsu, MO_8)
7071DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7072DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7073DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7074DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7075DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7076DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7077
7078DO_ST1_ZPZ_D(bd, zss, MO_8)
7079DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7080DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7081DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7082DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7083DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7084DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7085
7086DO_ST1_ZPZ_D(bd, zd, MO_8)
7087DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7088DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7089DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7090DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7091DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7092DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7093
7094#undef DO_ST1_ZPZ_S
7095#undef DO_ST1_ZPZ_D
7096
7097void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7098{
7099 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7100 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7101
7102 for (i = 0; i < opr_sz; ++i) {
7103 d[i] = n[i] ^ m[i] ^ k[i];
7104 }
7105}
7106
7107void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7108{
7109 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7110 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7111
7112 for (i = 0; i < opr_sz; ++i) {
7113 d[i] = n[i] ^ (m[i] & ~k[i]);
7114 }
7115}
7116
7117void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7118{
7119 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7120 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7121
7122 for (i = 0; i < opr_sz; ++i) {
7123 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7124 }
7125}
7126
7127void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7128{
7129 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7130 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7131
7132 for (i = 0; i < opr_sz; ++i) {
7133 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7134 }
7135}
7136
7137void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7138{
7139 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7140 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7141
7142 for (i = 0; i < opr_sz; ++i) {
7143 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7144 }
7145}
7146
7147
7148
7149
7150
7151
7152static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7153{
7154 int bits = 8 << esz;
7155 uint64_t ones = dup_const(esz, 1);
7156 uint64_t signs = ones << (bits - 1);
7157 uint64_t cmp0, cmp1;
7158
7159 cmp1 = dup_const(esz, n);
7160 cmp0 = cmp1 ^ m0;
7161 cmp1 = cmp1 ^ m1;
7162 cmp0 = (cmp0 - ones) & ~cmp0;
7163 cmp1 = (cmp1 - ones) & ~cmp1;
7164 return (cmp0 | cmp1) & signs;
7165}
7166
7167static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7168 uint32_t desc, int esz, bool nmatch)
7169{
7170 uint16_t esz_mask = pred_esz_masks[esz];
7171 intptr_t opr_sz = simd_oprsz(desc);
7172 uint32_t flags = PREDTEST_INIT;
7173 intptr_t i, j, k;
7174
7175 for (i = 0; i < opr_sz; i += 16) {
7176 uint64_t m0 = *(uint64_t *)(vm + i);
7177 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7178 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7179 uint16_t out = 0;
7180
7181 for (j = 0; j < 16; j += 8) {
7182 uint64_t n = *(uint64_t *)(vn + i + j);
7183
7184 for (k = 0; k < 8; k += 1 << esz) {
7185 if (pg & (1 << (j + k))) {
7186 bool o = do_match2(n >> (k * 8), m0, m1, esz);
7187 out |= (o ^ nmatch) << (j + k);
7188 }
7189 }
7190 }
7191 *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7192 flags = iter_predtest_fwd(out, pg, flags);
7193 }
7194 return flags;
7195}
7196
7197#define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7198uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7199{ \
7200 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7201}
7202
7203DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7204DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7205
7206DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7207DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7208
7209#undef DO_PPZZ_MATCH
7210
7211void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7212 uint32_t desc)
7213{
7214 ARMVectorReg scratch;
7215 intptr_t i, j;
7216 intptr_t opr_sz = simd_oprsz(desc);
7217 uint32_t *d = vd, *n = vn, *m = vm;
7218 uint8_t *pg = vg;
7219
7220 if (d == n) {
7221 n = memcpy(&scratch, n, opr_sz);
7222 if (d == m) {
7223 m = n;
7224 }
7225 } else if (d == m) {
7226 m = memcpy(&scratch, m, opr_sz);
7227 }
7228
7229 for (i = 0; i < opr_sz; i += 4) {
7230 uint64_t count = 0;
7231 uint8_t pred;
7232
7233 pred = pg[H1(i >> 3)] >> (i & 7);
7234 if (pred & 1) {
7235 uint32_t nn = n[H4(i >> 2)];
7236
7237 for (j = 0; j <= i; j += 4) {
7238 pred = pg[H1(j >> 3)] >> (j & 7);
7239 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7240 ++count;
7241 }
7242 }
7243 }
7244 d[H4(i >> 2)] = count;
7245 }
7246}
7247
7248void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7249 uint32_t desc)
7250{
7251 ARMVectorReg scratch;
7252 intptr_t i, j;
7253 intptr_t opr_sz = simd_oprsz(desc);
7254 uint64_t *d = vd, *n = vn, *m = vm;
7255 uint8_t *pg = vg;
7256
7257 if (d == n) {
7258 n = memcpy(&scratch, n, opr_sz);
7259 if (d == m) {
7260 m = n;
7261 }
7262 } else if (d == m) {
7263 m = memcpy(&scratch, m, opr_sz);
7264 }
7265
7266 for (i = 0; i < opr_sz / 8; ++i) {
7267 uint64_t count = 0;
7268 if (pg[H1(i)] & 1) {
7269 uint64_t nn = n[i];
7270 for (j = 0; j <= i; ++j) {
7271 if ((pg[H1(j)] & 1) && nn == m[j]) {
7272 ++count;
7273 }
7274 }
7275 }
7276 d[i] = count;
7277 }
7278}
7279
7280
7281
7282
7283
7284
7285static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7286{
7287 const uint64_t mask = dup_const(MO_8, 0x7f);
7288 uint64_t cmp0, cmp1;
7289
7290 cmp1 = dup_const(MO_8, n);
7291 cmp0 = cmp1 ^ m0;
7292 cmp1 = cmp1 ^ m1;
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7303 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314 return ctpop64(cmp0 | (cmp1 >> 1));
7315}
7316
7317void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7318{
7319 intptr_t i, j;
7320 intptr_t opr_sz = simd_oprsz(desc);
7321
7322 for (i = 0; i < opr_sz; i += 16) {
7323 uint64_t n0 = *(uint64_t *)(vn + i);
7324 uint64_t m0 = *(uint64_t *)(vm + i);
7325 uint64_t n1 = *(uint64_t *)(vn + i + 8);
7326 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7327 uint64_t out0 = 0;
7328 uint64_t out1 = 0;
7329
7330 for (j = 0; j < 64; j += 8) {
7331 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7332 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7333 out0 |= cnt0 << j;
7334 out1 |= cnt1 << j;
7335 }
7336
7337 *(uint64_t *)(vd + i) = out0;
7338 *(uint64_t *)(vd + i + 8) = out1;
7339 }
7340}
7341
7342void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7343{
7344 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7345 int shr = simd_data(desc);
7346 int shl = 8 - shr;
7347 uint64_t mask = dup_const(MO_8, 0xff >> shr);
7348 uint64_t *d = vd, *n = vn, *m = vm;
7349
7350 for (i = 0; i < opr_sz; ++i) {
7351 uint64_t t = n[i] ^ m[i];
7352 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7353 }
7354}
7355
7356void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7357{
7358 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7359 int shr = simd_data(desc);
7360 int shl = 16 - shr;
7361 uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7362 uint64_t *d = vd, *n = vn, *m = vm;
7363
7364 for (i = 0; i < opr_sz; ++i) {
7365 uint64_t t = n[i] ^ m[i];
7366 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7367 }
7368}
7369
7370void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7371{
7372 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7373 int shr = simd_data(desc);
7374 uint32_t *d = vd, *n = vn, *m = vm;
7375
7376 for (i = 0; i < opr_sz; ++i) {
7377 d[i] = ror32(n[i] ^ m[i], shr);
7378 }
7379}
7380
7381void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7382 void *status, uint32_t desc)
7383{
7384 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7385
7386 for (s = 0; s < opr_sz; ++s) {
7387 float32 *n = vn + s * sizeof(float32) * 4;
7388 float32 *m = vm + s * sizeof(float32) * 4;
7389 float32 *a = va + s * sizeof(float32) * 4;
7390 float32 *d = vd + s * sizeof(float32) * 4;
7391 float32 n00 = n[H4(0)], n01 = n[H4(1)];
7392 float32 n10 = n[H4(2)], n11 = n[H4(3)];
7393 float32 m00 = m[H4(0)], m01 = m[H4(1)];
7394 float32 m10 = m[H4(2)], m11 = m[H4(3)];
7395 float32 p0, p1;
7396
7397
7398 p0 = float32_mul(n00, m00, status);
7399 p1 = float32_mul(n01, m01, status);
7400 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7401
7402
7403 p0 = float32_mul(n00, m10, status);
7404 p1 = float32_mul(n01, m11, status);
7405 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7406
7407
7408 p0 = float32_mul(n10, m00, status);
7409 p1 = float32_mul(n11, m01, status);
7410 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7411
7412
7413 p0 = float32_mul(n10, m10, status);
7414 p1 = float32_mul(n11, m11, status);
7415 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7416 }
7417}
7418
7419void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7420 void *status, uint32_t desc)
7421{
7422 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7423
7424 for (s = 0; s < opr_sz; ++s) {
7425 float64 *n = vn + s * sizeof(float64) * 4;
7426 float64 *m = vm + s * sizeof(float64) * 4;
7427 float64 *a = va + s * sizeof(float64) * 4;
7428 float64 *d = vd + s * sizeof(float64) * 4;
7429 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7430 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7431 float64 p0, p1;
7432
7433
7434 p0 = float64_mul(n00, m00, status);
7435 p1 = float64_mul(n01, m01, status);
7436 d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7437
7438
7439 p0 = float64_mul(n00, m10, status);
7440 p1 = float64_mul(n01, m11, status);
7441 d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7442
7443
7444 p0 = float64_mul(n10, m00, status);
7445 p1 = float64_mul(n11, m01, status);
7446 d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7447
7448
7449 p0 = float64_mul(n10, m10, status);
7450 p1 = float64_mul(n11, m11, status);
7451 d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7452 }
7453}
7454
7455#define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7456void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7457{ \
7458 intptr_t i = simd_oprsz(desc); \
7459 uint64_t *g = vg; \
7460 do { \
7461 uint64_t pg = g[(i - 1) >> 6]; \
7462 do { \
7463 i -= sizeof(TYPEW); \
7464 if (likely((pg >> (i & 63)) & 1)) { \
7465 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
7466 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \
7467 } \
7468 } while (i & 63); \
7469 } while (i != 0); \
7470}
7471
7472DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7473DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7474DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7475
7476#define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7477void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7478{ \
7479 intptr_t i = simd_oprsz(desc); \
7480 uint64_t *g = vg; \
7481 do { \
7482 uint64_t pg = g[(i - 1) >> 6]; \
7483 do { \
7484 i -= sizeof(TYPEW); \
7485 if (likely((pg >> (i & 63)) & 1)) { \
7486 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \
7487 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \
7488 } \
7489 } while (i & 63); \
7490 } while (i != 0); \
7491}
7492
7493DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7494DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7495
7496#undef DO_FCVTLT
7497#undef DO_FCVTNT
7498