1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20#include "qemu/osdep.h"
21#include "cpu.h"
22#include "internals.h"
23#include "exec/exec-all.h"
24#include "exec/helper-proto.h"
25#include "tcg/tcg-gvec-desc.h"
26#include "fpu/softfloat.h"
27#include "tcg/tcg.h"
28#include "vec_internal.h"
29#include "sve_ldst_internal.h"
30
31
32
33
34
35
36
37
38
39
40#define PREDTEST_INIT 1
41
42
43
44
45static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
46{
47 if (likely(g)) {
48
49
50 if (!(flags & 4)) {
51 flags |= ((d & (g & -g)) != 0) << 31;
52 flags |= 4;
53 }
54
55
56 flags |= ((d & g) != 0) << 1;
57
58
59 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
60 }
61 return flags;
62}
63
64
65
66
67static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
68{
69 if (likely(g)) {
70
71
72 if (!(flags & 4)) {
73 flags += 4 - 1;
74 flags |= (d & pow2floor(g)) == 0;
75 }
76
77
78 flags |= ((d & g) != 0) << 1;
79
80
81 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
82 }
83 return flags;
84}
85
86
87uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
88{
89 return iter_predtest_fwd(d, g, PREDTEST_INIT);
90}
91
92
93uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
94{
95 uint32_t flags = PREDTEST_INIT;
96 uint64_t *d = vd, *g = vg;
97 uintptr_t i = 0;
98
99 do {
100 flags = iter_predtest_fwd(d[i], g[i], flags);
101 } while (++i < words);
102
103 return flags;
104}
105
106
107static inline uint64_t expand_pred_s(uint8_t byte)
108{
109 static const uint64_t word[] = {
110 [0x01] = 0x00000000ffffffffull,
111 [0x10] = 0xffffffff00000000ull,
112 [0x11] = 0xffffffffffffffffull,
113 };
114 return word[byte & 0x11];
115}
116
117#define LOGICAL_PPPP(NAME, FUNC) \
118void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
119{ \
120 uintptr_t opr_sz = simd_oprsz(desc); \
121 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
122 uintptr_t i; \
123 for (i = 0; i < opr_sz / 8; ++i) { \
124 d[i] = FUNC(n[i], m[i], g[i]); \
125 } \
126}
127
128#define DO_AND(N, M, G) (((N) & (M)) & (G))
129#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
130#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
131#define DO_ORR(N, M, G) (((N) | (M)) & (G))
132#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
133#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
134#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
135#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
136
137LOGICAL_PPPP(sve_and_pppp, DO_AND)
138LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
139LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
140LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
141LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
142LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
143LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
144LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
145
146#undef DO_AND
147#undef DO_BIC
148#undef DO_EOR
149#undef DO_ORR
150#undef DO_ORN
151#undef DO_NOR
152#undef DO_NAND
153#undef DO_SEL
154#undef LOGICAL_PPPP
155
156
157
158
159
160
161
162
163
164#define DO_ZPZZ(NAME, TYPE, H, OP) \
165void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
166{ \
167 intptr_t i, opr_sz = simd_oprsz(desc); \
168 for (i = 0; i < opr_sz; ) { \
169 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
170 do { \
171 if (pg & 1) { \
172 TYPE nn = *(TYPE *)(vn + H(i)); \
173 TYPE mm = *(TYPE *)(vm + H(i)); \
174 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
175 } \
176 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
177 } while (i & 15); \
178 } \
179}
180
181
182#define DO_ZPZZ_D(NAME, TYPE, OP) \
183void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
184{ \
185 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
186 TYPE *d = vd, *n = vn, *m = vm; \
187 uint8_t *pg = vg; \
188 for (i = 0; i < opr_sz; i += 1) { \
189 if (pg[H1(i)] & 1) { \
190 TYPE nn = n[i], mm = m[i]; \
191 d[i] = OP(nn, mm); \
192 } \
193 } \
194}
195
196#define DO_AND(N, M) (N & M)
197#define DO_EOR(N, M) (N ^ M)
198#define DO_ORR(N, M) (N | M)
199#define DO_BIC(N, M) (N & ~M)
200#define DO_ADD(N, M) (N + M)
201#define DO_SUB(N, M) (N - M)
202#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
203#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
204#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
205#define DO_MUL(N, M) (N * M)
206
207
208
209
210
211
212
213
214
215#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
216#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
217
218DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
219DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
220DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
221DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
222
223DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
224DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
225DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
226DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
227
228DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
229DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
230DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
231DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
232
233DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
234DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
235DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
236DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
237
238DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
239DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
240DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
241DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
242
243DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
244DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
245DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
246DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
247
248DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
249DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
250DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
251DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
252
253DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
254DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
255DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
256DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
257
258DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
259DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
260DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
261DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
262
263DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
264DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
265DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
266DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
267
268DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
269DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
270DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
271DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
272
273DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
274DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
275DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
276DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
277
278
279
280static inline uint8_t do_mulh_b(int32_t n, int32_t m)
281{
282 return (n * m) >> 8;
283}
284
285static inline uint16_t do_mulh_h(int32_t n, int32_t m)
286{
287 return (n * m) >> 16;
288}
289
290static inline uint32_t do_mulh_s(int64_t n, int64_t m)
291{
292 return (n * m) >> 32;
293}
294
295static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
296{
297 uint64_t lo, hi;
298 muls64(&lo, &hi, n, m);
299 return hi;
300}
301
302static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
303{
304 uint64_t lo, hi;
305 mulu64(&lo, &hi, n, m);
306 return hi;
307}
308
309DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
310DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
311DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
312DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
313
314DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
315DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
316DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
317DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
318
319DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
320DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
321DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
322DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
323
324DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
325DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
326
327DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
328DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
329
330
331
332#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
333#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
334#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
335
336DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
337DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
338DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
339
340DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
341DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
342DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
343
344DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
345DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
346DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
347
348DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
349DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
350DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
351
352static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
353{
354 int8_t n1 = n, n2 = n >> 8;
355 return m + n1 + n2;
356}
357
358static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
359{
360 int16_t n1 = n, n2 = n >> 16;
361 return m + n1 + n2;
362}
363
364static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
365{
366 int32_t n1 = n, n2 = n >> 32;
367 return m + n1 + n2;
368}
369
370DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
371DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
372DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
373
374static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
375{
376 uint8_t n1 = n, n2 = n >> 8;
377 return m + n1 + n2;
378}
379
380static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
381{
382 uint16_t n1 = n, n2 = n >> 16;
383 return m + n1 + n2;
384}
385
386static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
387{
388 uint32_t n1 = n, n2 = n >> 32;
389 return m + n1 + n2;
390}
391
392DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
393DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
394DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
395
396#define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
397#define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
398#define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
399#define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
400
401DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
402DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
403DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
404DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
405
406#define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
407#define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
408#define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
409#define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
410
411DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
412DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
413DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
414DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
415
416
417
418
419
420
421
422#define do_sqshl_b(n, m) \
423 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
424#define do_sqshl_h(n, m) \
425 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
426#define do_sqshl_s(n, m) \
427 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
428#define do_sqshl_d(n, m) \
429 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
430
431DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
432DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
433DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
434DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
435
436#define do_uqshl_b(n, m) \
437 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
438#define do_uqshl_h(n, m) \
439 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
440#define do_uqshl_s(n, m) \
441 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
442#define do_uqshl_d(n, m) \
443 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
444
445DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
446DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
447DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
448DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
449
450#define do_sqrshl_b(n, m) \
451 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
452#define do_sqrshl_h(n, m) \
453 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
454#define do_sqrshl_s(n, m) \
455 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
456#define do_sqrshl_d(n, m) \
457 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
458
459DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
460DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
461DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
462DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
463
464#undef do_sqrshl_d
465
466#define do_uqrshl_b(n, m) \
467 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
468#define do_uqrshl_h(n, m) \
469 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
470#define do_uqrshl_s(n, m) \
471 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
472#define do_uqrshl_d(n, m) \
473 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
474
475DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
476DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
477DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
478DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
479
480#undef do_uqrshl_d
481
482#define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
483#define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
484
485DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
486DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
487DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
488DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
489
490DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
491DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
492DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
493DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
494
495#define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
496#define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
497
498DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
499DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
500DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
501DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
502
503DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
504DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
505DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
506DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
507
508#define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
509#define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
510
511DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
512DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
513DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
514DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
515
516DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
517DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
518DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
519DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
520
521static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
522{
523 return val >= max ? max : val <= min ? min : val;
524}
525
526#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
527#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
528#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
529
530static inline int64_t do_sqadd_d(int64_t n, int64_t m)
531{
532 int64_t r = n + m;
533 if (((r ^ n) & ~(n ^ m)) < 0) {
534
535 return r < 0 ? INT64_MAX : INT64_MIN;
536 }
537 return r;
538}
539
540DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
541DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
542DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
543DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
544
545#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
546#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
547#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
548
549static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
550{
551 uint64_t r = n + m;
552 return r < n ? UINT64_MAX : r;
553}
554
555DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
556DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
557DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
558DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
559
560#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
561#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
562#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
563
564static inline int64_t do_sqsub_d(int64_t n, int64_t m)
565{
566 int64_t r = n - m;
567 if (((r ^ n) & (n ^ m)) < 0) {
568
569 return r < 0 ? INT64_MAX : INT64_MIN;
570 }
571 return r;
572}
573
574DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
575DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
576DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
577DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
578
579#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
580#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
581#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
582
583static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
584{
585 return n > m ? n - m : 0;
586}
587
588DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
589DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
590DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
591DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
592
593#define DO_SUQADD_B(n, m) \
594 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
595#define DO_SUQADD_H(n, m) \
596 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
597#define DO_SUQADD_S(n, m) \
598 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
599
600static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
601{
602 uint64_t r = n + m;
603
604 if (n < 0) {
605
606 if (r > INT64_MAX) {
607
608 if (m > -n) {
609
610 return INT64_MAX;
611 }
612
613 }
614 } else {
615
616 if (r < m || r > INT64_MAX) {
617 return INT64_MAX;
618 }
619 }
620 return r;
621}
622
623DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
624DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
625DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
626DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
627
628#define DO_USQADD_B(n, m) \
629 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
630#define DO_USQADD_H(n, m) \
631 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
632#define DO_USQADD_S(n, m) \
633 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
634
635static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
636{
637 uint64_t r = n + m;
638
639 if (m < 0) {
640 return n < -m ? 0 : r;
641 }
642 return r < n ? UINT64_MAX : r;
643}
644
645DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
646DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
647DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
648DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
649
650#undef DO_ZPZZ
651#undef DO_ZPZZ_D
652
653
654
655
656
657
658
659#define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
660void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
661{ \
662 intptr_t i, opr_sz = simd_oprsz(desc); \
663 for (i = 0; i < opr_sz; ) { \
664 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
665 do { \
666 TYPE n0 = *(TYPE *)(vn + H(i)); \
667 TYPE m0 = *(TYPE *)(vm + H(i)); \
668 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
669 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
670 if (pg & 1) { \
671 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
672 } \
673 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
674 if (pg & 1) { \
675 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
676 } \
677 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
678 } while (i & 15); \
679 } \
680}
681
682
683#define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
684void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
685{ \
686 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
687 TYPE *d = vd, *n = vn, *m = vm; \
688 uint8_t *pg = vg; \
689 for (i = 0; i < opr_sz; i += 2) { \
690 TYPE n0 = n[i], n1 = n[i + 1]; \
691 TYPE m0 = m[i], m1 = m[i + 1]; \
692 if (pg[H1(i)] & 1) { \
693 d[i] = OP(n0, n1); \
694 } \
695 if (pg[H1(i + 1)] & 1) { \
696 d[i + 1] = OP(m0, m1); \
697 } \
698 } \
699}
700
701DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
702DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
703DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
704DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
705
706DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
707DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
708DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
709DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
710
711DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
712DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
713DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
714DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
715
716DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
717DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
718DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
719DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
720
721DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
722DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
723DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
724DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
725
726#undef DO_ZPZZ_PAIR
727#undef DO_ZPZZ_PAIR_D
728
729#define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
730void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
731 void *status, uint32_t desc) \
732{ \
733 intptr_t i, opr_sz = simd_oprsz(desc); \
734 for (i = 0; i < opr_sz; ) { \
735 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
736 do { \
737 TYPE n0 = *(TYPE *)(vn + H(i)); \
738 TYPE m0 = *(TYPE *)(vm + H(i)); \
739 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
740 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
741 if (pg & 1) { \
742 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
743 } \
744 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
745 if (pg & 1) { \
746 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
747 } \
748 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
749 } while (i & 15); \
750 } \
751}
752
753DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
754DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
755DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
756
757DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
758DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
759DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
760
761DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
762DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
763DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
764
765DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
766DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
767DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
768
769DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
770DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
771DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
772
773#undef DO_ZPZZ_PAIR_FP
774
775
776
777
778
779#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
780void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
781{ \
782 intptr_t i, opr_sz = simd_oprsz(desc); \
783 for (i = 0; i < opr_sz; ) { \
784 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
785 TYPEW mm = *(TYPEW *)(vm + i); \
786 do { \
787 if (pg & 1) { \
788 TYPE nn = *(TYPE *)(vn + H(i)); \
789 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
790 } \
791 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
792 } while (i & 7); \
793 } \
794}
795
796DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
797DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
798DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
799
800DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
801DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
802DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
803
804DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
805DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
806DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
807
808#undef DO_ZPZW
809
810
811
812#define DO_ZPZ(NAME, TYPE, H, OP) \
813void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
814{ \
815 intptr_t i, opr_sz = simd_oprsz(desc); \
816 for (i = 0; i < opr_sz; ) { \
817 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
818 do { \
819 if (pg & 1) { \
820 TYPE nn = *(TYPE *)(vn + H(i)); \
821 *(TYPE *)(vd + H(i)) = OP(nn); \
822 } \
823 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
824 } while (i & 15); \
825 } \
826}
827
828
829#define DO_ZPZ_D(NAME, TYPE, OP) \
830void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
831{ \
832 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
833 TYPE *d = vd, *n = vn; \
834 uint8_t *pg = vg; \
835 for (i = 0; i < opr_sz; i += 1) { \
836 if (pg[H1(i)] & 1) { \
837 TYPE nn = n[i]; \
838 d[i] = OP(nn); \
839 } \
840 } \
841}
842
843#define DO_CLS_B(N) (clrsb32(N) - 24)
844#define DO_CLS_H(N) (clrsb32(N) - 16)
845
846DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
847DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
848DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
849DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
850
851#define DO_CLZ_B(N) (clz32(N) - 24)
852#define DO_CLZ_H(N) (clz32(N) - 16)
853
854DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
855DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
856DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
857DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
858
859DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
860DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
861DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
862DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
863
864#define DO_CNOT(N) (N == 0)
865
866DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
867DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
868DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
869DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
870
871#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
872
873DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
874DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
875DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
876
877#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
878
879DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
880DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
881DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
882
883#define DO_NOT(N) (~N)
884
885DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
886DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
887DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
888DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
889
890#define DO_SXTB(N) ((int8_t)N)
891#define DO_SXTH(N) ((int16_t)N)
892#define DO_SXTS(N) ((int32_t)N)
893#define DO_UXTB(N) ((uint8_t)N)
894#define DO_UXTH(N) ((uint16_t)N)
895#define DO_UXTS(N) ((uint32_t)N)
896
897DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
898DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
899DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
900DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
901DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
902DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
903
904DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
905DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
906DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
907DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
908DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
909DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
910
911#define DO_ABS(N) (N < 0 ? -N : N)
912
913DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
914DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
915DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
916DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
917
918#define DO_NEG(N) (-N)
919
920DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
921DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
922DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
923DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
924
925DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
926DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
927DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
928
929DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
930DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
931
932DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
933
934void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
935{
936 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
937 uint64_t *d = vd, *n = vn;
938 uint8_t *pg = vg;
939
940 for (i = 0; i < opr_sz; i += 2) {
941 if (pg[H1(i)] & 1) {
942 uint64_t n0 = n[i + 0];
943 uint64_t n1 = n[i + 1];
944 d[i + 0] = n1;
945 d[i + 1] = n0;
946 }
947 }
948}
949
950DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
951DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
952DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
953DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
954
955#define DO_SQABS(X) \
956 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
957 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
958
959DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
960DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
961DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
962DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
963
964#define DO_SQNEG(X) \
965 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
966 x_ == min_ ? -min_ - 1 : -x_; })
967
968DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
969DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
970DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
971DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
972
973DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
974DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
975
976
977
978#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
979void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
980{ \
981 intptr_t i, opr_sz = simd_oprsz(desc); \
982 for (i = 0; i < opr_sz; ) { \
983 TYPEW mm = *(TYPEW *)(vm + i); \
984 do { \
985 TYPE nn = *(TYPE *)(vn + H(i)); \
986 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
987 i += sizeof(TYPE); \
988 } while (i & 7); \
989 } \
990}
991
992DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
993DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
994DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
995
996DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
997DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
998DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
999
1000DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1001DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1002DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1003
1004#undef DO_ZZW
1005
1006#undef DO_CLS_B
1007#undef DO_CLS_H
1008#undef DO_CLZ_B
1009#undef DO_CLZ_H
1010#undef DO_CNOT
1011#undef DO_FABS
1012#undef DO_FNEG
1013#undef DO_ABS
1014#undef DO_NEG
1015#undef DO_ZPZ
1016#undef DO_ZPZ_D
1017
1018
1019
1020
1021
1022#define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1023void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1024{ \
1025 intptr_t i, opr_sz = simd_oprsz(desc); \
1026 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1027 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1028 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1029 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1030 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1031 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1032 } \
1033}
1034
1035DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1036DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1037DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1038
1039DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1040DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1041DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1042
1043DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1044DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1045DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1046
1047DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1048DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1049DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1050
1051DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1052DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1053DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1054
1055DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1056DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1057DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1058
1059DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1060DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1061DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1062
1063DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1064DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1065DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1066
1067
1068static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1069{
1070 int16_t val = n * m;
1071 return DO_SQADD_H(val, val);
1072}
1073
1074static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1075{
1076 int32_t val = n * m;
1077 return DO_SQADD_S(val, val);
1078}
1079
1080static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1081{
1082 int64_t val = n * m;
1083 return do_sqadd_d(val, val);
1084}
1085
1086DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1087DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1088DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1089
1090#undef DO_ZZZ_TB
1091
1092#define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1093void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1094{ \
1095 intptr_t i, opr_sz = simd_oprsz(desc); \
1096 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1097 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1098 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1099 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1100 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1101 } \
1102}
1103
1104DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1105DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1106DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1107
1108DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1109DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1110DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1111
1112DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1113DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1114DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1115
1116DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1117DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1118DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1119
1120#undef DO_ZZZ_WTB
1121
1122#define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1123void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1124{ \
1125 intptr_t i, opr_sz = simd_oprsz(desc); \
1126 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1127 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1128 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1129 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1130 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1131 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1132 } \
1133}
1134
1135DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1136DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1137DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1138DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1139
1140#undef DO_ZZZ_NTB
1141
1142#define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1143void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1144{ \
1145 intptr_t i, opr_sz = simd_oprsz(desc); \
1146 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1147 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1148 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1149 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1150 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1151 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1152 } \
1153}
1154
1155DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1156DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1157DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1158
1159DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1160DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1161DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1162
1163DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1164DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1165DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1166
1167DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1168DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1169DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1170
1171#define DO_NMUL(N, M) -(N * M)
1172
1173DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1174DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1175DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1176
1177DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1178DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1179DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1180
1181#undef DO_ZZZW_ACC
1182
1183#define DO_XTNB(NAME, TYPE, OP) \
1184void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1185{ \
1186 intptr_t i, opr_sz = simd_oprsz(desc); \
1187 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1188 TYPE nn = *(TYPE *)(vn + i); \
1189 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1190 *(TYPE *)(vd + i) = nn; \
1191 } \
1192}
1193
1194#define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1195void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1196{ \
1197 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1198 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1199 TYPE nn = *(TYPE *)(vn + i); \
1200 *(TYPEN *)(vd + i + odd) = OP(nn); \
1201 } \
1202}
1203
1204#define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1205#define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1206#define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1207
1208DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1209DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1210DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1211
1212DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1213DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1214DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1215
1216#define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1217#define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1218#define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1219
1220DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1221DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1222DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1223
1224DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1225DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1226DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1227
1228DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1229DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1230DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1231
1232DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1233DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1234DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1235
1236#undef DO_XTNB
1237#undef DO_XTNT
1238
1239void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1240{
1241 intptr_t i, opr_sz = simd_oprsz(desc);
1242 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1243 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1244 uint32_t *a = va, *n = vn;
1245 uint64_t *d = vd, *m = vm;
1246
1247 for (i = 0; i < opr_sz / 8; ++i) {
1248 uint32_t e1 = a[2 * i + H4(0)];
1249 uint32_t e2 = n[2 * i + sel] ^ inv;
1250 uint64_t c = extract64(m[i], 32, 1);
1251
1252 d[i] = c + e1 + e2;
1253 }
1254}
1255
1256void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1257{
1258 intptr_t i, opr_sz = simd_oprsz(desc);
1259 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1260 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1261 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1262
1263 for (i = 0; i < opr_sz / 8; i += 2) {
1264 Int128 e1 = int128_make64(a[i]);
1265 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1266 Int128 c = int128_make64(m[i + 1] & 1);
1267 Int128 r = int128_add(int128_add(e1, e2), c);
1268 d[i + 0] = int128_getlo(r);
1269 d[i + 1] = int128_gethi(r);
1270 }
1271}
1272
1273#define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1274void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1275{ \
1276 intptr_t i, opr_sz = simd_oprsz(desc); \
1277 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1278 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1279 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1280 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1281 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1282 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1283 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1284 } \
1285}
1286
1287DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1288 do_sqdmull_h, DO_SQADD_H)
1289DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1290 do_sqdmull_s, DO_SQADD_S)
1291DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1292 do_sqdmull_d, do_sqadd_d)
1293
1294DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1295 do_sqdmull_h, DO_SQSUB_H)
1296DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1297 do_sqdmull_s, DO_SQSUB_S)
1298DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1299 do_sqdmull_d, do_sqsub_d)
1300
1301#undef DO_SQDMLAL
1302
1303#define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1304void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1305{ \
1306 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1307 int rot = simd_data(desc); \
1308 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1309 bool sub_r = rot == 1 || rot == 2; \
1310 bool sub_i = rot >= 2; \
1311 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1312 for (i = 0; i < opr_sz; i += 2) { \
1313 TYPE elt1_a = n[H(i + sel_a)]; \
1314 TYPE elt2_a = m[H(i + sel_a)]; \
1315 TYPE elt2_b = m[H(i + sel_b)]; \
1316 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1317 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1318 } \
1319}
1320
1321#define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1322
1323DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1324DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1325DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1326DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1327
1328#define DO_SQRDMLAH_B(N, M, A, S) \
1329 do_sqrdmlah_b(N, M, A, S, true)
1330#define DO_SQRDMLAH_H(N, M, A, S) \
1331 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1332#define DO_SQRDMLAH_S(N, M, A, S) \
1333 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1334#define DO_SQRDMLAH_D(N, M, A, S) \
1335 do_sqrdmlah_d(N, M, A, S, true)
1336
1337DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1338DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1339DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1340DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1341
1342#define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1343void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1344{ \
1345 intptr_t i, j, oprsz = simd_oprsz(desc); \
1346 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
1347 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
1348 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1349 bool sub_r = rot == 1 || rot == 2; \
1350 bool sub_i = rot >= 2; \
1351 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1352 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
1353 TYPE elt2_a = m[H(i + idx + sel_a)]; \
1354 TYPE elt2_b = m[H(i + idx + sel_b)]; \
1355 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
1356 TYPE elt1_a = n[H(i + j + sel_a)]; \
1357 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
1358 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
1359 } \
1360 } \
1361}
1362
1363DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1364DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1365
1366DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1367DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1368
1369#undef DO_CMLA
1370#undef DO_CMLA_FUNC
1371#undef DO_CMLA_IDX_FUNC
1372#undef DO_SQRDMLAH_B
1373#undef DO_SQRDMLAH_H
1374#undef DO_SQRDMLAH_S
1375#undef DO_SQRDMLAH_D
1376
1377
1378static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1379 int sel_a, int sel_b, int sub_i)
1380{
1381 for (int i = 0; i <= 1; i++) {
1382 int32_t elt1_r = (int8_t)(n >> (16 * i));
1383 int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1384 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1385 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1386
1387 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1388 }
1389 return a;
1390}
1391
1392static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1393 int sel_a, int sel_b, int sub_i)
1394{
1395 for (int i = 0; i <= 1; i++) {
1396 int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1397 int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1398 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1399 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1400
1401 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1402 }
1403 return a;
1404}
1405
1406void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1407 void *va, uint32_t desc)
1408{
1409 int opr_sz = simd_oprsz(desc);
1410 int rot = simd_data(desc);
1411 int sel_a = rot & 1;
1412 int sel_b = sel_a ^ 1;
1413 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1414 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1415
1416 for (int e = 0; e < opr_sz / 4; e++) {
1417 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1418 }
1419}
1420
1421void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1422 void *va, uint32_t desc)
1423{
1424 int opr_sz = simd_oprsz(desc);
1425 int rot = simd_data(desc);
1426 int sel_a = rot & 1;
1427 int sel_b = sel_a ^ 1;
1428 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1429 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1430
1431 for (int e = 0; e < opr_sz / 8; e++) {
1432 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1433 }
1434}
1435
1436void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1437 void *va, uint32_t desc)
1438{
1439 int opr_sz = simd_oprsz(desc);
1440 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1441 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1442 int sel_a = rot & 1;
1443 int sel_b = sel_a ^ 1;
1444 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1445 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1446
1447 for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1448 uint32_t seg_m = m[seg + idx];
1449 for (int e = 0; e < 4; e++) {
1450 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1451 sel_a, sel_b, sub_i);
1452 }
1453 }
1454}
1455
1456void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1457 void *va, uint32_t desc)
1458{
1459 int seg, opr_sz = simd_oprsz(desc);
1460 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1461 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1462 int sel_a = rot & 1;
1463 int sel_b = sel_a ^ 1;
1464 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1465 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1466
1467 for (seg = 0; seg < opr_sz / 8; seg += 2) {
1468 uint64_t seg_m = m[seg + idx];
1469 for (int e = 0; e < 2; e++) {
1470 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1471 sel_a, sel_b, sub_i);
1472 }
1473 }
1474}
1475
1476#define DO_ZZXZ(NAME, TYPE, H, OP) \
1477void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1478{ \
1479 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
1480 intptr_t i, j, idx = simd_data(desc); \
1481 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
1482 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1483 TYPE mm = m[i]; \
1484 for (j = 0; j < segment; j++) { \
1485 d[i + j] = OP(n[i + j], mm, a[i + j]); \
1486 } \
1487 } \
1488}
1489
1490#define DO_SQRDMLAH_H(N, M, A) \
1491 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1492#define DO_SQRDMLAH_S(N, M, A) \
1493 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1494#define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1495
1496DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1497DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1498DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1499
1500#define DO_SQRDMLSH_H(N, M, A) \
1501 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1502#define DO_SQRDMLSH_S(N, M, A) \
1503 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1504#define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1505
1506DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1507DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1508DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1509
1510#undef DO_ZZXZ
1511
1512#define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1513void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1514{ \
1515 intptr_t i, j, oprsz = simd_oprsz(desc); \
1516 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1517 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1518 for (i = 0; i < oprsz; i += 16) { \
1519 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1520 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1521 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1522 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
1523 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
1524 } \
1525 } \
1526}
1527
1528#define DO_MLA(N, M, A) (A + N * M)
1529
1530DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1531DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1532DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1533DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1534
1535#define DO_MLS(N, M, A) (A - N * M)
1536
1537DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1538DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1539DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1540DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1541
1542#define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
1543#define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
1544
1545DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1546DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1547
1548#define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
1549#define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
1550
1551DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1552DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1553
1554#undef DO_MLA
1555#undef DO_MLS
1556#undef DO_ZZXW
1557
1558#define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1559void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1560{ \
1561 intptr_t i, j, oprsz = simd_oprsz(desc); \
1562 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1563 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1564 for (i = 0; i < oprsz; i += 16) { \
1565 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1566 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1567 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1568 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
1569 } \
1570 } \
1571}
1572
1573DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1574DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1575
1576DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1577DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1578
1579DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1580DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1581
1582#undef DO_ZZX
1583
1584#define DO_BITPERM(NAME, TYPE, OP) \
1585void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1586{ \
1587 intptr_t i, opr_sz = simd_oprsz(desc); \
1588 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1589 TYPE nn = *(TYPE *)(vn + i); \
1590 TYPE mm = *(TYPE *)(vm + i); \
1591 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1592 } \
1593}
1594
1595static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1596{
1597 uint64_t res = 0;
1598 int db, rb = 0;
1599
1600 for (db = 0; db < n; ++db) {
1601 if ((mask >> db) & 1) {
1602 res |= ((data >> db) & 1) << rb;
1603 ++rb;
1604 }
1605 }
1606 return res;
1607}
1608
1609DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1610DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1611DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1612DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1613
1614static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1615{
1616 uint64_t res = 0;
1617 int rb, db = 0;
1618
1619 for (rb = 0; rb < n; ++rb) {
1620 if ((mask >> rb) & 1) {
1621 res |= ((data >> db) & 1) << rb;
1622 ++db;
1623 }
1624 }
1625 return res;
1626}
1627
1628DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1629DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1630DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1631DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1632
1633static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1634{
1635 uint64_t resm = 0, resu = 0;
1636 int db, rbm = 0, rbu = 0;
1637
1638 for (db = 0; db < n; ++db) {
1639 uint64_t val = (data >> db) & 1;
1640 if ((mask >> db) & 1) {
1641 resm |= val << rbm++;
1642 } else {
1643 resu |= val << rbu++;
1644 }
1645 }
1646
1647 return resm | (resu << rbm);
1648}
1649
1650DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1651DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1652DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1653DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1654
1655#undef DO_BITPERM
1656
1657#define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1658void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1659{ \
1660 intptr_t i, opr_sz = simd_oprsz(desc); \
1661 int sub_r = simd_data(desc); \
1662 if (sub_r) { \
1663 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1664 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1665 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1666 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1667 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1668 acc_r = ADD_OP(acc_r, el2_i); \
1669 acc_i = SUB_OP(acc_i, el2_r); \
1670 *(TYPE *)(vd + H(i)) = acc_r; \
1671 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1672 } \
1673 } else { \
1674 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1675 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1676 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1677 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1678 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1679 acc_r = SUB_OP(acc_r, el2_i); \
1680 acc_i = ADD_OP(acc_i, el2_r); \
1681 *(TYPE *)(vd + H(i)) = acc_r; \
1682 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1683 } \
1684 } \
1685}
1686
1687DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1688DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1689DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1690DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1691
1692DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1693DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1694DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1695DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1696
1697#undef DO_CADD
1698
1699#define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1700void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1701{ \
1702 intptr_t i, opr_sz = simd_oprsz(desc); \
1703 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1704 int shift = simd_data(desc) >> 1; \
1705 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1706 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1707 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1708 } \
1709}
1710
1711DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1712DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1713DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1714
1715DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1716DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1717DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1718
1719#undef DO_ZZI_SHLL
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1731uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1732{ \
1733 intptr_t i, opr_sz = simd_oprsz(desc); \
1734 TYPERED ret = INIT; \
1735 for (i = 0; i < opr_sz; ) { \
1736 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1737 do { \
1738 if (pg & 1) { \
1739 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1740 ret = OP(ret, nn); \
1741 } \
1742 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1743 } while (i & 15); \
1744 } \
1745 return (TYPERET)ret; \
1746}
1747
1748#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1749uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1750{ \
1751 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1752 TYPEE *n = vn; \
1753 uint8_t *pg = vg; \
1754 TYPER ret = INIT; \
1755 for (i = 0; i < opr_sz; i += 1) { \
1756 if (pg[H1(i)] & 1) { \
1757 TYPEE nn = n[i]; \
1758 ret = OP(ret, nn); \
1759 } \
1760 } \
1761 return ret; \
1762}
1763
1764DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1765DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1766DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1767DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1768
1769DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1770DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1771DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1772DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1773
1774DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1775DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1776DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1777DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1778
1779DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1780DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1781DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1782
1783DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1784DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1785DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1786DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1787
1788DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1789DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1790DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1791DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1792
1793DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1794DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1795DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1796DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1797
1798DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1799DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1800DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1801DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1802
1803DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1804DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1805DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1806DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1807
1808#undef DO_VPZ
1809#undef DO_VPZ_D
1810
1811
1812#define DO_ZZI(NAME, TYPE, OP) \
1813void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1814{ \
1815 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1816 TYPE s = s64, *d = vd, *n = vn; \
1817 for (i = 0; i < opr_sz; ++i) { \
1818 d[i] = OP(n[i], s); \
1819 } \
1820}
1821
1822#define DO_SUBR(X, Y) (Y - X)
1823
1824DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1825DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1826DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1827DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1828
1829DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1830DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1831DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1832DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1833
1834DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1835DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1836DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1837DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1838
1839DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1840DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1841DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1842DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1843
1844DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1845DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1846DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1847DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1848
1849#undef DO_ZZI
1850
1851#undef DO_AND
1852#undef DO_ORR
1853#undef DO_EOR
1854#undef DO_BIC
1855#undef DO_ADD
1856#undef DO_SUB
1857#undef DO_MAX
1858#undef DO_MIN
1859#undef DO_ABD
1860#undef DO_MUL
1861#undef DO_DIV
1862#undef DO_ASR
1863#undef DO_LSR
1864#undef DO_LSL
1865#undef DO_SUBR
1866
1867
1868
1869
1870static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1871{
1872 uint64_t mask = pred_esz_masks[esz];
1873 intptr_t i = words;
1874
1875 do {
1876 uint64_t this_g = g[--i] & mask;
1877 if (this_g) {
1878 return i * 64 + (63 - clz64(this_g));
1879 }
1880 } while (i > 0);
1881 return (intptr_t)-1 << esz;
1882}
1883
1884uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1885{
1886 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1887 uint32_t flags = PREDTEST_INIT;
1888 uint64_t *d = vd, *g = vg;
1889 intptr_t i = 0;
1890
1891 do {
1892 uint64_t this_d = d[i];
1893 uint64_t this_g = g[i];
1894
1895 if (this_g) {
1896 if (!(flags & 4)) {
1897
1898 this_d |= this_g & -this_g;
1899 d[i] = this_d;
1900 }
1901 flags = iter_predtest_fwd(this_d, this_g, flags);
1902 }
1903 } while (++i < words);
1904
1905 return flags;
1906}
1907
1908uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1909{
1910 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1911 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1912 uint32_t flags = PREDTEST_INIT;
1913 uint64_t *d = vd, *g = vg, esz_mask;
1914 intptr_t i, next;
1915
1916 next = last_active_element(vd, words, esz) + (1 << esz);
1917 esz_mask = pred_esz_masks[esz];
1918
1919
1920
1921 if (next < words * 64) {
1922 uint64_t mask = -1;
1923
1924 if (next & 63) {
1925 mask = ~((1ull << (next & 63)) - 1);
1926 next &= -64;
1927 }
1928 do {
1929 uint64_t this_g = g[next / 64] & esz_mask & mask;
1930 if (this_g != 0) {
1931 next = (next & -64) + ctz64(this_g);
1932 break;
1933 }
1934 next += 64;
1935 mask = -1;
1936 } while (next < words * 64);
1937 }
1938
1939 i = 0;
1940 do {
1941 uint64_t this_d = 0;
1942 if (i == next / 64) {
1943 this_d = 1ull << (next & 63);
1944 }
1945 d[i] = this_d;
1946 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1947 } while (++i < words);
1948
1949 return flags;
1950}
1951
1952
1953
1954
1955
1956void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1957{
1958 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1959 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1960 uint64_t *d = vd, *n = vn;
1961 uint8_t *pg = vg;
1962
1963 for (i = 0; i < opr_sz; i += 1) {
1964 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1965 }
1966}
1967
1968void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1969{
1970 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1971 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1972 uint64_t *d = vd, *n = vn;
1973 uint8_t *pg = vg;
1974
1975 for (i = 0; i < opr_sz; i += 1) {
1976 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1977 }
1978}
1979
1980void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1981{
1982 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1983 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1984 uint64_t *d = vd, *n = vn;
1985 uint8_t *pg = vg;
1986
1987 for (i = 0; i < opr_sz; i += 1) {
1988 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
1989 }
1990}
1991
1992void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1993{
1994 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1995 uint64_t *d = vd, *n = vn;
1996 uint8_t *pg = vg;
1997 uint8_t inv = simd_data(desc);
1998
1999 for (i = 0; i < opr_sz; i += 1) {
2000 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2001 }
2002}
2003
2004
2005
2006#define DO_ZPZI(NAME, TYPE, H, OP) \
2007void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2008{ \
2009 intptr_t i, opr_sz = simd_oprsz(desc); \
2010 TYPE imm = simd_data(desc); \
2011 for (i = 0; i < opr_sz; ) { \
2012 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2013 do { \
2014 if (pg & 1) { \
2015 TYPE nn = *(TYPE *)(vn + H(i)); \
2016 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
2017 } \
2018 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2019 } while (i & 15); \
2020 } \
2021}
2022
2023
2024#define DO_ZPZI_D(NAME, TYPE, OP) \
2025void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2026{ \
2027 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2028 TYPE *d = vd, *n = vn; \
2029 TYPE imm = simd_data(desc); \
2030 uint8_t *pg = vg; \
2031 for (i = 0; i < opr_sz; i += 1) { \
2032 if (pg[H1(i)] & 1) { \
2033 TYPE nn = n[i]; \
2034 d[i] = OP(nn, imm); \
2035 } \
2036 } \
2037}
2038
2039#define DO_SHR(N, M) (N >> M)
2040#define DO_SHL(N, M) (N << M)
2041
2042
2043
2044
2045#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2046
2047static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2048{
2049 if (likely(sh < 64)) {
2050 return (x >> sh) + ((x >> (sh - 1)) & 1);
2051 } else if (sh == 64) {
2052 return x >> 63;
2053 } else {
2054 return 0;
2055 }
2056}
2057
2058static inline int64_t do_srshr(int64_t x, unsigned sh)
2059{
2060 if (likely(sh < 64)) {
2061 return (x >> sh) + ((x >> (sh - 1)) & 1);
2062 } else {
2063
2064 return 0;
2065 }
2066}
2067
2068DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2069DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2070DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2071DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2072
2073DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2074DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2075DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2076DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2077
2078DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2079DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2080DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2081DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2082
2083DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2084DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2085DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2086DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2087
2088
2089DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2090DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2091DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2092DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2093
2094DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2095DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2096DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2097DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2098
2099DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2100DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2101DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2102DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2103
2104DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2105DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2106DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2107DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2108
2109#define do_suqrshl_b(n, m) \
2110 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2111#define do_suqrshl_h(n, m) \
2112 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2113#define do_suqrshl_s(n, m) \
2114 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2115#define do_suqrshl_d(n, m) \
2116 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2117
2118DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2119DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2120DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2121DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2122
2123#undef DO_ASRD
2124#undef DO_ZPZI
2125#undef DO_ZPZI_D
2126
2127#define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2128void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2129{ \
2130 intptr_t i, opr_sz = simd_oprsz(desc); \
2131 int shift = simd_data(desc); \
2132 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2133 TYPEW nn = *(TYPEW *)(vn + i); \
2134 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2135 } \
2136}
2137
2138#define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2139void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2140{ \
2141 intptr_t i, opr_sz = simd_oprsz(desc); \
2142 int shift = simd_data(desc); \
2143 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2144 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2145 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2146 } \
2147}
2148
2149DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2150DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2151DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2152
2153DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2154DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2155DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2156
2157DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2158DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2159DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2160
2161DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2162DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2163DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2164
2165#define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2166#define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2167#define DO_SQSHRUN_D(x, sh) \
2168 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2169
2170DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2171DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2172DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2173
2174DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2175DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2176DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2177
2178#define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2179#define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2180#define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2181
2182DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2183DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2184DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2185
2186DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2187DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2188DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2189
2190#define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2191#define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2192#define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2193
2194DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2195DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2196DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2197
2198DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2199DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2200DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2201
2202#define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2203#define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2204#define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2205
2206DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2207DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2208DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2209
2210DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2211DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2212DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2213
2214#define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2215#define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2216#define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2217
2218DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2219DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2220DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2221
2222DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2223DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2224DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2225
2226#define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2227#define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2228#define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2229
2230DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2231DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2232DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2233
2234DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2235DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2236DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2237
2238#undef DO_SHRNB
2239#undef DO_SHRNT
2240
2241#define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2242void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2243{ \
2244 intptr_t i, opr_sz = simd_oprsz(desc); \
2245 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2246 TYPEW nn = *(TYPEW *)(vn + i); \
2247 TYPEW mm = *(TYPEW *)(vm + i); \
2248 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2249 } \
2250}
2251
2252#define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2253void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2254{ \
2255 intptr_t i, opr_sz = simd_oprsz(desc); \
2256 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2257 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2258 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2259 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2260 } \
2261}
2262
2263#define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2264#define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2265#define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2266#define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2267
2268DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2269DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2270DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2271
2272DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2273DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2274DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2275
2276DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2277DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2278DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2279
2280DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2281DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2282DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2283
2284DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2285DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2286DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2287
2288DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2289DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2290DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2291
2292DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2293DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2294DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2295
2296DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2297DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2298DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2299
2300#undef DO_RSUBHN
2301#undef DO_SUBHN
2302#undef DO_RADDHN
2303#undef DO_ADDHN
2304
2305#undef DO_BINOPNB
2306
2307
2308
2309#define DO_ZPZZZ(NAME, TYPE, H, OP) \
2310void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2311 void *vg, uint32_t desc) \
2312{ \
2313 intptr_t i, opr_sz = simd_oprsz(desc); \
2314 for (i = 0; i < opr_sz; ) { \
2315 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2316 do { \
2317 if (pg & 1) { \
2318 TYPE nn = *(TYPE *)(vn + H(i)); \
2319 TYPE mm = *(TYPE *)(vm + H(i)); \
2320 TYPE aa = *(TYPE *)(va + H(i)); \
2321 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2322 } \
2323 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2324 } while (i & 15); \
2325 } \
2326}
2327
2328
2329#define DO_ZPZZZ_D(NAME, TYPE, OP) \
2330void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2331 void *vg, uint32_t desc) \
2332{ \
2333 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2334 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2335 uint8_t *pg = vg; \
2336 for (i = 0; i < opr_sz; i += 1) { \
2337 if (pg[H1(i)] & 1) { \
2338 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2339 d[i] = OP(aa, nn, mm); \
2340 } \
2341 } \
2342}
2343
2344#define DO_MLA(A, N, M) (A + N * M)
2345#define DO_MLS(A, N, M) (A - N * M)
2346
2347DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2348DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2349
2350DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2351DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2352
2353DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2354DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2355
2356DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2357DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2358
2359#undef DO_MLA
2360#undef DO_MLS
2361#undef DO_ZPZZZ
2362#undef DO_ZPZZZ_D
2363
2364void HELPER(sve_index_b)(void *vd, uint32_t start,
2365 uint32_t incr, uint32_t desc)
2366{
2367 intptr_t i, opr_sz = simd_oprsz(desc);
2368 uint8_t *d = vd;
2369 for (i = 0; i < opr_sz; i += 1) {
2370 d[H1(i)] = start + i * incr;
2371 }
2372}
2373
2374void HELPER(sve_index_h)(void *vd, uint32_t start,
2375 uint32_t incr, uint32_t desc)
2376{
2377 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2378 uint16_t *d = vd;
2379 for (i = 0; i < opr_sz; i += 1) {
2380 d[H2(i)] = start + i * incr;
2381 }
2382}
2383
2384void HELPER(sve_index_s)(void *vd, uint32_t start,
2385 uint32_t incr, uint32_t desc)
2386{
2387 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2388 uint32_t *d = vd;
2389 for (i = 0; i < opr_sz; i += 1) {
2390 d[H4(i)] = start + i * incr;
2391 }
2392}
2393
2394void HELPER(sve_index_d)(void *vd, uint64_t start,
2395 uint64_t incr, uint32_t desc)
2396{
2397 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2398 uint64_t *d = vd;
2399 for (i = 0; i < opr_sz; i += 1) {
2400 d[i] = start + i * incr;
2401 }
2402}
2403
2404void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2405{
2406 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2407 uint32_t sh = simd_data(desc);
2408 uint32_t *d = vd, *n = vn, *m = vm;
2409 for (i = 0; i < opr_sz; i += 1) {
2410 d[i] = n[i] + (m[i] << sh);
2411 }
2412}
2413
2414void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2415{
2416 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2417 uint64_t sh = simd_data(desc);
2418 uint64_t *d = vd, *n = vn, *m = vm;
2419 for (i = 0; i < opr_sz; i += 1) {
2420 d[i] = n[i] + (m[i] << sh);
2421 }
2422}
2423
2424void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2425{
2426 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2427 uint64_t sh = simd_data(desc);
2428 uint64_t *d = vd, *n = vn, *m = vm;
2429 for (i = 0; i < opr_sz; i += 1) {
2430 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2431 }
2432}
2433
2434void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2435{
2436 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2437 uint64_t sh = simd_data(desc);
2438 uint64_t *d = vd, *n = vn, *m = vm;
2439 for (i = 0; i < opr_sz; i += 1) {
2440 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2441 }
2442}
2443
2444void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2445{
2446
2447 static const uint16_t coeff[] = {
2448 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2449 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2450 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2451 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2452 };
2453 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2454 uint16_t *d = vd, *n = vn;
2455
2456 for (i = 0; i < opr_sz; i++) {
2457 uint16_t nn = n[i];
2458 intptr_t idx = extract32(nn, 0, 5);
2459 uint16_t exp = extract32(nn, 5, 5);
2460 d[i] = coeff[idx] | (exp << 10);
2461 }
2462}
2463
2464void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2465{
2466
2467 static const uint32_t coeff[] = {
2468 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2469 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2470 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2471 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2472 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2473 0x1ef532, 0x20b051, 0x227043, 0x243516,
2474 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2475 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2476 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2477 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2478 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2479 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2480 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2481 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2482 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2483 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2484 };
2485 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2486 uint32_t *d = vd, *n = vn;
2487
2488 for (i = 0; i < opr_sz; i++) {
2489 uint32_t nn = n[i];
2490 intptr_t idx = extract32(nn, 0, 6);
2491 uint32_t exp = extract32(nn, 6, 8);
2492 d[i] = coeff[idx] | (exp << 23);
2493 }
2494}
2495
2496void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2497{
2498
2499 static const uint64_t coeff[] = {
2500 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2501 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2502 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2503 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2504 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2505 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2506 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2507 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2508 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2509 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2510 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2511 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2512 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2513 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2514 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2515 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2516 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2517 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2518 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2519 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2520 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2521 0xFA7C1819E90D8ull,
2522 };
2523 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2524 uint64_t *d = vd, *n = vn;
2525
2526 for (i = 0; i < opr_sz; i++) {
2527 uint64_t nn = n[i];
2528 intptr_t idx = extract32(nn, 0, 6);
2529 uint64_t exp = extract32(nn, 6, 11);
2530 d[i] = coeff[idx] | (exp << 52);
2531 }
2532}
2533
2534void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2535{
2536 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2537 uint16_t *d = vd, *n = vn, *m = vm;
2538 for (i = 0; i < opr_sz; i += 1) {
2539 uint16_t nn = n[i];
2540 uint16_t mm = m[i];
2541 if (mm & 1) {
2542 nn = float16_one;
2543 }
2544 d[i] = nn ^ (mm & 2) << 14;
2545 }
2546}
2547
2548void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2549{
2550 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2551 uint32_t *d = vd, *n = vn, *m = vm;
2552 for (i = 0; i < opr_sz; i += 1) {
2553 uint32_t nn = n[i];
2554 uint32_t mm = m[i];
2555 if (mm & 1) {
2556 nn = float32_one;
2557 }
2558 d[i] = nn ^ (mm & 2) << 30;
2559 }
2560}
2561
2562void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2563{
2564 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2565 uint64_t *d = vd, *n = vn, *m = vm;
2566 for (i = 0; i < opr_sz; i += 1) {
2567 uint64_t nn = n[i];
2568 uint64_t mm = m[i];
2569 if (mm & 1) {
2570 nn = float64_one;
2571 }
2572 d[i] = nn ^ (mm & 2) << 62;
2573 }
2574}
2575
2576
2577
2578
2579
2580void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2581{
2582 intptr_t i, oprsz = simd_oprsz(desc);
2583
2584 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2585 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2586 }
2587}
2588
2589void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2590{
2591 intptr_t i, oprsz = simd_oprsz(desc);
2592
2593 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2594 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2595 }
2596}
2597
2598void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2599{
2600 intptr_t i, oprsz = simd_oprsz(desc);
2601
2602 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2603 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2604 }
2605}
2606
2607void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2608{
2609 intptr_t i, oprsz = simd_oprsz(desc);
2610
2611 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2612 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2613 }
2614}
2615
2616
2617
2618
2619
2620void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2621{
2622 intptr_t i, oprsz = simd_oprsz(desc);
2623
2624 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2625 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2626 }
2627}
2628
2629void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2630{
2631 intptr_t i, oprsz = simd_oprsz(desc);
2632
2633 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2634 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2635 }
2636}
2637
2638void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2639{
2640 intptr_t i, oprsz = simd_oprsz(desc);
2641
2642 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2643 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2644 }
2645}
2646
2647void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2648{
2649 intptr_t i, oprsz = simd_oprsz(desc);
2650
2651 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2652 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2653 }
2654}
2655
2656void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2657{
2658 intptr_t i, oprsz = simd_oprsz(desc);
2659
2660 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2661 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2662 }
2663}
2664
2665
2666
2667
2668void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2669 uint64_t mm, uint32_t desc)
2670{
2671 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2672 uint64_t *d = vd, *n = vn;
2673 uint8_t *pg = vg;
2674
2675 mm = dup_const(MO_8, mm);
2676 for (i = 0; i < opr_sz; i += 1) {
2677 uint64_t nn = n[i];
2678 uint64_t pp = expand_pred_b(pg[H1(i)]);
2679 d[i] = (mm & pp) | (nn & ~pp);
2680 }
2681}
2682
2683void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2684 uint64_t mm, uint32_t desc)
2685{
2686 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2687 uint64_t *d = vd, *n = vn;
2688 uint8_t *pg = vg;
2689
2690 mm = dup_const(MO_16, mm);
2691 for (i = 0; i < opr_sz; i += 1) {
2692 uint64_t nn = n[i];
2693 uint64_t pp = expand_pred_h(pg[H1(i)]);
2694 d[i] = (mm & pp) | (nn & ~pp);
2695 }
2696}
2697
2698void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2699 uint64_t mm, uint32_t desc)
2700{
2701 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2702 uint64_t *d = vd, *n = vn;
2703 uint8_t *pg = vg;
2704
2705 mm = dup_const(MO_32, mm);
2706 for (i = 0; i < opr_sz; i += 1) {
2707 uint64_t nn = n[i];
2708 uint64_t pp = expand_pred_s(pg[H1(i)]);
2709 d[i] = (mm & pp) | (nn & ~pp);
2710 }
2711}
2712
2713void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2714 uint64_t mm, uint32_t desc)
2715{
2716 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2717 uint64_t *d = vd, *n = vn;
2718 uint8_t *pg = vg;
2719
2720 for (i = 0; i < opr_sz; i += 1) {
2721 uint64_t nn = n[i];
2722 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2723 }
2724}
2725
2726void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2727{
2728 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2729 uint64_t *d = vd;
2730 uint8_t *pg = vg;
2731
2732 val = dup_const(MO_8, val);
2733 for (i = 0; i < opr_sz; i += 1) {
2734 d[i] = val & expand_pred_b(pg[H1(i)]);
2735 }
2736}
2737
2738void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2739{
2740 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2741 uint64_t *d = vd;
2742 uint8_t *pg = vg;
2743
2744 val = dup_const(MO_16, val);
2745 for (i = 0; i < opr_sz; i += 1) {
2746 d[i] = val & expand_pred_h(pg[H1(i)]);
2747 }
2748}
2749
2750void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2751{
2752 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2753 uint64_t *d = vd;
2754 uint8_t *pg = vg;
2755
2756 val = dup_const(MO_32, val);
2757 for (i = 0; i < opr_sz; i += 1) {
2758 d[i] = val & expand_pred_s(pg[H1(i)]);
2759 }
2760}
2761
2762void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2763{
2764 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2765 uint64_t *d = vd;
2766 uint8_t *pg = vg;
2767
2768 for (i = 0; i < opr_sz; i += 1) {
2769 d[i] = (pg[H1(i)] & 1 ? val : 0);
2770 }
2771}
2772
2773
2774
2775
2776static void swap_memmove(void *vd, void *vs, size_t n)
2777{
2778 uintptr_t d = (uintptr_t)vd;
2779 uintptr_t s = (uintptr_t)vs;
2780 uintptr_t o = (d | s | n) & 7;
2781 size_t i;
2782
2783#if !HOST_BIG_ENDIAN
2784 o = 0;
2785#endif
2786 switch (o) {
2787 case 0:
2788 memmove(vd, vs, n);
2789 break;
2790
2791 case 4:
2792 if (d < s || d >= s + n) {
2793 for (i = 0; i < n; i += 4) {
2794 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2795 }
2796 } else {
2797 for (i = n; i > 0; ) {
2798 i -= 4;
2799 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2800 }
2801 }
2802 break;
2803
2804 case 2:
2805 case 6:
2806 if (d < s || d >= s + n) {
2807 for (i = 0; i < n; i += 2) {
2808 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2809 }
2810 } else {
2811 for (i = n; i > 0; ) {
2812 i -= 2;
2813 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2814 }
2815 }
2816 break;
2817
2818 default:
2819 if (d < s || d >= s + n) {
2820 for (i = 0; i < n; i++) {
2821 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2822 }
2823 } else {
2824 for (i = n; i > 0; ) {
2825 i -= 1;
2826 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2827 }
2828 }
2829 break;
2830 }
2831}
2832
2833
2834static void swap_memzero(void *vd, size_t n)
2835{
2836 uintptr_t d = (uintptr_t)vd;
2837 uintptr_t o = (d | n) & 7;
2838 size_t i;
2839
2840
2841 if (likely(n == 0)) {
2842 return;
2843 }
2844
2845#if !HOST_BIG_ENDIAN
2846 o = 0;
2847#endif
2848 switch (o) {
2849 case 0:
2850 memset(vd, 0, n);
2851 break;
2852
2853 case 4:
2854 for (i = 0; i < n; i += 4) {
2855 *(uint32_t *)H1_4(d + i) = 0;
2856 }
2857 break;
2858
2859 case 2:
2860 case 6:
2861 for (i = 0; i < n; i += 2) {
2862 *(uint16_t *)H1_2(d + i) = 0;
2863 }
2864 break;
2865
2866 default:
2867 for (i = 0; i < n; i++) {
2868 *(uint8_t *)H1(d + i) = 0;
2869 }
2870 break;
2871 }
2872}
2873
2874void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2875{
2876 intptr_t opr_sz = simd_oprsz(desc);
2877 size_t n_ofs = simd_data(desc);
2878 size_t n_siz = opr_sz - n_ofs;
2879
2880 if (vd != vm) {
2881 swap_memmove(vd, vn + n_ofs, n_siz);
2882 swap_memmove(vd + n_siz, vm, n_ofs);
2883 } else if (vd != vn) {
2884 swap_memmove(vd + n_siz, vd, n_ofs);
2885 swap_memmove(vd, vn + n_ofs, n_siz);
2886 } else {
2887
2888 ARMVectorReg tmp;
2889 swap_memmove(&tmp, vm, n_ofs);
2890 swap_memmove(vd, vd + n_ofs, n_siz);
2891 memcpy(vd + n_siz, &tmp, n_ofs);
2892 }
2893}
2894
2895#define DO_INSR(NAME, TYPE, H) \
2896void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2897{ \
2898 intptr_t opr_sz = simd_oprsz(desc); \
2899 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2900 *(TYPE *)(vd + H(0)) = val; \
2901}
2902
2903DO_INSR(sve_insr_b, uint8_t, H1)
2904DO_INSR(sve_insr_h, uint16_t, H1_2)
2905DO_INSR(sve_insr_s, uint32_t, H1_4)
2906DO_INSR(sve_insr_d, uint64_t, H1_8)
2907
2908#undef DO_INSR
2909
2910void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2911{
2912 intptr_t i, j, opr_sz = simd_oprsz(desc);
2913 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2914 uint64_t f = *(uint64_t *)(vn + i);
2915 uint64_t b = *(uint64_t *)(vn + j);
2916 *(uint64_t *)(vd + i) = bswap64(b);
2917 *(uint64_t *)(vd + j) = bswap64(f);
2918 }
2919}
2920
2921void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2922{
2923 intptr_t i, j, opr_sz = simd_oprsz(desc);
2924 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2925 uint64_t f = *(uint64_t *)(vn + i);
2926 uint64_t b = *(uint64_t *)(vn + j);
2927 *(uint64_t *)(vd + i) = hswap64(b);
2928 *(uint64_t *)(vd + j) = hswap64(f);
2929 }
2930}
2931
2932void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2933{
2934 intptr_t i, j, opr_sz = simd_oprsz(desc);
2935 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2936 uint64_t f = *(uint64_t *)(vn + i);
2937 uint64_t b = *(uint64_t *)(vn + j);
2938 *(uint64_t *)(vd + i) = rol64(b, 32);
2939 *(uint64_t *)(vd + j) = rol64(f, 32);
2940 }
2941}
2942
2943void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2944{
2945 intptr_t i, j, opr_sz = simd_oprsz(desc);
2946 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2947 uint64_t f = *(uint64_t *)(vn + i);
2948 uint64_t b = *(uint64_t *)(vn + j);
2949 *(uint64_t *)(vd + i) = b;
2950 *(uint64_t *)(vd + j) = f;
2951 }
2952}
2953
2954typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2955
2956static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2957 bool is_tbx, tb_impl_fn *fn)
2958{
2959 ARMVectorReg scratch;
2960 uintptr_t oprsz = simd_oprsz(desc);
2961
2962 if (unlikely(vd == vn)) {
2963 vn = memcpy(&scratch, vn, oprsz);
2964 }
2965
2966 fn(vd, vn, NULL, vm, oprsz, is_tbx);
2967}
2968
2969static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
2970 uint32_t desc, bool is_tbx, tb_impl_fn *fn)
2971{
2972 ARMVectorReg scratch;
2973 uintptr_t oprsz = simd_oprsz(desc);
2974
2975 if (unlikely(vd == vn0)) {
2976 vn0 = memcpy(&scratch, vn0, oprsz);
2977 if (vd == vn1) {
2978 vn1 = vn0;
2979 }
2980 } else if (unlikely(vd == vn1)) {
2981 vn1 = memcpy(&scratch, vn1, oprsz);
2982 }
2983
2984 fn(vd, vn0, vn1, vm, oprsz, is_tbx);
2985}
2986
2987#define DO_TB(SUFF, TYPE, H) \
2988static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \
2989 void *vm, uintptr_t oprsz, bool is_tbx) \
2990{ \
2991 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \
2992 uintptr_t i, nelem = oprsz / sizeof(TYPE); \
2993 for (i = 0; i < nelem; ++i) { \
2994 TYPE index = indexes[H1(i)], val = 0; \
2995 if (index < nelem) { \
2996 val = tbl0[H(index)]; \
2997 } else { \
2998 index -= nelem; \
2999 if (tbl1 && index < nelem) { \
3000 val = tbl1[H(index)]; \
3001 } else if (is_tbx) { \
3002 continue; \
3003 } \
3004 } \
3005 d[H(i)] = val; \
3006 } \
3007} \
3008void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3009{ \
3010 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \
3011} \
3012void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \
3013 void *vm, uint32_t desc) \
3014{ \
3015 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \
3016} \
3017void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3018{ \
3019 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \
3020}
3021
3022DO_TB(b, uint8_t, H1)
3023DO_TB(h, uint16_t, H2)
3024DO_TB(s, uint32_t, H4)
3025DO_TB(d, uint64_t, H8)
3026
3027#undef DO_TB
3028
3029#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3030void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
3031{ \
3032 intptr_t i, opr_sz = simd_oprsz(desc); \
3033 TYPED *d = vd; \
3034 TYPES *n = vn; \
3035 ARMVectorReg tmp; \
3036 if (unlikely(vn - vd < opr_sz)) { \
3037 n = memcpy(&tmp, n, opr_sz / 2); \
3038 } \
3039 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
3040 d[HD(i)] = n[HS(i)]; \
3041 } \
3042}
3043
3044DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3045DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3046DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3047
3048DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3049DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3050DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3051
3052#undef DO_UNPK
3053
3054
3055
3056
3057
3058static const uint64_t even_bit_esz_masks[5] = {
3059 0x5555555555555555ull,
3060 0x3333333333333333ull,
3061 0x0f0f0f0f0f0f0f0full,
3062 0x00ff00ff00ff00ffull,
3063 0x0000ffff0000ffffull,
3064};
3065
3066
3067
3068
3069
3070
3071static uint64_t expand_bits(uint64_t x, int n)
3072{
3073 int i;
3074
3075 x &= 0xffffffffu;
3076 for (i = 4; i >= n; i--) {
3077 int sh = 1 << i;
3078 x = ((x << sh) | x) & even_bit_esz_masks[i];
3079 }
3080 return x;
3081}
3082
3083
3084
3085
3086
3087
3088static uint64_t compress_bits(uint64_t x, int n)
3089{
3090 int i;
3091
3092 for (i = n; i <= 4; i++) {
3093 int sh = 1 << i;
3094 x &= even_bit_esz_masks[i];
3095 x = (x >> sh) | x;
3096 }
3097 return x & 0xffffffffu;
3098}
3099
3100void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3101{
3102 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3103 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3104 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3105 int esize = 1 << esz;
3106 uint64_t *d = vd;
3107 intptr_t i;
3108
3109 if (oprsz <= 8) {
3110 uint64_t nn = *(uint64_t *)vn;
3111 uint64_t mm = *(uint64_t *)vm;
3112 int half = 4 * oprsz;
3113
3114 nn = extract64(nn, high * half, half);
3115 mm = extract64(mm, high * half, half);
3116 nn = expand_bits(nn, esz);
3117 mm = expand_bits(mm, esz);
3118 d[0] = nn | (mm << esize);
3119 } else {
3120 ARMPredicateReg tmp;
3121
3122
3123
3124 if (vd == vn) {
3125 vn = memcpy(&tmp, vn, oprsz);
3126 if (vd == vm) {
3127 vm = vn;
3128 }
3129 } else if (vd == vm) {
3130 vm = memcpy(&tmp, vm, oprsz);
3131 }
3132 if (high) {
3133 high = oprsz >> 1;
3134 }
3135
3136 if ((oprsz & 7) == 0) {
3137 uint32_t *n = vn, *m = vm;
3138 high >>= 2;
3139
3140 for (i = 0; i < oprsz / 8; i++) {
3141 uint64_t nn = n[H4(high + i)];
3142 uint64_t mm = m[H4(high + i)];
3143
3144 nn = expand_bits(nn, esz);
3145 mm = expand_bits(mm, esz);
3146 d[i] = nn | (mm << esize);
3147 }
3148 } else {
3149 uint8_t *n = vn, *m = vm;
3150 uint16_t *d16 = vd;
3151
3152 for (i = 0; i < oprsz / 2; i++) {
3153 uint16_t nn = n[H1(high + i)];
3154 uint16_t mm = m[H1(high + i)];
3155
3156 nn = expand_bits(nn, esz);
3157 mm = expand_bits(mm, esz);
3158 d16[H2(i)] = nn | (mm << esize);
3159 }
3160 }
3161 }
3162}
3163
3164void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3165{
3166 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3167 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3168 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3169 uint64_t *d = vd, *n = vn, *m = vm;
3170 uint64_t l, h;
3171 intptr_t i;
3172
3173 if (oprsz <= 8) {
3174 l = compress_bits(n[0] >> odd, esz);
3175 h = compress_bits(m[0] >> odd, esz);
3176 d[0] = l | (h << (4 * oprsz));
3177 } else {
3178 ARMPredicateReg tmp_m;
3179 intptr_t oprsz_16 = oprsz / 16;
3180
3181 if ((vm - vd) < (uintptr_t)oprsz) {
3182 m = memcpy(&tmp_m, vm, oprsz);
3183 }
3184
3185 for (i = 0; i < oprsz_16; i++) {
3186 l = n[2 * i + 0];
3187 h = n[2 * i + 1];
3188 l = compress_bits(l >> odd, esz);
3189 h = compress_bits(h >> odd, esz);
3190 d[i] = l | (h << 32);
3191 }
3192
3193
3194
3195
3196
3197
3198 if (oprsz & 15) {
3199 int final_shift = (oprsz & 15) * 2;
3200
3201 l = n[2 * i + 0];
3202 h = n[2 * i + 1];
3203 l = compress_bits(l >> odd, esz);
3204 h = compress_bits(h >> odd, esz);
3205 d[i] = l | (h << final_shift);
3206
3207 for (i = 0; i < oprsz_16; i++) {
3208 l = m[2 * i + 0];
3209 h = m[2 * i + 1];
3210 l = compress_bits(l >> odd, esz);
3211 h = compress_bits(h >> odd, esz);
3212 tmp_m.p[i] = l | (h << 32);
3213 }
3214 l = m[2 * i + 0];
3215 h = m[2 * i + 1];
3216 l = compress_bits(l >> odd, esz);
3217 h = compress_bits(h >> odd, esz);
3218 tmp_m.p[i] = l | (h << final_shift);
3219
3220 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3221 } else {
3222 for (i = 0; i < oprsz_16; i++) {
3223 l = m[2 * i + 0];
3224 h = m[2 * i + 1];
3225 l = compress_bits(l >> odd, esz);
3226 h = compress_bits(h >> odd, esz);
3227 d[oprsz_16 + i] = l | (h << 32);
3228 }
3229 }
3230 }
3231}
3232
3233void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3234{
3235 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3236 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3237 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3238 uint64_t *d = vd, *n = vn, *m = vm;
3239 uint64_t mask;
3240 int shr, shl;
3241 intptr_t i;
3242
3243 shl = 1 << esz;
3244 shr = 0;
3245 mask = even_bit_esz_masks[esz];
3246 if (odd) {
3247 mask <<= shl;
3248 shr = shl;
3249 shl = 0;
3250 }
3251
3252 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3253 uint64_t nn = (n[i] & mask) >> shr;
3254 uint64_t mm = (m[i] & mask) << shl;
3255 d[i] = nn + mm;
3256 }
3257}
3258
3259
3260static uint64_t reverse_bits_64(uint64_t x, int n)
3261{
3262 int i, sh;
3263
3264 x = bswap64(x);
3265 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3266 uint64_t mask = even_bit_esz_masks[i];
3267 x = ((x & mask) << sh) | ((x >> sh) & mask);
3268 }
3269 return x;
3270}
3271
3272static uint8_t reverse_bits_8(uint8_t x, int n)
3273{
3274 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3275 int i, sh;
3276
3277 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3278 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3279 }
3280 return x;
3281}
3282
3283void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3284{
3285 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3286 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3287 intptr_t i, oprsz_2 = oprsz / 2;
3288
3289 if (oprsz <= 8) {
3290 uint64_t l = *(uint64_t *)vn;
3291 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3292 *(uint64_t *)vd = l;
3293 } else if ((oprsz & 15) == 0) {
3294 for (i = 0; i < oprsz_2; i += 8) {
3295 intptr_t ih = oprsz - 8 - i;
3296 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3297 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3298 *(uint64_t *)(vd + i) = h;
3299 *(uint64_t *)(vd + ih) = l;
3300 }
3301 } else {
3302 for (i = 0; i < oprsz_2; i += 1) {
3303 intptr_t il = H1(i);
3304 intptr_t ih = H1(oprsz - 1 - i);
3305 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3306 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3307 *(uint8_t *)(vd + il) = h;
3308 *(uint8_t *)(vd + ih) = l;
3309 }
3310 }
3311}
3312
3313void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3314{
3315 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3316 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3317 uint64_t *d = vd;
3318 intptr_t i;
3319
3320 if (oprsz <= 8) {
3321 uint64_t nn = *(uint64_t *)vn;
3322 int half = 4 * oprsz;
3323
3324 nn = extract64(nn, high * half, half);
3325 nn = expand_bits(nn, 0);
3326 d[0] = nn;
3327 } else {
3328 ARMPredicateReg tmp_n;
3329
3330
3331
3332 if ((vn - vd) < (uintptr_t)oprsz) {
3333 vn = memcpy(&tmp_n, vn, oprsz);
3334 }
3335 if (high) {
3336 high = oprsz >> 1;
3337 }
3338
3339 if ((oprsz & 7) == 0) {
3340 uint32_t *n = vn;
3341 high >>= 2;
3342
3343 for (i = 0; i < oprsz / 8; i++) {
3344 uint64_t nn = n[H4(high + i)];
3345 d[i] = expand_bits(nn, 0);
3346 }
3347 } else {
3348 uint16_t *d16 = vd;
3349 uint8_t *n = vn;
3350
3351 for (i = 0; i < oprsz / 2; i++) {
3352 uint16_t nn = n[H1(high + i)];
3353 d16[H2(i)] = expand_bits(nn, 0);
3354 }
3355 }
3356 }
3357}
3358
3359#define DO_ZIP(NAME, TYPE, H) \
3360void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3361{ \
3362 intptr_t oprsz = simd_oprsz(desc); \
3363 intptr_t odd_ofs = simd_data(desc); \
3364 intptr_t i, oprsz_2 = oprsz / 2; \
3365 ARMVectorReg tmp_n, tmp_m; \
3366
3367 \
3368 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3369 vn = memcpy(&tmp_n, vn, oprsz); \
3370 } \
3371 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3372 vm = memcpy(&tmp_m, vm, oprsz); \
3373 } \
3374 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3375 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3376 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \
3377 *(TYPE *)(vm + odd_ofs + H(i)); \
3378 } \
3379 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3380 memset(vd + oprsz - 16, 0, 16); \
3381 } \
3382}
3383
3384DO_ZIP(sve_zip_b, uint8_t, H1)
3385DO_ZIP(sve_zip_h, uint16_t, H1_2)
3386DO_ZIP(sve_zip_s, uint32_t, H1_4)
3387DO_ZIP(sve_zip_d, uint64_t, H1_8)
3388DO_ZIP(sve2_zip_q, Int128, )
3389
3390#define DO_UZP(NAME, TYPE, H) \
3391void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3392{ \
3393 intptr_t oprsz = simd_oprsz(desc); \
3394 intptr_t odd_ofs = simd_data(desc); \
3395 intptr_t i, p; \
3396 ARMVectorReg tmp_m; \
3397 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3398 vm = memcpy(&tmp_m, vm, oprsz); \
3399 } \
3400 i = 0, p = odd_ofs; \
3401 do { \
3402 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \
3403 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3404 } while (p < oprsz); \
3405 p -= oprsz; \
3406 do { \
3407 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \
3408 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3409 } while (p < oprsz); \
3410 tcg_debug_assert(i == oprsz); \
3411}
3412
3413DO_UZP(sve_uzp_b, uint8_t, H1)
3414DO_UZP(sve_uzp_h, uint16_t, H1_2)
3415DO_UZP(sve_uzp_s, uint32_t, H1_4)
3416DO_UZP(sve_uzp_d, uint64_t, H1_8)
3417DO_UZP(sve2_uzp_q, Int128, )
3418
3419#define DO_TRN(NAME, TYPE, H) \
3420void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3421{ \
3422 intptr_t oprsz = simd_oprsz(desc); \
3423 intptr_t odd_ofs = simd_data(desc); \
3424 intptr_t i; \
3425 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3426 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3427 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3428 *(TYPE *)(vd + H(i + 0)) = ae; \
3429 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3430 } \
3431 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3432 memset(vd + oprsz - 16, 0, 16); \
3433 } \
3434}
3435
3436DO_TRN(sve_trn_b, uint8_t, H1)
3437DO_TRN(sve_trn_h, uint16_t, H1_2)
3438DO_TRN(sve_trn_s, uint32_t, H1_4)
3439DO_TRN(sve_trn_d, uint64_t, H1_8)
3440DO_TRN(sve2_trn_q, Int128, )
3441
3442#undef DO_ZIP
3443#undef DO_UZP
3444#undef DO_TRN
3445
3446void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3447{
3448 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3449 uint32_t *d = vd, *n = vn;
3450 uint8_t *pg = vg;
3451
3452 for (i = j = 0; i < opr_sz; i++) {
3453 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3454 d[H4(j)] = n[H4(i)];
3455 j++;
3456 }
3457 }
3458 for (; j < opr_sz; j++) {
3459 d[H4(j)] = 0;
3460 }
3461}
3462
3463void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3464{
3465 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3466 uint64_t *d = vd, *n = vn;
3467 uint8_t *pg = vg;
3468
3469 for (i = j = 0; i < opr_sz; i++) {
3470 if (pg[H1(i)] & 1) {
3471 d[j] = n[i];
3472 j++;
3473 }
3474 }
3475 for (; j < opr_sz; j++) {
3476 d[j] = 0;
3477 }
3478}
3479
3480
3481
3482
3483
3484int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3485{
3486 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3487 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3488
3489 return last_active_element(vg, words, esz);
3490}
3491
3492void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3493{
3494 intptr_t opr_sz = simd_oprsz(desc) / 8;
3495 int esz = simd_data(desc);
3496 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3497 intptr_t i, first_i, last_i;
3498 ARMVectorReg tmp;
3499
3500 first_i = last_i = 0;
3501 first_g = last_g = 0;
3502
3503
3504 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3505 pg = *(uint64_t *)(vg + i) & mask;
3506 if (pg) {
3507 if (last_g == 0) {
3508 last_g = pg;
3509 last_i = i;
3510 }
3511 first_g = pg;
3512 first_i = i;
3513 }
3514 }
3515
3516 len = 0;
3517 if (first_g != 0) {
3518 first_i = first_i * 8 + ctz64(first_g);
3519 last_i = last_i * 8 + 63 - clz64(last_g);
3520 len = last_i - first_i + (1 << esz);
3521 if (vd == vm) {
3522 vm = memcpy(&tmp, vm, opr_sz * 8);
3523 }
3524 swap_memmove(vd, vn + first_i, len);
3525 }
3526 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3527}
3528
3529void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3530 void *vg, uint32_t desc)
3531{
3532 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3533 uint64_t *d = vd, *n = vn, *m = vm;
3534 uint8_t *pg = vg;
3535
3536 for (i = 0; i < opr_sz; i += 1) {
3537 uint64_t nn = n[i], mm = m[i];
3538 uint64_t pp = expand_pred_b(pg[H1(i)]);
3539 d[i] = (nn & pp) | (mm & ~pp);
3540 }
3541}
3542
3543void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3544 void *vg, uint32_t desc)
3545{
3546 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3547 uint64_t *d = vd, *n = vn, *m = vm;
3548 uint8_t *pg = vg;
3549
3550 for (i = 0; i < opr_sz; i += 1) {
3551 uint64_t nn = n[i], mm = m[i];
3552 uint64_t pp = expand_pred_h(pg[H1(i)]);
3553 d[i] = (nn & pp) | (mm & ~pp);
3554 }
3555}
3556
3557void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3558 void *vg, uint32_t desc)
3559{
3560 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3561 uint64_t *d = vd, *n = vn, *m = vm;
3562 uint8_t *pg = vg;
3563
3564 for (i = 0; i < opr_sz; i += 1) {
3565 uint64_t nn = n[i], mm = m[i];
3566 uint64_t pp = expand_pred_s(pg[H1(i)]);
3567 d[i] = (nn & pp) | (mm & ~pp);
3568 }
3569}
3570
3571void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3572 void *vg, uint32_t desc)
3573{
3574 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3575 uint64_t *d = vd, *n = vn, *m = vm;
3576 uint8_t *pg = vg;
3577
3578 for (i = 0; i < opr_sz; i += 1) {
3579 uint64_t nn = n[i], mm = m[i];
3580 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3581 }
3582}
3583
3584void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3585 void *vg, uint32_t desc)
3586{
3587 intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3588 Int128 *d = vd, *n = vn, *m = vm;
3589 uint16_t *pg = vg;
3590
3591 for (i = 0; i < opr_sz; i += 1) {
3592 d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3593 }
3594}
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3618uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3619{ \
3620 intptr_t opr_sz = simd_oprsz(desc); \
3621 uint32_t flags = PREDTEST_INIT; \
3622 intptr_t i = opr_sz; \
3623 do { \
3624 uint64_t out = 0, pg; \
3625 do { \
3626 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3627 TYPE nn = *(TYPE *)(vn + H(i)); \
3628 TYPE mm = *(TYPE *)(vm + H(i)); \
3629 out |= nn OP mm; \
3630 } while (i & 63); \
3631 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3632 out &= pg; \
3633 *(uint64_t *)(vd + (i >> 3)) = out; \
3634 flags = iter_predtest_bwd(out, pg, flags); \
3635 } while (i > 0); \
3636 return flags; \
3637}
3638
3639#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3640 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3641#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3642 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3643#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3644 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3645#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3646 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3647
3648DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3649DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3650DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3651DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3652
3653DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3654DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3655DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3656DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3657
3658DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3659DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3660DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3661DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3662
3663DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3664DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3665DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3666DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3667
3668DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3669DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3670DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3671DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3672
3673DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3674DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3675DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3676DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3677
3678#undef DO_CMP_PPZZ_B
3679#undef DO_CMP_PPZZ_H
3680#undef DO_CMP_PPZZ_S
3681#undef DO_CMP_PPZZ_D
3682#undef DO_CMP_PPZZ
3683
3684
3685#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3686uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3687{ \
3688 intptr_t opr_sz = simd_oprsz(desc); \
3689 uint32_t flags = PREDTEST_INIT; \
3690 intptr_t i = opr_sz; \
3691 do { \
3692 uint64_t out = 0, pg; \
3693 do { \
3694 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3695 do { \
3696 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3697 TYPE nn = *(TYPE *)(vn + H(i)); \
3698 out |= nn OP mm; \
3699 } while (i & 7); \
3700 } while (i & 63); \
3701 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3702 out &= pg; \
3703 *(uint64_t *)(vd + (i >> 3)) = out; \
3704 flags = iter_predtest_bwd(out, pg, flags); \
3705 } while (i > 0); \
3706 return flags; \
3707}
3708
3709#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3710 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3711#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3712 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3713#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3714 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3715
3716DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3717DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3718DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3719
3720DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3721DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3722DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3723
3724DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3725DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3726DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3727
3728DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3729DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3730DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3731
3732DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3733DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3734DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3735
3736DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3737DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3738DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3739
3740DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3741DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3742DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3743
3744DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3745DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3746DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3747
3748DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3749DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3750DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3751
3752DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3753DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3754DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3755
3756#undef DO_CMP_PPZW_B
3757#undef DO_CMP_PPZW_H
3758#undef DO_CMP_PPZW_S
3759#undef DO_CMP_PPZW
3760
3761
3762#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3763uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3764{ \
3765 intptr_t opr_sz = simd_oprsz(desc); \
3766 uint32_t flags = PREDTEST_INIT; \
3767 TYPE mm = simd_data(desc); \
3768 intptr_t i = opr_sz; \
3769 do { \
3770 uint64_t out = 0, pg; \
3771 do { \
3772 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3773 TYPE nn = *(TYPE *)(vn + H(i)); \
3774 out |= nn OP mm; \
3775 } while (i & 63); \
3776 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3777 out &= pg; \
3778 *(uint64_t *)(vd + (i >> 3)) = out; \
3779 flags = iter_predtest_bwd(out, pg, flags); \
3780 } while (i > 0); \
3781 return flags; \
3782}
3783
3784#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3785 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3786#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3787 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3788#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3789 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3790#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3791 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3792
3793DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3794DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3795DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3796DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3797
3798DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3799DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3800DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3801DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3802
3803DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3804DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3805DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3806DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3807
3808DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3809DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3810DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3811DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3812
3813DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3814DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3815DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3816DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3817
3818DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3819DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3820DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3821DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3822
3823DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3824DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3825DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3826DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3827
3828DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3829DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3830DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3831DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3832
3833DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3834DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3835DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3836DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3837
3838DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3839DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3840DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3841DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3842
3843#undef DO_CMP_PPZI_B
3844#undef DO_CMP_PPZI_H
3845#undef DO_CMP_PPZI_S
3846#undef DO_CMP_PPZI_D
3847#undef DO_CMP_PPZI
3848
3849
3850static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3851{
3852 intptr_t i;
3853
3854 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3855 uint64_t pg = *(uint64_t *)(vg + i);
3856 if (pg) {
3857 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3858 }
3859 }
3860 return 0;
3861}
3862
3863
3864
3865
3866
3867static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3868 bool brk, bool after)
3869{
3870 uint64_t b;
3871
3872 if (brk) {
3873 b = 0;
3874 } else if ((g & n) == 0) {
3875
3876 b = g;
3877 } else {
3878
3879 b = g & n;
3880 b = b & -b;
3881 if (after) {
3882 b = b | (b - 1);
3883 } else {
3884 b = b - 1;
3885 }
3886 brk = true;
3887 }
3888
3889 *retb = b;
3890 return brk;
3891}
3892
3893
3894static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3895 intptr_t oprsz, bool after)
3896{
3897 bool brk = false;
3898 intptr_t i;
3899
3900 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3901 uint64_t this_b, this_g = g[i];
3902
3903 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3904 d[i] = this_b & this_g;
3905 }
3906}
3907
3908
3909static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3910 intptr_t oprsz, bool after)
3911{
3912 uint32_t flags = PREDTEST_INIT;
3913 bool brk = false;
3914 intptr_t i;
3915
3916 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3917 uint64_t this_b, this_d, this_g = g[i];
3918
3919 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3920 d[i] = this_d = this_b & this_g;
3921 flags = iter_predtest_fwd(this_d, this_g, flags);
3922 }
3923 return flags;
3924}
3925
3926
3927static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3928 intptr_t oprsz, bool after)
3929{
3930 bool brk = false;
3931 intptr_t i;
3932
3933 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3934 uint64_t this_b, this_g = g[i];
3935
3936 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3937 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3938 }
3939}
3940
3941
3942static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3943 intptr_t oprsz, bool after)
3944{
3945 uint32_t flags = PREDTEST_INIT;
3946 bool brk = false;
3947 intptr_t i;
3948
3949 for (i = 0; i < oprsz / 8; ++i) {
3950 uint64_t this_b, this_d = d[i], this_g = g[i];
3951
3952 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3953 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3954 flags = iter_predtest_fwd(this_d, this_g, flags);
3955 }
3956 return flags;
3957}
3958
3959static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3960{
3961
3962
3963
3964 memset(d, 0, sizeof(ARMPredicateReg));
3965 return PREDTEST_INIT;
3966}
3967
3968void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3969 uint32_t pred_desc)
3970{
3971 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3972 if (last_active_pred(vn, vg, oprsz)) {
3973 compute_brk_z(vd, vm, vg, oprsz, true);
3974 } else {
3975 do_zero(vd, oprsz);
3976 }
3977}
3978
3979uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3980 uint32_t pred_desc)
3981{
3982 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3983 if (last_active_pred(vn, vg, oprsz)) {
3984 return compute_brks_z(vd, vm, vg, oprsz, true);
3985 } else {
3986 return do_zero(vd, oprsz);
3987 }
3988}
3989
3990void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3991 uint32_t pred_desc)
3992{
3993 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3994 if (last_active_pred(vn, vg, oprsz)) {
3995 compute_brk_z(vd, vm, vg, oprsz, false);
3996 } else {
3997 do_zero(vd, oprsz);
3998 }
3999}
4000
4001uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4002 uint32_t pred_desc)
4003{
4004 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4005 if (last_active_pred(vn, vg, oprsz)) {
4006 return compute_brks_z(vd, vm, vg, oprsz, false);
4007 } else {
4008 return do_zero(vd, oprsz);
4009 }
4010}
4011
4012void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4013{
4014 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4015 compute_brk_z(vd, vn, vg, oprsz, true);
4016}
4017
4018uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4019{
4020 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4021 return compute_brks_z(vd, vn, vg, oprsz, true);
4022}
4023
4024void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4025{
4026 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4027 compute_brk_z(vd, vn, vg, oprsz, false);
4028}
4029
4030uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4031{
4032 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4033 return compute_brks_z(vd, vn, vg, oprsz, false);
4034}
4035
4036void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4037{
4038 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4039 compute_brk_m(vd, vn, vg, oprsz, true);
4040}
4041
4042uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4043{
4044 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4045 return compute_brks_m(vd, vn, vg, oprsz, true);
4046}
4047
4048void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4049{
4050 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4051 compute_brk_m(vd, vn, vg, oprsz, false);
4052}
4053
4054uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4055{
4056 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4057 return compute_brks_m(vd, vn, vg, oprsz, false);
4058}
4059
4060void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4061{
4062 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4063 if (!last_active_pred(vn, vg, oprsz)) {
4064 do_zero(vd, oprsz);
4065 }
4066}
4067
4068
4069static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4070 uint64_t esz_mask)
4071{
4072 uint32_t flags = PREDTEST_INIT;
4073 intptr_t i;
4074
4075 for (i = 0; i < oprsz / 8; i++) {
4076 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4077 }
4078 if (oprsz & 7) {
4079 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4080 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4081 }
4082 return flags;
4083}
4084
4085uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4086{
4087 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4088 if (last_active_pred(vn, vg, oprsz)) {
4089 return predtest_ones(vd, oprsz, -1);
4090 } else {
4091 return do_zero(vd, oprsz);
4092 }
4093}
4094
4095uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4096{
4097 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4098 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4099 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4100 intptr_t i;
4101
4102 for (i = 0; i < words; ++i) {
4103 uint64_t t = n[i] & g[i] & mask;
4104 sum += ctpop64(t);
4105 }
4106 return sum;
4107}
4108
4109uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4110{
4111 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4112 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4113 uint64_t esz_mask = pred_esz_masks[esz];
4114 ARMPredicateReg *d = vd;
4115 uint32_t flags;
4116 intptr_t i;
4117
4118
4119 flags = do_zero(d, oprsz);
4120 if (count == 0) {
4121 return flags;
4122 }
4123
4124
4125 for (i = 0; i < count / 64; ++i) {
4126 d->p[i] = esz_mask;
4127 }
4128 if (count & 63) {
4129 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4130 }
4131
4132 return predtest_ones(d, oprsz, esz_mask);
4133}
4134
4135uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4136{
4137 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4138 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4139 uint64_t esz_mask = pred_esz_masks[esz];
4140 ARMPredicateReg *d = vd;
4141 intptr_t i, invcount, oprbits;
4142 uint64_t bits;
4143
4144 if (count == 0) {
4145 return do_zero(d, oprsz);
4146 }
4147
4148 oprbits = oprsz * 8;
4149 tcg_debug_assert(count <= oprbits);
4150
4151 bits = esz_mask;
4152 if (oprbits & 63) {
4153 bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4154 }
4155
4156 invcount = oprbits - count;
4157 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4158 d->p[i] = bits;
4159 bits = esz_mask;
4160 }
4161
4162 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4163
4164 while (--i >= 0) {
4165 d->p[i] = 0;
4166 }
4167
4168 return predtest_ones(d, oprsz, esz_mask);
4169}
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
4180static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4181{ \
4182 if (n == 1) { \
4183 return *data; \
4184 } else { \
4185 uintptr_t half = n / 2; \
4186 TYPE lo = NAME##_reduce(data, status, half); \
4187 TYPE hi = NAME##_reduce(data + half, status, half); \
4188 return TYPE##_##FUNC(lo, hi, status); \
4189 } \
4190} \
4191uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
4192{ \
4193 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
4194 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
4195 for (i = 0; i < oprsz; ) { \
4196 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4197 do { \
4198 TYPE nn = *(TYPE *)(vn + H(i)); \
4199 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
4200 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4201 } while (i & 15); \
4202 } \
4203 for (; i < maxsz; i += sizeof(TYPE)) { \
4204 *(TYPE *)((void *)data + i) = IDENT; \
4205 } \
4206 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
4207}
4208
4209DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4210DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4211DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4212
4213
4214DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4215DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4216DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4217
4218DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4219DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4220DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4221
4222DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4223DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4224DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4225
4226DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4227DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4228DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4229
4230#undef DO_REDUCE
4231
4232uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4233 void *status, uint32_t desc)
4234{
4235 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4236 float16 result = nn;
4237
4238 do {
4239 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4240 do {
4241 if (pg & 1) {
4242 float16 mm = *(float16 *)(vm + H1_2(i));
4243 result = float16_add(result, mm, status);
4244 }
4245 i += sizeof(float16), pg >>= sizeof(float16);
4246 } while (i & 15);
4247 } while (i < opr_sz);
4248
4249 return result;
4250}
4251
4252uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4253 void *status, uint32_t desc)
4254{
4255 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4256 float32 result = nn;
4257
4258 do {
4259 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4260 do {
4261 if (pg & 1) {
4262 float32 mm = *(float32 *)(vm + H1_2(i));
4263 result = float32_add(result, mm, status);
4264 }
4265 i += sizeof(float32), pg >>= sizeof(float32);
4266 } while (i & 15);
4267 } while (i < opr_sz);
4268
4269 return result;
4270}
4271
4272uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4273 void *status, uint32_t desc)
4274{
4275 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4276 uint64_t *m = vm;
4277 uint8_t *pg = vg;
4278
4279 for (i = 0; i < opr_sz; i++) {
4280 if (pg[H1(i)] & 1) {
4281 nn = float64_add(nn, m[i], status);
4282 }
4283 }
4284
4285 return nn;
4286}
4287
4288
4289
4290
4291#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4292void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4293 void *status, uint32_t desc) \
4294{ \
4295 intptr_t i = simd_oprsz(desc); \
4296 uint64_t *g = vg; \
4297 do { \
4298 uint64_t pg = g[(i - 1) >> 6]; \
4299 do { \
4300 i -= sizeof(TYPE); \
4301 if (likely((pg >> (i & 63)) & 1)) { \
4302 TYPE nn = *(TYPE *)(vn + H(i)); \
4303 TYPE mm = *(TYPE *)(vm + H(i)); \
4304 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4305 } \
4306 } while (i & 63); \
4307 } while (i != 0); \
4308}
4309
4310DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4311DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4312DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4313
4314DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4315DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4316DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4317
4318DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4319DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4320DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4321
4322DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4323DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4324DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4325
4326DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4327DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4328DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4329
4330DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4331DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4332DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4333
4334DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4335DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4336DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4337
4338DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4339DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4340DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4341
4342static inline float16 abd_h(float16 a, float16 b, float_status *s)
4343{
4344 return float16_abs(float16_sub(a, b, s));
4345}
4346
4347static inline float32 abd_s(float32 a, float32 b, float_status *s)
4348{
4349 return float32_abs(float32_sub(a, b, s));
4350}
4351
4352static inline float64 abd_d(float64 a, float64 b, float_status *s)
4353{
4354 return float64_abs(float64_sub(a, b, s));
4355}
4356
4357DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4358DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4359DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4360
4361static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4362{
4363 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4364 return float64_scalbn(a, b_int, s);
4365}
4366
4367DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4368DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4369DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4370
4371DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4372DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4373DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4374
4375#undef DO_ZPZZ_FP
4376
4377
4378
4379
4380#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4381void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4382 void *status, uint32_t desc) \
4383{ \
4384 intptr_t i = simd_oprsz(desc); \
4385 uint64_t *g = vg; \
4386 TYPE mm = scalar; \
4387 do { \
4388 uint64_t pg = g[(i - 1) >> 6]; \
4389 do { \
4390 i -= sizeof(TYPE); \
4391 if (likely((pg >> (i & 63)) & 1)) { \
4392 TYPE nn = *(TYPE *)(vn + H(i)); \
4393 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4394 } \
4395 } while (i & 63); \
4396 } while (i != 0); \
4397}
4398
4399DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4400DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4401DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4402
4403DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4404DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4405DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4406
4407DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4408DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4409DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4410
4411static inline float16 subr_h(float16 a, float16 b, float_status *s)
4412{
4413 return float16_sub(b, a, s);
4414}
4415
4416static inline float32 subr_s(float32 a, float32 b, float_status *s)
4417{
4418 return float32_sub(b, a, s);
4419}
4420
4421static inline float64 subr_d(float64 a, float64 b, float_status *s)
4422{
4423 return float64_sub(b, a, s);
4424}
4425
4426DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4427DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4428DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4429
4430DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4431DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4432DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4433
4434DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4435DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4436DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4437
4438DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4439DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4440DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4441
4442DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4443DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4444DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4445
4446
4447
4448
4449#define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4450void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4451{ \
4452 intptr_t i = simd_oprsz(desc); \
4453 uint64_t *g = vg; \
4454 do { \
4455 uint64_t pg = g[(i - 1) >> 6]; \
4456 do { \
4457 i -= sizeof(TYPE); \
4458 if (likely((pg >> (i & 63)) & 1)) { \
4459 TYPE nn = *(TYPE *)(vn + H(i)); \
4460 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4461 } \
4462 } while (i & 63); \
4463 } while (i != 0); \
4464}
4465
4466
4467
4468
4469
4470static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4471{
4472 bool save = get_flush_inputs_to_zero(fpst);
4473 float32 ret;
4474
4475 set_flush_inputs_to_zero(false, fpst);
4476 ret = float16_to_float32(f, true, fpst);
4477 set_flush_inputs_to_zero(save, fpst);
4478 return ret;
4479}
4480
4481static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4482{
4483 bool save = get_flush_inputs_to_zero(fpst);
4484 float64 ret;
4485
4486 set_flush_inputs_to_zero(false, fpst);
4487 ret = float16_to_float64(f, true, fpst);
4488 set_flush_inputs_to_zero(save, fpst);
4489 return ret;
4490}
4491
4492static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4493{
4494 bool save = get_flush_to_zero(fpst);
4495 float16 ret;
4496
4497 set_flush_to_zero(false, fpst);
4498 ret = float32_to_float16(f, true, fpst);
4499 set_flush_to_zero(save, fpst);
4500 return ret;
4501}
4502
4503static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4504{
4505 bool save = get_flush_to_zero(fpst);
4506 float16 ret;
4507
4508 set_flush_to_zero(false, fpst);
4509 ret = float64_to_float16(f, true, fpst);
4510 set_flush_to_zero(save, fpst);
4511 return ret;
4512}
4513
4514static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4515{
4516 if (float16_is_any_nan(f)) {
4517 float_raise(float_flag_invalid, s);
4518 return 0;
4519 }
4520 return float16_to_int16_round_to_zero(f, s);
4521}
4522
4523static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4524{
4525 if (float16_is_any_nan(f)) {
4526 float_raise(float_flag_invalid, s);
4527 return 0;
4528 }
4529 return float16_to_int64_round_to_zero(f, s);
4530}
4531
4532static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4533{
4534 if (float32_is_any_nan(f)) {
4535 float_raise(float_flag_invalid, s);
4536 return 0;
4537 }
4538 return float32_to_int64_round_to_zero(f, s);
4539}
4540
4541static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4542{
4543 if (float64_is_any_nan(f)) {
4544 float_raise(float_flag_invalid, s);
4545 return 0;
4546 }
4547 return float64_to_int64_round_to_zero(f, s);
4548}
4549
4550static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4551{
4552 if (float16_is_any_nan(f)) {
4553 float_raise(float_flag_invalid, s);
4554 return 0;
4555 }
4556 return float16_to_uint16_round_to_zero(f, s);
4557}
4558
4559static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4560{
4561 if (float16_is_any_nan(f)) {
4562 float_raise(float_flag_invalid, s);
4563 return 0;
4564 }
4565 return float16_to_uint64_round_to_zero(f, s);
4566}
4567
4568static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4569{
4570 if (float32_is_any_nan(f)) {
4571 float_raise(float_flag_invalid, s);
4572 return 0;
4573 }
4574 return float32_to_uint64_round_to_zero(f, s);
4575}
4576
4577static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4578{
4579 if (float64_is_any_nan(f)) {
4580 float_raise(float_flag_invalid, s);
4581 return 0;
4582 }
4583 return float64_to_uint64_round_to_zero(f, s);
4584}
4585
4586DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4587DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4588DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16)
4589DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4590DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4591DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4592DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4593
4594DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4595DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4596DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4597DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4598DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4599DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4600DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4601
4602DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4603DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4604DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4605DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4606DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4607DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4608DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4609
4610DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4611DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4612DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4613
4614DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4615DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4616DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4617
4618DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4619DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4620DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4621
4622DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4623DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4624DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4625
4626DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4627DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4628DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4629DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4630DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4631DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4632DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4633
4634DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4635DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4636DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4637DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4638DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4639DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4640DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4641
4642static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4643{
4644
4645 uint32_t frac = (uint32_t)a << (16 + 6);
4646 int16_t exp = extract32(a, 10, 5);
4647
4648 if (unlikely(exp == 0)) {
4649 if (frac != 0) {
4650 if (!get_flush_inputs_to_zero(s)) {
4651
4652 return -15 - clz32(frac);
4653 }
4654
4655 float_raise(float_flag_input_denormal, s);
4656 }
4657 } else if (unlikely(exp == 0x1f)) {
4658 if (frac == 0) {
4659 return INT16_MAX;
4660 }
4661 } else {
4662
4663 return exp - 15;
4664 }
4665
4666 float_raise(float_flag_invalid, s);
4667 return INT16_MIN;
4668}
4669
4670static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4671{
4672
4673 uint32_t frac = a << 9;
4674 int32_t exp = extract32(a, 23, 8);
4675
4676 if (unlikely(exp == 0)) {
4677 if (frac != 0) {
4678 if (!get_flush_inputs_to_zero(s)) {
4679
4680 return -127 - clz32(frac);
4681 }
4682
4683 float_raise(float_flag_input_denormal, s);
4684 }
4685 } else if (unlikely(exp == 0xff)) {
4686 if (frac == 0) {
4687 return INT32_MAX;
4688 }
4689 } else {
4690
4691 return exp - 127;
4692 }
4693
4694 float_raise(float_flag_invalid, s);
4695 return INT32_MIN;
4696}
4697
4698static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4699{
4700
4701 uint64_t frac = a << 12;
4702 int64_t exp = extract64(a, 52, 11);
4703
4704 if (unlikely(exp == 0)) {
4705 if (frac != 0) {
4706 if (!get_flush_inputs_to_zero(s)) {
4707
4708 return -1023 - clz64(frac);
4709 }
4710
4711 float_raise(float_flag_input_denormal, s);
4712 }
4713 } else if (unlikely(exp == 0x7ff)) {
4714 if (frac == 0) {
4715 return INT64_MAX;
4716 }
4717 } else {
4718
4719 return exp - 1023;
4720 }
4721
4722 float_raise(float_flag_invalid, s);
4723 return INT64_MIN;
4724}
4725
4726DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4727DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4728DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4729
4730#undef DO_ZPZ_FP
4731
4732static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4733 float_status *status, uint32_t desc,
4734 uint16_t neg1, uint16_t neg3)
4735{
4736 intptr_t i = simd_oprsz(desc);
4737 uint64_t *g = vg;
4738
4739 do {
4740 uint64_t pg = g[(i - 1) >> 6];
4741 do {
4742 i -= 2;
4743 if (likely((pg >> (i & 63)) & 1)) {
4744 float16 e1, e2, e3, r;
4745
4746 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4747 e2 = *(uint16_t *)(vm + H1_2(i));
4748 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4749 r = float16_muladd(e1, e2, e3, 0, status);
4750 *(uint16_t *)(vd + H1_2(i)) = r;
4751 }
4752 } while (i & 63);
4753 } while (i != 0);
4754}
4755
4756void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4757 void *vg, void *status, uint32_t desc)
4758{
4759 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4760}
4761
4762void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4763 void *vg, void *status, uint32_t desc)
4764{
4765 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4766}
4767
4768void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4769 void *vg, void *status, uint32_t desc)
4770{
4771 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4772}
4773
4774void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4775 void *vg, void *status, uint32_t desc)
4776{
4777 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4778}
4779
4780static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4781 float_status *status, uint32_t desc,
4782 uint32_t neg1, uint32_t neg3)
4783{
4784 intptr_t i = simd_oprsz(desc);
4785 uint64_t *g = vg;
4786
4787 do {
4788 uint64_t pg = g[(i - 1) >> 6];
4789 do {
4790 i -= 4;
4791 if (likely((pg >> (i & 63)) & 1)) {
4792 float32 e1, e2, e3, r;
4793
4794 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4795 e2 = *(uint32_t *)(vm + H1_4(i));
4796 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4797 r = float32_muladd(e1, e2, e3, 0, status);
4798 *(uint32_t *)(vd + H1_4(i)) = r;
4799 }
4800 } while (i & 63);
4801 } while (i != 0);
4802}
4803
4804void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4805 void *vg, void *status, uint32_t desc)
4806{
4807 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4808}
4809
4810void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4811 void *vg, void *status, uint32_t desc)
4812{
4813 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4814}
4815
4816void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4817 void *vg, void *status, uint32_t desc)
4818{
4819 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4820}
4821
4822void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4823 void *vg, void *status, uint32_t desc)
4824{
4825 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4826}
4827
4828static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4829 float_status *status, uint32_t desc,
4830 uint64_t neg1, uint64_t neg3)
4831{
4832 intptr_t i = simd_oprsz(desc);
4833 uint64_t *g = vg;
4834
4835 do {
4836 uint64_t pg = g[(i - 1) >> 6];
4837 do {
4838 i -= 8;
4839 if (likely((pg >> (i & 63)) & 1)) {
4840 float64 e1, e2, e3, r;
4841
4842 e1 = *(uint64_t *)(vn + i) ^ neg1;
4843 e2 = *(uint64_t *)(vm + i);
4844 e3 = *(uint64_t *)(va + i) ^ neg3;
4845 r = float64_muladd(e1, e2, e3, 0, status);
4846 *(uint64_t *)(vd + i) = r;
4847 }
4848 } while (i & 63);
4849 } while (i != 0);
4850}
4851
4852void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4853 void *vg, void *status, uint32_t desc)
4854{
4855 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4856}
4857
4858void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4859 void *vg, void *status, uint32_t desc)
4860{
4861 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4862}
4863
4864void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4865 void *vg, void *status, uint32_t desc)
4866{
4867 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4868}
4869
4870void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4871 void *vg, void *status, uint32_t desc)
4872{
4873 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4874}
4875
4876
4877
4878
4879
4880
4881#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4882void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4883 void *status, uint32_t desc) \
4884{ \
4885 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4886 uint64_t *d = vd, *g = vg; \
4887 do { \
4888 uint64_t out = 0, pg = g[j]; \
4889 do { \
4890 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4891 if (likely((pg >> (i & 63)) & 1)) { \
4892 TYPE nn = *(TYPE *)(vn + H(i)); \
4893 TYPE mm = *(TYPE *)(vm + H(i)); \
4894 out |= OP(TYPE, nn, mm, status); \
4895 } \
4896 } while (i & 63); \
4897 d[j--] = out; \
4898 } while (i > 0); \
4899}
4900
4901#define DO_FPCMP_PPZZ_H(NAME, OP) \
4902 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4903#define DO_FPCMP_PPZZ_S(NAME, OP) \
4904 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4905#define DO_FPCMP_PPZZ_D(NAME, OP) \
4906 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4907
4908#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4909 DO_FPCMP_PPZZ_H(NAME, OP) \
4910 DO_FPCMP_PPZZ_S(NAME, OP) \
4911 DO_FPCMP_PPZZ_D(NAME, OP)
4912
4913#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4914#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4915#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4916#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
4917#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4918#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4919#define DO_FCMUO(TYPE, X, Y, ST) \
4920 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4921#define DO_FACGE(TYPE, X, Y, ST) \
4922 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4923#define DO_FACGT(TYPE, X, Y, ST) \
4924 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4925
4926DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4927DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4928DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4929DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4930DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4931DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4932DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4933
4934#undef DO_FPCMP_PPZZ_ALL
4935#undef DO_FPCMP_PPZZ_D
4936#undef DO_FPCMP_PPZZ_S
4937#undef DO_FPCMP_PPZZ_H
4938#undef DO_FPCMP_PPZZ
4939
4940
4941
4942
4943#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4944void HELPER(NAME)(void *vd, void *vn, void *vg, \
4945 void *status, uint32_t desc) \
4946{ \
4947 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4948 uint64_t *d = vd, *g = vg; \
4949 do { \
4950 uint64_t out = 0, pg = g[j]; \
4951 do { \
4952 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4953 if ((pg >> (i & 63)) & 1) { \
4954 TYPE nn = *(TYPE *)(vn + H(i)); \
4955 out |= OP(TYPE, nn, 0, status); \
4956 } \
4957 } while (i & 63); \
4958 d[j--] = out; \
4959 } while (i > 0); \
4960}
4961
4962#define DO_FPCMP_PPZ0_H(NAME, OP) \
4963 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4964#define DO_FPCMP_PPZ0_S(NAME, OP) \
4965 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4966#define DO_FPCMP_PPZ0_D(NAME, OP) \
4967 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4968
4969#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4970 DO_FPCMP_PPZ0_H(NAME, OP) \
4971 DO_FPCMP_PPZ0_S(NAME, OP) \
4972 DO_FPCMP_PPZ0_D(NAME, OP)
4973
4974DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4975DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4976DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4977DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4978DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4979DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4980
4981
4982
4983void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4984{
4985 static const float16 coeff[16] = {
4986 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4987 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4988 };
4989 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4990 intptr_t x = simd_data(desc);
4991 float16 *d = vd, *n = vn, *m = vm;
4992 for (i = 0; i < opr_sz; i++) {
4993 float16 mm = m[i];
4994 intptr_t xx = x;
4995 if (float16_is_neg(mm)) {
4996 mm = float16_abs(mm);
4997 xx += 8;
4998 }
4999 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
5000 }
5001}
5002
5003void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5004{
5005 static const float32 coeff[16] = {
5006 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5007 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5008 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5009 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5010 };
5011 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5012 intptr_t x = simd_data(desc);
5013 float32 *d = vd, *n = vn, *m = vm;
5014 for (i = 0; i < opr_sz; i++) {
5015 float32 mm = m[i];
5016 intptr_t xx = x;
5017 if (float32_is_neg(mm)) {
5018 mm = float32_abs(mm);
5019 xx += 8;
5020 }
5021 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
5022 }
5023}
5024
5025void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5026{
5027 static const float64 coeff[16] = {
5028 0x3ff0000000000000ull, 0xbfc5555555555543ull,
5029 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5030 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5031 0x3de5d8408868552full, 0x0000000000000000ull,
5032 0x3ff0000000000000ull, 0xbfe0000000000000ull,
5033 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5034 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5035 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5036 };
5037 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5038 intptr_t x = simd_data(desc);
5039 float64 *d = vd, *n = vn, *m = vm;
5040 for (i = 0; i < opr_sz; i++) {
5041 float64 mm = m[i];
5042 intptr_t xx = x;
5043 if (float64_is_neg(mm)) {
5044 mm = float64_abs(mm);
5045 xx += 8;
5046 }
5047 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5048 }
5049}
5050
5051
5052
5053
5054
5055void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5056 void *vs, uint32_t desc)
5057{
5058 intptr_t j, i = simd_oprsz(desc);
5059 uint64_t *g = vg;
5060 float16 neg_imag = float16_set_sign(0, simd_data(desc));
5061 float16 neg_real = float16_chs(neg_imag);
5062
5063 do {
5064 uint64_t pg = g[(i - 1) >> 6];
5065 do {
5066 float16 e0, e1, e2, e3;
5067
5068
5069 j = i - sizeof(float16);
5070 i -= 2 * sizeof(float16);
5071
5072 e0 = *(float16 *)(vn + H1_2(i));
5073 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5074 e2 = *(float16 *)(vn + H1_2(j));
5075 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5076
5077 if (likely((pg >> (i & 63)) & 1)) {
5078 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5079 }
5080 if (likely((pg >> (j & 63)) & 1)) {
5081 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5082 }
5083 } while (i & 63);
5084 } while (i != 0);
5085}
5086
5087void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5088 void *vs, uint32_t desc)
5089{
5090 intptr_t j, i = simd_oprsz(desc);
5091 uint64_t *g = vg;
5092 float32 neg_imag = float32_set_sign(0, simd_data(desc));
5093 float32 neg_real = float32_chs(neg_imag);
5094
5095 do {
5096 uint64_t pg = g[(i - 1) >> 6];
5097 do {
5098 float32 e0, e1, e2, e3;
5099
5100
5101 j = i - sizeof(float32);
5102 i -= 2 * sizeof(float32);
5103
5104 e0 = *(float32 *)(vn + H1_2(i));
5105 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5106 e2 = *(float32 *)(vn + H1_2(j));
5107 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5108
5109 if (likely((pg >> (i & 63)) & 1)) {
5110 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5111 }
5112 if (likely((pg >> (j & 63)) & 1)) {
5113 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5114 }
5115 } while (i & 63);
5116 } while (i != 0);
5117}
5118
5119void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5120 void *vs, uint32_t desc)
5121{
5122 intptr_t j, i = simd_oprsz(desc);
5123 uint64_t *g = vg;
5124 float64 neg_imag = float64_set_sign(0, simd_data(desc));
5125 float64 neg_real = float64_chs(neg_imag);
5126
5127 do {
5128 uint64_t pg = g[(i - 1) >> 6];
5129 do {
5130 float64 e0, e1, e2, e3;
5131
5132
5133 j = i - sizeof(float64);
5134 i -= 2 * sizeof(float64);
5135
5136 e0 = *(float64 *)(vn + H1_2(i));
5137 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5138 e2 = *(float64 *)(vn + H1_2(j));
5139 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5140
5141 if (likely((pg >> (i & 63)) & 1)) {
5142 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5143 }
5144 if (likely((pg >> (j & 63)) & 1)) {
5145 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5146 }
5147 } while (i & 63);
5148 } while (i != 0);
5149}
5150
5151
5152
5153
5154
5155void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5156 void *vg, void *status, uint32_t desc)
5157{
5158 intptr_t j, i = simd_oprsz(desc);
5159 unsigned rot = simd_data(desc);
5160 bool flip = rot & 1;
5161 float16 neg_imag, neg_real;
5162 uint64_t *g = vg;
5163
5164 neg_imag = float16_set_sign(0, (rot & 2) != 0);
5165 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5166
5167 do {
5168 uint64_t pg = g[(i - 1) >> 6];
5169 do {
5170 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5171
5172
5173 j = i - sizeof(float16);
5174 i -= 2 * sizeof(float16);
5175
5176 nr = *(float16 *)(vn + H1_2(i));
5177 ni = *(float16 *)(vn + H1_2(j));
5178 mr = *(float16 *)(vm + H1_2(i));
5179 mi = *(float16 *)(vm + H1_2(j));
5180
5181 e2 = (flip ? ni : nr);
5182 e1 = (flip ? mi : mr) ^ neg_real;
5183 e4 = e2;
5184 e3 = (flip ? mr : mi) ^ neg_imag;
5185
5186 if (likely((pg >> (i & 63)) & 1)) {
5187 d = *(float16 *)(va + H1_2(i));
5188 d = float16_muladd(e2, e1, d, 0, status);
5189 *(float16 *)(vd + H1_2(i)) = d;
5190 }
5191 if (likely((pg >> (j & 63)) & 1)) {
5192 d = *(float16 *)(va + H1_2(j));
5193 d = float16_muladd(e4, e3, d, 0, status);
5194 *(float16 *)(vd + H1_2(j)) = d;
5195 }
5196 } while (i & 63);
5197 } while (i != 0);
5198}
5199
5200void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5201 void *vg, void *status, uint32_t desc)
5202{
5203 intptr_t j, i = simd_oprsz(desc);
5204 unsigned rot = simd_data(desc);
5205 bool flip = rot & 1;
5206 float32 neg_imag, neg_real;
5207 uint64_t *g = vg;
5208
5209 neg_imag = float32_set_sign(0, (rot & 2) != 0);
5210 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5211
5212 do {
5213 uint64_t pg = g[(i - 1) >> 6];
5214 do {
5215 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5216
5217
5218 j = i - sizeof(float32);
5219 i -= 2 * sizeof(float32);
5220
5221 nr = *(float32 *)(vn + H1_2(i));
5222 ni = *(float32 *)(vn + H1_2(j));
5223 mr = *(float32 *)(vm + H1_2(i));
5224 mi = *(float32 *)(vm + H1_2(j));
5225
5226 e2 = (flip ? ni : nr);
5227 e1 = (flip ? mi : mr) ^ neg_real;
5228 e4 = e2;
5229 e3 = (flip ? mr : mi) ^ neg_imag;
5230
5231 if (likely((pg >> (i & 63)) & 1)) {
5232 d = *(float32 *)(va + H1_2(i));
5233 d = float32_muladd(e2, e1, d, 0, status);
5234 *(float32 *)(vd + H1_2(i)) = d;
5235 }
5236 if (likely((pg >> (j & 63)) & 1)) {
5237 d = *(float32 *)(va + H1_2(j));
5238 d = float32_muladd(e4, e3, d, 0, status);
5239 *(float32 *)(vd + H1_2(j)) = d;
5240 }
5241 } while (i & 63);
5242 } while (i != 0);
5243}
5244
5245void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5246 void *vg, void *status, uint32_t desc)
5247{
5248 intptr_t j, i = simd_oprsz(desc);
5249 unsigned rot = simd_data(desc);
5250 bool flip = rot & 1;
5251 float64 neg_imag, neg_real;
5252 uint64_t *g = vg;
5253
5254 neg_imag = float64_set_sign(0, (rot & 2) != 0);
5255 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5256
5257 do {
5258 uint64_t pg = g[(i - 1) >> 6];
5259 do {
5260 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5261
5262
5263 j = i - sizeof(float64);
5264 i -= 2 * sizeof(float64);
5265
5266 nr = *(float64 *)(vn + H1_2(i));
5267 ni = *(float64 *)(vn + H1_2(j));
5268 mr = *(float64 *)(vm + H1_2(i));
5269 mi = *(float64 *)(vm + H1_2(j));
5270
5271 e2 = (flip ? ni : nr);
5272 e1 = (flip ? mi : mr) ^ neg_real;
5273 e4 = e2;
5274 e3 = (flip ? mr : mi) ^ neg_imag;
5275
5276 if (likely((pg >> (i & 63)) & 1)) {
5277 d = *(float64 *)(va + H1_2(i));
5278 d = float64_muladd(e2, e1, d, 0, status);
5279 *(float64 *)(vd + H1_2(i)) = d;
5280 }
5281 if (likely((pg >> (j & 63)) & 1)) {
5282 d = *(float64 *)(va + H1_2(j));
5283 d = float64_muladd(e4, e3, d, 0, status);
5284 *(float64 *)(vd + H1_2(j)) = d;
5285 }
5286 } while (i & 63);
5287 } while (i != 0);
5288}
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5300 intptr_t reg_max, int esz)
5301{
5302 uint64_t pg_mask = pred_esz_masks[esz];
5303 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5304
5305
5306 if (likely(pg & 1)) {
5307 return reg_off;
5308 }
5309
5310 if (pg == 0) {
5311 reg_off &= -64;
5312 do {
5313 reg_off += 64;
5314 if (unlikely(reg_off >= reg_max)) {
5315
5316 return reg_max;
5317 }
5318 pg = vg[reg_off >> 6] & pg_mask;
5319 } while (pg == 0);
5320 }
5321 reg_off += ctz64(pg);
5322
5323
5324 tcg_debug_assert(reg_off < reg_max);
5325 return reg_off;
5326}
5327
5328
5329
5330
5331
5332
5333
5334bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5335 target_ulong addr, int mem_off, MMUAccessType access_type,
5336 int mmu_idx, uintptr_t retaddr)
5337{
5338 int flags;
5339
5340 addr += mem_off;
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352 addr = useronly_clean_ptr(addr);
5353
5354#ifdef CONFIG_USER_ONLY
5355 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
5356 &info->host, retaddr);
5357 memset(&info->attrs, 0, sizeof(info->attrs));
5358
5359 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5360#else
5361 CPUTLBEntryFull *full;
5362 flags = probe_access_full(env, addr, access_type, mmu_idx, nofault,
5363 &info->host, &full, retaddr);
5364 info->attrs = full->attrs;
5365 info->tagged = full->pte_attrs == 0xf0;
5366#endif
5367 info->flags = flags;
5368
5369 if (flags & TLB_INVALID_MASK) {
5370 g_assert(nofault);
5371 return false;
5372 }
5373
5374
5375 info->host -= mem_off;
5376 return true;
5377}
5378
5379
5380
5381
5382
5383
5384bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5385 intptr_t reg_max, int esz, int msize)
5386{
5387 const int esize = 1 << esz;
5388 const uint64_t pg_mask = pred_esz_masks[esz];
5389 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5390 intptr_t mem_off_last, mem_off_split;
5391 intptr_t page_split, elt_split;
5392 intptr_t i;
5393
5394
5395 memset(info, -1, offsetof(SVEContLdSt, page));
5396 memset(info->page, 0, sizeof(info->page));
5397
5398
5399 i = 0;
5400 do {
5401 uint64_t pg = vg[i] & pg_mask;
5402 if (pg) {
5403 reg_off_last = i * 64 + 63 - clz64(pg);
5404 if (reg_off_first < 0) {
5405 reg_off_first = i * 64 + ctz64(pg);
5406 }
5407 }
5408 } while (++i * 64 < reg_max);
5409
5410 if (unlikely(reg_off_first < 0)) {
5411
5412 return false;
5413 }
5414 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5415
5416 info->reg_off_first[0] = reg_off_first;
5417 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5418 mem_off_last = (reg_off_last >> esz) * msize;
5419
5420 page_split = -(addr | TARGET_PAGE_MASK);
5421 if (likely(mem_off_last + msize <= page_split)) {
5422
5423 info->reg_off_last[0] = reg_off_last;
5424 return true;
5425 }
5426
5427 info->page_split = page_split;
5428 elt_split = page_split / msize;
5429 reg_off_split = elt_split << esz;
5430 mem_off_split = elt_split * msize;
5431
5432
5433
5434
5435
5436
5437
5438 if (elt_split != 0) {
5439 info->reg_off_last[0] = reg_off_split - esize;
5440 }
5441
5442
5443 if (page_split % msize != 0) {
5444
5445 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5446 info->reg_off_split = reg_off_split;
5447 info->mem_off_split = mem_off_split;
5448
5449 if (reg_off_split == reg_off_last) {
5450
5451 return true;
5452 }
5453 }
5454 reg_off_split += esize;
5455 mem_off_split += msize;
5456 }
5457
5458
5459
5460
5461
5462 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5463 tcg_debug_assert(reg_off_split <= reg_off_last);
5464 info->reg_off_first[1] = reg_off_split;
5465 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5466 info->reg_off_last[1] = reg_off_last;
5467 return true;
5468}
5469
5470
5471
5472
5473
5474
5475bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5476 CPUARMState *env, target_ulong addr,
5477 MMUAccessType access_type, uintptr_t retaddr)
5478{
5479 int mmu_idx = cpu_mmu_index(env, false);
5480 int mem_off = info->mem_off_first[0];
5481 bool nofault = fault == FAULT_NO;
5482 bool have_work = true;
5483
5484 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5485 access_type, mmu_idx, retaddr)) {
5486
5487 return false;
5488 }
5489
5490 if (likely(info->page_split < 0)) {
5491
5492 return true;
5493 }
5494
5495
5496
5497
5498
5499 if (info->mem_off_split >= 0) {
5500
5501
5502
5503
5504 mem_off = info->page_split;
5505
5506
5507
5508
5509
5510
5511 if (info->mem_off_first[0] < info->mem_off_split) {
5512 nofault = FAULT_FIRST;
5513 have_work = false;
5514 }
5515 } else {
5516
5517
5518
5519
5520 mem_off = info->mem_off_first[1];
5521
5522
5523
5524
5525 nofault = fault != FAULT_ALL;
5526 }
5527
5528 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5529 access_type, mmu_idx, retaddr);
5530 return have_work;
5531}
5532
5533#ifndef CONFIG_USER_ONLY
5534void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5535 uint64_t *vg, target_ulong addr,
5536 int esize, int msize, int wp_access,
5537 uintptr_t retaddr)
5538{
5539 intptr_t mem_off, reg_off, reg_last;
5540 int flags0 = info->page[0].flags;
5541 int flags1 = info->page[1].flags;
5542
5543 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5544 return;
5545 }
5546
5547
5548 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5549 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5550
5551 if (flags0 & TLB_WATCHPOINT) {
5552 mem_off = info->mem_off_first[0];
5553 reg_off = info->reg_off_first[0];
5554 reg_last = info->reg_off_last[0];
5555
5556 while (reg_off <= reg_last) {
5557 uint64_t pg = vg[reg_off >> 6];
5558 do {
5559 if ((pg >> (reg_off & 63)) & 1) {
5560 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5561 msize, info->page[0].attrs,
5562 wp_access, retaddr);
5563 }
5564 reg_off += esize;
5565 mem_off += msize;
5566 } while (reg_off <= reg_last && (reg_off & 63));
5567 }
5568 }
5569
5570 mem_off = info->mem_off_split;
5571 if (mem_off >= 0) {
5572 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5573 info->page[0].attrs, wp_access, retaddr);
5574 }
5575
5576 mem_off = info->mem_off_first[1];
5577 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5578 reg_off = info->reg_off_first[1];
5579 reg_last = info->reg_off_last[1];
5580
5581 do {
5582 uint64_t pg = vg[reg_off >> 6];
5583 do {
5584 if ((pg >> (reg_off & 63)) & 1) {
5585 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5586 msize, info->page[1].attrs,
5587 wp_access, retaddr);
5588 }
5589 reg_off += esize;
5590 mem_off += msize;
5591 } while (reg_off & 63);
5592 } while (reg_off <= reg_last);
5593 }
5594}
5595#endif
5596
5597void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5598 uint64_t *vg, target_ulong addr, int esize,
5599 int msize, uint32_t mtedesc, uintptr_t ra)
5600{
5601 intptr_t mem_off, reg_off, reg_last;
5602
5603
5604 if (info->page[0].tagged) {
5605 mem_off = info->mem_off_first[0];
5606 reg_off = info->reg_off_first[0];
5607 reg_last = info->reg_off_split;
5608 if (reg_last < 0) {
5609 reg_last = info->reg_off_last[0];
5610 }
5611
5612 do {
5613 uint64_t pg = vg[reg_off >> 6];
5614 do {
5615 if ((pg >> (reg_off & 63)) & 1) {
5616 mte_check(env, mtedesc, addr, ra);
5617 }
5618 reg_off += esize;
5619 mem_off += msize;
5620 } while (reg_off <= reg_last && (reg_off & 63));
5621 } while (reg_off <= reg_last);
5622 }
5623
5624 mem_off = info->mem_off_first[1];
5625 if (mem_off >= 0 && info->page[1].tagged) {
5626 reg_off = info->reg_off_first[1];
5627 reg_last = info->reg_off_last[1];
5628
5629 do {
5630 uint64_t pg = vg[reg_off >> 6];
5631 do {
5632 if ((pg >> (reg_off & 63)) & 1) {
5633 mte_check(env, mtedesc, addr, ra);
5634 }
5635 reg_off += esize;
5636 mem_off += msize;
5637 } while (reg_off & 63);
5638 } while (reg_off <= reg_last);
5639 }
5640}
5641
5642
5643
5644
5645static inline QEMU_ALWAYS_INLINE
5646void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5647 uint32_t desc, const uintptr_t retaddr,
5648 const int esz, const int msz, const int N, uint32_t mtedesc,
5649 sve_ldst1_host_fn *host_fn,
5650 sve_ldst1_tlb_fn *tlb_fn)
5651{
5652 const unsigned rd = simd_data(desc);
5653 const intptr_t reg_max = simd_oprsz(desc);
5654 intptr_t reg_off, reg_last, mem_off;
5655 SVEContLdSt info;
5656 void *host;
5657 int flags, i;
5658
5659
5660 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5661
5662 for (i = 0; i < N; ++i) {
5663 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5664 }
5665 return;
5666 }
5667
5668
5669 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5670
5671
5672 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5673 BP_MEM_READ, retaddr);
5674
5675
5676
5677
5678
5679 if (mtedesc) {
5680 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5681 mtedesc, retaddr);
5682 }
5683
5684 flags = info.page[0].flags | info.page[1].flags;
5685 if (unlikely(flags != 0)) {
5686#ifdef CONFIG_USER_ONLY
5687 g_assert_not_reached();
5688#else
5689
5690
5691
5692
5693
5694
5695 ARMVectorReg scratch[4] = { };
5696
5697 mem_off = info.mem_off_first[0];
5698 reg_off = info.reg_off_first[0];
5699 reg_last = info.reg_off_last[1];
5700 if (reg_last < 0) {
5701 reg_last = info.reg_off_split;
5702 if (reg_last < 0) {
5703 reg_last = info.reg_off_last[0];
5704 }
5705 }
5706
5707 do {
5708 uint64_t pg = vg[reg_off >> 6];
5709 do {
5710 if ((pg >> (reg_off & 63)) & 1) {
5711 for (i = 0; i < N; ++i) {
5712 tlb_fn(env, &scratch[i], reg_off,
5713 addr + mem_off + (i << msz), retaddr);
5714 }
5715 }
5716 reg_off += 1 << esz;
5717 mem_off += N << msz;
5718 } while (reg_off & 63);
5719 } while (reg_off <= reg_last);
5720
5721 for (i = 0; i < N; ++i) {
5722 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5723 }
5724 return;
5725#endif
5726 }
5727
5728
5729
5730 for (i = 0; i < N; ++i) {
5731 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5732 }
5733
5734 mem_off = info.mem_off_first[0];
5735 reg_off = info.reg_off_first[0];
5736 reg_last = info.reg_off_last[0];
5737 host = info.page[0].host;
5738
5739 while (reg_off <= reg_last) {
5740 uint64_t pg = vg[reg_off >> 6];
5741 do {
5742 if ((pg >> (reg_off & 63)) & 1) {
5743 for (i = 0; i < N; ++i) {
5744 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5745 host + mem_off + (i << msz));
5746 }
5747 }
5748 reg_off += 1 << esz;
5749 mem_off += N << msz;
5750 } while (reg_off <= reg_last && (reg_off & 63));
5751 }
5752
5753
5754
5755
5756
5757 mem_off = info.mem_off_split;
5758 if (unlikely(mem_off >= 0)) {
5759 reg_off = info.reg_off_split;
5760 for (i = 0; i < N; ++i) {
5761 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5762 addr + mem_off + (i << msz), retaddr);
5763 }
5764 }
5765
5766 mem_off = info.mem_off_first[1];
5767 if (unlikely(mem_off >= 0)) {
5768 reg_off = info.reg_off_first[1];
5769 reg_last = info.reg_off_last[1];
5770 host = info.page[1].host;
5771
5772 do {
5773 uint64_t pg = vg[reg_off >> 6];
5774 do {
5775 if ((pg >> (reg_off & 63)) & 1) {
5776 for (i = 0; i < N; ++i) {
5777 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5778 host + mem_off + (i << msz));
5779 }
5780 }
5781 reg_off += 1 << esz;
5782 mem_off += N << msz;
5783 } while (reg_off & 63);
5784 } while (reg_off <= reg_last);
5785 }
5786}
5787
5788static inline QEMU_ALWAYS_INLINE
5789void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5790 uint32_t desc, const uintptr_t ra,
5791 const int esz, const int msz, const int N,
5792 sve_ldst1_host_fn *host_fn,
5793 sve_ldst1_tlb_fn *tlb_fn)
5794{
5795 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5796 int bit55 = extract64(addr, 55, 1);
5797
5798
5799 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5800
5801
5802 if (!tbi_check(desc, bit55) ||
5803 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5804 mtedesc = 0;
5805 }
5806
5807 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5808}
5809
5810#define DO_LD1_1(NAME, ESZ) \
5811void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5812 target_ulong addr, uint32_t desc) \
5813{ \
5814 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
5815 sve_##NAME##_host, sve_##NAME##_tlb); \
5816} \
5817void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5818 target_ulong addr, uint32_t desc) \
5819{ \
5820 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
5821 sve_##NAME##_host, sve_##NAME##_tlb); \
5822}
5823
5824#define DO_LD1_2(NAME, ESZ, MSZ) \
5825void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
5826 target_ulong addr, uint32_t desc) \
5827{ \
5828 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5829 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5830} \
5831void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
5832 target_ulong addr, uint32_t desc) \
5833{ \
5834 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5835 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5836} \
5837void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5838 target_ulong addr, uint32_t desc) \
5839{ \
5840 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5841 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5842} \
5843void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5844 target_ulong addr, uint32_t desc) \
5845{ \
5846 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5847 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5848}
5849
5850DO_LD1_1(ld1bb, MO_8)
5851DO_LD1_1(ld1bhu, MO_16)
5852DO_LD1_1(ld1bhs, MO_16)
5853DO_LD1_1(ld1bsu, MO_32)
5854DO_LD1_1(ld1bss, MO_32)
5855DO_LD1_1(ld1bdu, MO_64)
5856DO_LD1_1(ld1bds, MO_64)
5857
5858DO_LD1_2(ld1hh, MO_16, MO_16)
5859DO_LD1_2(ld1hsu, MO_32, MO_16)
5860DO_LD1_2(ld1hss, MO_32, MO_16)
5861DO_LD1_2(ld1hdu, MO_64, MO_16)
5862DO_LD1_2(ld1hds, MO_64, MO_16)
5863
5864DO_LD1_2(ld1ss, MO_32, MO_32)
5865DO_LD1_2(ld1sdu, MO_64, MO_32)
5866DO_LD1_2(ld1sds, MO_64, MO_32)
5867
5868DO_LD1_2(ld1dd, MO_64, MO_64)
5869
5870#undef DO_LD1_1
5871#undef DO_LD1_2
5872
5873#define DO_LDN_1(N) \
5874void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
5875 target_ulong addr, uint32_t desc) \
5876{ \
5877 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
5878 sve_ld1bb_host, sve_ld1bb_tlb); \
5879} \
5880void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
5881 target_ulong addr, uint32_t desc) \
5882{ \
5883 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
5884 sve_ld1bb_host, sve_ld1bb_tlb); \
5885}
5886
5887#define DO_LDN_2(N, SUFF, ESZ) \
5888void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
5889 target_ulong addr, uint32_t desc) \
5890{ \
5891 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5892 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5893} \
5894void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
5895 target_ulong addr, uint32_t desc) \
5896{ \
5897 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5898 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5899} \
5900void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
5901 target_ulong addr, uint32_t desc) \
5902{ \
5903 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5904 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5905} \
5906void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
5907 target_ulong addr, uint32_t desc) \
5908{ \
5909 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5910 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5911}
5912
5913DO_LDN_1(2)
5914DO_LDN_1(3)
5915DO_LDN_1(4)
5916
5917DO_LDN_2(2, hh, MO_16)
5918DO_LDN_2(3, hh, MO_16)
5919DO_LDN_2(4, hh, MO_16)
5920
5921DO_LDN_2(2, ss, MO_32)
5922DO_LDN_2(3, ss, MO_32)
5923DO_LDN_2(4, ss, MO_32)
5924
5925DO_LDN_2(2, dd, MO_64)
5926DO_LDN_2(3, dd, MO_64)
5927DO_LDN_2(4, dd, MO_64)
5928
5929#undef DO_LDN_1
5930#undef DO_LDN_2
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5951{
5952 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5953
5954 if (i & 63) {
5955 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5956 i = ROUND_UP(i, 64);
5957 }
5958 for (; i < oprsz; i += 64) {
5959 ffr[i / 64] = 0;
5960 }
5961}
5962
5963
5964
5965
5966static inline QEMU_ALWAYS_INLINE
5967void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
5968 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
5969 const int esz, const int msz, const SVEContFault fault,
5970 sve_ldst1_host_fn *host_fn,
5971 sve_ldst1_tlb_fn *tlb_fn)
5972{
5973 const unsigned rd = simd_data(desc);
5974 void *vd = &env->vfp.zregs[rd];
5975 const intptr_t reg_max = simd_oprsz(desc);
5976 intptr_t reg_off, mem_off, reg_last;
5977 SVEContLdSt info;
5978 int flags;
5979 void *host;
5980
5981
5982 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
5983
5984 memset(vd, 0, reg_max);
5985 return;
5986 }
5987 reg_off = info.reg_off_first[0];
5988
5989
5990 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
5991
5992 tcg_debug_assert(fault == FAULT_NO);
5993 memset(vd, 0, reg_max);
5994 goto do_fault;
5995 }
5996
5997 mem_off = info.mem_off_first[0];
5998 flags = info.page[0].flags;
5999
6000
6001
6002
6003
6004 if (!info.page[0].tagged) {
6005 mtedesc = 0;
6006 }
6007
6008 if (fault == FAULT_FIRST) {
6009
6010 if (mtedesc) {
6011 mte_check(env, mtedesc, addr + mem_off, retaddr);
6012 }
6013
6014
6015
6016
6017
6018 bool is_split = mem_off == info.mem_off_split;
6019 if (unlikely(flags != 0) || unlikely(is_split)) {
6020
6021
6022
6023
6024 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6025
6026
6027 swap_memzero(vd, reg_off);
6028 reg_off += 1 << esz;
6029 mem_off += 1 << msz;
6030 swap_memzero(vd + reg_off, reg_max - reg_off);
6031
6032 if (is_split) {
6033 goto second_page;
6034 }
6035 } else {
6036 memset(vd, 0, reg_max);
6037 }
6038 } else {
6039 memset(vd, 0, reg_max);
6040 if (unlikely(mem_off == info.mem_off_split)) {
6041
6042 flags |= info.page[1].flags;
6043 if (unlikely(flags & TLB_MMIO)) {
6044
6045 goto do_fault;
6046 }
6047 if (unlikely(flags & TLB_WATCHPOINT) &&
6048 (cpu_watchpoint_address_matches
6049 (env_cpu(env), addr + mem_off, 1 << msz)
6050 & BP_MEM_READ)) {
6051
6052 goto do_fault;
6053 }
6054 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6055 goto do_fault;
6056 }
6057
6058
6059
6060
6061 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6062 goto second_page;
6063 }
6064 }
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087 if (unlikely(flags & TLB_MMIO)) {
6088 goto do_fault;
6089 }
6090
6091 reg_last = info.reg_off_last[0];
6092 host = info.page[0].host;
6093
6094 do {
6095 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6096 do {
6097 if ((pg >> (reg_off & 63)) & 1) {
6098 if (unlikely(flags & TLB_WATCHPOINT) &&
6099 (cpu_watchpoint_address_matches
6100 (env_cpu(env), addr + mem_off, 1 << msz)
6101 & BP_MEM_READ)) {
6102 goto do_fault;
6103 }
6104 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6105 goto do_fault;
6106 }
6107 host_fn(vd, reg_off, host + mem_off);
6108 }
6109 reg_off += 1 << esz;
6110 mem_off += 1 << msz;
6111 } while (reg_off <= reg_last && (reg_off & 63));
6112 } while (reg_off <= reg_last);
6113
6114
6115
6116
6117
6118
6119
6120 reg_off = info.reg_off_split;
6121 if (reg_off >= 0) {
6122 goto do_fault;
6123 }
6124
6125 second_page:
6126 reg_off = info.reg_off_first[1];
6127 if (likely(reg_off < 0)) {
6128
6129 return;
6130 }
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140 do_fault:
6141 record_fault(env, reg_off, reg_max);
6142}
6143
6144static inline QEMU_ALWAYS_INLINE
6145void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6146 uint32_t desc, const uintptr_t retaddr,
6147 const int esz, const int msz, const SVEContFault fault,
6148 sve_ldst1_host_fn *host_fn,
6149 sve_ldst1_tlb_fn *tlb_fn)
6150{
6151 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6152 int bit55 = extract64(addr, 55, 1);
6153
6154
6155 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6156
6157
6158 if (!tbi_check(desc, bit55) ||
6159 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6160 mtedesc = 0;
6161 }
6162
6163 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6164 esz, msz, fault, host_fn, tlb_fn);
6165}
6166
6167#define DO_LDFF1_LDNF1_1(PART, ESZ) \
6168void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6169 target_ulong addr, uint32_t desc) \
6170{ \
6171 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6172 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6173} \
6174void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6175 target_ulong addr, uint32_t desc) \
6176{ \
6177 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6178 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6179} \
6180void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6181 target_ulong addr, uint32_t desc) \
6182{ \
6183 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6184 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6185} \
6186void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6187 target_ulong addr, uint32_t desc) \
6188{ \
6189 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6190 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6191}
6192
6193#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6194void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6195 target_ulong addr, uint32_t desc) \
6196{ \
6197 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6198 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6199} \
6200void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6201 target_ulong addr, uint32_t desc) \
6202{ \
6203 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6204 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6205} \
6206void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6207 target_ulong addr, uint32_t desc) \
6208{ \
6209 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6210 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6211} \
6212void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6213 target_ulong addr, uint32_t desc) \
6214{ \
6215 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6216 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6217} \
6218void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6219 target_ulong addr, uint32_t desc) \
6220{ \
6221 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6222 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6223} \
6224void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6225 target_ulong addr, uint32_t desc) \
6226{ \
6227 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6228 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6229} \
6230void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6231 target_ulong addr, uint32_t desc) \
6232{ \
6233 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6234 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6235} \
6236void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6237 target_ulong addr, uint32_t desc) \
6238{ \
6239 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6240 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6241}
6242
6243DO_LDFF1_LDNF1_1(bb, MO_8)
6244DO_LDFF1_LDNF1_1(bhu, MO_16)
6245DO_LDFF1_LDNF1_1(bhs, MO_16)
6246DO_LDFF1_LDNF1_1(bsu, MO_32)
6247DO_LDFF1_LDNF1_1(bss, MO_32)
6248DO_LDFF1_LDNF1_1(bdu, MO_64)
6249DO_LDFF1_LDNF1_1(bds, MO_64)
6250
6251DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
6252DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6253DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6254DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6255DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6256
6257DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
6258DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6259DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6260
6261DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
6262
6263#undef DO_LDFF1_LDNF1_1
6264#undef DO_LDFF1_LDNF1_2
6265
6266
6267
6268
6269
6270static inline QEMU_ALWAYS_INLINE
6271void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6272 uint32_t desc, const uintptr_t retaddr,
6273 const int esz, const int msz, const int N, uint32_t mtedesc,
6274 sve_ldst1_host_fn *host_fn,
6275 sve_ldst1_tlb_fn *tlb_fn)
6276{
6277 const unsigned rd = simd_data(desc);
6278 const intptr_t reg_max = simd_oprsz(desc);
6279 intptr_t reg_off, reg_last, mem_off;
6280 SVEContLdSt info;
6281 void *host;
6282 int i, flags;
6283
6284
6285 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6286
6287 return;
6288 }
6289
6290
6291 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6292
6293
6294 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6295 BP_MEM_WRITE, retaddr);
6296
6297
6298
6299
6300
6301 if (mtedesc) {
6302 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6303 mtedesc, retaddr);
6304 }
6305
6306 flags = info.page[0].flags | info.page[1].flags;
6307 if (unlikely(flags != 0)) {
6308#ifdef CONFIG_USER_ONLY
6309 g_assert_not_reached();
6310#else
6311
6312
6313
6314
6315
6316
6317 mem_off = info.mem_off_first[0];
6318 reg_off = info.reg_off_first[0];
6319 reg_last = info.reg_off_last[1];
6320 if (reg_last < 0) {
6321 reg_last = info.reg_off_split;
6322 if (reg_last < 0) {
6323 reg_last = info.reg_off_last[0];
6324 }
6325 }
6326
6327 do {
6328 uint64_t pg = vg[reg_off >> 6];
6329 do {
6330 if ((pg >> (reg_off & 63)) & 1) {
6331 for (i = 0; i < N; ++i) {
6332 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6333 addr + mem_off + (i << msz), retaddr);
6334 }
6335 }
6336 reg_off += 1 << esz;
6337 mem_off += N << msz;
6338 } while (reg_off & 63);
6339 } while (reg_off <= reg_last);
6340 return;
6341#endif
6342 }
6343
6344 mem_off = info.mem_off_first[0];
6345 reg_off = info.reg_off_first[0];
6346 reg_last = info.reg_off_last[0];
6347 host = info.page[0].host;
6348
6349 while (reg_off <= reg_last) {
6350 uint64_t pg = vg[reg_off >> 6];
6351 do {
6352 if ((pg >> (reg_off & 63)) & 1) {
6353 for (i = 0; i < N; ++i) {
6354 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6355 host + mem_off + (i << msz));
6356 }
6357 }
6358 reg_off += 1 << esz;
6359 mem_off += N << msz;
6360 } while (reg_off <= reg_last && (reg_off & 63));
6361 }
6362
6363
6364
6365
6366
6367 mem_off = info.mem_off_split;
6368 if (unlikely(mem_off >= 0)) {
6369 reg_off = info.reg_off_split;
6370 for (i = 0; i < N; ++i) {
6371 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6372 addr + mem_off + (i << msz), retaddr);
6373 }
6374 }
6375
6376 mem_off = info.mem_off_first[1];
6377 if (unlikely(mem_off >= 0)) {
6378 reg_off = info.reg_off_first[1];
6379 reg_last = info.reg_off_last[1];
6380 host = info.page[1].host;
6381
6382 do {
6383 uint64_t pg = vg[reg_off >> 6];
6384 do {
6385 if ((pg >> (reg_off & 63)) & 1) {
6386 for (i = 0; i < N; ++i) {
6387 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6388 host + mem_off + (i << msz));
6389 }
6390 }
6391 reg_off += 1 << esz;
6392 mem_off += N << msz;
6393 } while (reg_off & 63);
6394 } while (reg_off <= reg_last);
6395 }
6396}
6397
6398static inline QEMU_ALWAYS_INLINE
6399void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6400 uint32_t desc, const uintptr_t ra,
6401 const int esz, const int msz, const int N,
6402 sve_ldst1_host_fn *host_fn,
6403 sve_ldst1_tlb_fn *tlb_fn)
6404{
6405 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6406 int bit55 = extract64(addr, 55, 1);
6407
6408
6409 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6410
6411
6412 if (!tbi_check(desc, bit55) ||
6413 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6414 mtedesc = 0;
6415 }
6416
6417 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6418}
6419
6420#define DO_STN_1(N, NAME, ESZ) \
6421void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6422 target_ulong addr, uint32_t desc) \
6423{ \
6424 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6425 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6426} \
6427void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6428 target_ulong addr, uint32_t desc) \
6429{ \
6430 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6431 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6432}
6433
6434#define DO_STN_2(N, NAME, ESZ, MSZ) \
6435void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6436 target_ulong addr, uint32_t desc) \
6437{ \
6438 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6439 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6440} \
6441void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6442 target_ulong addr, uint32_t desc) \
6443{ \
6444 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6445 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6446} \
6447void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6448 target_ulong addr, uint32_t desc) \
6449{ \
6450 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6451 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6452} \
6453void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6454 target_ulong addr, uint32_t desc) \
6455{ \
6456 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6457 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6458}
6459
6460DO_STN_1(1, bb, MO_8)
6461DO_STN_1(1, bh, MO_16)
6462DO_STN_1(1, bs, MO_32)
6463DO_STN_1(1, bd, MO_64)
6464DO_STN_1(2, bb, MO_8)
6465DO_STN_1(3, bb, MO_8)
6466DO_STN_1(4, bb, MO_8)
6467
6468DO_STN_2(1, hh, MO_16, MO_16)
6469DO_STN_2(1, hs, MO_32, MO_16)
6470DO_STN_2(1, hd, MO_64, MO_16)
6471DO_STN_2(2, hh, MO_16, MO_16)
6472DO_STN_2(3, hh, MO_16, MO_16)
6473DO_STN_2(4, hh, MO_16, MO_16)
6474
6475DO_STN_2(1, ss, MO_32, MO_32)
6476DO_STN_2(1, sd, MO_64, MO_32)
6477DO_STN_2(2, ss, MO_32, MO_32)
6478DO_STN_2(3, ss, MO_32, MO_32)
6479DO_STN_2(4, ss, MO_32, MO_32)
6480
6481DO_STN_2(1, dd, MO_64, MO_64)
6482DO_STN_2(2, dd, MO_64, MO_64)
6483DO_STN_2(3, dd, MO_64, MO_64)
6484DO_STN_2(4, dd, MO_64, MO_64)
6485
6486#undef DO_STN_1
6487#undef DO_STN_2
6488
6489
6490
6491
6492
6493
6494
6495
6496typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6497
6498static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6499{
6500 return *(uint32_t *)(reg + H1_4(reg_ofs));
6501}
6502
6503static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6504{
6505 return *(int32_t *)(reg + H1_4(reg_ofs));
6506}
6507
6508static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6509{
6510 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6511}
6512
6513static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6514{
6515 return (int32_t)*(uint64_t *)(reg + reg_ofs);
6516}
6517
6518static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6519{
6520 return *(uint64_t *)(reg + reg_ofs);
6521}
6522
6523static inline QEMU_ALWAYS_INLINE
6524void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6525 target_ulong base, uint32_t desc, uintptr_t retaddr,
6526 uint32_t mtedesc, int esize, int msize,
6527 zreg_off_fn *off_fn,
6528 sve_ldst1_host_fn *host_fn,
6529 sve_ldst1_tlb_fn *tlb_fn)
6530{
6531 const int mmu_idx = cpu_mmu_index(env, false);
6532 const intptr_t reg_max = simd_oprsz(desc);
6533 const int scale = simd_data(desc);
6534 ARMVectorReg scratch;
6535 intptr_t reg_off;
6536 SVEHostPage info, info2;
6537
6538 memset(&scratch, 0, reg_max);
6539 reg_off = 0;
6540 do {
6541 uint64_t pg = vg[reg_off >> 6];
6542 do {
6543 if (likely(pg & 1)) {
6544 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6545 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6546
6547 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6548 mmu_idx, retaddr);
6549
6550 if (likely(in_page >= msize)) {
6551 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6552 cpu_check_watchpoint(env_cpu(env), addr, msize,
6553 info.attrs, BP_MEM_READ, retaddr);
6554 }
6555 if (mtedesc && info.tagged) {
6556 mte_check(env, mtedesc, addr, retaddr);
6557 }
6558 if (unlikely(info.flags & TLB_MMIO)) {
6559 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6560 } else {
6561 host_fn(&scratch, reg_off, info.host);
6562 }
6563 } else {
6564
6565 sve_probe_page(&info2, false, env, addr + in_page, 0,
6566 MMU_DATA_LOAD, mmu_idx, retaddr);
6567 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6568 cpu_check_watchpoint(env_cpu(env), addr,
6569 msize, info.attrs,
6570 BP_MEM_READ, retaddr);
6571 }
6572 if (mtedesc && info.tagged) {
6573 mte_check(env, mtedesc, addr, retaddr);
6574 }
6575 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6576 }
6577 }
6578 reg_off += esize;
6579 pg >>= esize;
6580 } while (reg_off & 63);
6581 } while (reg_off < reg_max);
6582
6583
6584 memcpy(vd, &scratch, reg_max);
6585}
6586
6587static inline QEMU_ALWAYS_INLINE
6588void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6589 target_ulong base, uint32_t desc, uintptr_t retaddr,
6590 int esize, int msize, zreg_off_fn *off_fn,
6591 sve_ldst1_host_fn *host_fn,
6592 sve_ldst1_tlb_fn *tlb_fn)
6593{
6594 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6595
6596 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6597
6598
6599
6600
6601
6602
6603
6604 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6605 esize, msize, off_fn, host_fn, tlb_fn);
6606}
6607
6608#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6609void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6610 void *vm, target_ulong base, uint32_t desc) \
6611{ \
6612 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6613 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6614} \
6615void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6616 void *vm, target_ulong base, uint32_t desc) \
6617{ \
6618 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6619 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6620}
6621
6622#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6623void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6624 void *vm, target_ulong base, uint32_t desc) \
6625{ \
6626 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6627 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6628} \
6629void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6630 void *vm, target_ulong base, uint32_t desc) \
6631{ \
6632 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6633 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6634}
6635
6636DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6637DO_LD1_ZPZ_S(bsu, zss, MO_8)
6638DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6639DO_LD1_ZPZ_D(bdu, zss, MO_8)
6640DO_LD1_ZPZ_D(bdu, zd, MO_8)
6641
6642DO_LD1_ZPZ_S(bss, zsu, MO_8)
6643DO_LD1_ZPZ_S(bss, zss, MO_8)
6644DO_LD1_ZPZ_D(bds, zsu, MO_8)
6645DO_LD1_ZPZ_D(bds, zss, MO_8)
6646DO_LD1_ZPZ_D(bds, zd, MO_8)
6647
6648DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6649DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6650DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6651DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6652DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6653
6654DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6655DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6656DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6657DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6658DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6659
6660DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6661DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6662DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6663DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6664DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6665
6666DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6667DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6668DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6669DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6670DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6671
6672DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6673DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6674DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6675DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6676DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6677
6678DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6679DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6680DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6681DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6682DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6683
6684DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6685DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6686DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6687
6688DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6689DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6690DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6691
6692DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6693DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6694DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6695
6696DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6697DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6698DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6699
6700#undef DO_LD1_ZPZ_S
6701#undef DO_LD1_ZPZ_D
6702
6703
6704
6705
6706
6707
6708
6709static inline QEMU_ALWAYS_INLINE
6710void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6711 target_ulong base, uint32_t desc, uintptr_t retaddr,
6712 uint32_t mtedesc, const int esz, const int msz,
6713 zreg_off_fn *off_fn,
6714 sve_ldst1_host_fn *host_fn,
6715 sve_ldst1_tlb_fn *tlb_fn)
6716{
6717 const int mmu_idx = cpu_mmu_index(env, false);
6718 const intptr_t reg_max = simd_oprsz(desc);
6719 const int scale = simd_data(desc);
6720 const int esize = 1 << esz;
6721 const int msize = 1 << msz;
6722 intptr_t reg_off;
6723 SVEHostPage info;
6724 target_ulong addr, in_page;
6725
6726
6727 reg_off = find_next_active(vg, 0, reg_max, esz);
6728 if (unlikely(reg_off >= reg_max)) {
6729
6730 memset(vd, 0, reg_max);
6731 return;
6732 }
6733
6734
6735
6736
6737 addr = base + (off_fn(vm, reg_off) << scale);
6738 if (mtedesc) {
6739 mte_check(env, mtedesc, addr, retaddr);
6740 }
6741 tlb_fn(env, vd, reg_off, addr, retaddr);
6742
6743
6744 swap_memzero(vd, reg_off);
6745 reg_off += esize;
6746 swap_memzero(vd + reg_off, reg_max - reg_off);
6747
6748
6749
6750
6751 while (reg_off < reg_max) {
6752 uint64_t pg = vg[reg_off >> 6];
6753 do {
6754 if (likely((pg >> (reg_off & 63)) & 1)) {
6755 addr = base + (off_fn(vm, reg_off) << scale);
6756 in_page = -(addr | TARGET_PAGE_MASK);
6757
6758 if (unlikely(in_page < msize)) {
6759
6760 goto fault;
6761 }
6762
6763 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6764 mmu_idx, retaddr);
6765 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6766 goto fault;
6767 }
6768 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6769 (cpu_watchpoint_address_matches
6770 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6771 goto fault;
6772 }
6773 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
6774 goto fault;
6775 }
6776
6777 host_fn(vd, reg_off, info.host);
6778 }
6779 reg_off += esize;
6780 } while (reg_off & 63);
6781 }
6782 return;
6783
6784 fault:
6785 record_fault(env, reg_off, reg_max);
6786}
6787
6788static inline QEMU_ALWAYS_INLINE
6789void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6790 target_ulong base, uint32_t desc, uintptr_t retaddr,
6791 const int esz, const int msz,
6792 zreg_off_fn *off_fn,
6793 sve_ldst1_host_fn *host_fn,
6794 sve_ldst1_tlb_fn *tlb_fn)
6795{
6796 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6797
6798 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6799
6800
6801
6802
6803
6804
6805
6806 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6807 esz, msz, off_fn, host_fn, tlb_fn);
6808}
6809
6810#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6811void HELPER(sve_ldff##MEM##_##OFS) \
6812 (CPUARMState *env, void *vd, void *vg, \
6813 void *vm, target_ulong base, uint32_t desc) \
6814{ \
6815 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6816 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6817} \
6818void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6819 (CPUARMState *env, void *vd, void *vg, \
6820 void *vm, target_ulong base, uint32_t desc) \
6821{ \
6822 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
6823 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6824}
6825
6826#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
6827void HELPER(sve_ldff##MEM##_##OFS) \
6828 (CPUARMState *env, void *vd, void *vg, \
6829 void *vm, target_ulong base, uint32_t desc) \
6830{ \
6831 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
6832 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6833} \
6834void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6835 (CPUARMState *env, void *vd, void *vg, \
6836 void *vm, target_ulong base, uint32_t desc) \
6837{ \
6838 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
6839 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6840}
6841
6842DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6843DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6844DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6845DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6846DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6847
6848DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6849DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6850DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6851DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6852DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6853
6854DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6855DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6856DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6857DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6858DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6859
6860DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6861DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6862DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6863DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6864DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6865
6866DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6867DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6868DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6869DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6870DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6871
6872DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6873DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6874DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6875DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6876DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6877
6878DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
6879DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
6880DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6881DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6882DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6883
6884DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
6885DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
6886DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6887DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6888DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6889
6890DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6891DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6892DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6893
6894DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6895DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6896DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6897
6898DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6899DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6900DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6901
6902DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6903DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6904DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
6905
6906
6907
6908static inline QEMU_ALWAYS_INLINE
6909void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6910 target_ulong base, uint32_t desc, uintptr_t retaddr,
6911 uint32_t mtedesc, int esize, int msize,
6912 zreg_off_fn *off_fn,
6913 sve_ldst1_host_fn *host_fn,
6914 sve_ldst1_tlb_fn *tlb_fn)
6915{
6916 const int mmu_idx = cpu_mmu_index(env, false);
6917 const intptr_t reg_max = simd_oprsz(desc);
6918 const int scale = simd_data(desc);
6919 void *host[ARM_MAX_VQ * 4];
6920 intptr_t reg_off, i;
6921 SVEHostPage info, info2;
6922
6923
6924
6925
6926 i = reg_off = 0;
6927 do {
6928 uint64_t pg = vg[reg_off >> 6];
6929 do {
6930 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6931 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6932
6933 host[i] = NULL;
6934 if (likely((pg >> (reg_off & 63)) & 1)) {
6935 if (likely(in_page >= msize)) {
6936 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6937 mmu_idx, retaddr);
6938 if (!(info.flags & TLB_MMIO)) {
6939 host[i] = info.host;
6940 }
6941 } else {
6942
6943
6944
6945
6946
6947 sve_probe_page(&info, false, env, addr, 0,
6948 MMU_DATA_STORE, mmu_idx, retaddr);
6949 sve_probe_page(&info2, false, env, addr + in_page, 0,
6950 MMU_DATA_STORE, mmu_idx, retaddr);
6951 info.flags |= info2.flags;
6952 }
6953
6954 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6955 cpu_check_watchpoint(env_cpu(env), addr, msize,
6956 info.attrs, BP_MEM_WRITE, retaddr);
6957 }
6958
6959 if (mtedesc && info.tagged) {
6960 mte_check(env, mtedesc, addr, retaddr);
6961 }
6962 }
6963 i += 1;
6964 reg_off += esize;
6965 } while (reg_off & 63);
6966 } while (reg_off < reg_max);
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977 i = reg_off = 0;
6978 do {
6979 void *h = host[i];
6980 if (likely(h != NULL)) {
6981 host_fn(vd, reg_off, h);
6982 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
6983 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6984 tlb_fn(env, vd, reg_off, addr, retaddr);
6985 }
6986 i += 1;
6987 reg_off += esize;
6988 } while (reg_off < reg_max);
6989}
6990
6991static inline QEMU_ALWAYS_INLINE
6992void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6993 target_ulong base, uint32_t desc, uintptr_t retaddr,
6994 int esize, int msize, zreg_off_fn *off_fn,
6995 sve_ldst1_host_fn *host_fn,
6996 sve_ldst1_tlb_fn *tlb_fn)
6997{
6998 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6999
7000 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7001
7002
7003
7004
7005
7006
7007
7008 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7009 esize, msize, off_fn, host_fn, tlb_fn);
7010}
7011
7012#define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
7013void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7014 void *vm, target_ulong base, uint32_t desc) \
7015{ \
7016 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
7017 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7018} \
7019void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7020 void *vm, target_ulong base, uint32_t desc) \
7021{ \
7022 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
7023 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7024}
7025
7026#define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
7027void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7028 void *vm, target_ulong base, uint32_t desc) \
7029{ \
7030 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
7031 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7032} \
7033void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7034 void *vm, target_ulong base, uint32_t desc) \
7035{ \
7036 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
7037 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7038}
7039
7040DO_ST1_ZPZ_S(bs, zsu, MO_8)
7041DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7042DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7043DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7044DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7045
7046DO_ST1_ZPZ_S(bs, zss, MO_8)
7047DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7048DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7049DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7050DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7051
7052DO_ST1_ZPZ_D(bd, zsu, MO_8)
7053DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7054DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7055DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7056DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7057DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7058DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7059
7060DO_ST1_ZPZ_D(bd, zss, MO_8)
7061DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7062DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7063DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7064DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7065DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7066DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7067
7068DO_ST1_ZPZ_D(bd, zd, MO_8)
7069DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7070DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7071DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7072DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7073DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7074DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7075
7076#undef DO_ST1_ZPZ_S
7077#undef DO_ST1_ZPZ_D
7078
7079void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7080{
7081 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7082 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7083
7084 for (i = 0; i < opr_sz; ++i) {
7085 d[i] = n[i] ^ m[i] ^ k[i];
7086 }
7087}
7088
7089void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7090{
7091 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7092 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7093
7094 for (i = 0; i < opr_sz; ++i) {
7095 d[i] = n[i] ^ (m[i] & ~k[i]);
7096 }
7097}
7098
7099void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7100{
7101 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7102 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7103
7104 for (i = 0; i < opr_sz; ++i) {
7105 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7106 }
7107}
7108
7109void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7110{
7111 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7112 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7113
7114 for (i = 0; i < opr_sz; ++i) {
7115 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7116 }
7117}
7118
7119void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7120{
7121 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7122 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7123
7124 for (i = 0; i < opr_sz; ++i) {
7125 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7126 }
7127}
7128
7129
7130
7131
7132
7133
7134static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7135{
7136 int bits = 8 << esz;
7137 uint64_t ones = dup_const(esz, 1);
7138 uint64_t signs = ones << (bits - 1);
7139 uint64_t cmp0, cmp1;
7140
7141 cmp1 = dup_const(esz, n);
7142 cmp0 = cmp1 ^ m0;
7143 cmp1 = cmp1 ^ m1;
7144 cmp0 = (cmp0 - ones) & ~cmp0;
7145 cmp1 = (cmp1 - ones) & ~cmp1;
7146 return (cmp0 | cmp1) & signs;
7147}
7148
7149static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7150 uint32_t desc, int esz, bool nmatch)
7151{
7152 uint16_t esz_mask = pred_esz_masks[esz];
7153 intptr_t opr_sz = simd_oprsz(desc);
7154 uint32_t flags = PREDTEST_INIT;
7155 intptr_t i, j, k;
7156
7157 for (i = 0; i < opr_sz; i += 16) {
7158 uint64_t m0 = *(uint64_t *)(vm + i);
7159 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7160 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7161 uint16_t out = 0;
7162
7163 for (j = 0; j < 16; j += 8) {
7164 uint64_t n = *(uint64_t *)(vn + i + j);
7165
7166 for (k = 0; k < 8; k += 1 << esz) {
7167 if (pg & (1 << (j + k))) {
7168 bool o = do_match2(n >> (k * 8), m0, m1, esz);
7169 out |= (o ^ nmatch) << (j + k);
7170 }
7171 }
7172 }
7173 *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7174 flags = iter_predtest_fwd(out, pg, flags);
7175 }
7176 return flags;
7177}
7178
7179#define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7180uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7181{ \
7182 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7183}
7184
7185DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7186DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7187
7188DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7189DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7190
7191#undef DO_PPZZ_MATCH
7192
7193void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7194 uint32_t desc)
7195{
7196 ARMVectorReg scratch;
7197 intptr_t i, j;
7198 intptr_t opr_sz = simd_oprsz(desc);
7199 uint32_t *d = vd, *n = vn, *m = vm;
7200 uint8_t *pg = vg;
7201
7202 if (d == n) {
7203 n = memcpy(&scratch, n, opr_sz);
7204 if (d == m) {
7205 m = n;
7206 }
7207 } else if (d == m) {
7208 m = memcpy(&scratch, m, opr_sz);
7209 }
7210
7211 for (i = 0; i < opr_sz; i += 4) {
7212 uint64_t count = 0;
7213 uint8_t pred;
7214
7215 pred = pg[H1(i >> 3)] >> (i & 7);
7216 if (pred & 1) {
7217 uint32_t nn = n[H4(i >> 2)];
7218
7219 for (j = 0; j <= i; j += 4) {
7220 pred = pg[H1(j >> 3)] >> (j & 7);
7221 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7222 ++count;
7223 }
7224 }
7225 }
7226 d[H4(i >> 2)] = count;
7227 }
7228}
7229
7230void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7231 uint32_t desc)
7232{
7233 ARMVectorReg scratch;
7234 intptr_t i, j;
7235 intptr_t opr_sz = simd_oprsz(desc);
7236 uint64_t *d = vd, *n = vn, *m = vm;
7237 uint8_t *pg = vg;
7238
7239 if (d == n) {
7240 n = memcpy(&scratch, n, opr_sz);
7241 if (d == m) {
7242 m = n;
7243 }
7244 } else if (d == m) {
7245 m = memcpy(&scratch, m, opr_sz);
7246 }
7247
7248 for (i = 0; i < opr_sz / 8; ++i) {
7249 uint64_t count = 0;
7250 if (pg[H1(i)] & 1) {
7251 uint64_t nn = n[i];
7252 for (j = 0; j <= i; ++j) {
7253 if ((pg[H1(j)] & 1) && nn == m[j]) {
7254 ++count;
7255 }
7256 }
7257 }
7258 d[i] = count;
7259 }
7260}
7261
7262
7263
7264
7265
7266
7267static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7268{
7269 const uint64_t mask = dup_const(MO_8, 0x7f);
7270 uint64_t cmp0, cmp1;
7271
7272 cmp1 = dup_const(MO_8, n);
7273 cmp0 = cmp1 ^ m0;
7274 cmp1 = cmp1 ^ m1;
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7285 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296 return ctpop64(cmp0 | (cmp1 >> 1));
7297}
7298
7299void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7300{
7301 intptr_t i, j;
7302 intptr_t opr_sz = simd_oprsz(desc);
7303
7304 for (i = 0; i < opr_sz; i += 16) {
7305 uint64_t n0 = *(uint64_t *)(vn + i);
7306 uint64_t m0 = *(uint64_t *)(vm + i);
7307 uint64_t n1 = *(uint64_t *)(vn + i + 8);
7308 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7309 uint64_t out0 = 0;
7310 uint64_t out1 = 0;
7311
7312 for (j = 0; j < 64; j += 8) {
7313 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7314 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7315 out0 |= cnt0 << j;
7316 out1 |= cnt1 << j;
7317 }
7318
7319 *(uint64_t *)(vd + i) = out0;
7320 *(uint64_t *)(vd + i + 8) = out1;
7321 }
7322}
7323
7324void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7325{
7326 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7327 int shr = simd_data(desc);
7328 int shl = 8 - shr;
7329 uint64_t mask = dup_const(MO_8, 0xff >> shr);
7330 uint64_t *d = vd, *n = vn, *m = vm;
7331
7332 for (i = 0; i < opr_sz; ++i) {
7333 uint64_t t = n[i] ^ m[i];
7334 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7335 }
7336}
7337
7338void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7339{
7340 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7341 int shr = simd_data(desc);
7342 int shl = 16 - shr;
7343 uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7344 uint64_t *d = vd, *n = vn, *m = vm;
7345
7346 for (i = 0; i < opr_sz; ++i) {
7347 uint64_t t = n[i] ^ m[i];
7348 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7349 }
7350}
7351
7352void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7353{
7354 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7355 int shr = simd_data(desc);
7356 uint32_t *d = vd, *n = vn, *m = vm;
7357
7358 for (i = 0; i < opr_sz; ++i) {
7359 d[i] = ror32(n[i] ^ m[i], shr);
7360 }
7361}
7362
7363void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7364 void *status, uint32_t desc)
7365{
7366 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7367
7368 for (s = 0; s < opr_sz; ++s) {
7369 float32 *n = vn + s * sizeof(float32) * 4;
7370 float32 *m = vm + s * sizeof(float32) * 4;
7371 float32 *a = va + s * sizeof(float32) * 4;
7372 float32 *d = vd + s * sizeof(float32) * 4;
7373 float32 n00 = n[H4(0)], n01 = n[H4(1)];
7374 float32 n10 = n[H4(2)], n11 = n[H4(3)];
7375 float32 m00 = m[H4(0)], m01 = m[H4(1)];
7376 float32 m10 = m[H4(2)], m11 = m[H4(3)];
7377 float32 p0, p1;
7378
7379
7380 p0 = float32_mul(n00, m00, status);
7381 p1 = float32_mul(n01, m01, status);
7382 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7383
7384
7385 p0 = float32_mul(n00, m10, status);
7386 p1 = float32_mul(n01, m11, status);
7387 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7388
7389
7390 p0 = float32_mul(n10, m00, status);
7391 p1 = float32_mul(n11, m01, status);
7392 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7393
7394
7395 p0 = float32_mul(n10, m10, status);
7396 p1 = float32_mul(n11, m11, status);
7397 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7398 }
7399}
7400
7401void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7402 void *status, uint32_t desc)
7403{
7404 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7405
7406 for (s = 0; s < opr_sz; ++s) {
7407 float64 *n = vn + s * sizeof(float64) * 4;
7408 float64 *m = vm + s * sizeof(float64) * 4;
7409 float64 *a = va + s * sizeof(float64) * 4;
7410 float64 *d = vd + s * sizeof(float64) * 4;
7411 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7412 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7413 float64 p0, p1;
7414
7415
7416 p0 = float64_mul(n00, m00, status);
7417 p1 = float64_mul(n01, m01, status);
7418 d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7419
7420
7421 p0 = float64_mul(n00, m10, status);
7422 p1 = float64_mul(n01, m11, status);
7423 d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7424
7425
7426 p0 = float64_mul(n10, m00, status);
7427 p1 = float64_mul(n11, m01, status);
7428 d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7429
7430
7431 p0 = float64_mul(n10, m10, status);
7432 p1 = float64_mul(n11, m11, status);
7433 d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7434 }
7435}
7436
7437#define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7438void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7439{ \
7440 intptr_t i = simd_oprsz(desc); \
7441 uint64_t *g = vg; \
7442 do { \
7443 uint64_t pg = g[(i - 1) >> 6]; \
7444 do { \
7445 i -= sizeof(TYPEW); \
7446 if (likely((pg >> (i & 63)) & 1)) { \
7447 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
7448 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \
7449 } \
7450 } while (i & 63); \
7451 } while (i != 0); \
7452}
7453
7454DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7455DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7456DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7457
7458#define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7459void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7460{ \
7461 intptr_t i = simd_oprsz(desc); \
7462 uint64_t *g = vg; \
7463 do { \
7464 uint64_t pg = g[(i - 1) >> 6]; \
7465 do { \
7466 i -= sizeof(TYPEW); \
7467 if (likely((pg >> (i & 63)) & 1)) { \
7468 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \
7469 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \
7470 } \
7471 } while (i & 63); \
7472 } while (i != 0); \
7473}
7474
7475DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7476DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7477
7478#undef DO_FCVTLT
7479#undef DO_FCVTNT
7480