1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20#include "qemu/osdep.h"
21#include "cpu.h"
22#include "internals.h"
23#include "exec/exec-all.h"
24#include "exec/cpu_ldst.h"
25#include "exec/helper-proto.h"
26#include "tcg/tcg-gvec-desc.h"
27#include "fpu/softfloat.h"
28#include "tcg/tcg.h"
29
30
31
32
33#ifdef HOST_WORDS_BIGENDIAN
34#define H1(x) ((x) ^ 7)
35#define H1_2(x) ((x) ^ 6)
36#define H1_4(x) ((x) ^ 4)
37#define H2(x) ((x) ^ 3)
38#define H4(x) ((x) ^ 1)
39#else
40#define H1(x) (x)
41#define H1_2(x) (x)
42#define H1_4(x) (x)
43#define H2(x) (x)
44#define H4(x) (x)
45#endif
46
47
48
49
50
51
52
53
54
55#define PREDTEST_INIT 1
56
57
58
59
60static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
61{
62 if (likely(g)) {
63
64
65 if (!(flags & 4)) {
66 flags |= ((d & (g & -g)) != 0) << 31;
67 flags |= 4;
68 }
69
70
71 flags |= ((d & g) != 0) << 1;
72
73
74 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
75 }
76 return flags;
77}
78
79
80
81
82static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
83{
84 if (likely(g)) {
85
86
87 if (!(flags & 4)) {
88 flags += 4 - 1;
89 flags |= (d & pow2floor(g)) == 0;
90 }
91
92
93 flags |= ((d & g) != 0) << 1;
94
95
96 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
97 }
98 return flags;
99}
100
101
102uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
103{
104 return iter_predtest_fwd(d, g, PREDTEST_INIT);
105}
106
107
108uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
109{
110 uint32_t flags = PREDTEST_INIT;
111 uint64_t *d = vd, *g = vg;
112 uintptr_t i = 0;
113
114 do {
115 flags = iter_predtest_fwd(d[i], g[i], flags);
116 } while (++i < words);
117
118 return flags;
119}
120
121
122
123
124
125
126
127
128
129
130
131
132static inline uint64_t expand_pred_b(uint8_t byte)
133{
134 static const uint64_t word[256] = {
135 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
136 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
137 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
138 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
139 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
140 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
141 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
142 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
143 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
144 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
145 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
146 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
147 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
148 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
149 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
150 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
151 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
152 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
153 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
154 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
155 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
156 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
157 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
158 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
159 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
160 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
161 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
162 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
163 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
164 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
165 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
166 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
167 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
168 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
169 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
170 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
171 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
172 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
173 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
174 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
175 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
176 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
177 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
178 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
179 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
180 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
181 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
182 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
183 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
184 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
185 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
186 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
187 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
188 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
189 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
190 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
191 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
192 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
193 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
194 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
195 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
196 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
197 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
198 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
199 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
200 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
201 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
202 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
203 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
204 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
205 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
206 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
207 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
208 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
209 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
210 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
211 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
212 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
213 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
214 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
215 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
216 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
217 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
218 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
219 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
220 0xffffffffffffffff,
221 };
222 return word[byte];
223}
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239static inline uint64_t expand_pred_h(uint8_t byte)
240{
241 static const uint64_t word[] = {
242 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
243 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
244 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
245 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
246 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
247 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
248 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
249 [0x55] = 0xffffffffffffffff,
250 };
251 return word[byte & 0x55];
252}
253
254
255static inline uint64_t expand_pred_s(uint8_t byte)
256{
257 static const uint64_t word[] = {
258 [0x01] = 0x00000000ffffffffull,
259 [0x10] = 0xffffffff00000000ull,
260 [0x11] = 0xffffffffffffffffull,
261 };
262 return word[byte & 0x11];
263}
264
265
266static inline uint32_t hswap32(uint32_t h)
267{
268 return rol32(h, 16);
269}
270
271
272static inline uint64_t hswap64(uint64_t h)
273{
274 uint64_t m = 0x0000ffff0000ffffull;
275 h = rol64(h, 32);
276 return ((h & m) << 16) | ((h >> 16) & m);
277}
278
279
280static inline uint64_t wswap64(uint64_t h)
281{
282 return rol64(h, 32);
283}
284
285#define LOGICAL_PPPP(NAME, FUNC) \
286void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
287{ \
288 uintptr_t opr_sz = simd_oprsz(desc); \
289 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
290 uintptr_t i; \
291 for (i = 0; i < opr_sz / 8; ++i) { \
292 d[i] = FUNC(n[i], m[i], g[i]); \
293 } \
294}
295
296#define DO_AND(N, M, G) (((N) & (M)) & (G))
297#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
298#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
299#define DO_ORR(N, M, G) (((N) | (M)) & (G))
300#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
301#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
302#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
303#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
304
305LOGICAL_PPPP(sve_and_pppp, DO_AND)
306LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
307LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
308LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
309LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
310LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
311LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
312LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
313
314#undef DO_AND
315#undef DO_BIC
316#undef DO_EOR
317#undef DO_ORR
318#undef DO_ORN
319#undef DO_NOR
320#undef DO_NAND
321#undef DO_SEL
322#undef LOGICAL_PPPP
323
324
325
326
327
328
329
330
331
332#define DO_ZPZZ(NAME, TYPE, H, OP) \
333void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
334{ \
335 intptr_t i, opr_sz = simd_oprsz(desc); \
336 for (i = 0; i < opr_sz; ) { \
337 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
338 do { \
339 if (pg & 1) { \
340 TYPE nn = *(TYPE *)(vn + H(i)); \
341 TYPE mm = *(TYPE *)(vm + H(i)); \
342 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
343 } \
344 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
345 } while (i & 15); \
346 } \
347}
348
349
350#define DO_ZPZZ_D(NAME, TYPE, OP) \
351void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
352{ \
353 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
354 TYPE *d = vd, *n = vn, *m = vm; \
355 uint8_t *pg = vg; \
356 for (i = 0; i < opr_sz; i += 1) { \
357 if (pg[H1(i)] & 1) { \
358 TYPE nn = n[i], mm = m[i]; \
359 d[i] = OP(nn, mm); \
360 } \
361 } \
362}
363
364#define DO_AND(N, M) (N & M)
365#define DO_EOR(N, M) (N ^ M)
366#define DO_ORR(N, M) (N | M)
367#define DO_BIC(N, M) (N & ~M)
368#define DO_ADD(N, M) (N + M)
369#define DO_SUB(N, M) (N - M)
370#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
371#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
372#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
373#define DO_MUL(N, M) (N * M)
374
375
376
377
378
379
380
381
382
383#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
384#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
385
386DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
387DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
388DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
389DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
390
391DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
392DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
393DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
394DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
395
396DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
397DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
398DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
399DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
400
401DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
402DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
403DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
404DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
405
406DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
407DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
408DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
409DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
410
411DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
412DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
413DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
414DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
415
416DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
417DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
418DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
419DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
420
421DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
422DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
423DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
424DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
425
426DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
427DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
428DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
429DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
430
431DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
432DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
433DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
434DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
435
436DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
437DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
438DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
439DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
440
441DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
442DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
443DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
444DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
445
446
447
448static inline uint8_t do_mulh_b(int32_t n, int32_t m)
449{
450 return (n * m) >> 8;
451}
452
453static inline uint16_t do_mulh_h(int32_t n, int32_t m)
454{
455 return (n * m) >> 16;
456}
457
458static inline uint32_t do_mulh_s(int64_t n, int64_t m)
459{
460 return (n * m) >> 32;
461}
462
463static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
464{
465 uint64_t lo, hi;
466 muls64(&lo, &hi, n, m);
467 return hi;
468}
469
470static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
471{
472 uint64_t lo, hi;
473 mulu64(&lo, &hi, n, m);
474 return hi;
475}
476
477DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
478DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
479DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
480DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
481
482DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
483DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
484DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
485DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
486
487DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
488DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
489DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
490DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
491
492DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
493DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
494
495DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
496DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
497
498
499
500#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
501#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
502#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
503
504DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
505DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
506DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
507
508DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
509DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
510DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
511
512DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
513DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
514DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
515
516DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
517DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
518DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
519
520#undef DO_ZPZZ
521#undef DO_ZPZZ_D
522
523
524
525
526
527#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
528void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
529{ \
530 intptr_t i, opr_sz = simd_oprsz(desc); \
531 for (i = 0; i < opr_sz; ) { \
532 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
533 TYPEW mm = *(TYPEW *)(vm + i); \
534 do { \
535 if (pg & 1) { \
536 TYPE nn = *(TYPE *)(vn + H(i)); \
537 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
538 } \
539 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
540 } while (i & 7); \
541 } \
542}
543
544DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
545DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
546DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
547
548DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
549DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
550DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
551
552DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
553DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
554DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
555
556#undef DO_ZPZW
557
558
559
560#define DO_ZPZ(NAME, TYPE, H, OP) \
561void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
562{ \
563 intptr_t i, opr_sz = simd_oprsz(desc); \
564 for (i = 0; i < opr_sz; ) { \
565 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
566 do { \
567 if (pg & 1) { \
568 TYPE nn = *(TYPE *)(vn + H(i)); \
569 *(TYPE *)(vd + H(i)) = OP(nn); \
570 } \
571 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
572 } while (i & 15); \
573 } \
574}
575
576
577#define DO_ZPZ_D(NAME, TYPE, OP) \
578void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
579{ \
580 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
581 TYPE *d = vd, *n = vn; \
582 uint8_t *pg = vg; \
583 for (i = 0; i < opr_sz; i += 1) { \
584 if (pg[H1(i)] & 1) { \
585 TYPE nn = n[i]; \
586 d[i] = OP(nn); \
587 } \
588 } \
589}
590
591#define DO_CLS_B(N) (clrsb32(N) - 24)
592#define DO_CLS_H(N) (clrsb32(N) - 16)
593
594DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
595DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
596DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
597DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
598
599#define DO_CLZ_B(N) (clz32(N) - 24)
600#define DO_CLZ_H(N) (clz32(N) - 16)
601
602DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
603DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
604DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
605DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
606
607DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
608DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
609DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
610DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
611
612#define DO_CNOT(N) (N == 0)
613
614DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
615DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
616DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
617DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
618
619#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
620
621DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
622DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
623DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
624
625#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
626
627DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
628DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
629DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
630
631#define DO_NOT(N) (~N)
632
633DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
634DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
635DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
636DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
637
638#define DO_SXTB(N) ((int8_t)N)
639#define DO_SXTH(N) ((int16_t)N)
640#define DO_SXTS(N) ((int32_t)N)
641#define DO_UXTB(N) ((uint8_t)N)
642#define DO_UXTH(N) ((uint16_t)N)
643#define DO_UXTS(N) ((uint32_t)N)
644
645DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
646DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
647DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
648DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
649DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
650DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
651
652DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
653DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
654DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
655DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
656DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
657DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
658
659#define DO_ABS(N) (N < 0 ? -N : N)
660
661DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
662DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
663DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
664DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
665
666#define DO_NEG(N) (-N)
667
668DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
669DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
670DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
671DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
672
673DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
674DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
675DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
676
677DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
678DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
679
680DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
681
682DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
683DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
684DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
685DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
686
687
688
689#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
690void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
691{ \
692 intptr_t i, opr_sz = simd_oprsz(desc); \
693 for (i = 0; i < opr_sz; ) { \
694 TYPEW mm = *(TYPEW *)(vm + i); \
695 do { \
696 TYPE nn = *(TYPE *)(vn + H(i)); \
697 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
698 i += sizeof(TYPE); \
699 } while (i & 7); \
700 } \
701}
702
703DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
704DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
705DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
706
707DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
708DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
709DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
710
711DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
712DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
713DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
714
715#undef DO_ZZW
716
717#undef DO_CLS_B
718#undef DO_CLS_H
719#undef DO_CLZ_B
720#undef DO_CLZ_H
721#undef DO_CNOT
722#undef DO_FABS
723#undef DO_FNEG
724#undef DO_ABS
725#undef DO_NEG
726#undef DO_ZPZ
727#undef DO_ZPZ_D
728
729
730
731
732
733
734
735
736
737
738#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
739uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
740{ \
741 intptr_t i, opr_sz = simd_oprsz(desc); \
742 TYPERED ret = INIT; \
743 for (i = 0; i < opr_sz; ) { \
744 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
745 do { \
746 if (pg & 1) { \
747 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
748 ret = OP(ret, nn); \
749 } \
750 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
751 } while (i & 15); \
752 } \
753 return (TYPERET)ret; \
754}
755
756#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
757uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
758{ \
759 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
760 TYPEE *n = vn; \
761 uint8_t *pg = vg; \
762 TYPER ret = INIT; \
763 for (i = 0; i < opr_sz; i += 1) { \
764 if (pg[H1(i)] & 1) { \
765 TYPEE nn = n[i]; \
766 ret = OP(ret, nn); \
767 } \
768 } \
769 return ret; \
770}
771
772DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
773DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
774DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
775DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
776
777DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
778DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
779DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
780DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
781
782DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
783DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
784DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
785DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
786
787DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
788DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
789DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
790
791DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
792DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
793DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
794DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
795
796DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
797DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
798DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
799DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
800
801DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
802DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
803DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
804DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
805
806DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
807DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
808DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
809DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
810
811DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
812DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
813DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
814DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
815
816#undef DO_VPZ
817#undef DO_VPZ_D
818
819
820#define DO_ZZI(NAME, TYPE, OP) \
821void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
822{ \
823 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
824 TYPE s = s64, *d = vd, *n = vn; \
825 for (i = 0; i < opr_sz; ++i) { \
826 d[i] = OP(n[i], s); \
827 } \
828}
829
830#define DO_SUBR(X, Y) (Y - X)
831
832DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
833DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
834DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
835DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
836
837DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
838DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
839DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
840DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
841
842DO_ZZI(sve_smini_b, int8_t, DO_MIN)
843DO_ZZI(sve_smini_h, int16_t, DO_MIN)
844DO_ZZI(sve_smini_s, int32_t, DO_MIN)
845DO_ZZI(sve_smini_d, int64_t, DO_MIN)
846
847DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
848DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
849DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
850DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
851
852DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
853DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
854DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
855DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
856
857#undef DO_ZZI
858
859#undef DO_AND
860#undef DO_ORR
861#undef DO_EOR
862#undef DO_BIC
863#undef DO_ADD
864#undef DO_SUB
865#undef DO_MAX
866#undef DO_MIN
867#undef DO_ABD
868#undef DO_MUL
869#undef DO_DIV
870#undef DO_ASR
871#undef DO_LSR
872#undef DO_LSL
873#undef DO_SUBR
874
875
876
877
878static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
879{
880 uint64_t mask = pred_esz_masks[esz];
881 intptr_t i = words;
882
883 do {
884 uint64_t this_g = g[--i] & mask;
885 if (this_g) {
886 return i * 64 + (63 - clz64(this_g));
887 }
888 } while (i > 0);
889 return (intptr_t)-1 << esz;
890}
891
892uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
893{
894 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
895 uint32_t flags = PREDTEST_INIT;
896 uint64_t *d = vd, *g = vg;
897 intptr_t i = 0;
898
899 do {
900 uint64_t this_d = d[i];
901 uint64_t this_g = g[i];
902
903 if (this_g) {
904 if (!(flags & 4)) {
905
906 this_d |= this_g & -this_g;
907 d[i] = this_d;
908 }
909 flags = iter_predtest_fwd(this_d, this_g, flags);
910 }
911 } while (++i < words);
912
913 return flags;
914}
915
916uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
917{
918 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
919 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
920 uint32_t flags = PREDTEST_INIT;
921 uint64_t *d = vd, *g = vg, esz_mask;
922 intptr_t i, next;
923
924 next = last_active_element(vd, words, esz) + (1 << esz);
925 esz_mask = pred_esz_masks[esz];
926
927
928
929 if (next < words * 64) {
930 uint64_t mask = -1;
931
932 if (next & 63) {
933 mask = ~((1ull << (next & 63)) - 1);
934 next &= -64;
935 }
936 do {
937 uint64_t this_g = g[next / 64] & esz_mask & mask;
938 if (this_g != 0) {
939 next = (next & -64) + ctz64(this_g);
940 break;
941 }
942 next += 64;
943 mask = -1;
944 } while (next < words * 64);
945 }
946
947 i = 0;
948 do {
949 uint64_t this_d = 0;
950 if (i == next / 64) {
951 this_d = 1ull << (next & 63);
952 }
953 d[i] = this_d;
954 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
955 } while (++i < words);
956
957 return flags;
958}
959
960
961
962
963
964void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
965{
966 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
967 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
968 uint64_t *d = vd, *n = vn;
969 uint8_t *pg = vg;
970
971 for (i = 0; i < opr_sz; i += 1) {
972 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
973 }
974}
975
976void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
977{
978 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
979 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
980 uint64_t *d = vd, *n = vn;
981 uint8_t *pg = vg;
982
983 for (i = 0; i < opr_sz; i += 1) {
984 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
985 }
986}
987
988void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
989{
990 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
991 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
992 uint64_t *d = vd, *n = vn;
993 uint8_t *pg = vg;
994
995 for (i = 0; i < opr_sz; i += 1) {
996 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
997 }
998}
999
1000void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1001{
1002 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1003 uint64_t *d = vd, *n = vn;
1004 uint8_t *pg = vg;
1005 uint8_t inv = simd_data(desc);
1006
1007 for (i = 0; i < opr_sz; i += 1) {
1008 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
1009 }
1010}
1011
1012
1013
1014#define DO_ZPZI(NAME, TYPE, H, OP) \
1015void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1016{ \
1017 intptr_t i, opr_sz = simd_oprsz(desc); \
1018 TYPE imm = simd_data(desc); \
1019 for (i = 0; i < opr_sz; ) { \
1020 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1021 do { \
1022 if (pg & 1) { \
1023 TYPE nn = *(TYPE *)(vn + H(i)); \
1024 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1025 } \
1026 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1027 } while (i & 15); \
1028 } \
1029}
1030
1031
1032#define DO_ZPZI_D(NAME, TYPE, OP) \
1033void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1034{ \
1035 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1036 TYPE *d = vd, *n = vn; \
1037 TYPE imm = simd_data(desc); \
1038 uint8_t *pg = vg; \
1039 for (i = 0; i < opr_sz; i += 1) { \
1040 if (pg[H1(i)] & 1) { \
1041 TYPE nn = n[i]; \
1042 d[i] = OP(nn, imm); \
1043 } \
1044 } \
1045}
1046
1047#define DO_SHR(N, M) (N >> M)
1048#define DO_SHL(N, M) (N << M)
1049
1050
1051
1052
1053#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1054
1055DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1056DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1057DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1058DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1059
1060DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1061DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1062DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1063DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1064
1065DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1066DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1067DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1068DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1069
1070DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1071DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1072DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1073DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1074
1075#undef DO_SHR
1076#undef DO_SHL
1077#undef DO_ASRD
1078#undef DO_ZPZI
1079#undef DO_ZPZI_D
1080
1081
1082
1083#define DO_ZPZZZ(NAME, TYPE, H, OP) \
1084void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1085 void *vg, uint32_t desc) \
1086{ \
1087 intptr_t i, opr_sz = simd_oprsz(desc); \
1088 for (i = 0; i < opr_sz; ) { \
1089 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1090 do { \
1091 if (pg & 1) { \
1092 TYPE nn = *(TYPE *)(vn + H(i)); \
1093 TYPE mm = *(TYPE *)(vm + H(i)); \
1094 TYPE aa = *(TYPE *)(va + H(i)); \
1095 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1096 } \
1097 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1098 } while (i & 15); \
1099 } \
1100}
1101
1102
1103#define DO_ZPZZZ_D(NAME, TYPE, OP) \
1104void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1105 void *vg, uint32_t desc) \
1106{ \
1107 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1108 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1109 uint8_t *pg = vg; \
1110 for (i = 0; i < opr_sz; i += 1) { \
1111 if (pg[H1(i)] & 1) { \
1112 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1113 d[i] = OP(aa, nn, mm); \
1114 } \
1115 } \
1116}
1117
1118#define DO_MLA(A, N, M) (A + N * M)
1119#define DO_MLS(A, N, M) (A - N * M)
1120
1121DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1122DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1123
1124DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1125DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1126
1127DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1128DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1129
1130DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1131DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1132
1133#undef DO_MLA
1134#undef DO_MLS
1135#undef DO_ZPZZZ
1136#undef DO_ZPZZZ_D
1137
1138void HELPER(sve_index_b)(void *vd, uint32_t start,
1139 uint32_t incr, uint32_t desc)
1140{
1141 intptr_t i, opr_sz = simd_oprsz(desc);
1142 uint8_t *d = vd;
1143 for (i = 0; i < opr_sz; i += 1) {
1144 d[H1(i)] = start + i * incr;
1145 }
1146}
1147
1148void HELPER(sve_index_h)(void *vd, uint32_t start,
1149 uint32_t incr, uint32_t desc)
1150{
1151 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1152 uint16_t *d = vd;
1153 for (i = 0; i < opr_sz; i += 1) {
1154 d[H2(i)] = start + i * incr;
1155 }
1156}
1157
1158void HELPER(sve_index_s)(void *vd, uint32_t start,
1159 uint32_t incr, uint32_t desc)
1160{
1161 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1162 uint32_t *d = vd;
1163 for (i = 0; i < opr_sz; i += 1) {
1164 d[H4(i)] = start + i * incr;
1165 }
1166}
1167
1168void HELPER(sve_index_d)(void *vd, uint64_t start,
1169 uint64_t incr, uint32_t desc)
1170{
1171 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1172 uint64_t *d = vd;
1173 for (i = 0; i < opr_sz; i += 1) {
1174 d[i] = start + i * incr;
1175 }
1176}
1177
1178void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1179{
1180 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1181 uint32_t sh = simd_data(desc);
1182 uint32_t *d = vd, *n = vn, *m = vm;
1183 for (i = 0; i < opr_sz; i += 1) {
1184 d[i] = n[i] + (m[i] << sh);
1185 }
1186}
1187
1188void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1189{
1190 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1191 uint64_t sh = simd_data(desc);
1192 uint64_t *d = vd, *n = vn, *m = vm;
1193 for (i = 0; i < opr_sz; i += 1) {
1194 d[i] = n[i] + (m[i] << sh);
1195 }
1196}
1197
1198void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1199{
1200 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1201 uint64_t sh = simd_data(desc);
1202 uint64_t *d = vd, *n = vn, *m = vm;
1203 for (i = 0; i < opr_sz; i += 1) {
1204 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1205 }
1206}
1207
1208void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1209{
1210 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1211 uint64_t sh = simd_data(desc);
1212 uint64_t *d = vd, *n = vn, *m = vm;
1213 for (i = 0; i < opr_sz; i += 1) {
1214 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1215 }
1216}
1217
1218void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1219{
1220
1221 static const uint16_t coeff[] = {
1222 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1223 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1224 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1225 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1226 };
1227 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1228 uint16_t *d = vd, *n = vn;
1229
1230 for (i = 0; i < opr_sz; i++) {
1231 uint16_t nn = n[i];
1232 intptr_t idx = extract32(nn, 0, 5);
1233 uint16_t exp = extract32(nn, 5, 5);
1234 d[i] = coeff[idx] | (exp << 10);
1235 }
1236}
1237
1238void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1239{
1240
1241 static const uint32_t coeff[] = {
1242 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1243 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1244 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1245 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1246 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1247 0x1ef532, 0x20b051, 0x227043, 0x243516,
1248 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1249 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1250 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1251 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1252 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1253 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1254 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1255 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1256 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1257 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1258 };
1259 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1260 uint32_t *d = vd, *n = vn;
1261
1262 for (i = 0; i < opr_sz; i++) {
1263 uint32_t nn = n[i];
1264 intptr_t idx = extract32(nn, 0, 6);
1265 uint32_t exp = extract32(nn, 6, 8);
1266 d[i] = coeff[idx] | (exp << 23);
1267 }
1268}
1269
1270void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1271{
1272
1273 static const uint64_t coeff[] = {
1274 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1275 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1276 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1277 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1278 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1279 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1280 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1281 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1282 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1283 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1284 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1285 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1286 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1287 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1288 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1289 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1290 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1291 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1292 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1293 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1294 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1295 0xFA7C1819E90D8ull,
1296 };
1297 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1298 uint64_t *d = vd, *n = vn;
1299
1300 for (i = 0; i < opr_sz; i++) {
1301 uint64_t nn = n[i];
1302 intptr_t idx = extract32(nn, 0, 6);
1303 uint64_t exp = extract32(nn, 6, 11);
1304 d[i] = coeff[idx] | (exp << 52);
1305 }
1306}
1307
1308void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1309{
1310 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1311 uint16_t *d = vd, *n = vn, *m = vm;
1312 for (i = 0; i < opr_sz; i += 1) {
1313 uint16_t nn = n[i];
1314 uint16_t mm = m[i];
1315 if (mm & 1) {
1316 nn = float16_one;
1317 }
1318 d[i] = nn ^ (mm & 2) << 14;
1319 }
1320}
1321
1322void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1323{
1324 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1325 uint32_t *d = vd, *n = vn, *m = vm;
1326 for (i = 0; i < opr_sz; i += 1) {
1327 uint32_t nn = n[i];
1328 uint32_t mm = m[i];
1329 if (mm & 1) {
1330 nn = float32_one;
1331 }
1332 d[i] = nn ^ (mm & 2) << 30;
1333 }
1334}
1335
1336void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1337{
1338 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1339 uint64_t *d = vd, *n = vn, *m = vm;
1340 for (i = 0; i < opr_sz; i += 1) {
1341 uint64_t nn = n[i];
1342 uint64_t mm = m[i];
1343 if (mm & 1) {
1344 nn = float64_one;
1345 }
1346 d[i] = nn ^ (mm & 2) << 62;
1347 }
1348}
1349
1350
1351
1352
1353
1354void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1355{
1356 intptr_t i, oprsz = simd_oprsz(desc);
1357
1358 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1359 int r = *(int8_t *)(a + i) + b;
1360 if (r > INT8_MAX) {
1361 r = INT8_MAX;
1362 } else if (r < INT8_MIN) {
1363 r = INT8_MIN;
1364 }
1365 *(int8_t *)(d + i) = r;
1366 }
1367}
1368
1369void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1370{
1371 intptr_t i, oprsz = simd_oprsz(desc);
1372
1373 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1374 int r = *(int16_t *)(a + i) + b;
1375 if (r > INT16_MAX) {
1376 r = INT16_MAX;
1377 } else if (r < INT16_MIN) {
1378 r = INT16_MIN;
1379 }
1380 *(int16_t *)(d + i) = r;
1381 }
1382}
1383
1384void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1385{
1386 intptr_t i, oprsz = simd_oprsz(desc);
1387
1388 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1389 int64_t r = *(int32_t *)(a + i) + b;
1390 if (r > INT32_MAX) {
1391 r = INT32_MAX;
1392 } else if (r < INT32_MIN) {
1393 r = INT32_MIN;
1394 }
1395 *(int32_t *)(d + i) = r;
1396 }
1397}
1398
1399void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1400{
1401 intptr_t i, oprsz = simd_oprsz(desc);
1402
1403 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1404 int64_t ai = *(int64_t *)(a + i);
1405 int64_t r = ai + b;
1406 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1407
1408 r = (r < 0 ? INT64_MAX : INT64_MIN);
1409 }
1410 *(int64_t *)(d + i) = r;
1411 }
1412}
1413
1414
1415
1416
1417
1418void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1419{
1420 intptr_t i, oprsz = simd_oprsz(desc);
1421
1422 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1423 int r = *(uint8_t *)(a + i) + b;
1424 if (r > UINT8_MAX) {
1425 r = UINT8_MAX;
1426 } else if (r < 0) {
1427 r = 0;
1428 }
1429 *(uint8_t *)(d + i) = r;
1430 }
1431}
1432
1433void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1434{
1435 intptr_t i, oprsz = simd_oprsz(desc);
1436
1437 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1438 int r = *(uint16_t *)(a + i) + b;
1439 if (r > UINT16_MAX) {
1440 r = UINT16_MAX;
1441 } else if (r < 0) {
1442 r = 0;
1443 }
1444 *(uint16_t *)(d + i) = r;
1445 }
1446}
1447
1448void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1449{
1450 intptr_t i, oprsz = simd_oprsz(desc);
1451
1452 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1453 int64_t r = *(uint32_t *)(a + i) + b;
1454 if (r > UINT32_MAX) {
1455 r = UINT32_MAX;
1456 } else if (r < 0) {
1457 r = 0;
1458 }
1459 *(uint32_t *)(d + i) = r;
1460 }
1461}
1462
1463void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1464{
1465 intptr_t i, oprsz = simd_oprsz(desc);
1466
1467 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1468 uint64_t r = *(uint64_t *)(a + i) + b;
1469 if (r < b) {
1470 r = UINT64_MAX;
1471 }
1472 *(uint64_t *)(d + i) = r;
1473 }
1474}
1475
1476void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1477{
1478 intptr_t i, oprsz = simd_oprsz(desc);
1479
1480 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1481 uint64_t ai = *(uint64_t *)(a + i);
1482 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1483 }
1484}
1485
1486
1487
1488
1489void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1490 uint64_t mm, uint32_t desc)
1491{
1492 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1493 uint64_t *d = vd, *n = vn;
1494 uint8_t *pg = vg;
1495
1496 mm = dup_const(MO_8, mm);
1497 for (i = 0; i < opr_sz; i += 1) {
1498 uint64_t nn = n[i];
1499 uint64_t pp = expand_pred_b(pg[H1(i)]);
1500 d[i] = (mm & pp) | (nn & ~pp);
1501 }
1502}
1503
1504void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1505 uint64_t mm, uint32_t desc)
1506{
1507 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1508 uint64_t *d = vd, *n = vn;
1509 uint8_t *pg = vg;
1510
1511 mm = dup_const(MO_16, mm);
1512 for (i = 0; i < opr_sz; i += 1) {
1513 uint64_t nn = n[i];
1514 uint64_t pp = expand_pred_h(pg[H1(i)]);
1515 d[i] = (mm & pp) | (nn & ~pp);
1516 }
1517}
1518
1519void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1520 uint64_t mm, uint32_t desc)
1521{
1522 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1523 uint64_t *d = vd, *n = vn;
1524 uint8_t *pg = vg;
1525
1526 mm = dup_const(MO_32, mm);
1527 for (i = 0; i < opr_sz; i += 1) {
1528 uint64_t nn = n[i];
1529 uint64_t pp = expand_pred_s(pg[H1(i)]);
1530 d[i] = (mm & pp) | (nn & ~pp);
1531 }
1532}
1533
1534void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1535 uint64_t mm, uint32_t desc)
1536{
1537 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1538 uint64_t *d = vd, *n = vn;
1539 uint8_t *pg = vg;
1540
1541 for (i = 0; i < opr_sz; i += 1) {
1542 uint64_t nn = n[i];
1543 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1544 }
1545}
1546
1547void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1548{
1549 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1550 uint64_t *d = vd;
1551 uint8_t *pg = vg;
1552
1553 val = dup_const(MO_8, val);
1554 for (i = 0; i < opr_sz; i += 1) {
1555 d[i] = val & expand_pred_b(pg[H1(i)]);
1556 }
1557}
1558
1559void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1560{
1561 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1562 uint64_t *d = vd;
1563 uint8_t *pg = vg;
1564
1565 val = dup_const(MO_16, val);
1566 for (i = 0; i < opr_sz; i += 1) {
1567 d[i] = val & expand_pred_h(pg[H1(i)]);
1568 }
1569}
1570
1571void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1572{
1573 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1574 uint64_t *d = vd;
1575 uint8_t *pg = vg;
1576
1577 val = dup_const(MO_32, val);
1578 for (i = 0; i < opr_sz; i += 1) {
1579 d[i] = val & expand_pred_s(pg[H1(i)]);
1580 }
1581}
1582
1583void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1584{
1585 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1586 uint64_t *d = vd;
1587 uint8_t *pg = vg;
1588
1589 for (i = 0; i < opr_sz; i += 1) {
1590 d[i] = (pg[H1(i)] & 1 ? val : 0);
1591 }
1592}
1593
1594
1595
1596
1597static void swap_memmove(void *vd, void *vs, size_t n)
1598{
1599 uintptr_t d = (uintptr_t)vd;
1600 uintptr_t s = (uintptr_t)vs;
1601 uintptr_t o = (d | s | n) & 7;
1602 size_t i;
1603
1604#ifndef HOST_WORDS_BIGENDIAN
1605 o = 0;
1606#endif
1607 switch (o) {
1608 case 0:
1609 memmove(vd, vs, n);
1610 break;
1611
1612 case 4:
1613 if (d < s || d >= s + n) {
1614 for (i = 0; i < n; i += 4) {
1615 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1616 }
1617 } else {
1618 for (i = n; i > 0; ) {
1619 i -= 4;
1620 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1621 }
1622 }
1623 break;
1624
1625 case 2:
1626 case 6:
1627 if (d < s || d >= s + n) {
1628 for (i = 0; i < n; i += 2) {
1629 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1630 }
1631 } else {
1632 for (i = n; i > 0; ) {
1633 i -= 2;
1634 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1635 }
1636 }
1637 break;
1638
1639 default:
1640 if (d < s || d >= s + n) {
1641 for (i = 0; i < n; i++) {
1642 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1643 }
1644 } else {
1645 for (i = n; i > 0; ) {
1646 i -= 1;
1647 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1648 }
1649 }
1650 break;
1651 }
1652}
1653
1654
1655static void swap_memzero(void *vd, size_t n)
1656{
1657 uintptr_t d = (uintptr_t)vd;
1658 uintptr_t o = (d | n) & 7;
1659 size_t i;
1660
1661
1662 if (likely(n == 0)) {
1663 return;
1664 }
1665
1666#ifndef HOST_WORDS_BIGENDIAN
1667 o = 0;
1668#endif
1669 switch (o) {
1670 case 0:
1671 memset(vd, 0, n);
1672 break;
1673
1674 case 4:
1675 for (i = 0; i < n; i += 4) {
1676 *(uint32_t *)H1_4(d + i) = 0;
1677 }
1678 break;
1679
1680 case 2:
1681 case 6:
1682 for (i = 0; i < n; i += 2) {
1683 *(uint16_t *)H1_2(d + i) = 0;
1684 }
1685 break;
1686
1687 default:
1688 for (i = 0; i < n; i++) {
1689 *(uint8_t *)H1(d + i) = 0;
1690 }
1691 break;
1692 }
1693}
1694
1695void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1696{
1697 intptr_t opr_sz = simd_oprsz(desc);
1698 size_t n_ofs = simd_data(desc);
1699 size_t n_siz = opr_sz - n_ofs;
1700
1701 if (vd != vm) {
1702 swap_memmove(vd, vn + n_ofs, n_siz);
1703 swap_memmove(vd + n_siz, vm, n_ofs);
1704 } else if (vd != vn) {
1705 swap_memmove(vd + n_siz, vd, n_ofs);
1706 swap_memmove(vd, vn + n_ofs, n_siz);
1707 } else {
1708
1709 ARMVectorReg tmp;
1710 swap_memmove(&tmp, vm, n_ofs);
1711 swap_memmove(vd, vd + n_ofs, n_siz);
1712 memcpy(vd + n_siz, &tmp, n_ofs);
1713 }
1714}
1715
1716#define DO_INSR(NAME, TYPE, H) \
1717void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1718{ \
1719 intptr_t opr_sz = simd_oprsz(desc); \
1720 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1721 *(TYPE *)(vd + H(0)) = val; \
1722}
1723
1724DO_INSR(sve_insr_b, uint8_t, H1)
1725DO_INSR(sve_insr_h, uint16_t, H1_2)
1726DO_INSR(sve_insr_s, uint32_t, H1_4)
1727DO_INSR(sve_insr_d, uint64_t, )
1728
1729#undef DO_INSR
1730
1731void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1732{
1733 intptr_t i, j, opr_sz = simd_oprsz(desc);
1734 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1735 uint64_t f = *(uint64_t *)(vn + i);
1736 uint64_t b = *(uint64_t *)(vn + j);
1737 *(uint64_t *)(vd + i) = bswap64(b);
1738 *(uint64_t *)(vd + j) = bswap64(f);
1739 }
1740}
1741
1742void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1743{
1744 intptr_t i, j, opr_sz = simd_oprsz(desc);
1745 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1746 uint64_t f = *(uint64_t *)(vn + i);
1747 uint64_t b = *(uint64_t *)(vn + j);
1748 *(uint64_t *)(vd + i) = hswap64(b);
1749 *(uint64_t *)(vd + j) = hswap64(f);
1750 }
1751}
1752
1753void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1754{
1755 intptr_t i, j, opr_sz = simd_oprsz(desc);
1756 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1757 uint64_t f = *(uint64_t *)(vn + i);
1758 uint64_t b = *(uint64_t *)(vn + j);
1759 *(uint64_t *)(vd + i) = rol64(b, 32);
1760 *(uint64_t *)(vd + j) = rol64(f, 32);
1761 }
1762}
1763
1764void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1765{
1766 intptr_t i, j, opr_sz = simd_oprsz(desc);
1767 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1768 uint64_t f = *(uint64_t *)(vn + i);
1769 uint64_t b = *(uint64_t *)(vn + j);
1770 *(uint64_t *)(vd + i) = b;
1771 *(uint64_t *)(vd + j) = f;
1772 }
1773}
1774
1775#define DO_TBL(NAME, TYPE, H) \
1776void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1777{ \
1778 intptr_t i, opr_sz = simd_oprsz(desc); \
1779 uintptr_t elem = opr_sz / sizeof(TYPE); \
1780 TYPE *d = vd, *n = vn, *m = vm; \
1781 ARMVectorReg tmp; \
1782 if (unlikely(vd == vn)) { \
1783 n = memcpy(&tmp, vn, opr_sz); \
1784 } \
1785 for (i = 0; i < elem; i++) { \
1786 TYPE j = m[H(i)]; \
1787 d[H(i)] = j < elem ? n[H(j)] : 0; \
1788 } \
1789}
1790
1791DO_TBL(sve_tbl_b, uint8_t, H1)
1792DO_TBL(sve_tbl_h, uint16_t, H2)
1793DO_TBL(sve_tbl_s, uint32_t, H4)
1794DO_TBL(sve_tbl_d, uint64_t, )
1795
1796#undef TBL
1797
1798#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1799void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1800{ \
1801 intptr_t i, opr_sz = simd_oprsz(desc); \
1802 TYPED *d = vd; \
1803 TYPES *n = vn; \
1804 ARMVectorReg tmp; \
1805 if (unlikely(vn - vd < opr_sz)) { \
1806 n = memcpy(&tmp, n, opr_sz / 2); \
1807 } \
1808 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1809 d[HD(i)] = n[HS(i)]; \
1810 } \
1811}
1812
1813DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1814DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1815DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1816
1817DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1818DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1819DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1820
1821#undef DO_UNPK
1822
1823
1824
1825
1826
1827static const uint64_t even_bit_esz_masks[5] = {
1828 0x5555555555555555ull,
1829 0x3333333333333333ull,
1830 0x0f0f0f0f0f0f0f0full,
1831 0x00ff00ff00ff00ffull,
1832 0x0000ffff0000ffffull,
1833};
1834
1835
1836
1837
1838
1839
1840static uint64_t expand_bits(uint64_t x, int n)
1841{
1842 int i;
1843
1844 x &= 0xffffffffu;
1845 for (i = 4; i >= n; i--) {
1846 int sh = 1 << i;
1847 x = ((x << sh) | x) & even_bit_esz_masks[i];
1848 }
1849 return x;
1850}
1851
1852
1853
1854
1855
1856
1857static uint64_t compress_bits(uint64_t x, int n)
1858{
1859 int i;
1860
1861 for (i = n; i <= 4; i++) {
1862 int sh = 1 << i;
1863 x &= even_bit_esz_masks[i];
1864 x = (x >> sh) | x;
1865 }
1866 return x & 0xffffffffu;
1867}
1868
1869void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1870{
1871 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
1872 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1873 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
1874 int esize = 1 << esz;
1875 uint64_t *d = vd;
1876 intptr_t i;
1877
1878 if (oprsz <= 8) {
1879 uint64_t nn = *(uint64_t *)vn;
1880 uint64_t mm = *(uint64_t *)vm;
1881 int half = 4 * oprsz;
1882
1883 nn = extract64(nn, high * half, half);
1884 mm = extract64(mm, high * half, half);
1885 nn = expand_bits(nn, esz);
1886 mm = expand_bits(mm, esz);
1887 d[0] = nn | (mm << esize);
1888 } else {
1889 ARMPredicateReg tmp;
1890
1891
1892
1893 if (vd == vn) {
1894 vn = memcpy(&tmp, vn, oprsz);
1895 if (vd == vm) {
1896 vm = vn;
1897 }
1898 } else if (vd == vm) {
1899 vm = memcpy(&tmp, vm, oprsz);
1900 }
1901 if (high) {
1902 high = oprsz >> 1;
1903 }
1904
1905 if ((oprsz & 7) == 0) {
1906 uint32_t *n = vn, *m = vm;
1907 high >>= 2;
1908
1909 for (i = 0; i < oprsz / 8; i++) {
1910 uint64_t nn = n[H4(high + i)];
1911 uint64_t mm = m[H4(high + i)];
1912
1913 nn = expand_bits(nn, esz);
1914 mm = expand_bits(mm, esz);
1915 d[i] = nn | (mm << esize);
1916 }
1917 } else {
1918 uint8_t *n = vn, *m = vm;
1919 uint16_t *d16 = vd;
1920
1921 for (i = 0; i < oprsz / 2; i++) {
1922 uint16_t nn = n[H1(high + i)];
1923 uint16_t mm = m[H1(high + i)];
1924
1925 nn = expand_bits(nn, esz);
1926 mm = expand_bits(mm, esz);
1927 d16[H2(i)] = nn | (mm << esize);
1928 }
1929 }
1930 }
1931}
1932
1933void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1934{
1935 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
1936 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1937 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
1938 uint64_t *d = vd, *n = vn, *m = vm;
1939 uint64_t l, h;
1940 intptr_t i;
1941
1942 if (oprsz <= 8) {
1943 l = compress_bits(n[0] >> odd, esz);
1944 h = compress_bits(m[0] >> odd, esz);
1945 d[0] = l | (h << (4 * oprsz));
1946 } else {
1947 ARMPredicateReg tmp_m;
1948 intptr_t oprsz_16 = oprsz / 16;
1949
1950 if ((vm - vd) < (uintptr_t)oprsz) {
1951 m = memcpy(&tmp_m, vm, oprsz);
1952 }
1953
1954 for (i = 0; i < oprsz_16; i++) {
1955 l = n[2 * i + 0];
1956 h = n[2 * i + 1];
1957 l = compress_bits(l >> odd, esz);
1958 h = compress_bits(h >> odd, esz);
1959 d[i] = l | (h << 32);
1960 }
1961
1962
1963
1964
1965
1966
1967 if (oprsz & 15) {
1968 int final_shift = (oprsz & 15) * 2;
1969
1970 l = n[2 * i + 0];
1971 h = n[2 * i + 1];
1972 l = compress_bits(l >> odd, esz);
1973 h = compress_bits(h >> odd, esz);
1974 d[i] = l | (h << final_shift);
1975
1976 for (i = 0; i < oprsz_16; i++) {
1977 l = m[2 * i + 0];
1978 h = m[2 * i + 1];
1979 l = compress_bits(l >> odd, esz);
1980 h = compress_bits(h >> odd, esz);
1981 tmp_m.p[i] = l | (h << 32);
1982 }
1983 l = m[2 * i + 0];
1984 h = m[2 * i + 1];
1985 l = compress_bits(l >> odd, esz);
1986 h = compress_bits(h >> odd, esz);
1987 tmp_m.p[i] = l | (h << final_shift);
1988
1989 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1990 } else {
1991 for (i = 0; i < oprsz_16; i++) {
1992 l = m[2 * i + 0];
1993 h = m[2 * i + 1];
1994 l = compress_bits(l >> odd, esz);
1995 h = compress_bits(h >> odd, esz);
1996 d[oprsz_16 + i] = l | (h << 32);
1997 }
1998 }
1999 }
2000}
2001
2002void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2003{
2004 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2005 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2006 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
2007 uint64_t *d = vd, *n = vn, *m = vm;
2008 uint64_t mask;
2009 int shr, shl;
2010 intptr_t i;
2011
2012 shl = 1 << esz;
2013 shr = 0;
2014 mask = even_bit_esz_masks[esz];
2015 if (odd) {
2016 mask <<= shl;
2017 shr = shl;
2018 shl = 0;
2019 }
2020
2021 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2022 uint64_t nn = (n[i] & mask) >> shr;
2023 uint64_t mm = (m[i] & mask) << shl;
2024 d[i] = nn + mm;
2025 }
2026}
2027
2028
2029static uint64_t reverse_bits_64(uint64_t x, int n)
2030{
2031 int i, sh;
2032
2033 x = bswap64(x);
2034 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2035 uint64_t mask = even_bit_esz_masks[i];
2036 x = ((x & mask) << sh) | ((x >> sh) & mask);
2037 }
2038 return x;
2039}
2040
2041static uint8_t reverse_bits_8(uint8_t x, int n)
2042{
2043 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2044 int i, sh;
2045
2046 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2047 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2048 }
2049 return x;
2050}
2051
2052void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2053{
2054 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2055 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2056 intptr_t i, oprsz_2 = oprsz / 2;
2057
2058 if (oprsz <= 8) {
2059 uint64_t l = *(uint64_t *)vn;
2060 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2061 *(uint64_t *)vd = l;
2062 } else if ((oprsz & 15) == 0) {
2063 for (i = 0; i < oprsz_2; i += 8) {
2064 intptr_t ih = oprsz - 8 - i;
2065 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2066 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2067 *(uint64_t *)(vd + i) = h;
2068 *(uint64_t *)(vd + ih) = l;
2069 }
2070 } else {
2071 for (i = 0; i < oprsz_2; i += 1) {
2072 intptr_t il = H1(i);
2073 intptr_t ih = H1(oprsz - 1 - i);
2074 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2075 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2076 *(uint8_t *)(vd + il) = h;
2077 *(uint8_t *)(vd + ih) = l;
2078 }
2079 }
2080}
2081
2082void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2083{
2084 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2085 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
2086 uint64_t *d = vd;
2087 intptr_t i;
2088
2089 if (oprsz <= 8) {
2090 uint64_t nn = *(uint64_t *)vn;
2091 int half = 4 * oprsz;
2092
2093 nn = extract64(nn, high * half, half);
2094 nn = expand_bits(nn, 0);
2095 d[0] = nn;
2096 } else {
2097 ARMPredicateReg tmp_n;
2098
2099
2100
2101 if ((vn - vd) < (uintptr_t)oprsz) {
2102 vn = memcpy(&tmp_n, vn, oprsz);
2103 }
2104 if (high) {
2105 high = oprsz >> 1;
2106 }
2107
2108 if ((oprsz & 7) == 0) {
2109 uint32_t *n = vn;
2110 high >>= 2;
2111
2112 for (i = 0; i < oprsz / 8; i++) {
2113 uint64_t nn = n[H4(high + i)];
2114 d[i] = expand_bits(nn, 0);
2115 }
2116 } else {
2117 uint16_t *d16 = vd;
2118 uint8_t *n = vn;
2119
2120 for (i = 0; i < oprsz / 2; i++) {
2121 uint16_t nn = n[H1(high + i)];
2122 d16[H2(i)] = expand_bits(nn, 0);
2123 }
2124 }
2125 }
2126}
2127
2128#define DO_ZIP(NAME, TYPE, H) \
2129void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2130{ \
2131 intptr_t oprsz = simd_oprsz(desc); \
2132 intptr_t i, oprsz_2 = oprsz / 2; \
2133 ARMVectorReg tmp_n, tmp_m; \
2134
2135 \
2136 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2137 vn = memcpy(&tmp_n, vn, oprsz_2); \
2138 } \
2139 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2140 vm = memcpy(&tmp_m, vm, oprsz_2); \
2141 } \
2142 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2143 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2144 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2145 } \
2146}
2147
2148DO_ZIP(sve_zip_b, uint8_t, H1)
2149DO_ZIP(sve_zip_h, uint16_t, H1_2)
2150DO_ZIP(sve_zip_s, uint32_t, H1_4)
2151DO_ZIP(sve_zip_d, uint64_t, )
2152
2153#define DO_UZP(NAME, TYPE, H) \
2154void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2155{ \
2156 intptr_t oprsz = simd_oprsz(desc); \
2157 intptr_t oprsz_2 = oprsz / 2; \
2158 intptr_t odd_ofs = simd_data(desc); \
2159 intptr_t i; \
2160 ARMVectorReg tmp_m; \
2161 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2162 vm = memcpy(&tmp_m, vm, oprsz); \
2163 } \
2164 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2165 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2166 } \
2167 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2168 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2169 } \
2170}
2171
2172DO_UZP(sve_uzp_b, uint8_t, H1)
2173DO_UZP(sve_uzp_h, uint16_t, H1_2)
2174DO_UZP(sve_uzp_s, uint32_t, H1_4)
2175DO_UZP(sve_uzp_d, uint64_t, )
2176
2177#define DO_TRN(NAME, TYPE, H) \
2178void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2179{ \
2180 intptr_t oprsz = simd_oprsz(desc); \
2181 intptr_t odd_ofs = simd_data(desc); \
2182 intptr_t i; \
2183 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2184 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2185 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2186 *(TYPE *)(vd + H(i + 0)) = ae; \
2187 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2188 } \
2189}
2190
2191DO_TRN(sve_trn_b, uint8_t, H1)
2192DO_TRN(sve_trn_h, uint16_t, H1_2)
2193DO_TRN(sve_trn_s, uint32_t, H1_4)
2194DO_TRN(sve_trn_d, uint64_t, )
2195
2196#undef DO_ZIP
2197#undef DO_UZP
2198#undef DO_TRN
2199
2200void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2201{
2202 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2203 uint32_t *d = vd, *n = vn;
2204 uint8_t *pg = vg;
2205
2206 for (i = j = 0; i < opr_sz; i++) {
2207 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2208 d[H4(j)] = n[H4(i)];
2209 j++;
2210 }
2211 }
2212 for (; j < opr_sz; j++) {
2213 d[H4(j)] = 0;
2214 }
2215}
2216
2217void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2218{
2219 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2220 uint64_t *d = vd, *n = vn;
2221 uint8_t *pg = vg;
2222
2223 for (i = j = 0; i < opr_sz; i++) {
2224 if (pg[H1(i)] & 1) {
2225 d[j] = n[i];
2226 j++;
2227 }
2228 }
2229 for (; j < opr_sz; j++) {
2230 d[j] = 0;
2231 }
2232}
2233
2234
2235
2236
2237
2238int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2239{
2240 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
2241 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2242
2243 return last_active_element(vg, words, esz);
2244}
2245
2246void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2247{
2248 intptr_t opr_sz = simd_oprsz(desc) / 8;
2249 int esz = simd_data(desc);
2250 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2251 intptr_t i, first_i, last_i;
2252 ARMVectorReg tmp;
2253
2254 first_i = last_i = 0;
2255 first_g = last_g = 0;
2256
2257
2258 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2259 pg = *(uint64_t *)(vg + i) & mask;
2260 if (pg) {
2261 if (last_g == 0) {
2262 last_g = pg;
2263 last_i = i;
2264 }
2265 first_g = pg;
2266 first_i = i;
2267 }
2268 }
2269
2270 len = 0;
2271 if (first_g != 0) {
2272 first_i = first_i * 8 + ctz64(first_g);
2273 last_i = last_i * 8 + 63 - clz64(last_g);
2274 len = last_i - first_i + (1 << esz);
2275 if (vd == vm) {
2276 vm = memcpy(&tmp, vm, opr_sz * 8);
2277 }
2278 swap_memmove(vd, vn + first_i, len);
2279 }
2280 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2281}
2282
2283void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2284 void *vg, uint32_t desc)
2285{
2286 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2287 uint64_t *d = vd, *n = vn, *m = vm;
2288 uint8_t *pg = vg;
2289
2290 for (i = 0; i < opr_sz; i += 1) {
2291 uint64_t nn = n[i], mm = m[i];
2292 uint64_t pp = expand_pred_b(pg[H1(i)]);
2293 d[i] = (nn & pp) | (mm & ~pp);
2294 }
2295}
2296
2297void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2298 void *vg, uint32_t desc)
2299{
2300 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2301 uint64_t *d = vd, *n = vn, *m = vm;
2302 uint8_t *pg = vg;
2303
2304 for (i = 0; i < opr_sz; i += 1) {
2305 uint64_t nn = n[i], mm = m[i];
2306 uint64_t pp = expand_pred_h(pg[H1(i)]);
2307 d[i] = (nn & pp) | (mm & ~pp);
2308 }
2309}
2310
2311void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2312 void *vg, uint32_t desc)
2313{
2314 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2315 uint64_t *d = vd, *n = vn, *m = vm;
2316 uint8_t *pg = vg;
2317
2318 for (i = 0; i < opr_sz; i += 1) {
2319 uint64_t nn = n[i], mm = m[i];
2320 uint64_t pp = expand_pred_s(pg[H1(i)]);
2321 d[i] = (nn & pp) | (mm & ~pp);
2322 }
2323}
2324
2325void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2326 void *vg, uint32_t desc)
2327{
2328 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2329 uint64_t *d = vd, *n = vn, *m = vm;
2330 uint8_t *pg = vg;
2331
2332 for (i = 0; i < opr_sz; i += 1) {
2333 uint64_t nn = n[i], mm = m[i];
2334 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2335 }
2336}
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2360uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2361{ \
2362 intptr_t opr_sz = simd_oprsz(desc); \
2363 uint32_t flags = PREDTEST_INIT; \
2364 intptr_t i = opr_sz; \
2365 do { \
2366 uint64_t out = 0, pg; \
2367 do { \
2368 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2369 TYPE nn = *(TYPE *)(vn + H(i)); \
2370 TYPE mm = *(TYPE *)(vm + H(i)); \
2371 out |= nn OP mm; \
2372 } while (i & 63); \
2373 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2374 out &= pg; \
2375 *(uint64_t *)(vd + (i >> 3)) = out; \
2376 flags = iter_predtest_bwd(out, pg, flags); \
2377 } while (i > 0); \
2378 return flags; \
2379}
2380
2381#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2382 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2383#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2384 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2385#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2386 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2387#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2388 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2389
2390DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2391DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2392DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2393DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2394
2395DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2396DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2397DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2398DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2399
2400DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2401DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2402DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2403DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2404
2405DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2406DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2407DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2408DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2409
2410DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2411DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2412DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2413DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2414
2415DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2416DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2417DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2418DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2419
2420#undef DO_CMP_PPZZ_B
2421#undef DO_CMP_PPZZ_H
2422#undef DO_CMP_PPZZ_S
2423#undef DO_CMP_PPZZ_D
2424#undef DO_CMP_PPZZ
2425
2426
2427#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2428uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2429{ \
2430 intptr_t opr_sz = simd_oprsz(desc); \
2431 uint32_t flags = PREDTEST_INIT; \
2432 intptr_t i = opr_sz; \
2433 do { \
2434 uint64_t out = 0, pg; \
2435 do { \
2436 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2437 do { \
2438 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2439 TYPE nn = *(TYPE *)(vn + H(i)); \
2440 out |= nn OP mm; \
2441 } while (i & 7); \
2442 } while (i & 63); \
2443 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2444 out &= pg; \
2445 *(uint64_t *)(vd + (i >> 3)) = out; \
2446 flags = iter_predtest_bwd(out, pg, flags); \
2447 } while (i > 0); \
2448 return flags; \
2449}
2450
2451#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2452 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2453#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2454 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2455#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2456 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2457
2458DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
2459DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
2460DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
2461
2462DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
2463DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
2464DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
2465
2466DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2467DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2468DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2469
2470DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2471DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2472DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2473
2474DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2475DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2476DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2477
2478DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2479DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2480DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2481
2482DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2483DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2484DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2485
2486DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2487DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2488DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2489
2490DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2491DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2492DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2493
2494DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2495DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2496DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2497
2498#undef DO_CMP_PPZW_B
2499#undef DO_CMP_PPZW_H
2500#undef DO_CMP_PPZW_S
2501#undef DO_CMP_PPZW
2502
2503
2504#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2505uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2506{ \
2507 intptr_t opr_sz = simd_oprsz(desc); \
2508 uint32_t flags = PREDTEST_INIT; \
2509 TYPE mm = simd_data(desc); \
2510 intptr_t i = opr_sz; \
2511 do { \
2512 uint64_t out = 0, pg; \
2513 do { \
2514 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2515 TYPE nn = *(TYPE *)(vn + H(i)); \
2516 out |= nn OP mm; \
2517 } while (i & 63); \
2518 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2519 out &= pg; \
2520 *(uint64_t *)(vd + (i >> 3)) = out; \
2521 flags = iter_predtest_bwd(out, pg, flags); \
2522 } while (i > 0); \
2523 return flags; \
2524}
2525
2526#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2527 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2528#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2529 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2530#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2531 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2532#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2533 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2534
2535DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2536DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2537DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2538DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2539
2540DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2541DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2542DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2543DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2544
2545DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2546DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2547DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2548DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2549
2550DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2551DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2552DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2553DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2554
2555DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2556DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2557DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2558DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2559
2560DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2561DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2562DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2563DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2564
2565DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2566DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2567DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2568DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2569
2570DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2571DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2572DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2573DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2574
2575DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2576DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2577DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2578DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2579
2580DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2581DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2582DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2583DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2584
2585#undef DO_CMP_PPZI_B
2586#undef DO_CMP_PPZI_H
2587#undef DO_CMP_PPZI_S
2588#undef DO_CMP_PPZI_D
2589#undef DO_CMP_PPZI
2590
2591
2592static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2593{
2594 intptr_t i;
2595
2596 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2597 uint64_t pg = *(uint64_t *)(vg + i);
2598 if (pg) {
2599 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2600 }
2601 }
2602 return 0;
2603}
2604
2605
2606
2607
2608
2609static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2610 bool brk, bool after)
2611{
2612 uint64_t b;
2613
2614 if (brk) {
2615 b = 0;
2616 } else if ((g & n) == 0) {
2617
2618 b = g;
2619 } else {
2620
2621 b = g & n;
2622 b = b & -b;
2623 if (after) {
2624 b = b | (b - 1);
2625 } else {
2626 b = b - 1;
2627 }
2628 brk = true;
2629 }
2630
2631 *retb = b;
2632 return brk;
2633}
2634
2635
2636static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2637 intptr_t oprsz, bool after)
2638{
2639 bool brk = false;
2640 intptr_t i;
2641
2642 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2643 uint64_t this_b, this_g = g[i];
2644
2645 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2646 d[i] = this_b & this_g;
2647 }
2648}
2649
2650
2651static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2652 intptr_t oprsz, bool after)
2653{
2654 uint32_t flags = PREDTEST_INIT;
2655 bool brk = false;
2656 intptr_t i;
2657
2658 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2659 uint64_t this_b, this_d, this_g = g[i];
2660
2661 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2662 d[i] = this_d = this_b & this_g;
2663 flags = iter_predtest_fwd(this_d, this_g, flags);
2664 }
2665 return flags;
2666}
2667
2668
2669static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2670 intptr_t oprsz, bool after)
2671{
2672 bool brk = false;
2673 intptr_t i;
2674
2675 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2676 uint64_t this_b, this_g = g[i];
2677
2678 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2679 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2680 }
2681}
2682
2683
2684static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2685 intptr_t oprsz, bool after)
2686{
2687 uint32_t flags = PREDTEST_INIT;
2688 bool brk = false;
2689 intptr_t i;
2690
2691 for (i = 0; i < oprsz / 8; ++i) {
2692 uint64_t this_b, this_d = d[i], this_g = g[i];
2693
2694 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2695 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2696 flags = iter_predtest_fwd(this_d, this_g, flags);
2697 }
2698 return flags;
2699}
2700
2701static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2702{
2703
2704
2705
2706 memset(d, 0, sizeof(ARMPredicateReg));
2707 return PREDTEST_INIT;
2708}
2709
2710void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2711 uint32_t pred_desc)
2712{
2713 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2714 if (last_active_pred(vn, vg, oprsz)) {
2715 compute_brk_z(vd, vm, vg, oprsz, true);
2716 } else {
2717 do_zero(vd, oprsz);
2718 }
2719}
2720
2721uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2722 uint32_t pred_desc)
2723{
2724 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2725 if (last_active_pred(vn, vg, oprsz)) {
2726 return compute_brks_z(vd, vm, vg, oprsz, true);
2727 } else {
2728 return do_zero(vd, oprsz);
2729 }
2730}
2731
2732void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2733 uint32_t pred_desc)
2734{
2735 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2736 if (last_active_pred(vn, vg, oprsz)) {
2737 compute_brk_z(vd, vm, vg, oprsz, false);
2738 } else {
2739 do_zero(vd, oprsz);
2740 }
2741}
2742
2743uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2744 uint32_t pred_desc)
2745{
2746 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2747 if (last_active_pred(vn, vg, oprsz)) {
2748 return compute_brks_z(vd, vm, vg, oprsz, false);
2749 } else {
2750 return do_zero(vd, oprsz);
2751 }
2752}
2753
2754void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2755{
2756 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2757 compute_brk_z(vd, vn, vg, oprsz, true);
2758}
2759
2760uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2761{
2762 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2763 return compute_brks_z(vd, vn, vg, oprsz, true);
2764}
2765
2766void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2767{
2768 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2769 compute_brk_z(vd, vn, vg, oprsz, false);
2770}
2771
2772uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2773{
2774 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2775 return compute_brks_z(vd, vn, vg, oprsz, false);
2776}
2777
2778void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2779{
2780 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2781 compute_brk_m(vd, vn, vg, oprsz, true);
2782}
2783
2784uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2785{
2786 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2787 return compute_brks_m(vd, vn, vg, oprsz, true);
2788}
2789
2790void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2791{
2792 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2793 compute_brk_m(vd, vn, vg, oprsz, false);
2794}
2795
2796uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2797{
2798 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2799 return compute_brks_m(vd, vn, vg, oprsz, false);
2800}
2801
2802void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2803{
2804 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2805 if (!last_active_pred(vn, vg, oprsz)) {
2806 do_zero(vd, oprsz);
2807 }
2808}
2809
2810
2811static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2812 uint64_t esz_mask)
2813{
2814 uint32_t flags = PREDTEST_INIT;
2815 intptr_t i;
2816
2817 for (i = 0; i < oprsz / 8; i++) {
2818 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2819 }
2820 if (oprsz & 7) {
2821 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2822 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2823 }
2824 return flags;
2825}
2826
2827uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2828{
2829 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2830 if (last_active_pred(vn, vg, oprsz)) {
2831 return predtest_ones(vd, oprsz, -1);
2832 } else {
2833 return do_zero(vd, oprsz);
2834 }
2835}
2836
2837uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2838{
2839 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
2840 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2841 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2842 intptr_t i;
2843
2844 for (i = 0; i < words; ++i) {
2845 uint64_t t = n[i] & g[i] & mask;
2846 sum += ctpop64(t);
2847 }
2848 return sum;
2849}
2850
2851uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2852{
2853 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2854 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2855 uint64_t esz_mask = pred_esz_masks[esz];
2856 ARMPredicateReg *d = vd;
2857 uint32_t flags;
2858 intptr_t i;
2859
2860
2861 flags = do_zero(d, oprsz);
2862 if (count == 0) {
2863 return flags;
2864 }
2865
2866
2867 for (i = 0; i < count / 64; ++i) {
2868 d->p[i] = esz_mask;
2869 }
2870 if (count & 63) {
2871 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2872 }
2873
2874 return predtest_ones(d, oprsz, esz_mask);
2875}
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2886static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2887{ \
2888 if (n == 1) { \
2889 return *data; \
2890 } else { \
2891 uintptr_t half = n / 2; \
2892 TYPE lo = NAME##_reduce(data, status, half); \
2893 TYPE hi = NAME##_reduce(data + half, status, half); \
2894 return TYPE##_##FUNC(lo, hi, status); \
2895 } \
2896} \
2897uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2898{ \
2899 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
2900 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2901 for (i = 0; i < oprsz; ) { \
2902 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2903 do { \
2904 TYPE nn = *(TYPE *)(vn + H(i)); \
2905 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2906 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2907 } while (i & 15); \
2908 } \
2909 for (; i < maxsz; i += sizeof(TYPE)) { \
2910 *(TYPE *)((void *)data + i) = IDENT; \
2911 } \
2912 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2913}
2914
2915DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2916DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2917DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2918
2919
2920DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2921DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2922DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2923
2924DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2925DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2926DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2927
2928DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2929DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2930DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2931
2932DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2933DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2934DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2935
2936#undef DO_REDUCE
2937
2938uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2939 void *status, uint32_t desc)
2940{
2941 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2942 float16 result = nn;
2943
2944 do {
2945 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2946 do {
2947 if (pg & 1) {
2948 float16 mm = *(float16 *)(vm + H1_2(i));
2949 result = float16_add(result, mm, status);
2950 }
2951 i += sizeof(float16), pg >>= sizeof(float16);
2952 } while (i & 15);
2953 } while (i < opr_sz);
2954
2955 return result;
2956}
2957
2958uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2959 void *status, uint32_t desc)
2960{
2961 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2962 float32 result = nn;
2963
2964 do {
2965 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2966 do {
2967 if (pg & 1) {
2968 float32 mm = *(float32 *)(vm + H1_2(i));
2969 result = float32_add(result, mm, status);
2970 }
2971 i += sizeof(float32), pg >>= sizeof(float32);
2972 } while (i & 15);
2973 } while (i < opr_sz);
2974
2975 return result;
2976}
2977
2978uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
2979 void *status, uint32_t desc)
2980{
2981 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
2982 uint64_t *m = vm;
2983 uint8_t *pg = vg;
2984
2985 for (i = 0; i < opr_sz; i++) {
2986 if (pg[H1(i)] & 1) {
2987 nn = float64_add(nn, m[i], status);
2988 }
2989 }
2990
2991 return nn;
2992}
2993
2994
2995
2996
2997#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
2998void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
2999 void *status, uint32_t desc) \
3000{ \
3001 intptr_t i = simd_oprsz(desc); \
3002 uint64_t *g = vg; \
3003 do { \
3004 uint64_t pg = g[(i - 1) >> 6]; \
3005 do { \
3006 i -= sizeof(TYPE); \
3007 if (likely((pg >> (i & 63)) & 1)) { \
3008 TYPE nn = *(TYPE *)(vn + H(i)); \
3009 TYPE mm = *(TYPE *)(vm + H(i)); \
3010 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3011 } \
3012 } while (i & 63); \
3013 } while (i != 0); \
3014}
3015
3016DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3017DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3018DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3019
3020DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3021DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3022DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3023
3024DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3025DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3026DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3027
3028DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3029DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3030DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3031
3032DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3033DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3034DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3035
3036DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3037DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3038DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3039
3040DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3041DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3042DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3043
3044DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3045DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3046DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3047
3048static inline float16 abd_h(float16 a, float16 b, float_status *s)
3049{
3050 return float16_abs(float16_sub(a, b, s));
3051}
3052
3053static inline float32 abd_s(float32 a, float32 b, float_status *s)
3054{
3055 return float32_abs(float32_sub(a, b, s));
3056}
3057
3058static inline float64 abd_d(float64 a, float64 b, float_status *s)
3059{
3060 return float64_abs(float64_sub(a, b, s));
3061}
3062
3063DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3064DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3065DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3066
3067static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3068{
3069 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3070 return float64_scalbn(a, b_int, s);
3071}
3072
3073DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3074DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3075DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3076
3077DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3078DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3079DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3080
3081#undef DO_ZPZZ_FP
3082
3083
3084
3085
3086#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3087void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3088 void *status, uint32_t desc) \
3089{ \
3090 intptr_t i = simd_oprsz(desc); \
3091 uint64_t *g = vg; \
3092 TYPE mm = scalar; \
3093 do { \
3094 uint64_t pg = g[(i - 1) >> 6]; \
3095 do { \
3096 i -= sizeof(TYPE); \
3097 if (likely((pg >> (i & 63)) & 1)) { \
3098 TYPE nn = *(TYPE *)(vn + H(i)); \
3099 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3100 } \
3101 } while (i & 63); \
3102 } while (i != 0); \
3103}
3104
3105DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3106DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3107DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3108
3109DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3110DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3111DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3112
3113DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3114DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3115DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3116
3117static inline float16 subr_h(float16 a, float16 b, float_status *s)
3118{
3119 return float16_sub(b, a, s);
3120}
3121
3122static inline float32 subr_s(float32 a, float32 b, float_status *s)
3123{
3124 return float32_sub(b, a, s);
3125}
3126
3127static inline float64 subr_d(float64 a, float64 b, float_status *s)
3128{
3129 return float64_sub(b, a, s);
3130}
3131
3132DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3133DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3134DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3135
3136DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3137DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3138DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3139
3140DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3141DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3142DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3143
3144DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3145DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3146DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3147
3148DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3149DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3150DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3151
3152
3153
3154
3155#define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3156void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3157{ \
3158 intptr_t i = simd_oprsz(desc); \
3159 uint64_t *g = vg; \
3160 do { \
3161 uint64_t pg = g[(i - 1) >> 6]; \
3162 do { \
3163 i -= sizeof(TYPE); \
3164 if (likely((pg >> (i & 63)) & 1)) { \
3165 TYPE nn = *(TYPE *)(vn + H(i)); \
3166 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3167 } \
3168 } while (i & 63); \
3169 } while (i != 0); \
3170}
3171
3172
3173
3174
3175
3176static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3177{
3178 bool save = get_flush_inputs_to_zero(fpst);
3179 float32 ret;
3180
3181 set_flush_inputs_to_zero(false, fpst);
3182 ret = float16_to_float32(f, true, fpst);
3183 set_flush_inputs_to_zero(save, fpst);
3184 return ret;
3185}
3186
3187static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3188{
3189 bool save = get_flush_inputs_to_zero(fpst);
3190 float64 ret;
3191
3192 set_flush_inputs_to_zero(false, fpst);
3193 ret = float16_to_float64(f, true, fpst);
3194 set_flush_inputs_to_zero(save, fpst);
3195 return ret;
3196}
3197
3198static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3199{
3200 bool save = get_flush_to_zero(fpst);
3201 float16 ret;
3202
3203 set_flush_to_zero(false, fpst);
3204 ret = float32_to_float16(f, true, fpst);
3205 set_flush_to_zero(save, fpst);
3206 return ret;
3207}
3208
3209static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3210{
3211 bool save = get_flush_to_zero(fpst);
3212 float16 ret;
3213
3214 set_flush_to_zero(false, fpst);
3215 ret = float64_to_float16(f, true, fpst);
3216 set_flush_to_zero(save, fpst);
3217 return ret;
3218}
3219
3220static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3221{
3222 if (float16_is_any_nan(f)) {
3223 float_raise(float_flag_invalid, s);
3224 return 0;
3225 }
3226 return float16_to_int16_round_to_zero(f, s);
3227}
3228
3229static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3230{
3231 if (float16_is_any_nan(f)) {
3232 float_raise(float_flag_invalid, s);
3233 return 0;
3234 }
3235 return float16_to_int64_round_to_zero(f, s);
3236}
3237
3238static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3239{
3240 if (float32_is_any_nan(f)) {
3241 float_raise(float_flag_invalid, s);
3242 return 0;
3243 }
3244 return float32_to_int64_round_to_zero(f, s);
3245}
3246
3247static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3248{
3249 if (float64_is_any_nan(f)) {
3250 float_raise(float_flag_invalid, s);
3251 return 0;
3252 }
3253 return float64_to_int64_round_to_zero(f, s);
3254}
3255
3256static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3257{
3258 if (float16_is_any_nan(f)) {
3259 float_raise(float_flag_invalid, s);
3260 return 0;
3261 }
3262 return float16_to_uint16_round_to_zero(f, s);
3263}
3264
3265static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3266{
3267 if (float16_is_any_nan(f)) {
3268 float_raise(float_flag_invalid, s);
3269 return 0;
3270 }
3271 return float16_to_uint64_round_to_zero(f, s);
3272}
3273
3274static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3275{
3276 if (float32_is_any_nan(f)) {
3277 float_raise(float_flag_invalid, s);
3278 return 0;
3279 }
3280 return float32_to_uint64_round_to_zero(f, s);
3281}
3282
3283static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3284{
3285 if (float64_is_any_nan(f)) {
3286 float_raise(float_flag_invalid, s);
3287 return 0;
3288 }
3289 return float64_to_uint64_round_to_zero(f, s);
3290}
3291
3292DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3293DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3294DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3295DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3296DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3297DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3298
3299DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3300DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3301DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3302DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3303DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3304DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3305DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3306
3307DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3308DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3309DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3310DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3311DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3312DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3313DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3314
3315DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3316DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3317DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3318
3319DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3320DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3321DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3322
3323DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3324DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3325DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3326
3327DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3328DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3329DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3330
3331DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3332DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3333DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3334DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3335DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3336DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3337DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3338
3339DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3340DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3341DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3342DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3343DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3344DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3345DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3346
3347#undef DO_ZPZ_FP
3348
3349static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
3350 float_status *status, uint32_t desc,
3351 uint16_t neg1, uint16_t neg3)
3352{
3353 intptr_t i = simd_oprsz(desc);
3354 uint64_t *g = vg;
3355
3356 do {
3357 uint64_t pg = g[(i - 1) >> 6];
3358 do {
3359 i -= 2;
3360 if (likely((pg >> (i & 63)) & 1)) {
3361 float16 e1, e2, e3, r;
3362
3363 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3364 e2 = *(uint16_t *)(vm + H1_2(i));
3365 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3366 r = float16_muladd(e1, e2, e3, 0, status);
3367 *(uint16_t *)(vd + H1_2(i)) = r;
3368 }
3369 } while (i & 63);
3370 } while (i != 0);
3371}
3372
3373void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3374 void *vg, void *status, uint32_t desc)
3375{
3376 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
3377}
3378
3379void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3380 void *vg, void *status, uint32_t desc)
3381{
3382 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
3383}
3384
3385void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3386 void *vg, void *status, uint32_t desc)
3387{
3388 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
3389}
3390
3391void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3392 void *vg, void *status, uint32_t desc)
3393{
3394 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
3395}
3396
3397static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
3398 float_status *status, uint32_t desc,
3399 uint32_t neg1, uint32_t neg3)
3400{
3401 intptr_t i = simd_oprsz(desc);
3402 uint64_t *g = vg;
3403
3404 do {
3405 uint64_t pg = g[(i - 1) >> 6];
3406 do {
3407 i -= 4;
3408 if (likely((pg >> (i & 63)) & 1)) {
3409 float32 e1, e2, e3, r;
3410
3411 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3412 e2 = *(uint32_t *)(vm + H1_4(i));
3413 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3414 r = float32_muladd(e1, e2, e3, 0, status);
3415 *(uint32_t *)(vd + H1_4(i)) = r;
3416 }
3417 } while (i & 63);
3418 } while (i != 0);
3419}
3420
3421void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3422 void *vg, void *status, uint32_t desc)
3423{
3424 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
3425}
3426
3427void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3428 void *vg, void *status, uint32_t desc)
3429{
3430 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
3431}
3432
3433void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3434 void *vg, void *status, uint32_t desc)
3435{
3436 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
3437}
3438
3439void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3440 void *vg, void *status, uint32_t desc)
3441{
3442 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
3443}
3444
3445static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
3446 float_status *status, uint32_t desc,
3447 uint64_t neg1, uint64_t neg3)
3448{
3449 intptr_t i = simd_oprsz(desc);
3450 uint64_t *g = vg;
3451
3452 do {
3453 uint64_t pg = g[(i - 1) >> 6];
3454 do {
3455 i -= 8;
3456 if (likely((pg >> (i & 63)) & 1)) {
3457 float64 e1, e2, e3, r;
3458
3459 e1 = *(uint64_t *)(vn + i) ^ neg1;
3460 e2 = *(uint64_t *)(vm + i);
3461 e3 = *(uint64_t *)(va + i) ^ neg3;
3462 r = float64_muladd(e1, e2, e3, 0, status);
3463 *(uint64_t *)(vd + i) = r;
3464 }
3465 } while (i & 63);
3466 } while (i != 0);
3467}
3468
3469void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3470 void *vg, void *status, uint32_t desc)
3471{
3472 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
3473}
3474
3475void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3476 void *vg, void *status, uint32_t desc)
3477{
3478 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
3479}
3480
3481void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3482 void *vg, void *status, uint32_t desc)
3483{
3484 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
3485}
3486
3487void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3488 void *vg, void *status, uint32_t desc)
3489{
3490 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
3491}
3492
3493
3494
3495
3496
3497
3498#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3499void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3500 void *status, uint32_t desc) \
3501{ \
3502 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3503 uint64_t *d = vd, *g = vg; \
3504 do { \
3505 uint64_t out = 0, pg = g[j]; \
3506 do { \
3507 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3508 if (likely((pg >> (i & 63)) & 1)) { \
3509 TYPE nn = *(TYPE *)(vn + H(i)); \
3510 TYPE mm = *(TYPE *)(vm + H(i)); \
3511 out |= OP(TYPE, nn, mm, status); \
3512 } \
3513 } while (i & 63); \
3514 d[j--] = out; \
3515 } while (i > 0); \
3516}
3517
3518#define DO_FPCMP_PPZZ_H(NAME, OP) \
3519 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3520#define DO_FPCMP_PPZZ_S(NAME, OP) \
3521 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3522#define DO_FPCMP_PPZZ_D(NAME, OP) \
3523 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3524
3525#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3526 DO_FPCMP_PPZZ_H(NAME, OP) \
3527 DO_FPCMP_PPZZ_S(NAME, OP) \
3528 DO_FPCMP_PPZZ_D(NAME, OP)
3529
3530#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3531#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
3532#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3533#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
3534#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3535#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3536#define DO_FCMUO(TYPE, X, Y, ST) \
3537 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3538#define DO_FACGE(TYPE, X, Y, ST) \
3539 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3540#define DO_FACGT(TYPE, X, Y, ST) \
3541 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3542
3543DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3544DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3545DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3546DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3547DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3548DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3549DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3550
3551#undef DO_FPCMP_PPZZ_ALL
3552#undef DO_FPCMP_PPZZ_D
3553#undef DO_FPCMP_PPZZ_S
3554#undef DO_FPCMP_PPZZ_H
3555#undef DO_FPCMP_PPZZ
3556
3557
3558
3559
3560#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3561void HELPER(NAME)(void *vd, void *vn, void *vg, \
3562 void *status, uint32_t desc) \
3563{ \
3564 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3565 uint64_t *d = vd, *g = vg; \
3566 do { \
3567 uint64_t out = 0, pg = g[j]; \
3568 do { \
3569 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3570 if ((pg >> (i & 63)) & 1) { \
3571 TYPE nn = *(TYPE *)(vn + H(i)); \
3572 out |= OP(TYPE, nn, 0, status); \
3573 } \
3574 } while (i & 63); \
3575 d[j--] = out; \
3576 } while (i > 0); \
3577}
3578
3579#define DO_FPCMP_PPZ0_H(NAME, OP) \
3580 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3581#define DO_FPCMP_PPZ0_S(NAME, OP) \
3582 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3583#define DO_FPCMP_PPZ0_D(NAME, OP) \
3584 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3585
3586#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3587 DO_FPCMP_PPZ0_H(NAME, OP) \
3588 DO_FPCMP_PPZ0_S(NAME, OP) \
3589 DO_FPCMP_PPZ0_D(NAME, OP)
3590
3591DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3592DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3593DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3594DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3595DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3596DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3597
3598
3599
3600void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3601{
3602 static const float16 coeff[16] = {
3603 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3604 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3605 };
3606 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3607 intptr_t x = simd_data(desc);
3608 float16 *d = vd, *n = vn, *m = vm;
3609 for (i = 0; i < opr_sz; i++) {
3610 float16 mm = m[i];
3611 intptr_t xx = x;
3612 if (float16_is_neg(mm)) {
3613 mm = float16_abs(mm);
3614 xx += 8;
3615 }
3616 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3617 }
3618}
3619
3620void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3621{
3622 static const float32 coeff[16] = {
3623 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3624 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3625 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3626 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3627 };
3628 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3629 intptr_t x = simd_data(desc);
3630 float32 *d = vd, *n = vn, *m = vm;
3631 for (i = 0; i < opr_sz; i++) {
3632 float32 mm = m[i];
3633 intptr_t xx = x;
3634 if (float32_is_neg(mm)) {
3635 mm = float32_abs(mm);
3636 xx += 8;
3637 }
3638 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3639 }
3640}
3641
3642void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3643{
3644 static const float64 coeff[16] = {
3645 0x3ff0000000000000ull, 0xbfc5555555555543ull,
3646 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3647 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3648 0x3de5d8408868552full, 0x0000000000000000ull,
3649 0x3ff0000000000000ull, 0xbfe0000000000000ull,
3650 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3651 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3652 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3653 };
3654 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3655 intptr_t x = simd_data(desc);
3656 float64 *d = vd, *n = vn, *m = vm;
3657 for (i = 0; i < opr_sz; i++) {
3658 float64 mm = m[i];
3659 intptr_t xx = x;
3660 if (float64_is_neg(mm)) {
3661 mm = float64_abs(mm);
3662 xx += 8;
3663 }
3664 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3665 }
3666}
3667
3668
3669
3670
3671
3672void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3673 void *vs, uint32_t desc)
3674{
3675 intptr_t j, i = simd_oprsz(desc);
3676 uint64_t *g = vg;
3677 float16 neg_imag = float16_set_sign(0, simd_data(desc));
3678 float16 neg_real = float16_chs(neg_imag);
3679
3680 do {
3681 uint64_t pg = g[(i - 1) >> 6];
3682 do {
3683 float16 e0, e1, e2, e3;
3684
3685
3686 j = i - sizeof(float16);
3687 i -= 2 * sizeof(float16);
3688
3689 e0 = *(float16 *)(vn + H1_2(i));
3690 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3691 e2 = *(float16 *)(vn + H1_2(j));
3692 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3693
3694 if (likely((pg >> (i & 63)) & 1)) {
3695 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3696 }
3697 if (likely((pg >> (j & 63)) & 1)) {
3698 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3699 }
3700 } while (i & 63);
3701 } while (i != 0);
3702}
3703
3704void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3705 void *vs, uint32_t desc)
3706{
3707 intptr_t j, i = simd_oprsz(desc);
3708 uint64_t *g = vg;
3709 float32 neg_imag = float32_set_sign(0, simd_data(desc));
3710 float32 neg_real = float32_chs(neg_imag);
3711
3712 do {
3713 uint64_t pg = g[(i - 1) >> 6];
3714 do {
3715 float32 e0, e1, e2, e3;
3716
3717
3718 j = i - sizeof(float32);
3719 i -= 2 * sizeof(float32);
3720
3721 e0 = *(float32 *)(vn + H1_2(i));
3722 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3723 e2 = *(float32 *)(vn + H1_2(j));
3724 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3725
3726 if (likely((pg >> (i & 63)) & 1)) {
3727 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3728 }
3729 if (likely((pg >> (j & 63)) & 1)) {
3730 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3731 }
3732 } while (i & 63);
3733 } while (i != 0);
3734}
3735
3736void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3737 void *vs, uint32_t desc)
3738{
3739 intptr_t j, i = simd_oprsz(desc);
3740 uint64_t *g = vg;
3741 float64 neg_imag = float64_set_sign(0, simd_data(desc));
3742 float64 neg_real = float64_chs(neg_imag);
3743
3744 do {
3745 uint64_t pg = g[(i - 1) >> 6];
3746 do {
3747 float64 e0, e1, e2, e3;
3748
3749
3750 j = i - sizeof(float64);
3751 i -= 2 * sizeof(float64);
3752
3753 e0 = *(float64 *)(vn + H1_2(i));
3754 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3755 e2 = *(float64 *)(vn + H1_2(j));
3756 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3757
3758 if (likely((pg >> (i & 63)) & 1)) {
3759 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3760 }
3761 if (likely((pg >> (j & 63)) & 1)) {
3762 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3763 }
3764 } while (i & 63);
3765 } while (i != 0);
3766}
3767
3768
3769
3770
3771
3772void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3773 void *vg, void *status, uint32_t desc)
3774{
3775 intptr_t j, i = simd_oprsz(desc);
3776 unsigned rot = simd_data(desc);
3777 bool flip = rot & 1;
3778 float16 neg_imag, neg_real;
3779 uint64_t *g = vg;
3780
3781 neg_imag = float16_set_sign(0, (rot & 2) != 0);
3782 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3783
3784 do {
3785 uint64_t pg = g[(i - 1) >> 6];
3786 do {
3787 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3788
3789
3790 j = i - sizeof(float16);
3791 i -= 2 * sizeof(float16);
3792
3793 nr = *(float16 *)(vn + H1_2(i));
3794 ni = *(float16 *)(vn + H1_2(j));
3795 mr = *(float16 *)(vm + H1_2(i));
3796 mi = *(float16 *)(vm + H1_2(j));
3797
3798 e2 = (flip ? ni : nr);
3799 e1 = (flip ? mi : mr) ^ neg_real;
3800 e4 = e2;
3801 e3 = (flip ? mr : mi) ^ neg_imag;
3802
3803 if (likely((pg >> (i & 63)) & 1)) {
3804 d = *(float16 *)(va + H1_2(i));
3805 d = float16_muladd(e2, e1, d, 0, status);
3806 *(float16 *)(vd + H1_2(i)) = d;
3807 }
3808 if (likely((pg >> (j & 63)) & 1)) {
3809 d = *(float16 *)(va + H1_2(j));
3810 d = float16_muladd(e4, e3, d, 0, status);
3811 *(float16 *)(vd + H1_2(j)) = d;
3812 }
3813 } while (i & 63);
3814 } while (i != 0);
3815}
3816
3817void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3818 void *vg, void *status, uint32_t desc)
3819{
3820 intptr_t j, i = simd_oprsz(desc);
3821 unsigned rot = simd_data(desc);
3822 bool flip = rot & 1;
3823 float32 neg_imag, neg_real;
3824 uint64_t *g = vg;
3825
3826 neg_imag = float32_set_sign(0, (rot & 2) != 0);
3827 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3828
3829 do {
3830 uint64_t pg = g[(i - 1) >> 6];
3831 do {
3832 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3833
3834
3835 j = i - sizeof(float32);
3836 i -= 2 * sizeof(float32);
3837
3838 nr = *(float32 *)(vn + H1_2(i));
3839 ni = *(float32 *)(vn + H1_2(j));
3840 mr = *(float32 *)(vm + H1_2(i));
3841 mi = *(float32 *)(vm + H1_2(j));
3842
3843 e2 = (flip ? ni : nr);
3844 e1 = (flip ? mi : mr) ^ neg_real;
3845 e4 = e2;
3846 e3 = (flip ? mr : mi) ^ neg_imag;
3847
3848 if (likely((pg >> (i & 63)) & 1)) {
3849 d = *(float32 *)(va + H1_2(i));
3850 d = float32_muladd(e2, e1, d, 0, status);
3851 *(float32 *)(vd + H1_2(i)) = d;
3852 }
3853 if (likely((pg >> (j & 63)) & 1)) {
3854 d = *(float32 *)(va + H1_2(j));
3855 d = float32_muladd(e4, e3, d, 0, status);
3856 *(float32 *)(vd + H1_2(j)) = d;
3857 }
3858 } while (i & 63);
3859 } while (i != 0);
3860}
3861
3862void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3863 void *vg, void *status, uint32_t desc)
3864{
3865 intptr_t j, i = simd_oprsz(desc);
3866 unsigned rot = simd_data(desc);
3867 bool flip = rot & 1;
3868 float64 neg_imag, neg_real;
3869 uint64_t *g = vg;
3870
3871 neg_imag = float64_set_sign(0, (rot & 2) != 0);
3872 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3873
3874 do {
3875 uint64_t pg = g[(i - 1) >> 6];
3876 do {
3877 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3878
3879
3880 j = i - sizeof(float64);
3881 i -= 2 * sizeof(float64);
3882
3883 nr = *(float64 *)(vn + H1_2(i));
3884 ni = *(float64 *)(vn + H1_2(j));
3885 mr = *(float64 *)(vm + H1_2(i));
3886 mi = *(float64 *)(vm + H1_2(j));
3887
3888 e2 = (flip ? ni : nr);
3889 e1 = (flip ? mi : mr) ^ neg_real;
3890 e4 = e2;
3891 e3 = (flip ? mr : mi) ^ neg_imag;
3892
3893 if (likely((pg >> (i & 63)) & 1)) {
3894 d = *(float64 *)(va + H1_2(i));
3895 d = float64_muladd(e2, e1, d, 0, status);
3896 *(float64 *)(vd + H1_2(i)) = d;
3897 }
3898 if (likely((pg >> (j & 63)) & 1)) {
3899 d = *(float64 *)(va + H1_2(j));
3900 d = float64_muladd(e4, e3, d, 0, status);
3901 *(float64 *)(vd + H1_2(j)) = d;
3902 }
3903 } while (i & 63);
3904 } while (i != 0);
3905}
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
3916
3917
3918
3919
3920
3921typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
3922 target_ulong vaddr, uintptr_t retaddr);
3923
3924
3925
3926
3927
3928#define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
3929static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
3930{ \
3931 TYPEM val = HOST(host); \
3932 *(TYPEE *)(vd + H(reg_off)) = val; \
3933}
3934
3935#define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
3936static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
3937{ HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
3938
3939#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
3940static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
3941 target_ulong addr, uintptr_t ra) \
3942{ \
3943 *(TYPEE *)(vd + H(reg_off)) = \
3944 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
3945}
3946
3947#define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
3948static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
3949 target_ulong addr, uintptr_t ra) \
3950{ \
3951 TLB(env, useronly_clean_ptr(addr), \
3952 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
3953}
3954
3955#define DO_LD_PRIM_1(NAME, H, TE, TM) \
3956 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
3957 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
3958
3959DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
3960DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
3961DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
3962DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
3963DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
3964DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
3965DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
3966
3967#define DO_ST_PRIM_1(NAME, H, TE, TM) \
3968 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
3969 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
3970
3971DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
3972DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
3973DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
3974DO_ST_PRIM_1(bd, , uint64_t, uint8_t)
3975
3976#define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
3977 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
3978 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
3979 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
3980 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
3981
3982#define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
3983 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
3984 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
3985 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
3986 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
3987
3988DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
3989DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
3990DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
3991DO_LD_PRIM_2(hdu, , uint64_t, uint16_t, lduw)
3992DO_LD_PRIM_2(hds, , uint64_t, int16_t, lduw)
3993
3994DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
3995DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
3996DO_ST_PRIM_2(hd, , uint64_t, uint16_t, stw)
3997
3998DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
3999DO_LD_PRIM_2(sdu, , uint64_t, uint32_t, ldl)
4000DO_LD_PRIM_2(sds, , uint64_t, int32_t, ldl)
4001
4002DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
4003DO_ST_PRIM_2(sd, , uint64_t, uint32_t, stl)
4004
4005DO_LD_PRIM_2(dd, , uint64_t, uint64_t, ldq)
4006DO_ST_PRIM_2(dd, , uint64_t, uint64_t, stq)
4007
4008#undef DO_LD_TLB
4009#undef DO_ST_TLB
4010#undef DO_LD_HOST
4011#undef DO_LD_PRIM_1
4012#undef DO_ST_PRIM_1
4013#undef DO_LD_PRIM_2
4014#undef DO_ST_PRIM_2
4015
4016
4017
4018
4019
4020
4021static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4022 intptr_t reg_max, int esz)
4023{
4024 uint64_t pg_mask = pred_esz_masks[esz];
4025 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4026
4027
4028 if (likely(pg & 1)) {
4029 return reg_off;
4030 }
4031
4032 if (pg == 0) {
4033 reg_off &= -64;
4034 do {
4035 reg_off += 64;
4036 if (unlikely(reg_off >= reg_max)) {
4037
4038 return reg_max;
4039 }
4040 pg = vg[reg_off >> 6] & pg_mask;
4041 } while (pg == 0);
4042 }
4043 reg_off += ctz64(pg);
4044
4045
4046 tcg_debug_assert(reg_off < reg_max);
4047 return reg_off;
4048}
4049
4050
4051
4052
4053
4054
4055
4056typedef struct {
4057 void *host;
4058 int flags;
4059 MemTxAttrs attrs;
4060} SVEHostPage;
4061
4062static bool sve_probe_page(SVEHostPage *info, bool nofault,
4063 CPUARMState *env, target_ulong addr,
4064 int mem_off, MMUAccessType access_type,
4065 int mmu_idx, uintptr_t retaddr)
4066{
4067 int flags;
4068
4069 addr += mem_off;
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081 addr = useronly_clean_ptr(addr);
4082
4083 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
4084 &info->host, retaddr);
4085 info->flags = flags;
4086
4087 if (flags & TLB_INVALID_MASK) {
4088 g_assert(nofault);
4089 return false;
4090 }
4091
4092
4093 info->host -= mem_off;
4094
4095#ifdef CONFIG_USER_ONLY
4096 memset(&info->attrs, 0, sizeof(info->attrs));
4097#else
4098
4099
4100
4101
4102 {
4103 uintptr_t index = tlb_index(env, mmu_idx, addr);
4104
4105# ifdef CONFIG_DEBUG_TCG
4106 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
4107 target_ulong comparator = (access_type == MMU_DATA_LOAD
4108 ? entry->addr_read
4109 : tlb_addr_write(entry));
4110 g_assert(tlb_hit(comparator, addr));
4111# endif
4112
4113 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
4114 info->attrs = iotlbentry->attrs;
4115 }
4116#endif
4117
4118 return true;
4119}
4120
4121
4122
4123
4124
4125
4126typedef enum {
4127 FAULT_NO,
4128 FAULT_FIRST,
4129 FAULT_ALL,
4130} SVEContFault;
4131
4132typedef struct {
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145 int16_t mem_off_first[2];
4146 int16_t reg_off_first[2];
4147 int16_t reg_off_last[2];
4148
4149
4150
4151
4152
4153 int16_t mem_off_split;
4154 int16_t reg_off_split;
4155
4156
4157
4158
4159
4160 int16_t page_split;
4161
4162
4163 SVEHostPage page[2];
4164} SVEContLdSt;
4165
4166
4167
4168
4169
4170
4171static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
4172 uint64_t *vg, intptr_t reg_max,
4173 int esz, int msize)
4174{
4175 const int esize = 1 << esz;
4176 const uint64_t pg_mask = pred_esz_masks[esz];
4177 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
4178 intptr_t mem_off_last, mem_off_split;
4179 intptr_t page_split, elt_split;
4180 intptr_t i;
4181
4182
4183 memset(info, -1, offsetof(SVEContLdSt, page));
4184 memset(info->page, 0, sizeof(info->page));
4185
4186
4187 i = 0;
4188 do {
4189 uint64_t pg = vg[i] & pg_mask;
4190 if (pg) {
4191 reg_off_last = i * 64 + 63 - clz64(pg);
4192 if (reg_off_first < 0) {
4193 reg_off_first = i * 64 + ctz64(pg);
4194 }
4195 }
4196 } while (++i * 64 < reg_max);
4197
4198 if (unlikely(reg_off_first < 0)) {
4199
4200 return false;
4201 }
4202 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
4203
4204 info->reg_off_first[0] = reg_off_first;
4205 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
4206 mem_off_last = (reg_off_last >> esz) * msize;
4207
4208 page_split = -(addr | TARGET_PAGE_MASK);
4209 if (likely(mem_off_last + msize <= page_split)) {
4210
4211 info->reg_off_last[0] = reg_off_last;
4212 return true;
4213 }
4214
4215 info->page_split = page_split;
4216 elt_split = page_split / msize;
4217 reg_off_split = elt_split << esz;
4218 mem_off_split = elt_split * msize;
4219
4220
4221
4222
4223
4224
4225
4226 if (elt_split != 0) {
4227 info->reg_off_last[0] = reg_off_split - esize;
4228 }
4229
4230
4231 if (page_split % msize != 0) {
4232
4233 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
4234 info->reg_off_split = reg_off_split;
4235 info->mem_off_split = mem_off_split;
4236
4237 if (reg_off_split == reg_off_last) {
4238
4239 return true;
4240 }
4241 }
4242 reg_off_split += esize;
4243 mem_off_split += msize;
4244 }
4245
4246
4247
4248
4249
4250 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
4251 tcg_debug_assert(reg_off_split <= reg_off_last);
4252 info->reg_off_first[1] = reg_off_split;
4253 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
4254 info->reg_off_last[1] = reg_off_last;
4255 return true;
4256}
4257
4258
4259
4260
4261
4262
4263static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
4264 CPUARMState *env, target_ulong addr,
4265 MMUAccessType access_type, uintptr_t retaddr)
4266{
4267 int mmu_idx = cpu_mmu_index(env, false);
4268 int mem_off = info->mem_off_first[0];
4269 bool nofault = fault == FAULT_NO;
4270 bool have_work = true;
4271
4272 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
4273 access_type, mmu_idx, retaddr)) {
4274
4275 return false;
4276 }
4277
4278 if (likely(info->page_split < 0)) {
4279
4280 return true;
4281 }
4282
4283
4284
4285
4286
4287 if (info->mem_off_split >= 0) {
4288
4289
4290
4291
4292 mem_off = info->page_split;
4293
4294
4295
4296
4297
4298
4299 if (info->mem_off_first[0] < info->mem_off_split) {
4300 nofault = FAULT_FIRST;
4301 have_work = false;
4302 }
4303 } else {
4304
4305
4306
4307
4308 mem_off = info->mem_off_first[1];
4309
4310
4311
4312
4313 nofault = fault != FAULT_ALL;
4314 }
4315
4316 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
4317 access_type, mmu_idx, retaddr);
4318 return have_work;
4319}
4320
4321static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
4322 uint64_t *vg, target_ulong addr,
4323 int esize, int msize, int wp_access,
4324 uintptr_t retaddr)
4325{
4326#ifndef CONFIG_USER_ONLY
4327 intptr_t mem_off, reg_off, reg_last;
4328 int flags0 = info->page[0].flags;
4329 int flags1 = info->page[1].flags;
4330
4331 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
4332 return;
4333 }
4334
4335
4336 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
4337 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
4338
4339 if (flags0 & TLB_WATCHPOINT) {
4340 mem_off = info->mem_off_first[0];
4341 reg_off = info->reg_off_first[0];
4342 reg_last = info->reg_off_last[0];
4343
4344 while (reg_off <= reg_last) {
4345 uint64_t pg = vg[reg_off >> 6];
4346 do {
4347 if ((pg >> (reg_off & 63)) & 1) {
4348 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4349 msize, info->page[0].attrs,
4350 wp_access, retaddr);
4351 }
4352 reg_off += esize;
4353 mem_off += msize;
4354 } while (reg_off <= reg_last && (reg_off & 63));
4355 }
4356 }
4357
4358 mem_off = info->mem_off_split;
4359 if (mem_off >= 0) {
4360 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
4361 info->page[0].attrs, wp_access, retaddr);
4362 }
4363
4364 mem_off = info->mem_off_first[1];
4365 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
4366 reg_off = info->reg_off_first[1];
4367 reg_last = info->reg_off_last[1];
4368
4369 do {
4370 uint64_t pg = vg[reg_off >> 6];
4371 do {
4372 if ((pg >> (reg_off & 63)) & 1) {
4373 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4374 msize, info->page[1].attrs,
4375 wp_access, retaddr);
4376 }
4377 reg_off += esize;
4378 mem_off += msize;
4379 } while (reg_off & 63);
4380 } while (reg_off <= reg_last);
4381 }
4382#endif
4383}
4384
4385typedef uint64_t mte_check_fn(CPUARMState *, uint32_t, uint64_t, uintptr_t);
4386
4387static inline QEMU_ALWAYS_INLINE
4388void sve_cont_ldst_mte_check_int(SVEContLdSt *info, CPUARMState *env,
4389 uint64_t *vg, target_ulong addr, int esize,
4390 int msize, uint32_t mtedesc, uintptr_t ra,
4391 mte_check_fn *check)
4392{
4393 intptr_t mem_off, reg_off, reg_last;
4394
4395
4396 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
4397 mem_off = info->mem_off_first[0];
4398 reg_off = info->reg_off_first[0];
4399 reg_last = info->reg_off_split;
4400 if (reg_last < 0) {
4401 reg_last = info->reg_off_last[0];
4402 }
4403
4404 do {
4405 uint64_t pg = vg[reg_off >> 6];
4406 do {
4407 if ((pg >> (reg_off & 63)) & 1) {
4408 check(env, mtedesc, addr, ra);
4409 }
4410 reg_off += esize;
4411 mem_off += msize;
4412 } while (reg_off <= reg_last && (reg_off & 63));
4413 } while (reg_off <= reg_last);
4414 }
4415
4416 mem_off = info->mem_off_first[1];
4417 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
4418 reg_off = info->reg_off_first[1];
4419 reg_last = info->reg_off_last[1];
4420
4421 do {
4422 uint64_t pg = vg[reg_off >> 6];
4423 do {
4424 if ((pg >> (reg_off & 63)) & 1) {
4425 check(env, mtedesc, addr, ra);
4426 }
4427 reg_off += esize;
4428 mem_off += msize;
4429 } while (reg_off & 63);
4430 } while (reg_off <= reg_last);
4431 }
4432}
4433
4434typedef void sve_cont_ldst_mte_check_fn(SVEContLdSt *info, CPUARMState *env,
4435 uint64_t *vg, target_ulong addr,
4436 int esize, int msize, uint32_t mtedesc,
4437 uintptr_t ra);
4438
4439static void sve_cont_ldst_mte_check1(SVEContLdSt *info, CPUARMState *env,
4440 uint64_t *vg, target_ulong addr,
4441 int esize, int msize, uint32_t mtedesc,
4442 uintptr_t ra)
4443{
4444 sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
4445 mtedesc, ra, mte_check1);
4446}
4447
4448static void sve_cont_ldst_mte_checkN(SVEContLdSt *info, CPUARMState *env,
4449 uint64_t *vg, target_ulong addr,
4450 int esize, int msize, uint32_t mtedesc,
4451 uintptr_t ra)
4452{
4453 sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
4454 mtedesc, ra, mte_checkN);
4455}
4456
4457
4458
4459
4460
4461static inline QEMU_ALWAYS_INLINE
4462void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
4463 uint32_t desc, const uintptr_t retaddr,
4464 const int esz, const int msz, const int N, uint32_t mtedesc,
4465 sve_ldst1_host_fn *host_fn,
4466 sve_ldst1_tlb_fn *tlb_fn,
4467 sve_cont_ldst_mte_check_fn *mte_check_fn)
4468{
4469 const unsigned rd = simd_data(desc);
4470 const intptr_t reg_max = simd_oprsz(desc);
4471 intptr_t reg_off, reg_last, mem_off;
4472 SVEContLdSt info;
4473 void *host;
4474 int flags, i;
4475
4476
4477 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
4478
4479 for (i = 0; i < N; ++i) {
4480 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
4481 }
4482 return;
4483 }
4484
4485
4486 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
4487
4488
4489 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
4490 BP_MEM_READ, retaddr);
4491
4492
4493
4494
4495
4496 if (mte_check_fn && mtedesc) {
4497 mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
4498 mtedesc, retaddr);
4499 }
4500
4501 flags = info.page[0].flags | info.page[1].flags;
4502 if (unlikely(flags != 0)) {
4503#ifdef CONFIG_USER_ONLY
4504 g_assert_not_reached();
4505#else
4506
4507
4508
4509
4510
4511
4512 ARMVectorReg scratch[4] = { };
4513
4514 mem_off = info.mem_off_first[0];
4515 reg_off = info.reg_off_first[0];
4516 reg_last = info.reg_off_last[1];
4517 if (reg_last < 0) {
4518 reg_last = info.reg_off_split;
4519 if (reg_last < 0) {
4520 reg_last = info.reg_off_last[0];
4521 }
4522 }
4523
4524 do {
4525 uint64_t pg = vg[reg_off >> 6];
4526 do {
4527 if ((pg >> (reg_off & 63)) & 1) {
4528 for (i = 0; i < N; ++i) {
4529 tlb_fn(env, &scratch[i], reg_off,
4530 addr + mem_off + (i << msz), retaddr);
4531 }
4532 }
4533 reg_off += 1 << esz;
4534 mem_off += N << msz;
4535 } while (reg_off & 63);
4536 } while (reg_off <= reg_last);
4537
4538 for (i = 0; i < N; ++i) {
4539 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
4540 }
4541 return;
4542#endif
4543 }
4544
4545
4546
4547 for (i = 0; i < N; ++i) {
4548 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
4549 }
4550
4551 mem_off = info.mem_off_first[0];
4552 reg_off = info.reg_off_first[0];
4553 reg_last = info.reg_off_last[0];
4554 host = info.page[0].host;
4555
4556 while (reg_off <= reg_last) {
4557 uint64_t pg = vg[reg_off >> 6];
4558 do {
4559 if ((pg >> (reg_off & 63)) & 1) {
4560 for (i = 0; i < N; ++i) {
4561 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4562 host + mem_off + (i << msz));
4563 }
4564 }
4565 reg_off += 1 << esz;
4566 mem_off += N << msz;
4567 } while (reg_off <= reg_last && (reg_off & 63));
4568 }
4569
4570
4571
4572
4573
4574 mem_off = info.mem_off_split;
4575 if (unlikely(mem_off >= 0)) {
4576 reg_off = info.reg_off_split;
4577 for (i = 0; i < N; ++i) {
4578 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
4579 addr + mem_off + (i << msz), retaddr);
4580 }
4581 }
4582
4583 mem_off = info.mem_off_first[1];
4584 if (unlikely(mem_off >= 0)) {
4585 reg_off = info.reg_off_first[1];
4586 reg_last = info.reg_off_last[1];
4587 host = info.page[1].host;
4588
4589 do {
4590 uint64_t pg = vg[reg_off >> 6];
4591 do {
4592 if ((pg >> (reg_off & 63)) & 1) {
4593 for (i = 0; i < N; ++i) {
4594 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4595 host + mem_off + (i << msz));
4596 }
4597 }
4598 reg_off += 1 << esz;
4599 mem_off += N << msz;
4600 } while (reg_off & 63);
4601 } while (reg_off <= reg_last);
4602 }
4603}
4604
4605static inline QEMU_ALWAYS_INLINE
4606void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
4607 uint32_t desc, const uintptr_t ra,
4608 const int esz, const int msz, const int N,
4609 sve_ldst1_host_fn *host_fn,
4610 sve_ldst1_tlb_fn *tlb_fn)
4611{
4612 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4613 int bit55 = extract64(addr, 55, 1);
4614
4615
4616 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4617
4618
4619 if (!tbi_check(desc, bit55) ||
4620 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
4621 mtedesc = 0;
4622 }
4623
4624 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
4625 N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
4626}
4627
4628#define DO_LD1_1(NAME, ESZ) \
4629void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
4630 target_ulong addr, uint32_t desc) \
4631{ \
4632 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
4633 sve_##NAME##_host, sve_##NAME##_tlb, NULL); \
4634} \
4635void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
4636 target_ulong addr, uint32_t desc) \
4637{ \
4638 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
4639 sve_##NAME##_host, sve_##NAME##_tlb); \
4640}
4641
4642#define DO_LD1_2(NAME, ESZ, MSZ) \
4643void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
4644 target_ulong addr, uint32_t desc) \
4645{ \
4646 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4647 sve_##NAME##_le_host, sve_##NAME##_le_tlb, NULL); \
4648} \
4649void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
4650 target_ulong addr, uint32_t desc) \
4651{ \
4652 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4653 sve_##NAME##_be_host, sve_##NAME##_be_tlb, NULL); \
4654} \
4655void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
4656 target_ulong addr, uint32_t desc) \
4657{ \
4658 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
4659 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
4660} \
4661void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
4662 target_ulong addr, uint32_t desc) \
4663{ \
4664 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
4665 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
4666}
4667
4668DO_LD1_1(ld1bb, MO_8)
4669DO_LD1_1(ld1bhu, MO_16)
4670DO_LD1_1(ld1bhs, MO_16)
4671DO_LD1_1(ld1bsu, MO_32)
4672DO_LD1_1(ld1bss, MO_32)
4673DO_LD1_1(ld1bdu, MO_64)
4674DO_LD1_1(ld1bds, MO_64)
4675
4676DO_LD1_2(ld1hh, MO_16, MO_16)
4677DO_LD1_2(ld1hsu, MO_32, MO_16)
4678DO_LD1_2(ld1hss, MO_32, MO_16)
4679DO_LD1_2(ld1hdu, MO_64, MO_16)
4680DO_LD1_2(ld1hds, MO_64, MO_16)
4681
4682DO_LD1_2(ld1ss, MO_32, MO_32)
4683DO_LD1_2(ld1sdu, MO_64, MO_32)
4684DO_LD1_2(ld1sds, MO_64, MO_32)
4685
4686DO_LD1_2(ld1dd, MO_64, MO_64)
4687
4688#undef DO_LD1_1
4689#undef DO_LD1_2
4690
4691#define DO_LDN_1(N) \
4692void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
4693 target_ulong addr, uint32_t desc) \
4694{ \
4695 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
4696 sve_ld1bb_host, sve_ld1bb_tlb, NULL); \
4697} \
4698void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
4699 target_ulong addr, uint32_t desc) \
4700{ \
4701 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
4702 sve_ld1bb_host, sve_ld1bb_tlb); \
4703}
4704
4705#define DO_LDN_2(N, SUFF, ESZ) \
4706void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
4707 target_ulong addr, uint32_t desc) \
4708{ \
4709 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4710 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb, NULL); \
4711} \
4712void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
4713 target_ulong addr, uint32_t desc) \
4714{ \
4715 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4716 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb, NULL); \
4717} \
4718void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
4719 target_ulong addr, uint32_t desc) \
4720{ \
4721 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
4722 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
4723} \
4724void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
4725 target_ulong addr, uint32_t desc) \
4726{ \
4727 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
4728 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
4729}
4730
4731DO_LDN_1(2)
4732DO_LDN_1(3)
4733DO_LDN_1(4)
4734
4735DO_LDN_2(2, hh, MO_16)
4736DO_LDN_2(3, hh, MO_16)
4737DO_LDN_2(4, hh, MO_16)
4738
4739DO_LDN_2(2, ss, MO_32)
4740DO_LDN_2(3, ss, MO_32)
4741DO_LDN_2(4, ss, MO_32)
4742
4743DO_LDN_2(2, dd, MO_64)
4744DO_LDN_2(3, dd, MO_64)
4745DO_LDN_2(4, dd, MO_64)
4746
4747#undef DO_LDN_1
4748#undef DO_LDN_2
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4769{
4770 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4771
4772 if (i & 63) {
4773 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4774 i = ROUND_UP(i, 64);
4775 }
4776 for (; i < oprsz; i += 64) {
4777 ffr[i / 64] = 0;
4778 }
4779}
4780
4781
4782
4783
4784static inline QEMU_ALWAYS_INLINE
4785void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
4786 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
4787 const int esz, const int msz, const SVEContFault fault,
4788 sve_ldst1_host_fn *host_fn,
4789 sve_ldst1_tlb_fn *tlb_fn)
4790{
4791 const unsigned rd = simd_data(desc);
4792 void *vd = &env->vfp.zregs[rd];
4793 const intptr_t reg_max = simd_oprsz(desc);
4794 intptr_t reg_off, mem_off, reg_last;
4795 SVEContLdSt info;
4796 int flags;
4797 void *host;
4798
4799
4800 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
4801
4802 memset(vd, 0, reg_max);
4803 return;
4804 }
4805 reg_off = info.reg_off_first[0];
4806
4807
4808 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
4809
4810 tcg_debug_assert(fault == FAULT_NO);
4811 memset(vd, 0, reg_max);
4812 goto do_fault;
4813 }
4814
4815 mem_off = info.mem_off_first[0];
4816 flags = info.page[0].flags;
4817
4818
4819
4820
4821
4822 if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
4823 mtedesc = 0;
4824 }
4825
4826 if (fault == FAULT_FIRST) {
4827
4828 if (mtedesc) {
4829 mte_check1(env, mtedesc, addr + mem_off, retaddr);
4830 }
4831
4832
4833
4834
4835
4836 bool is_split = mem_off == info.mem_off_split;
4837 if (unlikely(flags != 0) || unlikely(is_split)) {
4838
4839
4840
4841
4842 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
4843
4844
4845 swap_memzero(vd, reg_off);
4846 reg_off += 1 << esz;
4847 mem_off += 1 << msz;
4848 swap_memzero(vd + reg_off, reg_max - reg_off);
4849
4850 if (is_split) {
4851 goto second_page;
4852 }
4853 } else {
4854 memset(vd, 0, reg_max);
4855 }
4856 } else {
4857 memset(vd, 0, reg_max);
4858 if (unlikely(mem_off == info.mem_off_split)) {
4859
4860 flags |= info.page[1].flags;
4861 if (unlikely(flags & TLB_MMIO)) {
4862
4863 goto do_fault;
4864 }
4865 if (unlikely(flags & TLB_WATCHPOINT) &&
4866 (cpu_watchpoint_address_matches
4867 (env_cpu(env), addr + mem_off, 1 << msz)
4868 & BP_MEM_READ)) {
4869
4870 goto do_fault;
4871 }
4872 if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) {
4873 goto do_fault;
4874 }
4875
4876
4877
4878
4879 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
4880 goto second_page;
4881 }
4882 }
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905 if (unlikely(flags & TLB_MMIO)) {
4906 goto do_fault;
4907 }
4908
4909 reg_last = info.reg_off_last[0];
4910 host = info.page[0].host;
4911
4912 do {
4913 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
4914 do {
4915 if ((pg >> (reg_off & 63)) & 1) {
4916 if (unlikely(flags & TLB_WATCHPOINT) &&
4917 (cpu_watchpoint_address_matches
4918 (env_cpu(env), addr + mem_off, 1 << msz)
4919 & BP_MEM_READ)) {
4920 goto do_fault;
4921 }
4922 if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) {
4923 goto do_fault;
4924 }
4925 host_fn(vd, reg_off, host + mem_off);
4926 }
4927 reg_off += 1 << esz;
4928 mem_off += 1 << msz;
4929 } while (reg_off <= reg_last && (reg_off & 63));
4930 } while (reg_off <= reg_last);
4931
4932
4933
4934
4935
4936
4937
4938 reg_off = info.reg_off_split;
4939 if (reg_off >= 0) {
4940 goto do_fault;
4941 }
4942
4943 second_page:
4944 reg_off = info.reg_off_first[1];
4945 if (likely(reg_off < 0)) {
4946
4947 return;
4948 }
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958 do_fault:
4959 record_fault(env, reg_off, reg_max);
4960}
4961
4962static inline QEMU_ALWAYS_INLINE
4963void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
4964 uint32_t desc, const uintptr_t retaddr,
4965 const int esz, const int msz, const SVEContFault fault,
4966 sve_ldst1_host_fn *host_fn,
4967 sve_ldst1_tlb_fn *tlb_fn)
4968{
4969 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4970 int bit55 = extract64(addr, 55, 1);
4971
4972
4973 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4974
4975
4976 if (!tbi_check(desc, bit55) ||
4977 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
4978 mtedesc = 0;
4979 }
4980
4981 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
4982 esz, msz, fault, host_fn, tlb_fn);
4983}
4984
4985#define DO_LDFF1_LDNF1_1(PART, ESZ) \
4986void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
4987 target_ulong addr, uint32_t desc) \
4988{ \
4989 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
4990 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4991} \
4992void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
4993 target_ulong addr, uint32_t desc) \
4994{ \
4995 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
4996 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4997} \
4998void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
4999 target_ulong addr, uint32_t desc) \
5000{ \
5001 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
5002 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5003} \
5004void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
5005 target_ulong addr, uint32_t desc) \
5006{ \
5007 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
5008 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5009}
5010
5011#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
5012void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
5013 target_ulong addr, uint32_t desc) \
5014{ \
5015 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
5016 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5017} \
5018void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
5019 target_ulong addr, uint32_t desc) \
5020{ \
5021 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
5022 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5023} \
5024void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
5025 target_ulong addr, uint32_t desc) \
5026{ \
5027 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
5028 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5029} \
5030void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
5031 target_ulong addr, uint32_t desc) \
5032{ \
5033 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
5034 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5035} \
5036void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5037 target_ulong addr, uint32_t desc) \
5038{ \
5039 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5040 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5041} \
5042void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5043 target_ulong addr, uint32_t desc) \
5044{ \
5045 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5046 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5047} \
5048void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5049 target_ulong addr, uint32_t desc) \
5050{ \
5051 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5052 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5053} \
5054void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5055 target_ulong addr, uint32_t desc) \
5056{ \
5057 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5058 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5059}
5060
5061DO_LDFF1_LDNF1_1(bb, MO_8)
5062DO_LDFF1_LDNF1_1(bhu, MO_16)
5063DO_LDFF1_LDNF1_1(bhs, MO_16)
5064DO_LDFF1_LDNF1_1(bsu, MO_32)
5065DO_LDFF1_LDNF1_1(bss, MO_32)
5066DO_LDFF1_LDNF1_1(bdu, MO_64)
5067DO_LDFF1_LDNF1_1(bds, MO_64)
5068
5069DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
5070DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
5071DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
5072DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
5073DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
5074
5075DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
5076DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
5077DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
5078
5079DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
5080
5081#undef DO_LDFF1_LDNF1_1
5082#undef DO_LDFF1_LDNF1_2
5083
5084
5085
5086
5087
5088static inline QEMU_ALWAYS_INLINE
5089void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
5090 uint32_t desc, const uintptr_t retaddr,
5091 const int esz, const int msz, const int N, uint32_t mtedesc,
5092 sve_ldst1_host_fn *host_fn,
5093 sve_ldst1_tlb_fn *tlb_fn,
5094 sve_cont_ldst_mte_check_fn *mte_check_fn)
5095{
5096 const unsigned rd = simd_data(desc);
5097 const intptr_t reg_max = simd_oprsz(desc);
5098 intptr_t reg_off, reg_last, mem_off;
5099 SVEContLdSt info;
5100 void *host;
5101 int i, flags;
5102
5103
5104 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5105
5106 return;
5107 }
5108
5109
5110 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
5111
5112
5113 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5114 BP_MEM_WRITE, retaddr);
5115
5116
5117
5118
5119
5120 if (mte_check_fn && mtedesc) {
5121 mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
5122 mtedesc, retaddr);
5123 }
5124
5125 flags = info.page[0].flags | info.page[1].flags;
5126 if (unlikely(flags != 0)) {
5127#ifdef CONFIG_USER_ONLY
5128 g_assert_not_reached();
5129#else
5130
5131
5132
5133
5134
5135
5136 mem_off = info.mem_off_first[0];
5137 reg_off = info.reg_off_first[0];
5138 reg_last = info.reg_off_last[1];
5139 if (reg_last < 0) {
5140 reg_last = info.reg_off_split;
5141 if (reg_last < 0) {
5142 reg_last = info.reg_off_last[0];
5143 }
5144 }
5145
5146 do {
5147 uint64_t pg = vg[reg_off >> 6];
5148 do {
5149 if ((pg >> (reg_off & 63)) & 1) {
5150 for (i = 0; i < N; ++i) {
5151 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5152 addr + mem_off + (i << msz), retaddr);
5153 }
5154 }
5155 reg_off += 1 << esz;
5156 mem_off += N << msz;
5157 } while (reg_off & 63);
5158 } while (reg_off <= reg_last);
5159 return;
5160#endif
5161 }
5162
5163 mem_off = info.mem_off_first[0];
5164 reg_off = info.reg_off_first[0];
5165 reg_last = info.reg_off_last[0];
5166 host = info.page[0].host;
5167
5168 while (reg_off <= reg_last) {
5169 uint64_t pg = vg[reg_off >> 6];
5170 do {
5171 if ((pg >> (reg_off & 63)) & 1) {
5172 for (i = 0; i < N; ++i) {
5173 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5174 host + mem_off + (i << msz));
5175 }
5176 }
5177 reg_off += 1 << esz;
5178 mem_off += N << msz;
5179 } while (reg_off <= reg_last && (reg_off & 63));
5180 }
5181
5182
5183
5184
5185
5186 mem_off = info.mem_off_split;
5187 if (unlikely(mem_off >= 0)) {
5188 reg_off = info.reg_off_split;
5189 for (i = 0; i < N; ++i) {
5190 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5191 addr + mem_off + (i << msz), retaddr);
5192 }
5193 }
5194
5195 mem_off = info.mem_off_first[1];
5196 if (unlikely(mem_off >= 0)) {
5197 reg_off = info.reg_off_first[1];
5198 reg_last = info.reg_off_last[1];
5199 host = info.page[1].host;
5200
5201 do {
5202 uint64_t pg = vg[reg_off >> 6];
5203 do {
5204 if ((pg >> (reg_off & 63)) & 1) {
5205 for (i = 0; i < N; ++i) {
5206 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5207 host + mem_off + (i << msz));
5208 }
5209 }
5210 reg_off += 1 << esz;
5211 mem_off += N << msz;
5212 } while (reg_off & 63);
5213 } while (reg_off <= reg_last);
5214 }
5215}
5216
5217static inline QEMU_ALWAYS_INLINE
5218void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5219 uint32_t desc, const uintptr_t ra,
5220 const int esz, const int msz, const int N,
5221 sve_ldst1_host_fn *host_fn,
5222 sve_ldst1_tlb_fn *tlb_fn)
5223{
5224 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5225 int bit55 = extract64(addr, 55, 1);
5226
5227
5228 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5229
5230
5231 if (!tbi_check(desc, bit55) ||
5232 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5233 mtedesc = 0;
5234 }
5235
5236 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
5237 N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
5238}
5239
5240#define DO_STN_1(N, NAME, ESZ) \
5241void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
5242 target_ulong addr, uint32_t desc) \
5243{ \
5244 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
5245 sve_st1##NAME##_host, sve_st1##NAME##_tlb, NULL); \
5246} \
5247void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
5248 target_ulong addr, uint32_t desc) \
5249{ \
5250 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
5251 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
5252}
5253
5254#define DO_STN_2(N, NAME, ESZ, MSZ) \
5255void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
5256 target_ulong addr, uint32_t desc) \
5257{ \
5258 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
5259 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb, NULL); \
5260} \
5261void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
5262 target_ulong addr, uint32_t desc) \
5263{ \
5264 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
5265 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb, NULL); \
5266} \
5267void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5268 target_ulong addr, uint32_t desc) \
5269{ \
5270 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5271 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
5272} \
5273void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5274 target_ulong addr, uint32_t desc) \
5275{ \
5276 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5277 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
5278}
5279
5280DO_STN_1(1, bb, MO_8)
5281DO_STN_1(1, bh, MO_16)
5282DO_STN_1(1, bs, MO_32)
5283DO_STN_1(1, bd, MO_64)
5284DO_STN_1(2, bb, MO_8)
5285DO_STN_1(3, bb, MO_8)
5286DO_STN_1(4, bb, MO_8)
5287
5288DO_STN_2(1, hh, MO_16, MO_16)
5289DO_STN_2(1, hs, MO_32, MO_16)
5290DO_STN_2(1, hd, MO_64, MO_16)
5291DO_STN_2(2, hh, MO_16, MO_16)
5292DO_STN_2(3, hh, MO_16, MO_16)
5293DO_STN_2(4, hh, MO_16, MO_16)
5294
5295DO_STN_2(1, ss, MO_32, MO_32)
5296DO_STN_2(1, sd, MO_64, MO_32)
5297DO_STN_2(2, ss, MO_32, MO_32)
5298DO_STN_2(3, ss, MO_32, MO_32)
5299DO_STN_2(4, ss, MO_32, MO_32)
5300
5301DO_STN_2(1, dd, MO_64, MO_64)
5302DO_STN_2(2, dd, MO_64, MO_64)
5303DO_STN_2(3, dd, MO_64, MO_64)
5304DO_STN_2(4, dd, MO_64, MO_64)
5305
5306#undef DO_STN_1
5307#undef DO_STN_2
5308
5309
5310
5311
5312
5313
5314
5315
5316typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
5317
5318static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
5319{
5320 return *(uint32_t *)(reg + H1_4(reg_ofs));
5321}
5322
5323static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
5324{
5325 return *(int32_t *)(reg + H1_4(reg_ofs));
5326}
5327
5328static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
5329{
5330 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
5331}
5332
5333static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
5334{
5335 return (int32_t)*(uint64_t *)(reg + reg_ofs);
5336}
5337
5338static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
5339{
5340 return *(uint64_t *)(reg + reg_ofs);
5341}
5342
5343static inline QEMU_ALWAYS_INLINE
5344void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5345 target_ulong base, uint32_t desc, uintptr_t retaddr,
5346 uint32_t mtedesc, int esize, int msize,
5347 zreg_off_fn *off_fn,
5348 sve_ldst1_host_fn *host_fn,
5349 sve_ldst1_tlb_fn *tlb_fn)
5350{
5351 const int mmu_idx = cpu_mmu_index(env, false);
5352 const intptr_t reg_max = simd_oprsz(desc);
5353 const int scale = simd_data(desc);
5354 ARMVectorReg scratch;
5355 intptr_t reg_off;
5356 SVEHostPage info, info2;
5357
5358 memset(&scratch, 0, reg_max);
5359 reg_off = 0;
5360 do {
5361 uint64_t pg = vg[reg_off >> 6];
5362 do {
5363 if (likely(pg & 1)) {
5364 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5365 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
5366
5367 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
5368 mmu_idx, retaddr);
5369
5370 if (likely(in_page >= msize)) {
5371 if (unlikely(info.flags & TLB_WATCHPOINT)) {
5372 cpu_check_watchpoint(env_cpu(env), addr, msize,
5373 info.attrs, BP_MEM_READ, retaddr);
5374 }
5375 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5376 mte_check1(env, mtedesc, addr, retaddr);
5377 }
5378 host_fn(&scratch, reg_off, info.host);
5379 } else {
5380
5381 sve_probe_page(&info2, false, env, addr + in_page, 0,
5382 MMU_DATA_LOAD, mmu_idx, retaddr);
5383 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
5384 cpu_check_watchpoint(env_cpu(env), addr,
5385 msize, info.attrs,
5386 BP_MEM_READ, retaddr);
5387 }
5388 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5389 mte_check1(env, mtedesc, addr, retaddr);
5390 }
5391 tlb_fn(env, &scratch, reg_off, addr, retaddr);
5392 }
5393 }
5394 reg_off += esize;
5395 pg >>= esize;
5396 } while (reg_off & 63);
5397 } while (reg_off < reg_max);
5398
5399
5400 memcpy(vd, &scratch, reg_max);
5401}
5402
5403static inline QEMU_ALWAYS_INLINE
5404void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5405 target_ulong base, uint32_t desc, uintptr_t retaddr,
5406 int esize, int msize, zreg_off_fn *off_fn,
5407 sve_ldst1_host_fn *host_fn,
5408 sve_ldst1_tlb_fn *tlb_fn)
5409{
5410 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5411
5412 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5413
5414
5415
5416
5417
5418
5419
5420 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5421 esize, msize, off_fn, host_fn, tlb_fn);
5422}
5423
5424#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
5425void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5426 void *vm, target_ulong base, uint32_t desc) \
5427{ \
5428 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
5429 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5430} \
5431void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5432 void *vm, target_ulong base, uint32_t desc) \
5433{ \
5434 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
5435 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5436}
5437
5438#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
5439void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5440 void *vm, target_ulong base, uint32_t desc) \
5441{ \
5442 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
5443 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5444} \
5445void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5446 void *vm, target_ulong base, uint32_t desc) \
5447{ \
5448 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
5449 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5450}
5451
5452DO_LD1_ZPZ_S(bsu, zsu, MO_8)
5453DO_LD1_ZPZ_S(bsu, zss, MO_8)
5454DO_LD1_ZPZ_D(bdu, zsu, MO_8)
5455DO_LD1_ZPZ_D(bdu, zss, MO_8)
5456DO_LD1_ZPZ_D(bdu, zd, MO_8)
5457
5458DO_LD1_ZPZ_S(bss, zsu, MO_8)
5459DO_LD1_ZPZ_S(bss, zss, MO_8)
5460DO_LD1_ZPZ_D(bds, zsu, MO_8)
5461DO_LD1_ZPZ_D(bds, zss, MO_8)
5462DO_LD1_ZPZ_D(bds, zd, MO_8)
5463
5464DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
5465DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
5466DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
5467DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
5468DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
5469
5470DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
5471DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
5472DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
5473DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
5474DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
5475
5476DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
5477DO_LD1_ZPZ_S(hss_le, zss, MO_16)
5478DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
5479DO_LD1_ZPZ_D(hds_le, zss, MO_16)
5480DO_LD1_ZPZ_D(hds_le, zd, MO_16)
5481
5482DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
5483DO_LD1_ZPZ_S(hss_be, zss, MO_16)
5484DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
5485DO_LD1_ZPZ_D(hds_be, zss, MO_16)
5486DO_LD1_ZPZ_D(hds_be, zd, MO_16)
5487
5488DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
5489DO_LD1_ZPZ_S(ss_le, zss, MO_32)
5490DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
5491DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
5492DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
5493
5494DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
5495DO_LD1_ZPZ_S(ss_be, zss, MO_32)
5496DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
5497DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
5498DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
5499
5500DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
5501DO_LD1_ZPZ_D(sds_le, zss, MO_32)
5502DO_LD1_ZPZ_D(sds_le, zd, MO_32)
5503
5504DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
5505DO_LD1_ZPZ_D(sds_be, zss, MO_32)
5506DO_LD1_ZPZ_D(sds_be, zd, MO_32)
5507
5508DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
5509DO_LD1_ZPZ_D(dd_le, zss, MO_64)
5510DO_LD1_ZPZ_D(dd_le, zd, MO_64)
5511
5512DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
5513DO_LD1_ZPZ_D(dd_be, zss, MO_64)
5514DO_LD1_ZPZ_D(dd_be, zd, MO_64)
5515
5516#undef DO_LD1_ZPZ_S
5517#undef DO_LD1_ZPZ_D
5518
5519
5520
5521
5522
5523
5524
5525static inline QEMU_ALWAYS_INLINE
5526void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5527 target_ulong base, uint32_t desc, uintptr_t retaddr,
5528 uint32_t mtedesc, const int esz, const int msz,
5529 zreg_off_fn *off_fn,
5530 sve_ldst1_host_fn *host_fn,
5531 sve_ldst1_tlb_fn *tlb_fn)
5532{
5533 const int mmu_idx = cpu_mmu_index(env, false);
5534 const intptr_t reg_max = simd_oprsz(desc);
5535 const int scale = simd_data(desc);
5536 const int esize = 1 << esz;
5537 const int msize = 1 << msz;
5538 intptr_t reg_off;
5539 SVEHostPage info;
5540 target_ulong addr, in_page;
5541
5542
5543 reg_off = find_next_active(vg, 0, reg_max, esz);
5544 if (unlikely(reg_off >= reg_max)) {
5545
5546 memset(vd, 0, reg_max);
5547 return;
5548 }
5549
5550
5551
5552
5553 addr = base + (off_fn(vm, reg_off) << scale);
5554 if (mtedesc) {
5555 mte_check1(env, mtedesc, addr, retaddr);
5556 }
5557 tlb_fn(env, vd, reg_off, addr, retaddr);
5558
5559
5560 swap_memzero(vd, reg_off);
5561 reg_off += esize;
5562 swap_memzero(vd + reg_off, reg_max - reg_off);
5563
5564
5565
5566
5567 while (reg_off < reg_max) {
5568 uint64_t pg = vg[reg_off >> 6];
5569 do {
5570 if (likely((pg >> (reg_off & 63)) & 1)) {
5571 addr = base + (off_fn(vm, reg_off) << scale);
5572 in_page = -(addr | TARGET_PAGE_MASK);
5573
5574 if (unlikely(in_page < msize)) {
5575
5576 goto fault;
5577 }
5578
5579 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
5580 mmu_idx, retaddr);
5581 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
5582 goto fault;
5583 }
5584 if (unlikely(info.flags & TLB_WATCHPOINT) &&
5585 (cpu_watchpoint_address_matches
5586 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
5587 goto fault;
5588 }
5589 if (mtedesc &&
5590 arm_tlb_mte_tagged(&info.attrs) &&
5591 !mte_probe1(env, mtedesc, addr)) {
5592 goto fault;
5593 }
5594
5595 host_fn(vd, reg_off, info.host);
5596 }
5597 reg_off += esize;
5598 } while (reg_off & 63);
5599 }
5600 return;
5601
5602 fault:
5603 record_fault(env, reg_off, reg_max);
5604}
5605
5606static inline QEMU_ALWAYS_INLINE
5607void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5608 target_ulong base, uint32_t desc, uintptr_t retaddr,
5609 const int esz, const int msz,
5610 zreg_off_fn *off_fn,
5611 sve_ldst1_host_fn *host_fn,
5612 sve_ldst1_tlb_fn *tlb_fn)
5613{
5614 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5615
5616 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5617
5618
5619
5620
5621
5622
5623
5624 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5625 esz, msz, off_fn, host_fn, tlb_fn);
5626}
5627
5628#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
5629void HELPER(sve_ldff##MEM##_##OFS) \
5630 (CPUARMState *env, void *vd, void *vg, \
5631 void *vm, target_ulong base, uint32_t desc) \
5632{ \
5633 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
5634 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5635} \
5636void HELPER(sve_ldff##MEM##_##OFS##_mte) \
5637 (CPUARMState *env, void *vd, void *vg, \
5638 void *vm, target_ulong base, uint32_t desc) \
5639{ \
5640 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
5641 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5642}
5643
5644#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
5645void HELPER(sve_ldff##MEM##_##OFS) \
5646 (CPUARMState *env, void *vd, void *vg, \
5647 void *vm, target_ulong base, uint32_t desc) \
5648{ \
5649 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
5650 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5651} \
5652void HELPER(sve_ldff##MEM##_##OFS##_mte) \
5653 (CPUARMState *env, void *vd, void *vg, \
5654 void *vm, target_ulong base, uint32_t desc) \
5655{ \
5656 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
5657 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5658}
5659
5660DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
5661DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
5662DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
5663DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
5664DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
5665
5666DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
5667DO_LDFF1_ZPZ_S(bss, zss, MO_8)
5668DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
5669DO_LDFF1_ZPZ_D(bds, zss, MO_8)
5670DO_LDFF1_ZPZ_D(bds, zd, MO_8)
5671
5672DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
5673DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
5674DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
5675DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
5676DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
5677
5678DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
5679DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
5680DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
5681DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
5682DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
5683
5684DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
5685DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
5686DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
5687DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
5688DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
5689
5690DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
5691DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
5692DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
5693DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
5694DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
5695
5696DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
5697DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
5698DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
5699DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
5700DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
5701
5702DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
5703DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
5704DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
5705DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
5706DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
5707
5708DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
5709DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
5710DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
5711
5712DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
5713DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
5714DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
5715
5716DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
5717DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
5718DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
5719
5720DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
5721DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
5722DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
5723
5724
5725
5726static inline QEMU_ALWAYS_INLINE
5727void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5728 target_ulong base, uint32_t desc, uintptr_t retaddr,
5729 uint32_t mtedesc, int esize, int msize,
5730 zreg_off_fn *off_fn,
5731 sve_ldst1_host_fn *host_fn,
5732 sve_ldst1_tlb_fn *tlb_fn)
5733{
5734 const int mmu_idx = cpu_mmu_index(env, false);
5735 const intptr_t reg_max = simd_oprsz(desc);
5736 const int scale = simd_data(desc);
5737 void *host[ARM_MAX_VQ * 4];
5738 intptr_t reg_off, i;
5739 SVEHostPage info, info2;
5740
5741
5742
5743
5744 i = reg_off = 0;
5745 do {
5746 uint64_t pg = vg[reg_off >> 6];
5747 do {
5748 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5749 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
5750
5751 host[i] = NULL;
5752 if (likely((pg >> (reg_off & 63)) & 1)) {
5753 if (likely(in_page >= msize)) {
5754 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
5755 mmu_idx, retaddr);
5756 host[i] = info.host;
5757 } else {
5758
5759
5760
5761
5762
5763 sve_probe_page(&info, false, env, addr, 0,
5764 MMU_DATA_STORE, mmu_idx, retaddr);
5765 sve_probe_page(&info2, false, env, addr + in_page, 0,
5766 MMU_DATA_STORE, mmu_idx, retaddr);
5767 info.flags |= info2.flags;
5768 }
5769
5770 if (unlikely(info.flags & TLB_WATCHPOINT)) {
5771 cpu_check_watchpoint(env_cpu(env), addr, msize,
5772 info.attrs, BP_MEM_WRITE, retaddr);
5773 }
5774
5775 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5776 mte_check1(env, mtedesc, addr, retaddr);
5777 }
5778 }
5779 i += 1;
5780 reg_off += esize;
5781 } while (reg_off & 63);
5782 } while (reg_off < reg_max);
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793 i = reg_off = 0;
5794 do {
5795 void *h = host[i];
5796 if (likely(h != NULL)) {
5797 host_fn(vd, reg_off, h);
5798 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
5799 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5800 tlb_fn(env, vd, reg_off, addr, retaddr);
5801 }
5802 i += 1;
5803 reg_off += esize;
5804 } while (reg_off < reg_max);
5805}
5806
5807static inline QEMU_ALWAYS_INLINE
5808void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5809 target_ulong base, uint32_t desc, uintptr_t retaddr,
5810 int esize, int msize, zreg_off_fn *off_fn,
5811 sve_ldst1_host_fn *host_fn,
5812 sve_ldst1_tlb_fn *tlb_fn)
5813{
5814 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5815
5816 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5817
5818
5819
5820
5821
5822
5823
5824 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5825 esize, msize, off_fn, host_fn, tlb_fn);
5826}
5827
5828#define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
5829void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5830 void *vm, target_ulong base, uint32_t desc) \
5831{ \
5832 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
5833 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5834} \
5835void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5836 void *vm, target_ulong base, uint32_t desc) \
5837{ \
5838 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
5839 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5840}
5841
5842#define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
5843void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5844 void *vm, target_ulong base, uint32_t desc) \
5845{ \
5846 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
5847 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5848} \
5849void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5850 void *vm, target_ulong base, uint32_t desc) \
5851{ \
5852 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
5853 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5854}
5855
5856DO_ST1_ZPZ_S(bs, zsu, MO_8)
5857DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
5858DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
5859DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
5860DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
5861
5862DO_ST1_ZPZ_S(bs, zss, MO_8)
5863DO_ST1_ZPZ_S(hs_le, zss, MO_16)
5864DO_ST1_ZPZ_S(hs_be, zss, MO_16)
5865DO_ST1_ZPZ_S(ss_le, zss, MO_32)
5866DO_ST1_ZPZ_S(ss_be, zss, MO_32)
5867
5868DO_ST1_ZPZ_D(bd, zsu, MO_8)
5869DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
5870DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
5871DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
5872DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
5873DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
5874DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
5875
5876DO_ST1_ZPZ_D(bd, zss, MO_8)
5877DO_ST1_ZPZ_D(hd_le, zss, MO_16)
5878DO_ST1_ZPZ_D(hd_be, zss, MO_16)
5879DO_ST1_ZPZ_D(sd_le, zss, MO_32)
5880DO_ST1_ZPZ_D(sd_be, zss, MO_32)
5881DO_ST1_ZPZ_D(dd_le, zss, MO_64)
5882DO_ST1_ZPZ_D(dd_be, zss, MO_64)
5883
5884DO_ST1_ZPZ_D(bd, zd, MO_8)
5885DO_ST1_ZPZ_D(hd_le, zd, MO_16)
5886DO_ST1_ZPZ_D(hd_be, zd, MO_16)
5887DO_ST1_ZPZ_D(sd_le, zd, MO_32)
5888DO_ST1_ZPZ_D(sd_be, zd, MO_32)
5889DO_ST1_ZPZ_D(dd_le, zd, MO_64)
5890DO_ST1_ZPZ_D(dd_be, zd, MO_64)
5891
5892#undef DO_ST1_ZPZ_S
5893#undef DO_ST1_ZPZ_D
5894