1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20#include "qemu/osdep.h"
21#include "cpu.h"
22#include "exec/exec-all.h"
23#include "exec/cpu_ldst.h"
24#include "exec/helper-proto.h"
25#include "tcg/tcg-gvec-desc.h"
26#include "fpu/softfloat.h"
27
28
29
30
31#ifdef HOST_WORDS_BIGENDIAN
32#define H1(x) ((x) ^ 7)
33#define H1_2(x) ((x) ^ 6)
34#define H1_4(x) ((x) ^ 4)
35#define H2(x) ((x) ^ 3)
36#define H4(x) ((x) ^ 1)
37#else
38#define H1(x) (x)
39#define H1_2(x) (x)
40#define H1_4(x) (x)
41#define H2(x) (x)
42#define H4(x) (x)
43#endif
44
45
46
47
48
49
50
51
52
53#define PREDTEST_INIT 1
54
55
56
57
58static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
59{
60 if (likely(g)) {
61
62
63 if (!(flags & 4)) {
64 flags |= ((d & (g & -g)) != 0) << 31;
65 flags |= 4;
66 }
67
68
69 flags |= ((d & g) != 0) << 1;
70
71
72 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
73 }
74 return flags;
75}
76
77
78
79
80static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
81{
82 if (likely(g)) {
83
84
85 if (!(flags & 4)) {
86 flags += 4 - 1;
87 flags |= (d & pow2floor(g)) == 0;
88 }
89
90
91 flags |= ((d & g) != 0) << 1;
92
93
94 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
95 }
96 return flags;
97}
98
99
100uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
101{
102 return iter_predtest_fwd(d, g, PREDTEST_INIT);
103}
104
105
106uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
107{
108 uint32_t flags = PREDTEST_INIT;
109 uint64_t *d = vd, *g = vg;
110 uintptr_t i = 0;
111
112 do {
113 flags = iter_predtest_fwd(d[i], g[i], flags);
114 } while (++i < words);
115
116 return flags;
117}
118
119
120
121
122
123
124
125
126
127
128
129
130static inline uint64_t expand_pred_b(uint8_t byte)
131{
132 static const uint64_t word[256] = {
133 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
134 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
135 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
136 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
137 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
138 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
139 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
140 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
141 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
142 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
143 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
144 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
145 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
146 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
147 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
148 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
149 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
150 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
151 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
152 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
153 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
154 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
155 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
156 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
157 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
158 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
159 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
160 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
161 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
162 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
163 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
164 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
165 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
166 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
167 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
168 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
169 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
170 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
171 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
172 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
173 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
174 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
175 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
176 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
177 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
178 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
179 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
180 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
181 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
182 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
183 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
184 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
185 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
186 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
187 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
188 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
189 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
190 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
191 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
192 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
193 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
194 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
195 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
196 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
197 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
198 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
199 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
200 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
201 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
202 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
203 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
204 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
205 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
206 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
207 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
208 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
209 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
210 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
211 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
212 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
213 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
214 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
215 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
216 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
217 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
218 0xffffffffffffffff,
219 };
220 return word[byte];
221}
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237static inline uint64_t expand_pred_h(uint8_t byte)
238{
239 static const uint64_t word[] = {
240 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
241 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
242 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
243 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
244 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
245 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
246 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
247 [0x55] = 0xffffffffffffffff,
248 };
249 return word[byte & 0x55];
250}
251
252
253static inline uint64_t expand_pred_s(uint8_t byte)
254{
255 static const uint64_t word[] = {
256 [0x01] = 0x00000000ffffffffull,
257 [0x10] = 0xffffffff00000000ull,
258 [0x11] = 0xffffffffffffffffull,
259 };
260 return word[byte & 0x11];
261}
262
263
264static inline uint32_t hswap32(uint32_t h)
265{
266 return rol32(h, 16);
267}
268
269
270static inline uint64_t hswap64(uint64_t h)
271{
272 uint64_t m = 0x0000ffff0000ffffull;
273 h = rol64(h, 32);
274 return ((h & m) << 16) | ((h >> 16) & m);
275}
276
277
278static inline uint64_t wswap64(uint64_t h)
279{
280 return rol64(h, 32);
281}
282
283#define LOGICAL_PPPP(NAME, FUNC) \
284void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
285{ \
286 uintptr_t opr_sz = simd_oprsz(desc); \
287 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
288 uintptr_t i; \
289 for (i = 0; i < opr_sz / 8; ++i) { \
290 d[i] = FUNC(n[i], m[i], g[i]); \
291 } \
292}
293
294#define DO_AND(N, M, G) (((N) & (M)) & (G))
295#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
296#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
297#define DO_ORR(N, M, G) (((N) | (M)) & (G))
298#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
299#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
300#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
301#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
302
303LOGICAL_PPPP(sve_and_pppp, DO_AND)
304LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
305LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
306LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
307LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
308LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
309LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
310LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
311
312#undef DO_AND
313#undef DO_BIC
314#undef DO_EOR
315#undef DO_ORR
316#undef DO_ORN
317#undef DO_NOR
318#undef DO_NAND
319#undef DO_SEL
320#undef LOGICAL_PPPP
321
322
323
324
325
326
327
328
329
330#define DO_ZPZZ(NAME, TYPE, H, OP) \
331void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
332{ \
333 intptr_t i, opr_sz = simd_oprsz(desc); \
334 for (i = 0; i < opr_sz; ) { \
335 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
336 do { \
337 if (pg & 1) { \
338 TYPE nn = *(TYPE *)(vn + H(i)); \
339 TYPE mm = *(TYPE *)(vm + H(i)); \
340 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
341 } \
342 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
343 } while (i & 15); \
344 } \
345}
346
347
348#define DO_ZPZZ_D(NAME, TYPE, OP) \
349void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
350{ \
351 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
352 TYPE *d = vd, *n = vn, *m = vm; \
353 uint8_t *pg = vg; \
354 for (i = 0; i < opr_sz; i += 1) { \
355 if (pg[H1(i)] & 1) { \
356 TYPE nn = n[i], mm = m[i]; \
357 d[i] = OP(nn, mm); \
358 } \
359 } \
360}
361
362#define DO_AND(N, M) (N & M)
363#define DO_EOR(N, M) (N ^ M)
364#define DO_ORR(N, M) (N | M)
365#define DO_BIC(N, M) (N & ~M)
366#define DO_ADD(N, M) (N + M)
367#define DO_SUB(N, M) (N - M)
368#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
369#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
370#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
371#define DO_MUL(N, M) (N * M)
372
373
374
375
376
377
378
379
380
381#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
382#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
383
384DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
385DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
386DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
387DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
388
389DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
390DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
391DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
392DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
393
394DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
395DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
396DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
397DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
398
399DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
400DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
401DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
402DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
403
404DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
405DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
406DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
407DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
408
409DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
410DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
411DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
412DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
413
414DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
415DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
416DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
417DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
418
419DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
420DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
421DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
422DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
423
424DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
425DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
426DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
427DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
428
429DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
430DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
431DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
432DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
433
434DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
435DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
436DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
437DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
438
439DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
440DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
441DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
442DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
443
444
445
446static inline uint8_t do_mulh_b(int32_t n, int32_t m)
447{
448 return (n * m) >> 8;
449}
450
451static inline uint16_t do_mulh_h(int32_t n, int32_t m)
452{
453 return (n * m) >> 16;
454}
455
456static inline uint32_t do_mulh_s(int64_t n, int64_t m)
457{
458 return (n * m) >> 32;
459}
460
461static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
462{
463 uint64_t lo, hi;
464 muls64(&lo, &hi, n, m);
465 return hi;
466}
467
468static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
469{
470 uint64_t lo, hi;
471 mulu64(&lo, &hi, n, m);
472 return hi;
473}
474
475DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
476DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
477DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
478DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
479
480DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
481DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
482DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
483DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
484
485DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
486DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
487DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
488DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
489
490DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
491DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
492
493DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
494DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
495
496
497
498#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
499#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
500#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
501
502DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
503DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
504DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
505
506DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
507DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
508DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
509
510DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
511DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
512DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
513
514DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
515DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
516DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
517
518#undef DO_ZPZZ
519#undef DO_ZPZZ_D
520
521
522
523
524
525#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
526void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
527{ \
528 intptr_t i, opr_sz = simd_oprsz(desc); \
529 for (i = 0; i < opr_sz; ) { \
530 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
531 TYPEW mm = *(TYPEW *)(vm + i); \
532 do { \
533 if (pg & 1) { \
534 TYPE nn = *(TYPE *)(vn + H(i)); \
535 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
536 } \
537 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
538 } while (i & 7); \
539 } \
540}
541
542DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
543DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
544DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
545
546DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
547DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
548DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
549
550DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
551DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
552DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
553
554#undef DO_ZPZW
555
556
557
558#define DO_ZPZ(NAME, TYPE, H, OP) \
559void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
560{ \
561 intptr_t i, opr_sz = simd_oprsz(desc); \
562 for (i = 0; i < opr_sz; ) { \
563 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
564 do { \
565 if (pg & 1) { \
566 TYPE nn = *(TYPE *)(vn + H(i)); \
567 *(TYPE *)(vd + H(i)) = OP(nn); \
568 } \
569 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
570 } while (i & 15); \
571 } \
572}
573
574
575#define DO_ZPZ_D(NAME, TYPE, OP) \
576void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
577{ \
578 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
579 TYPE *d = vd, *n = vn; \
580 uint8_t *pg = vg; \
581 for (i = 0; i < opr_sz; i += 1) { \
582 if (pg[H1(i)] & 1) { \
583 TYPE nn = n[i]; \
584 d[i] = OP(nn); \
585 } \
586 } \
587}
588
589#define DO_CLS_B(N) (clrsb32(N) - 24)
590#define DO_CLS_H(N) (clrsb32(N) - 16)
591
592DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
593DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
594DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
595DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
596
597#define DO_CLZ_B(N) (clz32(N) - 24)
598#define DO_CLZ_H(N) (clz32(N) - 16)
599
600DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
601DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
602DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
603DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
604
605DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
606DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
607DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
608DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
609
610#define DO_CNOT(N) (N == 0)
611
612DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
613DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
614DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
615DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
616
617#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
618
619DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
620DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
621DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
622
623#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
624
625DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
626DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
627DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
628
629#define DO_NOT(N) (~N)
630
631DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
632DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
633DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
634DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
635
636#define DO_SXTB(N) ((int8_t)N)
637#define DO_SXTH(N) ((int16_t)N)
638#define DO_SXTS(N) ((int32_t)N)
639#define DO_UXTB(N) ((uint8_t)N)
640#define DO_UXTH(N) ((uint16_t)N)
641#define DO_UXTS(N) ((uint32_t)N)
642
643DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
644DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
645DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
646DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
647DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
648DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
649
650DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
651DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
652DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
653DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
654DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
655DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
656
657#define DO_ABS(N) (N < 0 ? -N : N)
658
659DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
660DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
661DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
662DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
663
664#define DO_NEG(N) (-N)
665
666DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
667DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
668DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
669DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
670
671DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
672DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
673DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
674
675DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
676DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
677
678DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
679
680DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
681DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
682DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
683DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
684
685
686
687#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
688void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
689{ \
690 intptr_t i, opr_sz = simd_oprsz(desc); \
691 for (i = 0; i < opr_sz; ) { \
692 TYPEW mm = *(TYPEW *)(vm + i); \
693 do { \
694 TYPE nn = *(TYPE *)(vn + H(i)); \
695 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
696 i += sizeof(TYPE); \
697 } while (i & 7); \
698 } \
699}
700
701DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
702DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
703DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
704
705DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
706DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
707DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
708
709DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
710DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
711DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
712
713#undef DO_ZZW
714
715#undef DO_CLS_B
716#undef DO_CLS_H
717#undef DO_CLZ_B
718#undef DO_CLZ_H
719#undef DO_CNOT
720#undef DO_FABS
721#undef DO_FNEG
722#undef DO_ABS
723#undef DO_NEG
724#undef DO_ZPZ
725#undef DO_ZPZ_D
726
727
728
729
730
731
732
733
734
735
736#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
737uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
738{ \
739 intptr_t i, opr_sz = simd_oprsz(desc); \
740 TYPERED ret = INIT; \
741 for (i = 0; i < opr_sz; ) { \
742 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
743 do { \
744 if (pg & 1) { \
745 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
746 ret = OP(ret, nn); \
747 } \
748 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
749 } while (i & 15); \
750 } \
751 return (TYPERET)ret; \
752}
753
754#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
755uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
756{ \
757 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
758 TYPEE *n = vn; \
759 uint8_t *pg = vg; \
760 TYPER ret = INIT; \
761 for (i = 0; i < opr_sz; i += 1) { \
762 if (pg[H1(i)] & 1) { \
763 TYPEE nn = n[i]; \
764 ret = OP(ret, nn); \
765 } \
766 } \
767 return ret; \
768}
769
770DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
771DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
772DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
773DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
774
775DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
776DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
777DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
778DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
779
780DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
781DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
782DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
783DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
784
785DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
786DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
787DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
788
789DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
790DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
791DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
792DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
793
794DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
795DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
796DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
797DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
798
799DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
800DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
801DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
802DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
803
804DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
805DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
806DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
807DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
808
809DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
810DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
811DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
812DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
813
814#undef DO_VPZ
815#undef DO_VPZ_D
816
817
818#define DO_ZZI(NAME, TYPE, OP) \
819void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
820{ \
821 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
822 TYPE s = s64, *d = vd, *n = vn; \
823 for (i = 0; i < opr_sz; ++i) { \
824 d[i] = OP(n[i], s); \
825 } \
826}
827
828#define DO_SUBR(X, Y) (Y - X)
829
830DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
831DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
832DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
833DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
834
835DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
836DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
837DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
838DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
839
840DO_ZZI(sve_smini_b, int8_t, DO_MIN)
841DO_ZZI(sve_smini_h, int16_t, DO_MIN)
842DO_ZZI(sve_smini_s, int32_t, DO_MIN)
843DO_ZZI(sve_smini_d, int64_t, DO_MIN)
844
845DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
846DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
847DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
848DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
849
850DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
851DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
852DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
853DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
854
855#undef DO_ZZI
856
857#undef DO_AND
858#undef DO_ORR
859#undef DO_EOR
860#undef DO_BIC
861#undef DO_ADD
862#undef DO_SUB
863#undef DO_MAX
864#undef DO_MIN
865#undef DO_ABD
866#undef DO_MUL
867#undef DO_DIV
868#undef DO_ASR
869#undef DO_LSR
870#undef DO_LSL
871#undef DO_SUBR
872
873
874
875
876static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
877{
878 uint64_t mask = pred_esz_masks[esz];
879 intptr_t i = words;
880
881 do {
882 uint64_t this_g = g[--i] & mask;
883 if (this_g) {
884 return i * 64 + (63 - clz64(this_g));
885 }
886 } while (i > 0);
887 return (intptr_t)-1 << esz;
888}
889
890uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
891{
892 uint32_t flags = PREDTEST_INIT;
893 uint64_t *d = vd, *g = vg;
894 intptr_t i = 0;
895
896 do {
897 uint64_t this_d = d[i];
898 uint64_t this_g = g[i];
899
900 if (this_g) {
901 if (!(flags & 4)) {
902
903 this_d |= this_g & -this_g;
904 d[i] = this_d;
905 }
906 flags = iter_predtest_fwd(this_d, this_g, flags);
907 }
908 } while (++i < words);
909
910 return flags;
911}
912
913uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
914{
915 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
916 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
917 uint32_t flags = PREDTEST_INIT;
918 uint64_t *d = vd, *g = vg, esz_mask;
919 intptr_t i, next;
920
921 next = last_active_element(vd, words, esz) + (1 << esz);
922 esz_mask = pred_esz_masks[esz];
923
924
925
926 if (next < words * 64) {
927 uint64_t mask = -1;
928
929 if (next & 63) {
930 mask = ~((1ull << (next & 63)) - 1);
931 next &= -64;
932 }
933 do {
934 uint64_t this_g = g[next / 64] & esz_mask & mask;
935 if (this_g != 0) {
936 next = (next & -64) + ctz64(this_g);
937 break;
938 }
939 next += 64;
940 mask = -1;
941 } while (next < words * 64);
942 }
943
944 i = 0;
945 do {
946 uint64_t this_d = 0;
947 if (i == next / 64) {
948 this_d = 1ull << (next & 63);
949 }
950 d[i] = this_d;
951 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
952 } while (++i < words);
953
954 return flags;
955}
956
957
958
959
960
961
962
963
964
965
966void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
967{
968 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
969 uint64_t *d = vd;
970 uint8_t *pg = vg;
971 for (i = 0; i < opr_sz; i += 1) {
972 d[i] &= ~expand_pred_b(pg[H1(i)]);
973 }
974}
975
976void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
977{
978 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
979 uint64_t *d = vd;
980 uint8_t *pg = vg;
981 for (i = 0; i < opr_sz; i += 1) {
982 d[i] &= ~expand_pred_h(pg[H1(i)]);
983 }
984}
985
986void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
987{
988 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
989 uint64_t *d = vd;
990 uint8_t *pg = vg;
991 for (i = 0; i < opr_sz; i += 1) {
992 d[i] &= ~expand_pred_s(pg[H1(i)]);
993 }
994}
995
996void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
997{
998 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
999 uint64_t *d = vd;
1000 uint8_t *pg = vg;
1001 for (i = 0; i < opr_sz; i += 1) {
1002 if (pg[H1(i)] & 1) {
1003 d[i] = 0;
1004 }
1005 }
1006}
1007
1008
1009void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1010{
1011 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1012 uint64_t *d = vd, *n = vn;
1013 uint8_t *pg = vg;
1014 for (i = 0; i < opr_sz; i += 1) {
1015 d[i] = n[i] & expand_pred_b(pg[H1(i)]);
1016 }
1017}
1018
1019void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1020{
1021 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1022 uint64_t *d = vd, *n = vn;
1023 uint8_t *pg = vg;
1024 for (i = 0; i < opr_sz; i += 1) {
1025 d[i] = n[i] & expand_pred_h(pg[H1(i)]);
1026 }
1027}
1028
1029void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1030{
1031 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1032 uint64_t *d = vd, *n = vn;
1033 uint8_t *pg = vg;
1034 for (i = 0; i < opr_sz; i += 1) {
1035 d[i] = n[i] & expand_pred_s(pg[H1(i)]);
1036 }
1037}
1038
1039void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1040{
1041 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1042 uint64_t *d = vd, *n = vn;
1043 uint8_t *pg = vg;
1044 for (i = 0; i < opr_sz; i += 1) {
1045 d[i] = n[i] & -(uint64_t)(pg[H1(i)] & 1);
1046 }
1047}
1048
1049
1050
1051#define DO_ZPZI(NAME, TYPE, H, OP) \
1052void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1053{ \
1054 intptr_t i, opr_sz = simd_oprsz(desc); \
1055 TYPE imm = simd_data(desc); \
1056 for (i = 0; i < opr_sz; ) { \
1057 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1058 do { \
1059 if (pg & 1) { \
1060 TYPE nn = *(TYPE *)(vn + H(i)); \
1061 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1062 } \
1063 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1064 } while (i & 15); \
1065 } \
1066}
1067
1068
1069#define DO_ZPZI_D(NAME, TYPE, OP) \
1070void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1071{ \
1072 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1073 TYPE *d = vd, *n = vn; \
1074 TYPE imm = simd_data(desc); \
1075 uint8_t *pg = vg; \
1076 for (i = 0; i < opr_sz; i += 1) { \
1077 if (pg[H1(i)] & 1) { \
1078 TYPE nn = n[i]; \
1079 d[i] = OP(nn, imm); \
1080 } \
1081 } \
1082}
1083
1084#define DO_SHR(N, M) (N >> M)
1085#define DO_SHL(N, M) (N << M)
1086
1087
1088
1089
1090#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1091
1092DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1093DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1094DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1095DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1096
1097DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1098DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1099DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1100DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1101
1102DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1103DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1104DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1105DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1106
1107DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1108DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1109DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1110DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1111
1112#undef DO_SHR
1113#undef DO_SHL
1114#undef DO_ASRD
1115#undef DO_ZPZI
1116#undef DO_ZPZI_D
1117
1118
1119
1120#define DO_ZPZZZ(NAME, TYPE, H, OP) \
1121void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1122 void *vg, uint32_t desc) \
1123{ \
1124 intptr_t i, opr_sz = simd_oprsz(desc); \
1125 for (i = 0; i < opr_sz; ) { \
1126 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1127 do { \
1128 if (pg & 1) { \
1129 TYPE nn = *(TYPE *)(vn + H(i)); \
1130 TYPE mm = *(TYPE *)(vm + H(i)); \
1131 TYPE aa = *(TYPE *)(va + H(i)); \
1132 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1133 } \
1134 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1135 } while (i & 15); \
1136 } \
1137}
1138
1139
1140#define DO_ZPZZZ_D(NAME, TYPE, OP) \
1141void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1142 void *vg, uint32_t desc) \
1143{ \
1144 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1145 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1146 uint8_t *pg = vg; \
1147 for (i = 0; i < opr_sz; i += 1) { \
1148 if (pg[H1(i)] & 1) { \
1149 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1150 d[i] = OP(aa, nn, mm); \
1151 } \
1152 } \
1153}
1154
1155#define DO_MLA(A, N, M) (A + N * M)
1156#define DO_MLS(A, N, M) (A - N * M)
1157
1158DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1159DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1160
1161DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1162DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1163
1164DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1165DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1166
1167DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1168DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1169
1170#undef DO_MLA
1171#undef DO_MLS
1172#undef DO_ZPZZZ
1173#undef DO_ZPZZZ_D
1174
1175void HELPER(sve_index_b)(void *vd, uint32_t start,
1176 uint32_t incr, uint32_t desc)
1177{
1178 intptr_t i, opr_sz = simd_oprsz(desc);
1179 uint8_t *d = vd;
1180 for (i = 0; i < opr_sz; i += 1) {
1181 d[H1(i)] = start + i * incr;
1182 }
1183}
1184
1185void HELPER(sve_index_h)(void *vd, uint32_t start,
1186 uint32_t incr, uint32_t desc)
1187{
1188 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1189 uint16_t *d = vd;
1190 for (i = 0; i < opr_sz; i += 1) {
1191 d[H2(i)] = start + i * incr;
1192 }
1193}
1194
1195void HELPER(sve_index_s)(void *vd, uint32_t start,
1196 uint32_t incr, uint32_t desc)
1197{
1198 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1199 uint32_t *d = vd;
1200 for (i = 0; i < opr_sz; i += 1) {
1201 d[H4(i)] = start + i * incr;
1202 }
1203}
1204
1205void HELPER(sve_index_d)(void *vd, uint64_t start,
1206 uint64_t incr, uint32_t desc)
1207{
1208 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1209 uint64_t *d = vd;
1210 for (i = 0; i < opr_sz; i += 1) {
1211 d[i] = start + i * incr;
1212 }
1213}
1214
1215void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1216{
1217 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1218 uint32_t sh = simd_data(desc);
1219 uint32_t *d = vd, *n = vn, *m = vm;
1220 for (i = 0; i < opr_sz; i += 1) {
1221 d[i] = n[i] + (m[i] << sh);
1222 }
1223}
1224
1225void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1226{
1227 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1228 uint64_t sh = simd_data(desc);
1229 uint64_t *d = vd, *n = vn, *m = vm;
1230 for (i = 0; i < opr_sz; i += 1) {
1231 d[i] = n[i] + (m[i] << sh);
1232 }
1233}
1234
1235void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1236{
1237 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1238 uint64_t sh = simd_data(desc);
1239 uint64_t *d = vd, *n = vn, *m = vm;
1240 for (i = 0; i < opr_sz; i += 1) {
1241 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1242 }
1243}
1244
1245void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1246{
1247 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1248 uint64_t sh = simd_data(desc);
1249 uint64_t *d = vd, *n = vn, *m = vm;
1250 for (i = 0; i < opr_sz; i += 1) {
1251 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1252 }
1253}
1254
1255void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1256{
1257
1258 static const uint16_t coeff[] = {
1259 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1260 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1261 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1262 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1263 };
1264 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1265 uint16_t *d = vd, *n = vn;
1266
1267 for (i = 0; i < opr_sz; i++) {
1268 uint16_t nn = n[i];
1269 intptr_t idx = extract32(nn, 0, 5);
1270 uint16_t exp = extract32(nn, 5, 5);
1271 d[i] = coeff[idx] | (exp << 10);
1272 }
1273}
1274
1275void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1276{
1277
1278 static const uint32_t coeff[] = {
1279 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1280 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1281 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1282 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1283 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1284 0x1ef532, 0x20b051, 0x227043, 0x243516,
1285 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1286 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1287 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1288 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1289 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1290 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1291 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1292 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1293 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1294 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1295 };
1296 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1297 uint32_t *d = vd, *n = vn;
1298
1299 for (i = 0; i < opr_sz; i++) {
1300 uint32_t nn = n[i];
1301 intptr_t idx = extract32(nn, 0, 6);
1302 uint32_t exp = extract32(nn, 6, 8);
1303 d[i] = coeff[idx] | (exp << 23);
1304 }
1305}
1306
1307void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1308{
1309
1310 static const uint64_t coeff[] = {
1311 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1312 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1313 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1314 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1315 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1316 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1317 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1318 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1319 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1320 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1321 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1322 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1323 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1324 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1325 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1326 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1327 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1328 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1329 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1330 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1331 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1332 0xFA7C1819E90D8ull,
1333 };
1334 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1335 uint64_t *d = vd, *n = vn;
1336
1337 for (i = 0; i < opr_sz; i++) {
1338 uint64_t nn = n[i];
1339 intptr_t idx = extract32(nn, 0, 6);
1340 uint64_t exp = extract32(nn, 6, 11);
1341 d[i] = coeff[idx] | (exp << 52);
1342 }
1343}
1344
1345void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1346{
1347 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1348 uint16_t *d = vd, *n = vn, *m = vm;
1349 for (i = 0; i < opr_sz; i += 1) {
1350 uint16_t nn = n[i];
1351 uint16_t mm = m[i];
1352 if (mm & 1) {
1353 nn = float16_one;
1354 }
1355 d[i] = nn ^ (mm & 2) << 14;
1356 }
1357}
1358
1359void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1360{
1361 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1362 uint32_t *d = vd, *n = vn, *m = vm;
1363 for (i = 0; i < opr_sz; i += 1) {
1364 uint32_t nn = n[i];
1365 uint32_t mm = m[i];
1366 if (mm & 1) {
1367 nn = float32_one;
1368 }
1369 d[i] = nn ^ (mm & 2) << 30;
1370 }
1371}
1372
1373void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1374{
1375 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1376 uint64_t *d = vd, *n = vn, *m = vm;
1377 for (i = 0; i < opr_sz; i += 1) {
1378 uint64_t nn = n[i];
1379 uint64_t mm = m[i];
1380 if (mm & 1) {
1381 nn = float64_one;
1382 }
1383 d[i] = nn ^ (mm & 2) << 62;
1384 }
1385}
1386
1387
1388
1389
1390
1391void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1392{
1393 intptr_t i, oprsz = simd_oprsz(desc);
1394
1395 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1396 int r = *(int8_t *)(a + i) + b;
1397 if (r > INT8_MAX) {
1398 r = INT8_MAX;
1399 } else if (r < INT8_MIN) {
1400 r = INT8_MIN;
1401 }
1402 *(int8_t *)(d + i) = r;
1403 }
1404}
1405
1406void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1407{
1408 intptr_t i, oprsz = simd_oprsz(desc);
1409
1410 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1411 int r = *(int16_t *)(a + i) + b;
1412 if (r > INT16_MAX) {
1413 r = INT16_MAX;
1414 } else if (r < INT16_MIN) {
1415 r = INT16_MIN;
1416 }
1417 *(int16_t *)(d + i) = r;
1418 }
1419}
1420
1421void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1422{
1423 intptr_t i, oprsz = simd_oprsz(desc);
1424
1425 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1426 int64_t r = *(int32_t *)(a + i) + b;
1427 if (r > INT32_MAX) {
1428 r = INT32_MAX;
1429 } else if (r < INT32_MIN) {
1430 r = INT32_MIN;
1431 }
1432 *(int32_t *)(d + i) = r;
1433 }
1434}
1435
1436void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1437{
1438 intptr_t i, oprsz = simd_oprsz(desc);
1439
1440 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1441 int64_t ai = *(int64_t *)(a + i);
1442 int64_t r = ai + b;
1443 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1444
1445 r = (r < 0 ? INT64_MAX : INT64_MIN);
1446 }
1447 *(int64_t *)(d + i) = r;
1448 }
1449}
1450
1451
1452
1453
1454
1455void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1456{
1457 intptr_t i, oprsz = simd_oprsz(desc);
1458
1459 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1460 int r = *(uint8_t *)(a + i) + b;
1461 if (r > UINT8_MAX) {
1462 r = UINT8_MAX;
1463 } else if (r < 0) {
1464 r = 0;
1465 }
1466 *(uint8_t *)(d + i) = r;
1467 }
1468}
1469
1470void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1471{
1472 intptr_t i, oprsz = simd_oprsz(desc);
1473
1474 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1475 int r = *(uint16_t *)(a + i) + b;
1476 if (r > UINT16_MAX) {
1477 r = UINT16_MAX;
1478 } else if (r < 0) {
1479 r = 0;
1480 }
1481 *(uint16_t *)(d + i) = r;
1482 }
1483}
1484
1485void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1486{
1487 intptr_t i, oprsz = simd_oprsz(desc);
1488
1489 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1490 int64_t r = *(uint32_t *)(a + i) + b;
1491 if (r > UINT32_MAX) {
1492 r = UINT32_MAX;
1493 } else if (r < 0) {
1494 r = 0;
1495 }
1496 *(uint32_t *)(d + i) = r;
1497 }
1498}
1499
1500void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1501{
1502 intptr_t i, oprsz = simd_oprsz(desc);
1503
1504 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1505 uint64_t r = *(uint64_t *)(a + i) + b;
1506 if (r < b) {
1507 r = UINT64_MAX;
1508 }
1509 *(uint64_t *)(d + i) = r;
1510 }
1511}
1512
1513void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1514{
1515 intptr_t i, oprsz = simd_oprsz(desc);
1516
1517 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1518 uint64_t ai = *(uint64_t *)(a + i);
1519 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1520 }
1521}
1522
1523
1524
1525
1526void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1527 uint64_t mm, uint32_t desc)
1528{
1529 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1530 uint64_t *d = vd, *n = vn;
1531 uint8_t *pg = vg;
1532
1533 mm = dup_const(MO_8, mm);
1534 for (i = 0; i < opr_sz; i += 1) {
1535 uint64_t nn = n[i];
1536 uint64_t pp = expand_pred_b(pg[H1(i)]);
1537 d[i] = (mm & pp) | (nn & ~pp);
1538 }
1539}
1540
1541void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1542 uint64_t mm, uint32_t desc)
1543{
1544 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1545 uint64_t *d = vd, *n = vn;
1546 uint8_t *pg = vg;
1547
1548 mm = dup_const(MO_16, mm);
1549 for (i = 0; i < opr_sz; i += 1) {
1550 uint64_t nn = n[i];
1551 uint64_t pp = expand_pred_h(pg[H1(i)]);
1552 d[i] = (mm & pp) | (nn & ~pp);
1553 }
1554}
1555
1556void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1557 uint64_t mm, uint32_t desc)
1558{
1559 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1560 uint64_t *d = vd, *n = vn;
1561 uint8_t *pg = vg;
1562
1563 mm = dup_const(MO_32, mm);
1564 for (i = 0; i < opr_sz; i += 1) {
1565 uint64_t nn = n[i];
1566 uint64_t pp = expand_pred_s(pg[H1(i)]);
1567 d[i] = (mm & pp) | (nn & ~pp);
1568 }
1569}
1570
1571void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1572 uint64_t mm, uint32_t desc)
1573{
1574 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1575 uint64_t *d = vd, *n = vn;
1576 uint8_t *pg = vg;
1577
1578 for (i = 0; i < opr_sz; i += 1) {
1579 uint64_t nn = n[i];
1580 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1581 }
1582}
1583
1584void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1585{
1586 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1587 uint64_t *d = vd;
1588 uint8_t *pg = vg;
1589
1590 val = dup_const(MO_8, val);
1591 for (i = 0; i < opr_sz; i += 1) {
1592 d[i] = val & expand_pred_b(pg[H1(i)]);
1593 }
1594}
1595
1596void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1597{
1598 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1599 uint64_t *d = vd;
1600 uint8_t *pg = vg;
1601
1602 val = dup_const(MO_16, val);
1603 for (i = 0; i < opr_sz; i += 1) {
1604 d[i] = val & expand_pred_h(pg[H1(i)]);
1605 }
1606}
1607
1608void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1609{
1610 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1611 uint64_t *d = vd;
1612 uint8_t *pg = vg;
1613
1614 val = dup_const(MO_32, val);
1615 for (i = 0; i < opr_sz; i += 1) {
1616 d[i] = val & expand_pred_s(pg[H1(i)]);
1617 }
1618}
1619
1620void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1621{
1622 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1623 uint64_t *d = vd;
1624 uint8_t *pg = vg;
1625
1626 for (i = 0; i < opr_sz; i += 1) {
1627 d[i] = (pg[H1(i)] & 1 ? val : 0);
1628 }
1629}
1630
1631
1632
1633
1634static void swap_memmove(void *vd, void *vs, size_t n)
1635{
1636 uintptr_t d = (uintptr_t)vd;
1637 uintptr_t s = (uintptr_t)vs;
1638 uintptr_t o = (d | s | n) & 7;
1639 size_t i;
1640
1641#ifndef HOST_WORDS_BIGENDIAN
1642 o = 0;
1643#endif
1644 switch (o) {
1645 case 0:
1646 memmove(vd, vs, n);
1647 break;
1648
1649 case 4:
1650 if (d < s || d >= s + n) {
1651 for (i = 0; i < n; i += 4) {
1652 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1653 }
1654 } else {
1655 for (i = n; i > 0; ) {
1656 i -= 4;
1657 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1658 }
1659 }
1660 break;
1661
1662 case 2:
1663 case 6:
1664 if (d < s || d >= s + n) {
1665 for (i = 0; i < n; i += 2) {
1666 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1667 }
1668 } else {
1669 for (i = n; i > 0; ) {
1670 i -= 2;
1671 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1672 }
1673 }
1674 break;
1675
1676 default:
1677 if (d < s || d >= s + n) {
1678 for (i = 0; i < n; i++) {
1679 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1680 }
1681 } else {
1682 for (i = n; i > 0; ) {
1683 i -= 1;
1684 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1685 }
1686 }
1687 break;
1688 }
1689}
1690
1691void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1692{
1693 intptr_t opr_sz = simd_oprsz(desc);
1694 size_t n_ofs = simd_data(desc);
1695 size_t n_siz = opr_sz - n_ofs;
1696
1697 if (vd != vm) {
1698 swap_memmove(vd, vn + n_ofs, n_siz);
1699 swap_memmove(vd + n_siz, vm, n_ofs);
1700 } else if (vd != vn) {
1701 swap_memmove(vd + n_siz, vd, n_ofs);
1702 swap_memmove(vd, vn + n_ofs, n_siz);
1703 } else {
1704
1705 ARMVectorReg tmp;
1706 swap_memmove(&tmp, vm, n_ofs);
1707 swap_memmove(vd, vd + n_ofs, n_siz);
1708 memcpy(vd + n_siz, &tmp, n_ofs);
1709 }
1710}
1711
1712#define DO_INSR(NAME, TYPE, H) \
1713void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1714{ \
1715 intptr_t opr_sz = simd_oprsz(desc); \
1716 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1717 *(TYPE *)(vd + H(0)) = val; \
1718}
1719
1720DO_INSR(sve_insr_b, uint8_t, H1)
1721DO_INSR(sve_insr_h, uint16_t, H1_2)
1722DO_INSR(sve_insr_s, uint32_t, H1_4)
1723DO_INSR(sve_insr_d, uint64_t, )
1724
1725#undef DO_INSR
1726
1727void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1728{
1729 intptr_t i, j, opr_sz = simd_oprsz(desc);
1730 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1731 uint64_t f = *(uint64_t *)(vn + i);
1732 uint64_t b = *(uint64_t *)(vn + j);
1733 *(uint64_t *)(vd + i) = bswap64(b);
1734 *(uint64_t *)(vd + j) = bswap64(f);
1735 }
1736}
1737
1738void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1739{
1740 intptr_t i, j, opr_sz = simd_oprsz(desc);
1741 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1742 uint64_t f = *(uint64_t *)(vn + i);
1743 uint64_t b = *(uint64_t *)(vn + j);
1744 *(uint64_t *)(vd + i) = hswap64(b);
1745 *(uint64_t *)(vd + j) = hswap64(f);
1746 }
1747}
1748
1749void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1750{
1751 intptr_t i, j, opr_sz = simd_oprsz(desc);
1752 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1753 uint64_t f = *(uint64_t *)(vn + i);
1754 uint64_t b = *(uint64_t *)(vn + j);
1755 *(uint64_t *)(vd + i) = rol64(b, 32);
1756 *(uint64_t *)(vd + j) = rol64(f, 32);
1757 }
1758}
1759
1760void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1761{
1762 intptr_t i, j, opr_sz = simd_oprsz(desc);
1763 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1764 uint64_t f = *(uint64_t *)(vn + i);
1765 uint64_t b = *(uint64_t *)(vn + j);
1766 *(uint64_t *)(vd + i) = b;
1767 *(uint64_t *)(vd + j) = f;
1768 }
1769}
1770
1771#define DO_TBL(NAME, TYPE, H) \
1772void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1773{ \
1774 intptr_t i, opr_sz = simd_oprsz(desc); \
1775 uintptr_t elem = opr_sz / sizeof(TYPE); \
1776 TYPE *d = vd, *n = vn, *m = vm; \
1777 ARMVectorReg tmp; \
1778 if (unlikely(vd == vn)) { \
1779 n = memcpy(&tmp, vn, opr_sz); \
1780 } \
1781 for (i = 0; i < elem; i++) { \
1782 TYPE j = m[H(i)]; \
1783 d[H(i)] = j < elem ? n[H(j)] : 0; \
1784 } \
1785}
1786
1787DO_TBL(sve_tbl_b, uint8_t, H1)
1788DO_TBL(sve_tbl_h, uint16_t, H2)
1789DO_TBL(sve_tbl_s, uint32_t, H4)
1790DO_TBL(sve_tbl_d, uint64_t, )
1791
1792#undef TBL
1793
1794#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1795void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1796{ \
1797 intptr_t i, opr_sz = simd_oprsz(desc); \
1798 TYPED *d = vd; \
1799 TYPES *n = vn; \
1800 ARMVectorReg tmp; \
1801 if (unlikely(vn - vd < opr_sz)) { \
1802 n = memcpy(&tmp, n, opr_sz / 2); \
1803 } \
1804 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1805 d[HD(i)] = n[HS(i)]; \
1806 } \
1807}
1808
1809DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1810DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1811DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1812
1813DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1814DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1815DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1816
1817#undef DO_UNPK
1818
1819
1820
1821
1822
1823static const uint64_t even_bit_esz_masks[5] = {
1824 0x5555555555555555ull,
1825 0x3333333333333333ull,
1826 0x0f0f0f0f0f0f0f0full,
1827 0x00ff00ff00ff00ffull,
1828 0x0000ffff0000ffffull,
1829};
1830
1831
1832
1833
1834
1835
1836static uint64_t expand_bits(uint64_t x, int n)
1837{
1838 int i;
1839
1840 x &= 0xffffffffu;
1841 for (i = 4; i >= n; i--) {
1842 int sh = 1 << i;
1843 x = ((x << sh) | x) & even_bit_esz_masks[i];
1844 }
1845 return x;
1846}
1847
1848
1849
1850
1851
1852
1853static uint64_t compress_bits(uint64_t x, int n)
1854{
1855 int i;
1856
1857 for (i = n; i <= 4; i++) {
1858 int sh = 1 << i;
1859 x &= even_bit_esz_masks[i];
1860 x = (x >> sh) | x;
1861 }
1862 return x & 0xffffffffu;
1863}
1864
1865void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1866{
1867 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1868 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1869 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1870 uint64_t *d = vd;
1871 intptr_t i;
1872
1873 if (oprsz <= 8) {
1874 uint64_t nn = *(uint64_t *)vn;
1875 uint64_t mm = *(uint64_t *)vm;
1876 int half = 4 * oprsz;
1877
1878 nn = extract64(nn, high * half, half);
1879 mm = extract64(mm, high * half, half);
1880 nn = expand_bits(nn, esz);
1881 mm = expand_bits(mm, esz);
1882 d[0] = nn + (mm << (1 << esz));
1883 } else {
1884 ARMPredicateReg tmp_n, tmp_m;
1885
1886
1887
1888 if ((vn - vd) < (uintptr_t)oprsz) {
1889 vn = memcpy(&tmp_n, vn, oprsz);
1890 }
1891 if ((vm - vd) < (uintptr_t)oprsz) {
1892 vm = memcpy(&tmp_m, vm, oprsz);
1893 }
1894 if (high) {
1895 high = oprsz >> 1;
1896 }
1897
1898 if ((high & 3) == 0) {
1899 uint32_t *n = vn, *m = vm;
1900 high >>= 2;
1901
1902 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1903 uint64_t nn = n[H4(high + i)];
1904 uint64_t mm = m[H4(high + i)];
1905
1906 nn = expand_bits(nn, esz);
1907 mm = expand_bits(mm, esz);
1908 d[i] = nn + (mm << (1 << esz));
1909 }
1910 } else {
1911 uint8_t *n = vn, *m = vm;
1912 uint16_t *d16 = vd;
1913
1914 for (i = 0; i < oprsz / 2; i++) {
1915 uint16_t nn = n[H1(high + i)];
1916 uint16_t mm = m[H1(high + i)];
1917
1918 nn = expand_bits(nn, esz);
1919 mm = expand_bits(mm, esz);
1920 d16[H2(i)] = nn + (mm << (1 << esz));
1921 }
1922 }
1923 }
1924}
1925
1926void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1927{
1928 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1929 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1930 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1931 uint64_t *d = vd, *n = vn, *m = vm;
1932 uint64_t l, h;
1933 intptr_t i;
1934
1935 if (oprsz <= 8) {
1936 l = compress_bits(n[0] >> odd, esz);
1937 h = compress_bits(m[0] >> odd, esz);
1938 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1939 } else {
1940 ARMPredicateReg tmp_m;
1941 intptr_t oprsz_16 = oprsz / 16;
1942
1943 if ((vm - vd) < (uintptr_t)oprsz) {
1944 m = memcpy(&tmp_m, vm, oprsz);
1945 }
1946
1947 for (i = 0; i < oprsz_16; i++) {
1948 l = n[2 * i + 0];
1949 h = n[2 * i + 1];
1950 l = compress_bits(l >> odd, esz);
1951 h = compress_bits(h >> odd, esz);
1952 d[i] = l + (h << 32);
1953 }
1954
1955
1956
1957
1958 if (oprsz & 15) {
1959 d[i] = compress_bits(n[2 * i] >> odd, esz);
1960
1961 for (i = 0; i < oprsz_16; i++) {
1962 l = m[2 * i + 0];
1963 h = m[2 * i + 1];
1964 l = compress_bits(l >> odd, esz);
1965 h = compress_bits(h >> odd, esz);
1966 tmp_m.p[i] = l + (h << 32);
1967 }
1968 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
1969
1970 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1971 } else {
1972 for (i = 0; i < oprsz_16; i++) {
1973 l = m[2 * i + 0];
1974 h = m[2 * i + 1];
1975 l = compress_bits(l >> odd, esz);
1976 h = compress_bits(h >> odd, esz);
1977 d[oprsz_16 + i] = l + (h << 32);
1978 }
1979 }
1980 }
1981}
1982
1983void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1984{
1985 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1986 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1987 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1988 uint64_t *d = vd, *n = vn, *m = vm;
1989 uint64_t mask;
1990 int shr, shl;
1991 intptr_t i;
1992
1993 shl = 1 << esz;
1994 shr = 0;
1995 mask = even_bit_esz_masks[esz];
1996 if (odd) {
1997 mask <<= shl;
1998 shr = shl;
1999 shl = 0;
2000 }
2001
2002 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2003 uint64_t nn = (n[i] & mask) >> shr;
2004 uint64_t mm = (m[i] & mask) << shl;
2005 d[i] = nn + mm;
2006 }
2007}
2008
2009
2010static uint64_t reverse_bits_64(uint64_t x, int n)
2011{
2012 int i, sh;
2013
2014 x = bswap64(x);
2015 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2016 uint64_t mask = even_bit_esz_masks[i];
2017 x = ((x & mask) << sh) | ((x >> sh) & mask);
2018 }
2019 return x;
2020}
2021
2022static uint8_t reverse_bits_8(uint8_t x, int n)
2023{
2024 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2025 int i, sh;
2026
2027 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2028 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2029 }
2030 return x;
2031}
2032
2033void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2034{
2035 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2036 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2037 intptr_t i, oprsz_2 = oprsz / 2;
2038
2039 if (oprsz <= 8) {
2040 uint64_t l = *(uint64_t *)vn;
2041 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2042 *(uint64_t *)vd = l;
2043 } else if ((oprsz & 15) == 0) {
2044 for (i = 0; i < oprsz_2; i += 8) {
2045 intptr_t ih = oprsz - 8 - i;
2046 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2047 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2048 *(uint64_t *)(vd + i) = h;
2049 *(uint64_t *)(vd + ih) = l;
2050 }
2051 } else {
2052 for (i = 0; i < oprsz_2; i += 1) {
2053 intptr_t il = H1(i);
2054 intptr_t ih = H1(oprsz - 1 - i);
2055 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2056 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2057 *(uint8_t *)(vd + il) = h;
2058 *(uint8_t *)(vd + ih) = l;
2059 }
2060 }
2061}
2062
2063void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2064{
2065 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2066 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2067 uint64_t *d = vd;
2068 intptr_t i;
2069
2070 if (oprsz <= 8) {
2071 uint64_t nn = *(uint64_t *)vn;
2072 int half = 4 * oprsz;
2073
2074 nn = extract64(nn, high * half, half);
2075 nn = expand_bits(nn, 0);
2076 d[0] = nn;
2077 } else {
2078 ARMPredicateReg tmp_n;
2079
2080
2081
2082 if ((vn - vd) < (uintptr_t)oprsz) {
2083 vn = memcpy(&tmp_n, vn, oprsz);
2084 }
2085 if (high) {
2086 high = oprsz >> 1;
2087 }
2088
2089 if ((high & 3) == 0) {
2090 uint32_t *n = vn;
2091 high >>= 2;
2092
2093 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2094 uint64_t nn = n[H4(high + i)];
2095 d[i] = expand_bits(nn, 0);
2096 }
2097 } else {
2098 uint16_t *d16 = vd;
2099 uint8_t *n = vn;
2100
2101 for (i = 0; i < oprsz / 2; i++) {
2102 uint16_t nn = n[H1(high + i)];
2103 d16[H2(i)] = expand_bits(nn, 0);
2104 }
2105 }
2106 }
2107}
2108
2109#define DO_ZIP(NAME, TYPE, H) \
2110void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2111{ \
2112 intptr_t oprsz = simd_oprsz(desc); \
2113 intptr_t i, oprsz_2 = oprsz / 2; \
2114 ARMVectorReg tmp_n, tmp_m; \
2115
2116 \
2117 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2118 vn = memcpy(&tmp_n, vn, oprsz_2); \
2119 } \
2120 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2121 vm = memcpy(&tmp_m, vm, oprsz_2); \
2122 } \
2123 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2124 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2125 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2126 } \
2127}
2128
2129DO_ZIP(sve_zip_b, uint8_t, H1)
2130DO_ZIP(sve_zip_h, uint16_t, H1_2)
2131DO_ZIP(sve_zip_s, uint32_t, H1_4)
2132DO_ZIP(sve_zip_d, uint64_t, )
2133
2134#define DO_UZP(NAME, TYPE, H) \
2135void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2136{ \
2137 intptr_t oprsz = simd_oprsz(desc); \
2138 intptr_t oprsz_2 = oprsz / 2; \
2139 intptr_t odd_ofs = simd_data(desc); \
2140 intptr_t i; \
2141 ARMVectorReg tmp_m; \
2142 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2143 vm = memcpy(&tmp_m, vm, oprsz); \
2144 } \
2145 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2146 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2147 } \
2148 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2149 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2150 } \
2151}
2152
2153DO_UZP(sve_uzp_b, uint8_t, H1)
2154DO_UZP(sve_uzp_h, uint16_t, H1_2)
2155DO_UZP(sve_uzp_s, uint32_t, H1_4)
2156DO_UZP(sve_uzp_d, uint64_t, )
2157
2158#define DO_TRN(NAME, TYPE, H) \
2159void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2160{ \
2161 intptr_t oprsz = simd_oprsz(desc); \
2162 intptr_t odd_ofs = simd_data(desc); \
2163 intptr_t i; \
2164 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2165 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2166 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2167 *(TYPE *)(vd + H(i + 0)) = ae; \
2168 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2169 } \
2170}
2171
2172DO_TRN(sve_trn_b, uint8_t, H1)
2173DO_TRN(sve_trn_h, uint16_t, H1_2)
2174DO_TRN(sve_trn_s, uint32_t, H1_4)
2175DO_TRN(sve_trn_d, uint64_t, )
2176
2177#undef DO_ZIP
2178#undef DO_UZP
2179#undef DO_TRN
2180
2181void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2182{
2183 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2184 uint32_t *d = vd, *n = vn;
2185 uint8_t *pg = vg;
2186
2187 for (i = j = 0; i < opr_sz; i++) {
2188 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2189 d[H4(j)] = n[H4(i)];
2190 j++;
2191 }
2192 }
2193 for (; j < opr_sz; j++) {
2194 d[H4(j)] = 0;
2195 }
2196}
2197
2198void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2199{
2200 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2201 uint64_t *d = vd, *n = vn;
2202 uint8_t *pg = vg;
2203
2204 for (i = j = 0; i < opr_sz; i++) {
2205 if (pg[H1(i)] & 1) {
2206 d[j] = n[i];
2207 j++;
2208 }
2209 }
2210 for (; j < opr_sz; j++) {
2211 d[j] = 0;
2212 }
2213}
2214
2215
2216
2217
2218
2219int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2220{
2221 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2222 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2223
2224 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2225}
2226
2227void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2228{
2229 intptr_t opr_sz = simd_oprsz(desc) / 8;
2230 int esz = simd_data(desc);
2231 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2232 intptr_t i, first_i, last_i;
2233 ARMVectorReg tmp;
2234
2235 first_i = last_i = 0;
2236 first_g = last_g = 0;
2237
2238
2239 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2240 pg = *(uint64_t *)(vg + i) & mask;
2241 if (pg) {
2242 if (last_g == 0) {
2243 last_g = pg;
2244 last_i = i;
2245 }
2246 first_g = pg;
2247 first_i = i;
2248 }
2249 }
2250
2251 len = 0;
2252 if (first_g != 0) {
2253 first_i = first_i * 8 + ctz64(first_g);
2254 last_i = last_i * 8 + 63 - clz64(last_g);
2255 len = last_i - first_i + (1 << esz);
2256 if (vd == vm) {
2257 vm = memcpy(&tmp, vm, opr_sz * 8);
2258 }
2259 swap_memmove(vd, vn + first_i, len);
2260 }
2261 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2262}
2263
2264void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2265 void *vg, uint32_t desc)
2266{
2267 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2268 uint64_t *d = vd, *n = vn, *m = vm;
2269 uint8_t *pg = vg;
2270
2271 for (i = 0; i < opr_sz; i += 1) {
2272 uint64_t nn = n[i], mm = m[i];
2273 uint64_t pp = expand_pred_b(pg[H1(i)]);
2274 d[i] = (nn & pp) | (mm & ~pp);
2275 }
2276}
2277
2278void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2279 void *vg, uint32_t desc)
2280{
2281 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2282 uint64_t *d = vd, *n = vn, *m = vm;
2283 uint8_t *pg = vg;
2284
2285 for (i = 0; i < opr_sz; i += 1) {
2286 uint64_t nn = n[i], mm = m[i];
2287 uint64_t pp = expand_pred_h(pg[H1(i)]);
2288 d[i] = (nn & pp) | (mm & ~pp);
2289 }
2290}
2291
2292void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2293 void *vg, uint32_t desc)
2294{
2295 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2296 uint64_t *d = vd, *n = vn, *m = vm;
2297 uint8_t *pg = vg;
2298
2299 for (i = 0; i < opr_sz; i += 1) {
2300 uint64_t nn = n[i], mm = m[i];
2301 uint64_t pp = expand_pred_s(pg[H1(i)]);
2302 d[i] = (nn & pp) | (mm & ~pp);
2303 }
2304}
2305
2306void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2307 void *vg, uint32_t desc)
2308{
2309 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2310 uint64_t *d = vd, *n = vn, *m = vm;
2311 uint8_t *pg = vg;
2312
2313 for (i = 0; i < opr_sz; i += 1) {
2314 uint64_t nn = n[i], mm = m[i];
2315 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2316 }
2317}
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2341uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2342{ \
2343 intptr_t opr_sz = simd_oprsz(desc); \
2344 uint32_t flags = PREDTEST_INIT; \
2345 intptr_t i = opr_sz; \
2346 do { \
2347 uint64_t out = 0, pg; \
2348 do { \
2349 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2350 TYPE nn = *(TYPE *)(vn + H(i)); \
2351 TYPE mm = *(TYPE *)(vm + H(i)); \
2352 out |= nn OP mm; \
2353 } while (i & 63); \
2354 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2355 out &= pg; \
2356 *(uint64_t *)(vd + (i >> 3)) = out; \
2357 flags = iter_predtest_bwd(out, pg, flags); \
2358 } while (i > 0); \
2359 return flags; \
2360}
2361
2362#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2363 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2364#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2365 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2366#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2367 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2368#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2369 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2370
2371DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2372DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2373DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2374DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2375
2376DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2377DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2378DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2379DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2380
2381DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2382DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2383DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2384DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2385
2386DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2387DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2388DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2389DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2390
2391DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2392DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2393DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2394DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2395
2396DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2397DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2398DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2399DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2400
2401#undef DO_CMP_PPZZ_B
2402#undef DO_CMP_PPZZ_H
2403#undef DO_CMP_PPZZ_S
2404#undef DO_CMP_PPZZ_D
2405#undef DO_CMP_PPZZ
2406
2407
2408#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2409uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2410{ \
2411 intptr_t opr_sz = simd_oprsz(desc); \
2412 uint32_t flags = PREDTEST_INIT; \
2413 intptr_t i = opr_sz; \
2414 do { \
2415 uint64_t out = 0, pg; \
2416 do { \
2417 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2418 do { \
2419 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2420 TYPE nn = *(TYPE *)(vn + H(i)); \
2421 out |= nn OP mm; \
2422 } while (i & 7); \
2423 } while (i & 63); \
2424 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2425 out &= pg; \
2426 *(uint64_t *)(vd + (i >> 3)) = out; \
2427 flags = iter_predtest_bwd(out, pg, flags); \
2428 } while (i > 0); \
2429 return flags; \
2430}
2431
2432#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2433 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2434#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2435 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2436#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2437 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2438
2439DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
2440DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
2441DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
2442
2443DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
2444DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
2445DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
2446
2447DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2448DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2449DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2450
2451DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2452DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2453DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2454
2455DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2456DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2457DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2458
2459DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2460DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2461DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2462
2463DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2464DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2465DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2466
2467DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2468DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2469DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2470
2471DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2472DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2473DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2474
2475DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2476DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2477DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2478
2479#undef DO_CMP_PPZW_B
2480#undef DO_CMP_PPZW_H
2481#undef DO_CMP_PPZW_S
2482#undef DO_CMP_PPZW
2483
2484
2485#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2486uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2487{ \
2488 intptr_t opr_sz = simd_oprsz(desc); \
2489 uint32_t flags = PREDTEST_INIT; \
2490 TYPE mm = simd_data(desc); \
2491 intptr_t i = opr_sz; \
2492 do { \
2493 uint64_t out = 0, pg; \
2494 do { \
2495 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2496 TYPE nn = *(TYPE *)(vn + H(i)); \
2497 out |= nn OP mm; \
2498 } while (i & 63); \
2499 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2500 out &= pg; \
2501 *(uint64_t *)(vd + (i >> 3)) = out; \
2502 flags = iter_predtest_bwd(out, pg, flags); \
2503 } while (i > 0); \
2504 return flags; \
2505}
2506
2507#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2508 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2509#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2510 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2511#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2512 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2513#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2514 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2515
2516DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2517DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2518DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2519DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2520
2521DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2522DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2523DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2524DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2525
2526DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2527DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2528DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2529DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2530
2531DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2532DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2533DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2534DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2535
2536DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2537DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2538DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2539DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2540
2541DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2542DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2543DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2544DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2545
2546DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2547DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2548DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2549DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2550
2551DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2552DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2553DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2554DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2555
2556DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2557DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2558DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2559DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2560
2561DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2562DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2563DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2564DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2565
2566#undef DO_CMP_PPZI_B
2567#undef DO_CMP_PPZI_H
2568#undef DO_CMP_PPZI_S
2569#undef DO_CMP_PPZI_D
2570#undef DO_CMP_PPZI
2571
2572
2573static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2574{
2575 intptr_t i;
2576
2577 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2578 uint64_t pg = *(uint64_t *)(vg + i);
2579 if (pg) {
2580 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2581 }
2582 }
2583 return 0;
2584}
2585
2586
2587
2588
2589
2590static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2591 bool brk, bool after)
2592{
2593 uint64_t b;
2594
2595 if (brk) {
2596 b = 0;
2597 } else if ((g & n) == 0) {
2598
2599 b = g;
2600 } else {
2601
2602 b = g & n;
2603 b = b & -b;
2604 if (after) {
2605 b = b | (b - 1);
2606 } else {
2607 b = b - 1;
2608 }
2609 brk = true;
2610 }
2611
2612 *retb = b;
2613 return brk;
2614}
2615
2616
2617static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2618 intptr_t oprsz, bool after)
2619{
2620 bool brk = false;
2621 intptr_t i;
2622
2623 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2624 uint64_t this_b, this_g = g[i];
2625
2626 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2627 d[i] = this_b & this_g;
2628 }
2629}
2630
2631
2632static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2633 intptr_t oprsz, bool after)
2634{
2635 uint32_t flags = PREDTEST_INIT;
2636 bool brk = false;
2637 intptr_t i;
2638
2639 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2640 uint64_t this_b, this_d, this_g = g[i];
2641
2642 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2643 d[i] = this_d = this_b & this_g;
2644 flags = iter_predtest_fwd(this_d, this_g, flags);
2645 }
2646 return flags;
2647}
2648
2649
2650static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2651 intptr_t oprsz, bool after)
2652{
2653 bool brk = false;
2654 intptr_t i;
2655
2656 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2657 uint64_t this_b, this_g = g[i];
2658
2659 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2660 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2661 }
2662}
2663
2664
2665static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2666 intptr_t oprsz, bool after)
2667{
2668 uint32_t flags = PREDTEST_INIT;
2669 bool brk = false;
2670 intptr_t i;
2671
2672 for (i = 0; i < oprsz / 8; ++i) {
2673 uint64_t this_b, this_d = d[i], this_g = g[i];
2674
2675 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2676 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2677 flags = iter_predtest_fwd(this_d, this_g, flags);
2678 }
2679 return flags;
2680}
2681
2682static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2683{
2684
2685
2686
2687 memset(d, 0, sizeof(ARMPredicateReg));
2688 return PREDTEST_INIT;
2689}
2690
2691void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2692 uint32_t pred_desc)
2693{
2694 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2695 if (last_active_pred(vn, vg, oprsz)) {
2696 compute_brk_z(vd, vm, vg, oprsz, true);
2697 } else {
2698 do_zero(vd, oprsz);
2699 }
2700}
2701
2702uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2703 uint32_t pred_desc)
2704{
2705 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2706 if (last_active_pred(vn, vg, oprsz)) {
2707 return compute_brks_z(vd, vm, vg, oprsz, true);
2708 } else {
2709 return do_zero(vd, oprsz);
2710 }
2711}
2712
2713void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2714 uint32_t pred_desc)
2715{
2716 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2717 if (last_active_pred(vn, vg, oprsz)) {
2718 compute_brk_z(vd, vm, vg, oprsz, false);
2719 } else {
2720 do_zero(vd, oprsz);
2721 }
2722}
2723
2724uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2725 uint32_t pred_desc)
2726{
2727 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2728 if (last_active_pred(vn, vg, oprsz)) {
2729 return compute_brks_z(vd, vm, vg, oprsz, false);
2730 } else {
2731 return do_zero(vd, oprsz);
2732 }
2733}
2734
2735void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2736{
2737 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2738 compute_brk_z(vd, vn, vg, oprsz, true);
2739}
2740
2741uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2742{
2743 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2744 return compute_brks_z(vd, vn, vg, oprsz, true);
2745}
2746
2747void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2748{
2749 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2750 compute_brk_z(vd, vn, vg, oprsz, false);
2751}
2752
2753uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2754{
2755 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2756 return compute_brks_z(vd, vn, vg, oprsz, false);
2757}
2758
2759void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2760{
2761 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2762 compute_brk_m(vd, vn, vg, oprsz, true);
2763}
2764
2765uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2766{
2767 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2768 return compute_brks_m(vd, vn, vg, oprsz, true);
2769}
2770
2771void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2772{
2773 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2774 compute_brk_m(vd, vn, vg, oprsz, false);
2775}
2776
2777uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2778{
2779 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2780 return compute_brks_m(vd, vn, vg, oprsz, false);
2781}
2782
2783void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2784{
2785 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2786
2787 if (!last_active_pred(vn, vg, oprsz)) {
2788 do_zero(vd, oprsz);
2789 }
2790}
2791
2792
2793static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2794 uint64_t esz_mask)
2795{
2796 uint32_t flags = PREDTEST_INIT;
2797 intptr_t i;
2798
2799 for (i = 0; i < oprsz / 8; i++) {
2800 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2801 }
2802 if (oprsz & 7) {
2803 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2804 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2805 }
2806 return flags;
2807}
2808
2809uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2810{
2811 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2812
2813 if (last_active_pred(vn, vg, oprsz)) {
2814 return predtest_ones(vd, oprsz, -1);
2815 } else {
2816 return do_zero(vd, oprsz);
2817 }
2818}
2819
2820uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2821{
2822 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2823 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2824 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2825 intptr_t i;
2826
2827 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2828 uint64_t t = n[i] & g[i] & mask;
2829 sum += ctpop64(t);
2830 }
2831 return sum;
2832}
2833
2834uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2835{
2836 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2837 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2838 uint64_t esz_mask = pred_esz_masks[esz];
2839 ARMPredicateReg *d = vd;
2840 uint32_t flags;
2841 intptr_t i;
2842
2843
2844 flags = do_zero(d, oprsz);
2845 if (count == 0) {
2846 return flags;
2847 }
2848
2849
2850 for (i = 0; i < count / 64; ++i) {
2851 d->p[i] = esz_mask;
2852 }
2853 if (count & 63) {
2854 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2855 }
2856
2857 return predtest_ones(d, oprsz, esz_mask);
2858}
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2869static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2870{ \
2871 if (n == 1) { \
2872 return *data; \
2873 } else { \
2874 uintptr_t half = n / 2; \
2875 TYPE lo = NAME##_reduce(data, status, half); \
2876 TYPE hi = NAME##_reduce(data + half, status, half); \
2877 return TYPE##_##FUNC(lo, hi, status); \
2878 } \
2879} \
2880uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2881{ \
2882 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
2883 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2884 for (i = 0; i < oprsz; ) { \
2885 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2886 do { \
2887 TYPE nn = *(TYPE *)(vn + H(i)); \
2888 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2889 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2890 } while (i & 15); \
2891 } \
2892 for (; i < maxsz; i += sizeof(TYPE)) { \
2893 *(TYPE *)((void *)data + i) = IDENT; \
2894 } \
2895 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2896}
2897
2898DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2899DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2900DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2901
2902
2903DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2904DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2905DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2906
2907DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2908DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2909DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2910
2911DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2912DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2913DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2914
2915DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2916DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2917DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2918
2919#undef DO_REDUCE
2920
2921uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2922 void *status, uint32_t desc)
2923{
2924 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2925 float16 result = nn;
2926
2927 do {
2928 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2929 do {
2930 if (pg & 1) {
2931 float16 mm = *(float16 *)(vm + H1_2(i));
2932 result = float16_add(result, mm, status);
2933 }
2934 i += sizeof(float16), pg >>= sizeof(float16);
2935 } while (i & 15);
2936 } while (i < opr_sz);
2937
2938 return result;
2939}
2940
2941uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2942 void *status, uint32_t desc)
2943{
2944 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2945 float32 result = nn;
2946
2947 do {
2948 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2949 do {
2950 if (pg & 1) {
2951 float32 mm = *(float32 *)(vm + H1_2(i));
2952 result = float32_add(result, mm, status);
2953 }
2954 i += sizeof(float32), pg >>= sizeof(float32);
2955 } while (i & 15);
2956 } while (i < opr_sz);
2957
2958 return result;
2959}
2960
2961uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
2962 void *status, uint32_t desc)
2963{
2964 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
2965 uint64_t *m = vm;
2966 uint8_t *pg = vg;
2967
2968 for (i = 0; i < opr_sz; i++) {
2969 if (pg[H1(i)] & 1) {
2970 nn = float64_add(nn, m[i], status);
2971 }
2972 }
2973
2974 return nn;
2975}
2976
2977
2978
2979
2980#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
2981void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
2982 void *status, uint32_t desc) \
2983{ \
2984 intptr_t i = simd_oprsz(desc); \
2985 uint64_t *g = vg; \
2986 do { \
2987 uint64_t pg = g[(i - 1) >> 6]; \
2988 do { \
2989 i -= sizeof(TYPE); \
2990 if (likely((pg >> (i & 63)) & 1)) { \
2991 TYPE nn = *(TYPE *)(vn + H(i)); \
2992 TYPE mm = *(TYPE *)(vm + H(i)); \
2993 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
2994 } \
2995 } while (i & 63); \
2996 } while (i != 0); \
2997}
2998
2999DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3000DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3001DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3002
3003DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3004DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3005DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3006
3007DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3008DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3009DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3010
3011DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3012DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3013DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3014
3015DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3016DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3017DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3018
3019DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3020DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3021DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3022
3023DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3024DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3025DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3026
3027DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3028DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3029DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3030
3031static inline float16 abd_h(float16 a, float16 b, float_status *s)
3032{
3033 return float16_abs(float16_sub(a, b, s));
3034}
3035
3036static inline float32 abd_s(float32 a, float32 b, float_status *s)
3037{
3038 return float32_abs(float32_sub(a, b, s));
3039}
3040
3041static inline float64 abd_d(float64 a, float64 b, float_status *s)
3042{
3043 return float64_abs(float64_sub(a, b, s));
3044}
3045
3046DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3047DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3048DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3049
3050static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3051{
3052 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3053 return float64_scalbn(a, b_int, s);
3054}
3055
3056DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3057DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3058DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3059
3060DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3061DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3062DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3063
3064#undef DO_ZPZZ_FP
3065
3066
3067
3068
3069#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3070void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3071 void *status, uint32_t desc) \
3072{ \
3073 intptr_t i = simd_oprsz(desc); \
3074 uint64_t *g = vg; \
3075 TYPE mm = scalar; \
3076 do { \
3077 uint64_t pg = g[(i - 1) >> 6]; \
3078 do { \
3079 i -= sizeof(TYPE); \
3080 if (likely((pg >> (i & 63)) & 1)) { \
3081 TYPE nn = *(TYPE *)(vn + H(i)); \
3082 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3083 } \
3084 } while (i & 63); \
3085 } while (i != 0); \
3086}
3087
3088DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3089DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3090DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3091
3092DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3093DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3094DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3095
3096DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3097DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3098DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3099
3100static inline float16 subr_h(float16 a, float16 b, float_status *s)
3101{
3102 return float16_sub(b, a, s);
3103}
3104
3105static inline float32 subr_s(float32 a, float32 b, float_status *s)
3106{
3107 return float32_sub(b, a, s);
3108}
3109
3110static inline float64 subr_d(float64 a, float64 b, float_status *s)
3111{
3112 return float64_sub(b, a, s);
3113}
3114
3115DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3116DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3117DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3118
3119DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3120DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3121DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3122
3123DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3124DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3125DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3126
3127DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3128DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3129DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3130
3131DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3132DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3133DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3134
3135
3136
3137
3138#define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3139void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3140{ \
3141 intptr_t i = simd_oprsz(desc); \
3142 uint64_t *g = vg; \
3143 do { \
3144 uint64_t pg = g[(i - 1) >> 6]; \
3145 do { \
3146 i -= sizeof(TYPE); \
3147 if (likely((pg >> (i & 63)) & 1)) { \
3148 TYPE nn = *(TYPE *)(vn + H(i)); \
3149 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3150 } \
3151 } while (i & 63); \
3152 } while (i != 0); \
3153}
3154
3155
3156
3157
3158
3159static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3160{
3161 flag save = get_flush_inputs_to_zero(fpst);
3162 float32 ret;
3163
3164 set_flush_inputs_to_zero(false, fpst);
3165 ret = float16_to_float32(f, true, fpst);
3166 set_flush_inputs_to_zero(save, fpst);
3167 return ret;
3168}
3169
3170static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3171{
3172 flag save = get_flush_inputs_to_zero(fpst);
3173 float64 ret;
3174
3175 set_flush_inputs_to_zero(false, fpst);
3176 ret = float16_to_float64(f, true, fpst);
3177 set_flush_inputs_to_zero(save, fpst);
3178 return ret;
3179}
3180
3181static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3182{
3183 flag save = get_flush_to_zero(fpst);
3184 float16 ret;
3185
3186 set_flush_to_zero(false, fpst);
3187 ret = float32_to_float16(f, true, fpst);
3188 set_flush_to_zero(save, fpst);
3189 return ret;
3190}
3191
3192static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3193{
3194 flag save = get_flush_to_zero(fpst);
3195 float16 ret;
3196
3197 set_flush_to_zero(false, fpst);
3198 ret = float64_to_float16(f, true, fpst);
3199 set_flush_to_zero(save, fpst);
3200 return ret;
3201}
3202
3203static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3204{
3205 if (float16_is_any_nan(f)) {
3206 float_raise(float_flag_invalid, s);
3207 return 0;
3208 }
3209 return float16_to_int16_round_to_zero(f, s);
3210}
3211
3212static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3213{
3214 if (float16_is_any_nan(f)) {
3215 float_raise(float_flag_invalid, s);
3216 return 0;
3217 }
3218 return float16_to_int64_round_to_zero(f, s);
3219}
3220
3221static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3222{
3223 if (float32_is_any_nan(f)) {
3224 float_raise(float_flag_invalid, s);
3225 return 0;
3226 }
3227 return float32_to_int64_round_to_zero(f, s);
3228}
3229
3230static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3231{
3232 if (float64_is_any_nan(f)) {
3233 float_raise(float_flag_invalid, s);
3234 return 0;
3235 }
3236 return float64_to_int64_round_to_zero(f, s);
3237}
3238
3239static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3240{
3241 if (float16_is_any_nan(f)) {
3242 float_raise(float_flag_invalid, s);
3243 return 0;
3244 }
3245 return float16_to_uint16_round_to_zero(f, s);
3246}
3247
3248static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3249{
3250 if (float16_is_any_nan(f)) {
3251 float_raise(float_flag_invalid, s);
3252 return 0;
3253 }
3254 return float16_to_uint64_round_to_zero(f, s);
3255}
3256
3257static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3258{
3259 if (float32_is_any_nan(f)) {
3260 float_raise(float_flag_invalid, s);
3261 return 0;
3262 }
3263 return float32_to_uint64_round_to_zero(f, s);
3264}
3265
3266static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3267{
3268 if (float64_is_any_nan(f)) {
3269 float_raise(float_flag_invalid, s);
3270 return 0;
3271 }
3272 return float64_to_uint64_round_to_zero(f, s);
3273}
3274
3275DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3276DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3277DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3278DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3279DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3280DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3281
3282DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3283DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3284DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3285DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3286DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3287DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3288DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3289
3290DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3291DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3292DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3293DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3294DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3295DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3296DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3297
3298DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3299DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3300DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3301
3302DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3303DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3304DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3305
3306DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3307DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3308DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3309
3310DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3311DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3312DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3313
3314DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3315DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3316DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3317DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3318DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3319DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3320DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3321
3322DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3323DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3324DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3325DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3326DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3327DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3328DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3329
3330#undef DO_ZPZ_FP
3331
3332
3333
3334
3335QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32);
3336
3337static void do_fmla_zpzzz_h(CPUARMState *env, void *vg, uint32_t desc,
3338 uint16_t neg1, uint16_t neg3)
3339{
3340 intptr_t i = simd_oprsz(desc);
3341 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3342 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3343 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3344 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3345 void *vd = &env->vfp.zregs[rd];
3346 void *vn = &env->vfp.zregs[rn];
3347 void *vm = &env->vfp.zregs[rm];
3348 void *va = &env->vfp.zregs[ra];
3349 uint64_t *g = vg;
3350
3351 do {
3352 uint64_t pg = g[(i - 1) >> 6];
3353 do {
3354 i -= 2;
3355 if (likely((pg >> (i & 63)) & 1)) {
3356 float16 e1, e2, e3, r;
3357
3358 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3359 e2 = *(uint16_t *)(vm + H1_2(i));
3360 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3361 r = float16_muladd(e1, e2, e3, 0, &env->vfp.fp_status_f16);
3362 *(uint16_t *)(vd + H1_2(i)) = r;
3363 }
3364 } while (i & 63);
3365 } while (i != 0);
3366}
3367
3368void HELPER(sve_fmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3369{
3370 do_fmla_zpzzz_h(env, vg, desc, 0, 0);
3371}
3372
3373void HELPER(sve_fmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3374{
3375 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0);
3376}
3377
3378void HELPER(sve_fnmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3379{
3380 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0x8000);
3381}
3382
3383void HELPER(sve_fnmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3384{
3385 do_fmla_zpzzz_h(env, vg, desc, 0, 0x8000);
3386}
3387
3388static void do_fmla_zpzzz_s(CPUARMState *env, void *vg, uint32_t desc,
3389 uint32_t neg1, uint32_t neg3)
3390{
3391 intptr_t i = simd_oprsz(desc);
3392 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3393 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3394 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3395 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3396 void *vd = &env->vfp.zregs[rd];
3397 void *vn = &env->vfp.zregs[rn];
3398 void *vm = &env->vfp.zregs[rm];
3399 void *va = &env->vfp.zregs[ra];
3400 uint64_t *g = vg;
3401
3402 do {
3403 uint64_t pg = g[(i - 1) >> 6];
3404 do {
3405 i -= 4;
3406 if (likely((pg >> (i & 63)) & 1)) {
3407 float32 e1, e2, e3, r;
3408
3409 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3410 e2 = *(uint32_t *)(vm + H1_4(i));
3411 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3412 r = float32_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3413 *(uint32_t *)(vd + H1_4(i)) = r;
3414 }
3415 } while (i & 63);
3416 } while (i != 0);
3417}
3418
3419void HELPER(sve_fmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3420{
3421 do_fmla_zpzzz_s(env, vg, desc, 0, 0);
3422}
3423
3424void HELPER(sve_fmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3425{
3426 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0);
3427}
3428
3429void HELPER(sve_fnmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3430{
3431 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0x80000000);
3432}
3433
3434void HELPER(sve_fnmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3435{
3436 do_fmla_zpzzz_s(env, vg, desc, 0, 0x80000000);
3437}
3438
3439static void do_fmla_zpzzz_d(CPUARMState *env, void *vg, uint32_t desc,
3440 uint64_t neg1, uint64_t neg3)
3441{
3442 intptr_t i = simd_oprsz(desc);
3443 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3444 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3445 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3446 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3447 void *vd = &env->vfp.zregs[rd];
3448 void *vn = &env->vfp.zregs[rn];
3449 void *vm = &env->vfp.zregs[rm];
3450 void *va = &env->vfp.zregs[ra];
3451 uint64_t *g = vg;
3452
3453 do {
3454 uint64_t pg = g[(i - 1) >> 6];
3455 do {
3456 i -= 8;
3457 if (likely((pg >> (i & 63)) & 1)) {
3458 float64 e1, e2, e3, r;
3459
3460 e1 = *(uint64_t *)(vn + i) ^ neg1;
3461 e2 = *(uint64_t *)(vm + i);
3462 e3 = *(uint64_t *)(va + i) ^ neg3;
3463 r = float64_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3464 *(uint64_t *)(vd + i) = r;
3465 }
3466 } while (i & 63);
3467 } while (i != 0);
3468}
3469
3470void HELPER(sve_fmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3471{
3472 do_fmla_zpzzz_d(env, vg, desc, 0, 0);
3473}
3474
3475void HELPER(sve_fmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3476{
3477 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, 0);
3478}
3479
3480void HELPER(sve_fnmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3481{
3482 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, INT64_MIN);
3483}
3484
3485void HELPER(sve_fnmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3486{
3487 do_fmla_zpzzz_d(env, vg, desc, 0, INT64_MIN);
3488}
3489
3490
3491
3492
3493
3494
3495#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3496void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3497 void *status, uint32_t desc) \
3498{ \
3499 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3500 uint64_t *d = vd, *g = vg; \
3501 do { \
3502 uint64_t out = 0, pg = g[j]; \
3503 do { \
3504 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3505 if (likely((pg >> (i & 63)) & 1)) { \
3506 TYPE nn = *(TYPE *)(vn + H(i)); \
3507 TYPE mm = *(TYPE *)(vm + H(i)); \
3508 out |= OP(TYPE, nn, mm, status); \
3509 } \
3510 } while (i & 63); \
3511 d[j--] = out; \
3512 } while (i > 0); \
3513}
3514
3515#define DO_FPCMP_PPZZ_H(NAME, OP) \
3516 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3517#define DO_FPCMP_PPZZ_S(NAME, OP) \
3518 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3519#define DO_FPCMP_PPZZ_D(NAME, OP) \
3520 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3521
3522#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3523 DO_FPCMP_PPZZ_H(NAME, OP) \
3524 DO_FPCMP_PPZZ_S(NAME, OP) \
3525 DO_FPCMP_PPZZ_D(NAME, OP)
3526
3527#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3528#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
3529#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3530#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
3531#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3532#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3533#define DO_FCMUO(TYPE, X, Y, ST) \
3534 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3535#define DO_FACGE(TYPE, X, Y, ST) \
3536 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3537#define DO_FACGT(TYPE, X, Y, ST) \
3538 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3539
3540DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3541DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3542DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3543DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3544DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3545DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3546DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3547
3548#undef DO_FPCMP_PPZZ_ALL
3549#undef DO_FPCMP_PPZZ_D
3550#undef DO_FPCMP_PPZZ_S
3551#undef DO_FPCMP_PPZZ_H
3552#undef DO_FPCMP_PPZZ
3553
3554
3555
3556
3557#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3558void HELPER(NAME)(void *vd, void *vn, void *vg, \
3559 void *status, uint32_t desc) \
3560{ \
3561 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3562 uint64_t *d = vd, *g = vg; \
3563 do { \
3564 uint64_t out = 0, pg = g[j]; \
3565 do { \
3566 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3567 if ((pg >> (i & 63)) & 1) { \
3568 TYPE nn = *(TYPE *)(vn + H(i)); \
3569 out |= OP(TYPE, nn, 0, status); \
3570 } \
3571 } while (i & 63); \
3572 d[j--] = out; \
3573 } while (i > 0); \
3574}
3575
3576#define DO_FPCMP_PPZ0_H(NAME, OP) \
3577 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3578#define DO_FPCMP_PPZ0_S(NAME, OP) \
3579 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3580#define DO_FPCMP_PPZ0_D(NAME, OP) \
3581 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3582
3583#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3584 DO_FPCMP_PPZ0_H(NAME, OP) \
3585 DO_FPCMP_PPZ0_S(NAME, OP) \
3586 DO_FPCMP_PPZ0_D(NAME, OP)
3587
3588DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3589DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3590DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3591DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3592DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3593DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3594
3595
3596
3597void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3598{
3599 static const float16 coeff[16] = {
3600 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3601 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3602 };
3603 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3604 intptr_t x = simd_data(desc);
3605 float16 *d = vd, *n = vn, *m = vm;
3606 for (i = 0; i < opr_sz; i++) {
3607 float16 mm = m[i];
3608 intptr_t xx = x;
3609 if (float16_is_neg(mm)) {
3610 mm = float16_abs(mm);
3611 xx += 8;
3612 }
3613 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3614 }
3615}
3616
3617void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3618{
3619 static const float32 coeff[16] = {
3620 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3621 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3622 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3623 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3624 };
3625 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3626 intptr_t x = simd_data(desc);
3627 float32 *d = vd, *n = vn, *m = vm;
3628 for (i = 0; i < opr_sz; i++) {
3629 float32 mm = m[i];
3630 intptr_t xx = x;
3631 if (float32_is_neg(mm)) {
3632 mm = float32_abs(mm);
3633 xx += 8;
3634 }
3635 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3636 }
3637}
3638
3639void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3640{
3641 static const float64 coeff[16] = {
3642 0x3ff0000000000000ull, 0xbfc5555555555543ull,
3643 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3644 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3645 0x3de5d8408868552full, 0x0000000000000000ull,
3646 0x3ff0000000000000ull, 0xbfe0000000000000ull,
3647 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3648 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3649 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3650 };
3651 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3652 intptr_t x = simd_data(desc);
3653 float64 *d = vd, *n = vn, *m = vm;
3654 for (i = 0; i < opr_sz; i++) {
3655 float64 mm = m[i];
3656 intptr_t xx = x;
3657 if (float64_is_neg(mm)) {
3658 mm = float64_abs(mm);
3659 xx += 8;
3660 }
3661 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3662 }
3663}
3664
3665
3666
3667
3668
3669void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3670 void *vs, uint32_t desc)
3671{
3672 intptr_t j, i = simd_oprsz(desc);
3673 uint64_t *g = vg;
3674 float16 neg_imag = float16_set_sign(0, simd_data(desc));
3675 float16 neg_real = float16_chs(neg_imag);
3676
3677 do {
3678 uint64_t pg = g[(i - 1) >> 6];
3679 do {
3680 float16 e0, e1, e2, e3;
3681
3682
3683 j = i - sizeof(float16);
3684 i -= 2 * sizeof(float16);
3685
3686 e0 = *(float16 *)(vn + H1_2(i));
3687 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3688 e2 = *(float16 *)(vn + H1_2(j));
3689 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3690
3691 if (likely((pg >> (i & 63)) & 1)) {
3692 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3693 }
3694 if (likely((pg >> (j & 63)) & 1)) {
3695 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3696 }
3697 } while (i & 63);
3698 } while (i != 0);
3699}
3700
3701void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3702 void *vs, uint32_t desc)
3703{
3704 intptr_t j, i = simd_oprsz(desc);
3705 uint64_t *g = vg;
3706 float32 neg_imag = float32_set_sign(0, simd_data(desc));
3707 float32 neg_real = float32_chs(neg_imag);
3708
3709 do {
3710 uint64_t pg = g[(i - 1) >> 6];
3711 do {
3712 float32 e0, e1, e2, e3;
3713
3714
3715 j = i - sizeof(float32);
3716 i -= 2 * sizeof(float32);
3717
3718 e0 = *(float32 *)(vn + H1_2(i));
3719 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3720 e2 = *(float32 *)(vn + H1_2(j));
3721 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3722
3723 if (likely((pg >> (i & 63)) & 1)) {
3724 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3725 }
3726 if (likely((pg >> (j & 63)) & 1)) {
3727 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3728 }
3729 } while (i & 63);
3730 } while (i != 0);
3731}
3732
3733void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3734 void *vs, uint32_t desc)
3735{
3736 intptr_t j, i = simd_oprsz(desc);
3737 uint64_t *g = vg;
3738 float64 neg_imag = float64_set_sign(0, simd_data(desc));
3739 float64 neg_real = float64_chs(neg_imag);
3740
3741 do {
3742 uint64_t pg = g[(i - 1) >> 6];
3743 do {
3744 float64 e0, e1, e2, e3;
3745
3746
3747 j = i - sizeof(float64);
3748 i -= 2 * sizeof(float64);
3749
3750 e0 = *(float64 *)(vn + H1_2(i));
3751 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3752 e2 = *(float64 *)(vn + H1_2(j));
3753 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3754
3755 if (likely((pg >> (i & 63)) & 1)) {
3756 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3757 }
3758 if (likely((pg >> (j & 63)) & 1)) {
3759 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3760 }
3761 } while (i & 63);
3762 } while (i != 0);
3763}
3764
3765
3766
3767
3768
3769QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 22 > 32);
3770
3771void HELPER(sve_fcmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3772{
3773 intptr_t j, i = simd_oprsz(desc);
3774 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3775 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3776 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3777 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3778 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3779 bool flip = rot & 1;
3780 float16 neg_imag, neg_real;
3781 void *vd = &env->vfp.zregs[rd];
3782 void *vn = &env->vfp.zregs[rn];
3783 void *vm = &env->vfp.zregs[rm];
3784 void *va = &env->vfp.zregs[ra];
3785 uint64_t *g = vg;
3786
3787 neg_imag = float16_set_sign(0, (rot & 2) != 0);
3788 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3789
3790 do {
3791 uint64_t pg = g[(i - 1) >> 6];
3792 do {
3793 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3794
3795
3796 j = i - sizeof(float16);
3797 i -= 2 * sizeof(float16);
3798
3799 nr = *(float16 *)(vn + H1_2(i));
3800 ni = *(float16 *)(vn + H1_2(j));
3801 mr = *(float16 *)(vm + H1_2(i));
3802 mi = *(float16 *)(vm + H1_2(j));
3803
3804 e2 = (flip ? ni : nr);
3805 e1 = (flip ? mi : mr) ^ neg_real;
3806 e4 = e2;
3807 e3 = (flip ? mr : mi) ^ neg_imag;
3808
3809 if (likely((pg >> (i & 63)) & 1)) {
3810 d = *(float16 *)(va + H1_2(i));
3811 d = float16_muladd(e2, e1, d, 0, &env->vfp.fp_status_f16);
3812 *(float16 *)(vd + H1_2(i)) = d;
3813 }
3814 if (likely((pg >> (j & 63)) & 1)) {
3815 d = *(float16 *)(va + H1_2(j));
3816 d = float16_muladd(e4, e3, d, 0, &env->vfp.fp_status_f16);
3817 *(float16 *)(vd + H1_2(j)) = d;
3818 }
3819 } while (i & 63);
3820 } while (i != 0);
3821}
3822
3823void HELPER(sve_fcmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3824{
3825 intptr_t j, i = simd_oprsz(desc);
3826 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3827 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3828 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3829 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3830 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3831 bool flip = rot & 1;
3832 float32 neg_imag, neg_real;
3833 void *vd = &env->vfp.zregs[rd];
3834 void *vn = &env->vfp.zregs[rn];
3835 void *vm = &env->vfp.zregs[rm];
3836 void *va = &env->vfp.zregs[ra];
3837 uint64_t *g = vg;
3838
3839 neg_imag = float32_set_sign(0, (rot & 2) != 0);
3840 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3841
3842 do {
3843 uint64_t pg = g[(i - 1) >> 6];
3844 do {
3845 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3846
3847
3848 j = i - sizeof(float32);
3849 i -= 2 * sizeof(float32);
3850
3851 nr = *(float32 *)(vn + H1_2(i));
3852 ni = *(float32 *)(vn + H1_2(j));
3853 mr = *(float32 *)(vm + H1_2(i));
3854 mi = *(float32 *)(vm + H1_2(j));
3855
3856 e2 = (flip ? ni : nr);
3857 e1 = (flip ? mi : mr) ^ neg_real;
3858 e4 = e2;
3859 e3 = (flip ? mr : mi) ^ neg_imag;
3860
3861 if (likely((pg >> (i & 63)) & 1)) {
3862 d = *(float32 *)(va + H1_2(i));
3863 d = float32_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3864 *(float32 *)(vd + H1_2(i)) = d;
3865 }
3866 if (likely((pg >> (j & 63)) & 1)) {
3867 d = *(float32 *)(va + H1_2(j));
3868 d = float32_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3869 *(float32 *)(vd + H1_2(j)) = d;
3870 }
3871 } while (i & 63);
3872 } while (i != 0);
3873}
3874
3875void HELPER(sve_fcmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3876{
3877 intptr_t j, i = simd_oprsz(desc);
3878 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3879 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3880 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3881 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3882 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3883 bool flip = rot & 1;
3884 float64 neg_imag, neg_real;
3885 void *vd = &env->vfp.zregs[rd];
3886 void *vn = &env->vfp.zregs[rn];
3887 void *vm = &env->vfp.zregs[rm];
3888 void *va = &env->vfp.zregs[ra];
3889 uint64_t *g = vg;
3890
3891 neg_imag = float64_set_sign(0, (rot & 2) != 0);
3892 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3893
3894 do {
3895 uint64_t pg = g[(i - 1) >> 6];
3896 do {
3897 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3898
3899
3900 j = i - sizeof(float64);
3901 i -= 2 * sizeof(float64);
3902
3903 nr = *(float64 *)(vn + H1_2(i));
3904 ni = *(float64 *)(vn + H1_2(j));
3905 mr = *(float64 *)(vm + H1_2(i));
3906 mi = *(float64 *)(vm + H1_2(j));
3907
3908 e2 = (flip ? ni : nr);
3909 e1 = (flip ? mi : mr) ^ neg_real;
3910 e4 = e2;
3911 e3 = (flip ? mr : mi) ^ neg_imag;
3912
3913 if (likely((pg >> (i & 63)) & 1)) {
3914 d = *(float64 *)(va + H1_2(i));
3915 d = float64_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3916 *(float64 *)(vd + H1_2(i)) = d;
3917 }
3918 if (likely((pg >> (j & 63)) & 1)) {
3919 d = *(float64 *)(va + H1_2(j));
3920 d = float64_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3921 *(float64 *)(vd + H1_2(j)) = d;
3922 }
3923 } while (i & 63);
3924 } while (i != 0);
3925}
3926
3927
3928
3929
3930#define DO_LD1(NAME, FN, TYPEE, TYPEM, H) \
3931static void do_##NAME(CPUARMState *env, void *vd, void *vg, \
3932 target_ulong addr, intptr_t oprsz, \
3933 uintptr_t ra) \
3934{ \
3935 intptr_t i = 0; \
3936 do { \
3937 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3938 do { \
3939 TYPEM m = 0; \
3940 if (pg & 1) { \
3941 m = FN(env, addr, ra); \
3942 } \
3943 *(TYPEE *)(vd + H(i)) = m; \
3944 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3945 addr += sizeof(TYPEM); \
3946 } while (i & 15); \
3947 } while (i < oprsz); \
3948} \
3949void HELPER(NAME)(CPUARMState *env, void *vg, \
3950 target_ulong addr, uint32_t desc) \
3951{ \
3952 do_##NAME(env, &env->vfp.zregs[simd_data(desc)], vg, \
3953 addr, simd_oprsz(desc), GETPC()); \
3954}
3955
3956#define DO_LD2(NAME, FN, TYPEE, TYPEM, H) \
3957void HELPER(NAME)(CPUARMState *env, void *vg, \
3958 target_ulong addr, uint32_t desc) \
3959{ \
3960 intptr_t i, oprsz = simd_oprsz(desc); \
3961 intptr_t ra = GETPC(); \
3962 unsigned rd = simd_data(desc); \
3963 void *d1 = &env->vfp.zregs[rd]; \
3964 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
3965 for (i = 0; i < oprsz; ) { \
3966 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3967 do { \
3968 TYPEM m1 = 0, m2 = 0; \
3969 if (pg & 1) { \
3970 m1 = FN(env, addr, ra); \
3971 m2 = FN(env, addr + sizeof(TYPEM), ra); \
3972 } \
3973 *(TYPEE *)(d1 + H(i)) = m1; \
3974 *(TYPEE *)(d2 + H(i)) = m2; \
3975 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3976 addr += 2 * sizeof(TYPEM); \
3977 } while (i & 15); \
3978 } \
3979}
3980
3981#define DO_LD3(NAME, FN, TYPEE, TYPEM, H) \
3982void HELPER(NAME)(CPUARMState *env, void *vg, \
3983 target_ulong addr, uint32_t desc) \
3984{ \
3985 intptr_t i, oprsz = simd_oprsz(desc); \
3986 intptr_t ra = GETPC(); \
3987 unsigned rd = simd_data(desc); \
3988 void *d1 = &env->vfp.zregs[rd]; \
3989 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
3990 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
3991 for (i = 0; i < oprsz; ) { \
3992 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3993 do { \
3994 TYPEM m1 = 0, m2 = 0, m3 = 0; \
3995 if (pg & 1) { \
3996 m1 = FN(env, addr, ra); \
3997 m2 = FN(env, addr + sizeof(TYPEM), ra); \
3998 m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
3999 } \
4000 *(TYPEE *)(d1 + H(i)) = m1; \
4001 *(TYPEE *)(d2 + H(i)) = m2; \
4002 *(TYPEE *)(d3 + H(i)) = m3; \
4003 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4004 addr += 3 * sizeof(TYPEM); \
4005 } while (i & 15); \
4006 } \
4007}
4008
4009#define DO_LD4(NAME, FN, TYPEE, TYPEM, H) \
4010void HELPER(NAME)(CPUARMState *env, void *vg, \
4011 target_ulong addr, uint32_t desc) \
4012{ \
4013 intptr_t i, oprsz = simd_oprsz(desc); \
4014 intptr_t ra = GETPC(); \
4015 unsigned rd = simd_data(desc); \
4016 void *d1 = &env->vfp.zregs[rd]; \
4017 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4018 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
4019 void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \
4020 for (i = 0; i < oprsz; ) { \
4021 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4022 do { \
4023 TYPEM m1 = 0, m2 = 0, m3 = 0, m4 = 0; \
4024 if (pg & 1) { \
4025 m1 = FN(env, addr, ra); \
4026 m2 = FN(env, addr + sizeof(TYPEM), ra); \
4027 m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
4028 m4 = FN(env, addr + 3 * sizeof(TYPEM), ra); \
4029 } \
4030 *(TYPEE *)(d1 + H(i)) = m1; \
4031 *(TYPEE *)(d2 + H(i)) = m2; \
4032 *(TYPEE *)(d3 + H(i)) = m3; \
4033 *(TYPEE *)(d4 + H(i)) = m4; \
4034 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4035 addr += 4 * sizeof(TYPEM); \
4036 } while (i & 15); \
4037 } \
4038}
4039
4040DO_LD1(sve_ld1bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
4041DO_LD1(sve_ld1bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
4042DO_LD1(sve_ld1bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
4043DO_LD1(sve_ld1bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
4044DO_LD1(sve_ld1bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
4045DO_LD1(sve_ld1bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
4046
4047DO_LD1(sve_ld1hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
4048DO_LD1(sve_ld1hss_r, cpu_ldsw_data_ra, uint32_t, int16_t, H1_4)
4049DO_LD1(sve_ld1hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
4050DO_LD1(sve_ld1hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
4051
4052DO_LD1(sve_ld1sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
4053DO_LD1(sve_ld1sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
4054
4055DO_LD1(sve_ld1bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4056DO_LD2(sve_ld2bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4057DO_LD3(sve_ld3bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4058DO_LD4(sve_ld4bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4059
4060DO_LD1(sve_ld1hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4061DO_LD2(sve_ld2hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4062DO_LD3(sve_ld3hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4063DO_LD4(sve_ld4hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4064
4065DO_LD1(sve_ld1ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4066DO_LD2(sve_ld2ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4067DO_LD3(sve_ld3ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4068DO_LD4(sve_ld4ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4069
4070DO_LD1(sve_ld1dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4071DO_LD2(sve_ld2dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4072DO_LD3(sve_ld3dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4073DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4074
4075#undef DO_LD1
4076#undef DO_LD2
4077#undef DO_LD3
4078#undef DO_LD4
4079
4080
4081
4082
4083
4084#ifdef CONFIG_USER_ONLY
4085
4086
4087
4088
4089
4090static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4091{
4092 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4093
4094 if (i & 63) {
4095 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4096 i = ROUND_UP(i, 64);
4097 }
4098 for (; i < oprsz; i += 64) {
4099 ffr[i / 64] = 0;
4100 }
4101}
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
4113static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg, \
4114 target_ulong addr, intptr_t oprsz, \
4115 bool first, uintptr_t ra) \
4116{ \
4117 intptr_t i = 0; \
4118 do { \
4119 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4120 do { \
4121 TYPEM m = 0; \
4122 if (pg & 1) { \
4123 if (!first && \
4124 unlikely(page_check_range(addr, sizeof(TYPEM), \
4125 PAGE_READ))) { \
4126 record_fault(env, i, oprsz); \
4127 return; \
4128 } \
4129 m = FN(env, addr, ra); \
4130 first = false; \
4131 } \
4132 *(TYPEE *)(vd + H(i)) = m; \
4133 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4134 addr += sizeof(TYPEM); \
4135 } while (i & 15); \
4136 } while (i < oprsz); \
4137} \
4138void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
4139 target_ulong addr, uint32_t desc) \
4140{ \
4141 intptr_t oprsz = simd_oprsz(desc); \
4142 unsigned rd = simd_data(desc); \
4143 void *vd = &env->vfp.zregs[rd]; \
4144 mmap_lock(); \
4145 if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
4146 do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
4147 } else { \
4148 do_sve_ldff1##PART(env, vd, vg, addr, oprsz, true, GETPC()); \
4149 } \
4150 mmap_unlock(); \
4151}
4152
4153
4154
4155
4156#define DO_LDNF1(PART) \
4157void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
4158 target_ulong addr, uint32_t desc) \
4159{ \
4160 intptr_t oprsz = simd_oprsz(desc); \
4161 unsigned rd = simd_data(desc); \
4162 void *vd = &env->vfp.zregs[rd]; \
4163 mmap_lock(); \
4164 if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
4165 do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
4166 } else { \
4167 do_sve_ldff1##PART(env, vd, vg, addr, oprsz, false, GETPC()); \
4168 } \
4169 mmap_unlock(); \
4170}
4171
4172#else
4173
4174
4175
4176
4177#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
4178void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
4179 target_ulong addr, uint32_t desc) \
4180{ \
4181 g_assert_not_reached(); \
4182}
4183
4184#define DO_LDNF1(PART) \
4185void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
4186 target_ulong addr, uint32_t desc) \
4187{ \
4188 g_assert_not_reached(); \
4189}
4190
4191#endif
4192
4193DO_LDFF1(bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4194DO_LDFF1(bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
4195DO_LDFF1(bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
4196DO_LDFF1(bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
4197DO_LDFF1(bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
4198DO_LDFF1(bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
4199DO_LDFF1(bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
4200
4201DO_LDFF1(hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4202DO_LDFF1(hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
4203DO_LDFF1(hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
4204DO_LDFF1(hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
4205DO_LDFF1(hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
4206
4207DO_LDFF1(ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4208DO_LDFF1(sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
4209DO_LDFF1(sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
4210
4211DO_LDFF1(dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4212
4213#undef DO_LDFF1
4214
4215DO_LDNF1(bb_r)
4216DO_LDNF1(bhu_r)
4217DO_LDNF1(bhs_r)
4218DO_LDNF1(bsu_r)
4219DO_LDNF1(bss_r)
4220DO_LDNF1(bdu_r)
4221DO_LDNF1(bds_r)
4222
4223DO_LDNF1(hh_r)
4224DO_LDNF1(hsu_r)
4225DO_LDNF1(hss_r)
4226DO_LDNF1(hdu_r)
4227DO_LDNF1(hds_r)
4228
4229DO_LDNF1(ss_r)
4230DO_LDNF1(sdu_r)
4231DO_LDNF1(sds_r)
4232
4233DO_LDNF1(dd_r)
4234
4235#undef DO_LDNF1
4236
4237
4238
4239
4240#define DO_ST1(NAME, FN, TYPEE, TYPEM, H) \
4241void HELPER(NAME)(CPUARMState *env, void *vg, \
4242 target_ulong addr, uint32_t desc) \
4243{ \
4244 intptr_t i, oprsz = simd_oprsz(desc); \
4245 intptr_t ra = GETPC(); \
4246 unsigned rd = simd_data(desc); \
4247 void *vd = &env->vfp.zregs[rd]; \
4248 for (i = 0; i < oprsz; ) { \
4249 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4250 do { \
4251 if (pg & 1) { \
4252 TYPEM m = *(TYPEE *)(vd + H(i)); \
4253 FN(env, addr, m, ra); \
4254 } \
4255 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4256 addr += sizeof(TYPEM); \
4257 } while (i & 15); \
4258 } \
4259}
4260
4261#define DO_ST1_D(NAME, FN, TYPEM) \
4262void HELPER(NAME)(CPUARMState *env, void *vg, \
4263 target_ulong addr, uint32_t desc) \
4264{ \
4265 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
4266 intptr_t ra = GETPC(); \
4267 unsigned rd = simd_data(desc); \
4268 uint64_t *d = &env->vfp.zregs[rd].d[0]; \
4269 uint8_t *pg = vg; \
4270 for (i = 0; i < oprsz; i += 1) { \
4271 if (pg[H1(i)] & 1) { \
4272 FN(env, addr, d[i], ra); \
4273 } \
4274 addr += sizeof(TYPEM); \
4275 } \
4276}
4277
4278#define DO_ST2(NAME, FN, TYPEE, TYPEM, H) \
4279void HELPER(NAME)(CPUARMState *env, void *vg, \
4280 target_ulong addr, uint32_t desc) \
4281{ \
4282 intptr_t i, oprsz = simd_oprsz(desc); \
4283 intptr_t ra = GETPC(); \
4284 unsigned rd = simd_data(desc); \
4285 void *d1 = &env->vfp.zregs[rd]; \
4286 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4287 for (i = 0; i < oprsz; ) { \
4288 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4289 do { \
4290 if (pg & 1) { \
4291 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
4292 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
4293 FN(env, addr, m1, ra); \
4294 FN(env, addr + sizeof(TYPEM), m2, ra); \
4295 } \
4296 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4297 addr += 2 * sizeof(TYPEM); \
4298 } while (i & 15); \
4299 } \
4300}
4301
4302#define DO_ST3(NAME, FN, TYPEE, TYPEM, H) \
4303void HELPER(NAME)(CPUARMState *env, void *vg, \
4304 target_ulong addr, uint32_t desc) \
4305{ \
4306 intptr_t i, oprsz = simd_oprsz(desc); \
4307 intptr_t ra = GETPC(); \
4308 unsigned rd = simd_data(desc); \
4309 void *d1 = &env->vfp.zregs[rd]; \
4310 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4311 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
4312 for (i = 0; i < oprsz; ) { \
4313 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4314 do { \
4315 if (pg & 1) { \
4316 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
4317 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
4318 TYPEM m3 = *(TYPEE *)(d3 + H(i)); \
4319 FN(env, addr, m1, ra); \
4320 FN(env, addr + sizeof(TYPEM), m2, ra); \
4321 FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
4322 } \
4323 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4324 addr += 3 * sizeof(TYPEM); \
4325 } while (i & 15); \
4326 } \
4327}
4328
4329#define DO_ST4(NAME, FN, TYPEE, TYPEM, H) \
4330void HELPER(NAME)(CPUARMState *env, void *vg, \
4331 target_ulong addr, uint32_t desc) \
4332{ \
4333 intptr_t i, oprsz = simd_oprsz(desc); \
4334 intptr_t ra = GETPC(); \
4335 unsigned rd = simd_data(desc); \
4336 void *d1 = &env->vfp.zregs[rd]; \
4337 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4338 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
4339 void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \
4340 for (i = 0; i < oprsz; ) { \
4341 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4342 do { \
4343 if (pg & 1) { \
4344 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
4345 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
4346 TYPEM m3 = *(TYPEE *)(d3 + H(i)); \
4347 TYPEM m4 = *(TYPEE *)(d4 + H(i)); \
4348 FN(env, addr, m1, ra); \
4349 FN(env, addr + sizeof(TYPEM), m2, ra); \
4350 FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
4351 FN(env, addr + 3 * sizeof(TYPEM), m4, ra); \
4352 } \
4353 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4354 addr += 4 * sizeof(TYPEM); \
4355 } while (i & 15); \
4356 } \
4357}
4358
4359DO_ST1(sve_st1bh_r, cpu_stb_data_ra, uint16_t, uint8_t, H1_2)
4360DO_ST1(sve_st1bs_r, cpu_stb_data_ra, uint32_t, uint8_t, H1_4)
4361DO_ST1_D(sve_st1bd_r, cpu_stb_data_ra, uint8_t)
4362
4363DO_ST1(sve_st1hs_r, cpu_stw_data_ra, uint32_t, uint16_t, H1_4)
4364DO_ST1_D(sve_st1hd_r, cpu_stw_data_ra, uint16_t)
4365
4366DO_ST1_D(sve_st1sd_r, cpu_stl_data_ra, uint32_t)
4367
4368DO_ST1(sve_st1bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4369DO_ST2(sve_st2bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4370DO_ST3(sve_st3bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4371DO_ST4(sve_st4bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4372
4373DO_ST1(sve_st1hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4374DO_ST2(sve_st2hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4375DO_ST3(sve_st3hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4376DO_ST4(sve_st4hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4377
4378DO_ST1(sve_st1ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4379DO_ST2(sve_st2ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4380DO_ST3(sve_st3ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4381DO_ST4(sve_st4ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4382
4383DO_ST1_D(sve_st1dd_r, cpu_stq_data_ra, uint64_t)
4384
4385void HELPER(sve_st2dd_r)(CPUARMState *env, void *vg,
4386 target_ulong addr, uint32_t desc)
4387{
4388 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4389 intptr_t ra = GETPC();
4390 unsigned rd = simd_data(desc);
4391 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4392 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4393 uint8_t *pg = vg;
4394
4395 for (i = 0; i < oprsz; i += 1) {
4396 if (pg[H1(i)] & 1) {
4397 cpu_stq_data_ra(env, addr, d1[i], ra);
4398 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4399 }
4400 addr += 2 * 8;
4401 }
4402}
4403
4404void HELPER(sve_st3dd_r)(CPUARMState *env, void *vg,
4405 target_ulong addr, uint32_t desc)
4406{
4407 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4408 intptr_t ra = GETPC();
4409 unsigned rd = simd_data(desc);
4410 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4411 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4412 uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
4413 uint8_t *pg = vg;
4414
4415 for (i = 0; i < oprsz; i += 1) {
4416 if (pg[H1(i)] & 1) {
4417 cpu_stq_data_ra(env, addr, d1[i], ra);
4418 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4419 cpu_stq_data_ra(env, addr + 16, d3[i], ra);
4420 }
4421 addr += 3 * 8;
4422 }
4423}
4424
4425void HELPER(sve_st4dd_r)(CPUARMState *env, void *vg,
4426 target_ulong addr, uint32_t desc)
4427{
4428 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4429 intptr_t ra = GETPC();
4430 unsigned rd = simd_data(desc);
4431 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4432 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4433 uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
4434 uint64_t *d4 = &env->vfp.zregs[(rd + 3) & 31].d[0];
4435 uint8_t *pg = vg;
4436
4437 for (i = 0; i < oprsz; i += 1) {
4438 if (pg[H1(i)] & 1) {
4439 cpu_stq_data_ra(env, addr, d1[i], ra);
4440 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4441 cpu_stq_data_ra(env, addr + 16, d3[i], ra);
4442 cpu_stq_data_ra(env, addr + 24, d4[i], ra);
4443 }
4444 addr += 4 * 8;
4445 }
4446}
4447
4448
4449
4450#define DO_LD1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
4451void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4452 target_ulong base, uint32_t desc) \
4453{ \
4454 intptr_t i, oprsz = simd_oprsz(desc); \
4455 unsigned scale = simd_data(desc); \
4456 uintptr_t ra = GETPC(); \
4457 for (i = 0; i < oprsz; ) { \
4458 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4459 do { \
4460 TYPEM m = 0; \
4461 if (pg & 1) { \
4462 target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
4463 m = FN(env, base + (off << scale), ra); \
4464 } \
4465 *(uint32_t *)(vd + H1_4(i)) = m; \
4466 i += 4, pg >>= 4; \
4467 } while (i & 15); \
4468 } \
4469}
4470
4471#define DO_LD1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
4472void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4473 target_ulong base, uint32_t desc) \
4474{ \
4475 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
4476 unsigned scale = simd_data(desc); \
4477 uintptr_t ra = GETPC(); \
4478 uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
4479 for (i = 0; i < oprsz; i++) { \
4480 TYPEM mm = 0; \
4481 if (pg[H1(i)] & 1) { \
4482 target_ulong off = (TYPEI)m[i]; \
4483 mm = FN(env, base + (off << scale), ra); \
4484 } \
4485 d[i] = mm; \
4486 } \
4487}
4488
4489DO_LD1_ZPZ_S(sve_ldbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4490DO_LD1_ZPZ_S(sve_ldhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4491DO_LD1_ZPZ_S(sve_ldssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4492DO_LD1_ZPZ_S(sve_ldbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4493DO_LD1_ZPZ_S(sve_ldhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4494
4495DO_LD1_ZPZ_S(sve_ldbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4496DO_LD1_ZPZ_S(sve_ldhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4497DO_LD1_ZPZ_S(sve_ldssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4498DO_LD1_ZPZ_S(sve_ldbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
4499DO_LD1_ZPZ_S(sve_ldhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
4500
4501DO_LD1_ZPZ_D(sve_ldbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4502DO_LD1_ZPZ_D(sve_ldhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4503DO_LD1_ZPZ_D(sve_ldsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4504DO_LD1_ZPZ_D(sve_ldddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
4505DO_LD1_ZPZ_D(sve_ldbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4506DO_LD1_ZPZ_D(sve_ldhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4507DO_LD1_ZPZ_D(sve_ldsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
4508
4509DO_LD1_ZPZ_D(sve_ldbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4510DO_LD1_ZPZ_D(sve_ldhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4511DO_LD1_ZPZ_D(sve_ldsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4512DO_LD1_ZPZ_D(sve_ldddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
4513DO_LD1_ZPZ_D(sve_ldbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
4514DO_LD1_ZPZ_D(sve_ldhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
4515DO_LD1_ZPZ_D(sve_ldsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
4516
4517DO_LD1_ZPZ_D(sve_ldbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
4518DO_LD1_ZPZ_D(sve_ldhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
4519DO_LD1_ZPZ_D(sve_ldsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
4520DO_LD1_ZPZ_D(sve_ldddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
4521DO_LD1_ZPZ_D(sve_ldbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
4522DO_LD1_ZPZ_D(sve_ldhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
4523DO_LD1_ZPZ_D(sve_ldsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
4524
4525
4526
4527#ifdef CONFIG_USER_ONLY
4528
4529#define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
4530void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4531 target_ulong base, uint32_t desc) \
4532{ \
4533 intptr_t i, oprsz = simd_oprsz(desc); \
4534 unsigned scale = simd_data(desc); \
4535 uintptr_t ra = GETPC(); \
4536 bool first = true; \
4537 mmap_lock(); \
4538 for (i = 0; i < oprsz; ) { \
4539 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4540 do { \
4541 TYPEM m = 0; \
4542 if (pg & 1) { \
4543 target_ulong off = *(TYPEI *)(vm + H(i)); \
4544 target_ulong addr = base + (off << scale); \
4545 if (!first && \
4546 page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \
4547 record_fault(env, i, oprsz); \
4548 goto exit; \
4549 } \
4550 m = FN(env, addr, ra); \
4551 first = false; \
4552 } \
4553 *(TYPEE *)(vd + H(i)) = m; \
4554 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4555 } while (i & 15); \
4556 } \
4557 exit: \
4558 mmap_unlock(); \
4559}
4560
4561#else
4562
4563#define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
4564void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4565 target_ulong base, uint32_t desc) \
4566{ \
4567 g_assert_not_reached(); \
4568}
4569
4570#endif
4571
4572#define DO_LDFF1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
4573 DO_LDFF1_ZPZ(NAME, uint32_t, TYPEI, TYPEM, FN, H1_4)
4574#define DO_LDFF1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
4575 DO_LDFF1_ZPZ(NAME, uint64_t, TYPEI, TYPEM, FN, )
4576
4577DO_LDFF1_ZPZ_S(sve_ldffbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4578DO_LDFF1_ZPZ_S(sve_ldffhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4579DO_LDFF1_ZPZ_S(sve_ldffssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4580DO_LDFF1_ZPZ_S(sve_ldffbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4581DO_LDFF1_ZPZ_S(sve_ldffhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4582
4583DO_LDFF1_ZPZ_S(sve_ldffbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4584DO_LDFF1_ZPZ_S(sve_ldffhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4585DO_LDFF1_ZPZ_S(sve_ldffssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4586DO_LDFF1_ZPZ_S(sve_ldffbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
4587DO_LDFF1_ZPZ_S(sve_ldffhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
4588
4589DO_LDFF1_ZPZ_D(sve_ldffbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4590DO_LDFF1_ZPZ_D(sve_ldffhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4591DO_LDFF1_ZPZ_D(sve_ldffsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4592DO_LDFF1_ZPZ_D(sve_ldffddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
4593DO_LDFF1_ZPZ_D(sve_ldffbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4594DO_LDFF1_ZPZ_D(sve_ldffhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4595DO_LDFF1_ZPZ_D(sve_ldffsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
4596
4597DO_LDFF1_ZPZ_D(sve_ldffbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4598DO_LDFF1_ZPZ_D(sve_ldffhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4599DO_LDFF1_ZPZ_D(sve_ldffsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4600DO_LDFF1_ZPZ_D(sve_ldffddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
4601DO_LDFF1_ZPZ_D(sve_ldffbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
4602DO_LDFF1_ZPZ_D(sve_ldffhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
4603DO_LDFF1_ZPZ_D(sve_ldffsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
4604
4605DO_LDFF1_ZPZ_D(sve_ldffbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
4606DO_LDFF1_ZPZ_D(sve_ldffhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
4607DO_LDFF1_ZPZ_D(sve_ldffsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
4608DO_LDFF1_ZPZ_D(sve_ldffddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
4609DO_LDFF1_ZPZ_D(sve_ldffbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
4610DO_LDFF1_ZPZ_D(sve_ldffhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
4611DO_LDFF1_ZPZ_D(sve_ldffsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
4612
4613
4614
4615#define DO_ST1_ZPZ_S(NAME, TYPEI, FN) \
4616void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4617 target_ulong base, uint32_t desc) \
4618{ \
4619 intptr_t i, oprsz = simd_oprsz(desc); \
4620 unsigned scale = simd_data(desc); \
4621 uintptr_t ra = GETPC(); \
4622 for (i = 0; i < oprsz; ) { \
4623 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4624 do { \
4625 if (likely(pg & 1)) { \
4626 target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
4627 uint32_t d = *(uint32_t *)(vd + H1_4(i)); \
4628 FN(env, base + (off << scale), d, ra); \
4629 } \
4630 i += sizeof(uint32_t), pg >>= sizeof(uint32_t); \
4631 } while (i & 15); \
4632 } \
4633}
4634
4635#define DO_ST1_ZPZ_D(NAME, TYPEI, FN) \
4636void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4637 target_ulong base, uint32_t desc) \
4638{ \
4639 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
4640 unsigned scale = simd_data(desc); \
4641 uintptr_t ra = GETPC(); \
4642 uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
4643 for (i = 0; i < oprsz; i++) { \
4644 if (likely(pg[H1(i)] & 1)) { \
4645 target_ulong off = (target_ulong)(TYPEI)m[i] << scale; \
4646 FN(env, base + off, d[i], ra); \
4647 } \
4648 } \
4649}
4650
4651DO_ST1_ZPZ_S(sve_stbs_zsu, uint32_t, cpu_stb_data_ra)
4652DO_ST1_ZPZ_S(sve_sths_zsu, uint32_t, cpu_stw_data_ra)
4653DO_ST1_ZPZ_S(sve_stss_zsu, uint32_t, cpu_stl_data_ra)
4654
4655DO_ST1_ZPZ_S(sve_stbs_zss, int32_t, cpu_stb_data_ra)
4656DO_ST1_ZPZ_S(sve_sths_zss, int32_t, cpu_stw_data_ra)
4657DO_ST1_ZPZ_S(sve_stss_zss, int32_t, cpu_stl_data_ra)
4658
4659DO_ST1_ZPZ_D(sve_stbd_zsu, uint32_t, cpu_stb_data_ra)
4660DO_ST1_ZPZ_D(sve_sthd_zsu, uint32_t, cpu_stw_data_ra)
4661DO_ST1_ZPZ_D(sve_stsd_zsu, uint32_t, cpu_stl_data_ra)
4662DO_ST1_ZPZ_D(sve_stdd_zsu, uint32_t, cpu_stq_data_ra)
4663
4664DO_ST1_ZPZ_D(sve_stbd_zss, int32_t, cpu_stb_data_ra)
4665DO_ST1_ZPZ_D(sve_sthd_zss, int32_t, cpu_stw_data_ra)
4666DO_ST1_ZPZ_D(sve_stsd_zss, int32_t, cpu_stl_data_ra)
4667DO_ST1_ZPZ_D(sve_stdd_zss, int32_t, cpu_stq_data_ra)
4668
4669DO_ST1_ZPZ_D(sve_stbd_zd, uint64_t, cpu_stb_data_ra)
4670DO_ST1_ZPZ_D(sve_sthd_zd, uint64_t, cpu_stw_data_ra)
4671DO_ST1_ZPZ_D(sve_stsd_zd, uint64_t, cpu_stl_data_ra)
4672DO_ST1_ZPZ_D(sve_stdd_zd, uint64_t, cpu_stq_data_ra)
4673