1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20#include "qemu/osdep.h"
21#include "cpu.h"
22#include "exec/exec-all.h"
23#include "exec/cpu_ldst.h"
24#include "exec/helper-proto.h"
25#include "tcg/tcg-gvec-desc.h"
26#include "fpu/softfloat.h"
27
28
29
30
31#ifdef HOST_WORDS_BIGENDIAN
32#define H1(x) ((x) ^ 7)
33#define H1_2(x) ((x) ^ 6)
34#define H1_4(x) ((x) ^ 4)
35#define H2(x) ((x) ^ 3)
36#define H4(x) ((x) ^ 1)
37#else
38#define H1(x) (x)
39#define H1_2(x) (x)
40#define H1_4(x) (x)
41#define H2(x) (x)
42#define H4(x) (x)
43#endif
44
45
46
47
48
49
50
51
52
53#define PREDTEST_INIT 1
54
55
56
57
58static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
59{
60 if (likely(g)) {
61
62
63 if (!(flags & 4)) {
64 flags |= ((d & (g & -g)) != 0) << 31;
65 flags |= 4;
66 }
67
68
69 flags |= ((d & g) != 0) << 1;
70
71
72 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
73 }
74 return flags;
75}
76
77
78
79
80static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
81{
82 if (likely(g)) {
83
84
85 if (!(flags & 4)) {
86 flags += 4 - 1;
87 flags |= (d & pow2floor(g)) == 0;
88 }
89
90
91 flags |= ((d & g) != 0) << 1;
92
93
94 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
95 }
96 return flags;
97}
98
99
100uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
101{
102 return iter_predtest_fwd(d, g, PREDTEST_INIT);
103}
104
105
106uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
107{
108 uint32_t flags = PREDTEST_INIT;
109 uint64_t *d = vd, *g = vg;
110 uintptr_t i = 0;
111
112 do {
113 flags = iter_predtest_fwd(d[i], g[i], flags);
114 } while (++i < words);
115
116 return flags;
117}
118
119
120
121
122
123
124
125
126
127
128
129
130static inline uint64_t expand_pred_b(uint8_t byte)
131{
132 static const uint64_t word[256] = {
133 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
134 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
135 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
136 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
137 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
138 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
139 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
140 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
141 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
142 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
143 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
144 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
145 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
146 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
147 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
148 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
149 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
150 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
151 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
152 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
153 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
154 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
155 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
156 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
157 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
158 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
159 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
160 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
161 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
162 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
163 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
164 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
165 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
166 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
167 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
168 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
169 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
170 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
171 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
172 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
173 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
174 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
175 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
176 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
177 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
178 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
179 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
180 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
181 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
182 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
183 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
184 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
185 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
186 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
187 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
188 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
189 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
190 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
191 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
192 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
193 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
194 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
195 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
196 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
197 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
198 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
199 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
200 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
201 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
202 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
203 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
204 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
205 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
206 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
207 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
208 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
209 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
210 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
211 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
212 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
213 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
214 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
215 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
216 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
217 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
218 0xffffffffffffffff,
219 };
220 return word[byte];
221}
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237static inline uint64_t expand_pred_h(uint8_t byte)
238{
239 static const uint64_t word[] = {
240 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
241 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
242 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
243 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
244 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
245 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
246 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
247 [0x55] = 0xffffffffffffffff,
248 };
249 return word[byte & 0x55];
250}
251
252
253static inline uint64_t expand_pred_s(uint8_t byte)
254{
255 static const uint64_t word[] = {
256 [0x01] = 0x00000000ffffffffull,
257 [0x10] = 0xffffffff00000000ull,
258 [0x11] = 0xffffffffffffffffull,
259 };
260 return word[byte & 0x11];
261}
262
263
264static inline uint32_t hswap32(uint32_t h)
265{
266 return rol32(h, 16);
267}
268
269
270static inline uint64_t hswap64(uint64_t h)
271{
272 uint64_t m = 0x0000ffff0000ffffull;
273 h = rol64(h, 32);
274 return ((h & m) << 16) | ((h >> 16) & m);
275}
276
277
278static inline uint64_t wswap64(uint64_t h)
279{
280 return rol64(h, 32);
281}
282
283#define LOGICAL_PPPP(NAME, FUNC) \
284void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
285{ \
286 uintptr_t opr_sz = simd_oprsz(desc); \
287 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
288 uintptr_t i; \
289 for (i = 0; i < opr_sz / 8; ++i) { \
290 d[i] = FUNC(n[i], m[i], g[i]); \
291 } \
292}
293
294#define DO_AND(N, M, G) (((N) & (M)) & (G))
295#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
296#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
297#define DO_ORR(N, M, G) (((N) | (M)) & (G))
298#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
299#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
300#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
301#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
302
303LOGICAL_PPPP(sve_and_pppp, DO_AND)
304LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
305LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
306LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
307LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
308LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
309LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
310LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
311
312#undef DO_AND
313#undef DO_BIC
314#undef DO_EOR
315#undef DO_ORR
316#undef DO_ORN
317#undef DO_NOR
318#undef DO_NAND
319#undef DO_SEL
320#undef LOGICAL_PPPP
321
322
323
324
325
326
327
328
329
330#define DO_ZPZZ(NAME, TYPE, H, OP) \
331void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
332{ \
333 intptr_t i, opr_sz = simd_oprsz(desc); \
334 for (i = 0; i < opr_sz; ) { \
335 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
336 do { \
337 if (pg & 1) { \
338 TYPE nn = *(TYPE *)(vn + H(i)); \
339 TYPE mm = *(TYPE *)(vm + H(i)); \
340 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
341 } \
342 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
343 } while (i & 15); \
344 } \
345}
346
347
348#define DO_ZPZZ_D(NAME, TYPE, OP) \
349void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
350{ \
351 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
352 TYPE *d = vd, *n = vn, *m = vm; \
353 uint8_t *pg = vg; \
354 for (i = 0; i < opr_sz; i += 1) { \
355 if (pg[H1(i)] & 1) { \
356 TYPE nn = n[i], mm = m[i]; \
357 d[i] = OP(nn, mm); \
358 } \
359 } \
360}
361
362#define DO_AND(N, M) (N & M)
363#define DO_EOR(N, M) (N ^ M)
364#define DO_ORR(N, M) (N | M)
365#define DO_BIC(N, M) (N & ~M)
366#define DO_ADD(N, M) (N + M)
367#define DO_SUB(N, M) (N - M)
368#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
369#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
370#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
371#define DO_MUL(N, M) (N * M)
372
373
374
375
376
377
378
379
380
381#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
382#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
383
384DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
385DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
386DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
387DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
388
389DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
390DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
391DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
392DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
393
394DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
395DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
396DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
397DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
398
399DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
400DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
401DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
402DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
403
404DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
405DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
406DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
407DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
408
409DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
410DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
411DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
412DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
413
414DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
415DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
416DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
417DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
418
419DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
420DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
421DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
422DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
423
424DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
425DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
426DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
427DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
428
429DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
430DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
431DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
432DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
433
434DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
435DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
436DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
437DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
438
439DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
440DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
441DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
442DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
443
444
445
446static inline uint8_t do_mulh_b(int32_t n, int32_t m)
447{
448 return (n * m) >> 8;
449}
450
451static inline uint16_t do_mulh_h(int32_t n, int32_t m)
452{
453 return (n * m) >> 16;
454}
455
456static inline uint32_t do_mulh_s(int64_t n, int64_t m)
457{
458 return (n * m) >> 32;
459}
460
461static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
462{
463 uint64_t lo, hi;
464 muls64(&lo, &hi, n, m);
465 return hi;
466}
467
468static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
469{
470 uint64_t lo, hi;
471 mulu64(&lo, &hi, n, m);
472 return hi;
473}
474
475DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
476DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
477DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
478DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
479
480DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
481DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
482DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
483DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
484
485DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
486DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
487DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
488DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
489
490DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
491DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
492
493DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
494DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
495
496
497
498#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
499#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
500#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
501
502DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
503DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
504DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
505
506DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
507DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
508DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
509
510DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
511DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
512DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
513
514DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
515DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
516DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
517
518#undef DO_ZPZZ
519#undef DO_ZPZZ_D
520
521
522
523
524
525#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
526void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
527{ \
528 intptr_t i, opr_sz = simd_oprsz(desc); \
529 for (i = 0; i < opr_sz; ) { \
530 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
531 TYPEW mm = *(TYPEW *)(vm + i); \
532 do { \
533 if (pg & 1) { \
534 TYPE nn = *(TYPE *)(vn + H(i)); \
535 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
536 } \
537 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
538 } while (i & 7); \
539 } \
540}
541
542DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
543DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
544DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
545
546DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
547DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
548DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
549
550DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
551DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
552DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
553
554#undef DO_ZPZW
555
556
557
558#define DO_ZPZ(NAME, TYPE, H, OP) \
559void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
560{ \
561 intptr_t i, opr_sz = simd_oprsz(desc); \
562 for (i = 0; i < opr_sz; ) { \
563 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
564 do { \
565 if (pg & 1) { \
566 TYPE nn = *(TYPE *)(vn + H(i)); \
567 *(TYPE *)(vd + H(i)) = OP(nn); \
568 } \
569 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
570 } while (i & 15); \
571 } \
572}
573
574
575#define DO_ZPZ_D(NAME, TYPE, OP) \
576void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
577{ \
578 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
579 TYPE *d = vd, *n = vn; \
580 uint8_t *pg = vg; \
581 for (i = 0; i < opr_sz; i += 1) { \
582 if (pg[H1(i)] & 1) { \
583 TYPE nn = n[i]; \
584 d[i] = OP(nn); \
585 } \
586 } \
587}
588
589#define DO_CLS_B(N) (clrsb32(N) - 24)
590#define DO_CLS_H(N) (clrsb32(N) - 16)
591
592DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
593DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
594DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
595DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
596
597#define DO_CLZ_B(N) (clz32(N) - 24)
598#define DO_CLZ_H(N) (clz32(N) - 16)
599
600DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
601DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
602DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
603DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
604
605DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
606DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
607DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
608DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
609
610#define DO_CNOT(N) (N == 0)
611
612DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
613DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
614DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
615DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
616
617#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
618
619DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
620DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
621DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
622
623#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
624
625DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
626DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
627DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
628
629#define DO_NOT(N) (~N)
630
631DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
632DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
633DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
634DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
635
636#define DO_SXTB(N) ((int8_t)N)
637#define DO_SXTH(N) ((int16_t)N)
638#define DO_SXTS(N) ((int32_t)N)
639#define DO_UXTB(N) ((uint8_t)N)
640#define DO_UXTH(N) ((uint16_t)N)
641#define DO_UXTS(N) ((uint32_t)N)
642
643DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
644DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
645DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
646DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
647DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
648DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
649
650DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
651DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
652DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
653DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
654DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
655DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
656
657#define DO_ABS(N) (N < 0 ? -N : N)
658
659DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
660DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
661DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
662DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
663
664#define DO_NEG(N) (-N)
665
666DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
667DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
668DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
669DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
670
671DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
672DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
673DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
674
675DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
676DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
677
678DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
679
680DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
681DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
682DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
683DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
684
685
686
687#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
688void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
689{ \
690 intptr_t i, opr_sz = simd_oprsz(desc); \
691 for (i = 0; i < opr_sz; ) { \
692 TYPEW mm = *(TYPEW *)(vm + i); \
693 do { \
694 TYPE nn = *(TYPE *)(vn + H(i)); \
695 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
696 i += sizeof(TYPE); \
697 } while (i & 7); \
698 } \
699}
700
701DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
702DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
703DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
704
705DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
706DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
707DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
708
709DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
710DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
711DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
712
713#undef DO_ZZW
714
715#undef DO_CLS_B
716#undef DO_CLS_H
717#undef DO_CLZ_B
718#undef DO_CLZ_H
719#undef DO_CNOT
720#undef DO_FABS
721#undef DO_FNEG
722#undef DO_ABS
723#undef DO_NEG
724#undef DO_ZPZ
725#undef DO_ZPZ_D
726
727
728
729
730
731
732
733
734
735
736#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
737uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
738{ \
739 intptr_t i, opr_sz = simd_oprsz(desc); \
740 TYPERED ret = INIT; \
741 for (i = 0; i < opr_sz; ) { \
742 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
743 do { \
744 if (pg & 1) { \
745 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
746 ret = OP(ret, nn); \
747 } \
748 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
749 } while (i & 15); \
750 } \
751 return (TYPERET)ret; \
752}
753
754#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
755uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
756{ \
757 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
758 TYPEE *n = vn; \
759 uint8_t *pg = vg; \
760 TYPER ret = INIT; \
761 for (i = 0; i < opr_sz; i += 1) { \
762 if (pg[H1(i)] & 1) { \
763 TYPEE nn = n[i]; \
764 ret = OP(ret, nn); \
765 } \
766 } \
767 return ret; \
768}
769
770DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
771DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
772DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
773DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
774
775DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
776DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
777DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
778DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
779
780DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
781DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
782DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
783DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
784
785DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
786DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
787DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
788
789DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
790DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
791DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
792DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
793
794DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
795DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
796DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
797DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
798
799DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
800DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
801DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
802DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
803
804DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
805DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
806DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
807DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
808
809DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
810DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
811DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
812DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
813
814#undef DO_VPZ
815#undef DO_VPZ_D
816
817
818#define DO_ZZI(NAME, TYPE, OP) \
819void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
820{ \
821 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
822 TYPE s = s64, *d = vd, *n = vn; \
823 for (i = 0; i < opr_sz; ++i) { \
824 d[i] = OP(n[i], s); \
825 } \
826}
827
828#define DO_SUBR(X, Y) (Y - X)
829
830DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
831DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
832DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
833DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
834
835DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
836DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
837DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
838DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
839
840DO_ZZI(sve_smini_b, int8_t, DO_MIN)
841DO_ZZI(sve_smini_h, int16_t, DO_MIN)
842DO_ZZI(sve_smini_s, int32_t, DO_MIN)
843DO_ZZI(sve_smini_d, int64_t, DO_MIN)
844
845DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
846DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
847DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
848DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
849
850DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
851DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
852DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
853DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
854
855#undef DO_ZZI
856
857#undef DO_AND
858#undef DO_ORR
859#undef DO_EOR
860#undef DO_BIC
861#undef DO_ADD
862#undef DO_SUB
863#undef DO_MAX
864#undef DO_MIN
865#undef DO_ABD
866#undef DO_MUL
867#undef DO_DIV
868#undef DO_ASR
869#undef DO_LSR
870#undef DO_LSL
871#undef DO_SUBR
872
873
874
875
876static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
877{
878 uint64_t mask = pred_esz_masks[esz];
879 intptr_t i = words;
880
881 do {
882 uint64_t this_g = g[--i] & mask;
883 if (this_g) {
884 return i * 64 + (63 - clz64(this_g));
885 }
886 } while (i > 0);
887 return (intptr_t)-1 << esz;
888}
889
890uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
891{
892 uint32_t flags = PREDTEST_INIT;
893 uint64_t *d = vd, *g = vg;
894 intptr_t i = 0;
895
896 do {
897 uint64_t this_d = d[i];
898 uint64_t this_g = g[i];
899
900 if (this_g) {
901 if (!(flags & 4)) {
902
903 this_d |= this_g & -this_g;
904 d[i] = this_d;
905 }
906 flags = iter_predtest_fwd(this_d, this_g, flags);
907 }
908 } while (++i < words);
909
910 return flags;
911}
912
913uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
914{
915 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
916 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
917 uint32_t flags = PREDTEST_INIT;
918 uint64_t *d = vd, *g = vg, esz_mask;
919 intptr_t i, next;
920
921 next = last_active_element(vd, words, esz) + (1 << esz);
922 esz_mask = pred_esz_masks[esz];
923
924
925
926 if (next < words * 64) {
927 uint64_t mask = -1;
928
929 if (next & 63) {
930 mask = ~((1ull << (next & 63)) - 1);
931 next &= -64;
932 }
933 do {
934 uint64_t this_g = g[next / 64] & esz_mask & mask;
935 if (this_g != 0) {
936 next = (next & -64) + ctz64(this_g);
937 break;
938 }
939 next += 64;
940 mask = -1;
941 } while (next < words * 64);
942 }
943
944 i = 0;
945 do {
946 uint64_t this_d = 0;
947 if (i == next / 64) {
948 this_d = 1ull << (next & 63);
949 }
950 d[i] = this_d;
951 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
952 } while (++i < words);
953
954 return flags;
955}
956
957
958
959
960
961
962
963
964
965
966void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
967{
968 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
969 uint64_t *d = vd;
970 uint8_t *pg = vg;
971 for (i = 0; i < opr_sz; i += 1) {
972 d[i] &= ~expand_pred_b(pg[H1(i)]);
973 }
974}
975
976void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
977{
978 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
979 uint64_t *d = vd;
980 uint8_t *pg = vg;
981 for (i = 0; i < opr_sz; i += 1) {
982 d[i] &= ~expand_pred_h(pg[H1(i)]);
983 }
984}
985
986void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
987{
988 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
989 uint64_t *d = vd;
990 uint8_t *pg = vg;
991 for (i = 0; i < opr_sz; i += 1) {
992 d[i] &= ~expand_pred_s(pg[H1(i)]);
993 }
994}
995
996void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
997{
998 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
999 uint64_t *d = vd;
1000 uint8_t *pg = vg;
1001 for (i = 0; i < opr_sz; i += 1) {
1002 if (pg[H1(i)] & 1) {
1003 d[i] = 0;
1004 }
1005 }
1006}
1007
1008
1009void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1010{
1011 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1012 uint64_t *d = vd, *n = vn;
1013 uint8_t *pg = vg;
1014 for (i = 0; i < opr_sz; i += 1) {
1015 d[i] = n[i] & expand_pred_b(pg[H1(i)]);
1016 }
1017}
1018
1019void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1020{
1021 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1022 uint64_t *d = vd, *n = vn;
1023 uint8_t *pg = vg;
1024 for (i = 0; i < opr_sz; i += 1) {
1025 d[i] = n[i] & expand_pred_h(pg[H1(i)]);
1026 }
1027}
1028
1029void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1030{
1031 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1032 uint64_t *d = vd, *n = vn;
1033 uint8_t *pg = vg;
1034 for (i = 0; i < opr_sz; i += 1) {
1035 d[i] = n[i] & expand_pred_s(pg[H1(i)]);
1036 }
1037}
1038
1039void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1040{
1041 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1042 uint64_t *d = vd, *n = vn;
1043 uint8_t *pg = vg;
1044 for (i = 0; i < opr_sz; i += 1) {
1045 d[i] = n[1] & -(uint64_t)(pg[H1(i)] & 1);
1046 }
1047}
1048
1049
1050
1051#define DO_ZPZI(NAME, TYPE, H, OP) \
1052void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1053{ \
1054 intptr_t i, opr_sz = simd_oprsz(desc); \
1055 TYPE imm = simd_data(desc); \
1056 for (i = 0; i < opr_sz; ) { \
1057 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1058 do { \
1059 if (pg & 1) { \
1060 TYPE nn = *(TYPE *)(vn + H(i)); \
1061 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1062 } \
1063 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1064 } while (i & 15); \
1065 } \
1066}
1067
1068
1069#define DO_ZPZI_D(NAME, TYPE, OP) \
1070void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1071{ \
1072 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1073 TYPE *d = vd, *n = vn; \
1074 TYPE imm = simd_data(desc); \
1075 uint8_t *pg = vg; \
1076 for (i = 0; i < opr_sz; i += 1) { \
1077 if (pg[H1(i)] & 1) { \
1078 TYPE nn = n[i]; \
1079 d[i] = OP(nn, imm); \
1080 } \
1081 } \
1082}
1083
1084#define DO_SHR(N, M) (N >> M)
1085#define DO_SHL(N, M) (N << M)
1086
1087
1088
1089
1090#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1091
1092DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1093DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1094DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1095DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1096
1097DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1098DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1099DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1100DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1101
1102DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1103DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1104DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1105DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1106
1107DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1108DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1109DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1110DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1111
1112#undef DO_SHR
1113#undef DO_SHL
1114#undef DO_ASRD
1115#undef DO_ZPZI
1116#undef DO_ZPZI_D
1117
1118
1119
1120#define DO_ZPZZZ(NAME, TYPE, H, OP) \
1121void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1122 void *vg, uint32_t desc) \
1123{ \
1124 intptr_t i, opr_sz = simd_oprsz(desc); \
1125 for (i = 0; i < opr_sz; ) { \
1126 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1127 do { \
1128 if (pg & 1) { \
1129 TYPE nn = *(TYPE *)(vn + H(i)); \
1130 TYPE mm = *(TYPE *)(vm + H(i)); \
1131 TYPE aa = *(TYPE *)(va + H(i)); \
1132 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1133 } \
1134 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1135 } while (i & 15); \
1136 } \
1137}
1138
1139
1140#define DO_ZPZZZ_D(NAME, TYPE, OP) \
1141void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1142 void *vg, uint32_t desc) \
1143{ \
1144 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1145 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1146 uint8_t *pg = vg; \
1147 for (i = 0; i < opr_sz; i += 1) { \
1148 if (pg[H1(i)] & 1) { \
1149 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1150 d[i] = OP(aa, nn, mm); \
1151 } \
1152 } \
1153}
1154
1155#define DO_MLA(A, N, M) (A + N * M)
1156#define DO_MLS(A, N, M) (A - N * M)
1157
1158DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1159DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1160
1161DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1162DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1163
1164DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1165DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1166
1167DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1168DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1169
1170#undef DO_MLA
1171#undef DO_MLS
1172#undef DO_ZPZZZ
1173#undef DO_ZPZZZ_D
1174
1175void HELPER(sve_index_b)(void *vd, uint32_t start,
1176 uint32_t incr, uint32_t desc)
1177{
1178 intptr_t i, opr_sz = simd_oprsz(desc);
1179 uint8_t *d = vd;
1180 for (i = 0; i < opr_sz; i += 1) {
1181 d[H1(i)] = start + i * incr;
1182 }
1183}
1184
1185void HELPER(sve_index_h)(void *vd, uint32_t start,
1186 uint32_t incr, uint32_t desc)
1187{
1188 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1189 uint16_t *d = vd;
1190 for (i = 0; i < opr_sz; i += 1) {
1191 d[H2(i)] = start + i * incr;
1192 }
1193}
1194
1195void HELPER(sve_index_s)(void *vd, uint32_t start,
1196 uint32_t incr, uint32_t desc)
1197{
1198 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1199 uint32_t *d = vd;
1200 for (i = 0; i < opr_sz; i += 1) {
1201 d[H4(i)] = start + i * incr;
1202 }
1203}
1204
1205void HELPER(sve_index_d)(void *vd, uint64_t start,
1206 uint64_t incr, uint32_t desc)
1207{
1208 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1209 uint64_t *d = vd;
1210 for (i = 0; i < opr_sz; i += 1) {
1211 d[i] = start + i * incr;
1212 }
1213}
1214
1215void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1216{
1217 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1218 uint32_t sh = simd_data(desc);
1219 uint32_t *d = vd, *n = vn, *m = vm;
1220 for (i = 0; i < opr_sz; i += 1) {
1221 d[i] = n[i] + (m[i] << sh);
1222 }
1223}
1224
1225void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1226{
1227 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1228 uint64_t sh = simd_data(desc);
1229 uint64_t *d = vd, *n = vn, *m = vm;
1230 for (i = 0; i < opr_sz; i += 1) {
1231 d[i] = n[i] + (m[i] << sh);
1232 }
1233}
1234
1235void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1236{
1237 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1238 uint64_t sh = simd_data(desc);
1239 uint64_t *d = vd, *n = vn, *m = vm;
1240 for (i = 0; i < opr_sz; i += 1) {
1241 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1242 }
1243}
1244
1245void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1246{
1247 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1248 uint64_t sh = simd_data(desc);
1249 uint64_t *d = vd, *n = vn, *m = vm;
1250 for (i = 0; i < opr_sz; i += 1) {
1251 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1252 }
1253}
1254
1255void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1256{
1257
1258 static const uint16_t coeff[] = {
1259 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1260 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1261 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1262 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1263 };
1264 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1265 uint16_t *d = vd, *n = vn;
1266
1267 for (i = 0; i < opr_sz; i++) {
1268 uint16_t nn = n[i];
1269 intptr_t idx = extract32(nn, 0, 5);
1270 uint16_t exp = extract32(nn, 5, 5);
1271 d[i] = coeff[idx] | (exp << 10);
1272 }
1273}
1274
1275void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1276{
1277
1278 static const uint32_t coeff[] = {
1279 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1280 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1281 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1282 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1283 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1284 0x1ef532, 0x20b051, 0x227043, 0x243516,
1285 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1286 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1287 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1288 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1289 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1290 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1291 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1292 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1293 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1294 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1295 };
1296 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1297 uint32_t *d = vd, *n = vn;
1298
1299 for (i = 0; i < opr_sz; i++) {
1300 uint32_t nn = n[i];
1301 intptr_t idx = extract32(nn, 0, 6);
1302 uint32_t exp = extract32(nn, 6, 8);
1303 d[i] = coeff[idx] | (exp << 23);
1304 }
1305}
1306
1307void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1308{
1309
1310 static const uint64_t coeff[] = {
1311 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1312 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1313 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1314 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1315 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1316 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1317 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1318 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1319 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1320 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1321 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1322 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1323 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1324 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1325 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1326 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1327 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1328 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1329 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1330 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1331 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1332 0xFA7C1819E90D8ull,
1333 };
1334 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1335 uint64_t *d = vd, *n = vn;
1336
1337 for (i = 0; i < opr_sz; i++) {
1338 uint64_t nn = n[i];
1339 intptr_t idx = extract32(nn, 0, 6);
1340 uint64_t exp = extract32(nn, 6, 11);
1341 d[i] = coeff[idx] | (exp << 52);
1342 }
1343}
1344
1345void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1346{
1347 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1348 uint16_t *d = vd, *n = vn, *m = vm;
1349 for (i = 0; i < opr_sz; i += 1) {
1350 uint16_t nn = n[i];
1351 uint16_t mm = m[i];
1352 if (mm & 1) {
1353 nn = float16_one;
1354 }
1355 d[i] = nn ^ (mm & 2) << 14;
1356 }
1357}
1358
1359void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1360{
1361 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1362 uint32_t *d = vd, *n = vn, *m = vm;
1363 for (i = 0; i < opr_sz; i += 1) {
1364 uint32_t nn = n[i];
1365 uint32_t mm = m[i];
1366 if (mm & 1) {
1367 nn = float32_one;
1368 }
1369 d[i] = nn ^ (mm & 2) << 30;
1370 }
1371}
1372
1373void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1374{
1375 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1376 uint64_t *d = vd, *n = vn, *m = vm;
1377 for (i = 0; i < opr_sz; i += 1) {
1378 uint64_t nn = n[i];
1379 uint64_t mm = m[i];
1380 if (mm & 1) {
1381 nn = float64_one;
1382 }
1383 d[i] = nn ^ (mm & 2) << 62;
1384 }
1385}
1386
1387
1388
1389
1390
1391void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1392{
1393 intptr_t i, oprsz = simd_oprsz(desc);
1394
1395 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1396 int r = *(int8_t *)(a + i) + b;
1397 if (r > INT8_MAX) {
1398 r = INT8_MAX;
1399 } else if (r < INT8_MIN) {
1400 r = INT8_MIN;
1401 }
1402 *(int8_t *)(d + i) = r;
1403 }
1404}
1405
1406void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1407{
1408 intptr_t i, oprsz = simd_oprsz(desc);
1409
1410 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1411 int r = *(int16_t *)(a + i) + b;
1412 if (r > INT16_MAX) {
1413 r = INT16_MAX;
1414 } else if (r < INT16_MIN) {
1415 r = INT16_MIN;
1416 }
1417 *(int16_t *)(d + i) = r;
1418 }
1419}
1420
1421void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1422{
1423 intptr_t i, oprsz = simd_oprsz(desc);
1424
1425 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1426 int64_t r = *(int32_t *)(a + i) + b;
1427 if (r > INT32_MAX) {
1428 r = INT32_MAX;
1429 } else if (r < INT32_MIN) {
1430 r = INT32_MIN;
1431 }
1432 *(int32_t *)(d + i) = r;
1433 }
1434}
1435
1436void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1437{
1438 intptr_t i, oprsz = simd_oprsz(desc);
1439
1440 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1441 int64_t ai = *(int64_t *)(a + i);
1442 int64_t r = ai + b;
1443 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1444
1445 r = (r < 0 ? INT64_MAX : INT64_MIN);
1446 }
1447 *(int64_t *)(d + i) = r;
1448 }
1449}
1450
1451
1452
1453
1454
1455void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1456{
1457 intptr_t i, oprsz = simd_oprsz(desc);
1458
1459 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1460 int r = *(uint8_t *)(a + i) + b;
1461 if (r > UINT8_MAX) {
1462 r = UINT8_MAX;
1463 } else if (r < 0) {
1464 r = 0;
1465 }
1466 *(uint8_t *)(d + i) = r;
1467 }
1468}
1469
1470void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1471{
1472 intptr_t i, oprsz = simd_oprsz(desc);
1473
1474 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1475 int r = *(uint16_t *)(a + i) + b;
1476 if (r > UINT16_MAX) {
1477 r = UINT16_MAX;
1478 } else if (r < 0) {
1479 r = 0;
1480 }
1481 *(uint16_t *)(d + i) = r;
1482 }
1483}
1484
1485void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1486{
1487 intptr_t i, oprsz = simd_oprsz(desc);
1488
1489 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1490 int64_t r = *(uint32_t *)(a + i) + b;
1491 if (r > UINT32_MAX) {
1492 r = UINT32_MAX;
1493 } else if (r < 0) {
1494 r = 0;
1495 }
1496 *(uint32_t *)(d + i) = r;
1497 }
1498}
1499
1500void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1501{
1502 intptr_t i, oprsz = simd_oprsz(desc);
1503
1504 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1505 uint64_t r = *(uint64_t *)(a + i) + b;
1506 if (r < b) {
1507 r = UINT64_MAX;
1508 }
1509 *(uint64_t *)(d + i) = r;
1510 }
1511}
1512
1513void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1514{
1515 intptr_t i, oprsz = simd_oprsz(desc);
1516
1517 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1518 uint64_t ai = *(uint64_t *)(a + i);
1519 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1520 }
1521}
1522
1523
1524
1525
1526void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1527 uint64_t mm, uint32_t desc)
1528{
1529 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1530 uint64_t *d = vd, *n = vn;
1531 uint8_t *pg = vg;
1532
1533 mm = dup_const(MO_8, mm);
1534 for (i = 0; i < opr_sz; i += 1) {
1535 uint64_t nn = n[i];
1536 uint64_t pp = expand_pred_b(pg[H1(i)]);
1537 d[i] = (mm & pp) | (nn & ~pp);
1538 }
1539}
1540
1541void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1542 uint64_t mm, uint32_t desc)
1543{
1544 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1545 uint64_t *d = vd, *n = vn;
1546 uint8_t *pg = vg;
1547
1548 mm = dup_const(MO_16, mm);
1549 for (i = 0; i < opr_sz; i += 1) {
1550 uint64_t nn = n[i];
1551 uint64_t pp = expand_pred_h(pg[H1(i)]);
1552 d[i] = (mm & pp) | (nn & ~pp);
1553 }
1554}
1555
1556void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1557 uint64_t mm, uint32_t desc)
1558{
1559 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1560 uint64_t *d = vd, *n = vn;
1561 uint8_t *pg = vg;
1562
1563 mm = dup_const(MO_32, mm);
1564 for (i = 0; i < opr_sz; i += 1) {
1565 uint64_t nn = n[i];
1566 uint64_t pp = expand_pred_s(pg[H1(i)]);
1567 d[i] = (mm & pp) | (nn & ~pp);
1568 }
1569}
1570
1571void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1572 uint64_t mm, uint32_t desc)
1573{
1574 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1575 uint64_t *d = vd, *n = vn;
1576 uint8_t *pg = vg;
1577
1578 for (i = 0; i < opr_sz; i += 1) {
1579 uint64_t nn = n[i];
1580 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1581 }
1582}
1583
1584void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1585{
1586 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1587 uint64_t *d = vd;
1588 uint8_t *pg = vg;
1589
1590 val = dup_const(MO_8, val);
1591 for (i = 0; i < opr_sz; i += 1) {
1592 d[i] = val & expand_pred_b(pg[H1(i)]);
1593 }
1594}
1595
1596void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1597{
1598 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1599 uint64_t *d = vd;
1600 uint8_t *pg = vg;
1601
1602 val = dup_const(MO_16, val);
1603 for (i = 0; i < opr_sz; i += 1) {
1604 d[i] = val & expand_pred_h(pg[H1(i)]);
1605 }
1606}
1607
1608void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1609{
1610 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1611 uint64_t *d = vd;
1612 uint8_t *pg = vg;
1613
1614 val = dup_const(MO_32, val);
1615 for (i = 0; i < opr_sz; i += 1) {
1616 d[i] = val & expand_pred_s(pg[H1(i)]);
1617 }
1618}
1619
1620void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1621{
1622 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1623 uint64_t *d = vd;
1624 uint8_t *pg = vg;
1625
1626 for (i = 0; i < opr_sz; i += 1) {
1627 d[i] = (pg[H1(i)] & 1 ? val : 0);
1628 }
1629}
1630
1631
1632
1633
1634static void swap_memmove(void *vd, void *vs, size_t n)
1635{
1636 uintptr_t d = (uintptr_t)vd;
1637 uintptr_t s = (uintptr_t)vs;
1638 uintptr_t o = (d | s | n) & 7;
1639 size_t i;
1640
1641#ifndef HOST_WORDS_BIGENDIAN
1642 o = 0;
1643#endif
1644 switch (o) {
1645 case 0:
1646 memmove(vd, vs, n);
1647 break;
1648
1649 case 4:
1650 if (d < s || d >= s + n) {
1651 for (i = 0; i < n; i += 4) {
1652 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1653 }
1654 } else {
1655 for (i = n; i > 0; ) {
1656 i -= 4;
1657 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1658 }
1659 }
1660 break;
1661
1662 case 2:
1663 case 6:
1664 if (d < s || d >= s + n) {
1665 for (i = 0; i < n; i += 2) {
1666 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1667 }
1668 } else {
1669 for (i = n; i > 0; ) {
1670 i -= 2;
1671 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1672 }
1673 }
1674 break;
1675
1676 default:
1677 if (d < s || d >= s + n) {
1678 for (i = 0; i < n; i++) {
1679 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1680 }
1681 } else {
1682 for (i = n; i > 0; ) {
1683 i -= 1;
1684 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1685 }
1686 }
1687 break;
1688 }
1689}
1690
1691void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1692{
1693 intptr_t opr_sz = simd_oprsz(desc);
1694 size_t n_ofs = simd_data(desc);
1695 size_t n_siz = opr_sz - n_ofs;
1696
1697 if (vd != vm) {
1698 swap_memmove(vd, vn + n_ofs, n_siz);
1699 swap_memmove(vd + n_siz, vm, n_ofs);
1700 } else if (vd != vn) {
1701 swap_memmove(vd + n_siz, vd, n_ofs);
1702 swap_memmove(vd, vn + n_ofs, n_siz);
1703 } else {
1704
1705 ARMVectorReg tmp;
1706 swap_memmove(&tmp, vm, n_ofs);
1707 swap_memmove(vd, vd + n_ofs, n_siz);
1708 memcpy(vd + n_siz, &tmp, n_ofs);
1709 }
1710}
1711
1712#define DO_INSR(NAME, TYPE, H) \
1713void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1714{ \
1715 intptr_t opr_sz = simd_oprsz(desc); \
1716 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1717 *(TYPE *)(vd + H(0)) = val; \
1718}
1719
1720DO_INSR(sve_insr_b, uint8_t, H1)
1721DO_INSR(sve_insr_h, uint16_t, H1_2)
1722DO_INSR(sve_insr_s, uint32_t, H1_4)
1723DO_INSR(sve_insr_d, uint64_t, )
1724
1725#undef DO_INSR
1726
1727void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1728{
1729 intptr_t i, j, opr_sz = simd_oprsz(desc);
1730 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1731 uint64_t f = *(uint64_t *)(vn + i);
1732 uint64_t b = *(uint64_t *)(vn + j);
1733 *(uint64_t *)(vd + i) = bswap64(b);
1734 *(uint64_t *)(vd + j) = bswap64(f);
1735 }
1736}
1737
1738void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1739{
1740 intptr_t i, j, opr_sz = simd_oprsz(desc);
1741 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1742 uint64_t f = *(uint64_t *)(vn + i);
1743 uint64_t b = *(uint64_t *)(vn + j);
1744 *(uint64_t *)(vd + i) = hswap64(b);
1745 *(uint64_t *)(vd + j) = hswap64(f);
1746 }
1747}
1748
1749void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1750{
1751 intptr_t i, j, opr_sz = simd_oprsz(desc);
1752 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1753 uint64_t f = *(uint64_t *)(vn + i);
1754 uint64_t b = *(uint64_t *)(vn + j);
1755 *(uint64_t *)(vd + i) = rol64(b, 32);
1756 *(uint64_t *)(vd + j) = rol64(f, 32);
1757 }
1758}
1759
1760void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1761{
1762 intptr_t i, j, opr_sz = simd_oprsz(desc);
1763 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1764 uint64_t f = *(uint64_t *)(vn + i);
1765 uint64_t b = *(uint64_t *)(vn + j);
1766 *(uint64_t *)(vd + i) = b;
1767 *(uint64_t *)(vd + j) = f;
1768 }
1769}
1770
1771#define DO_TBL(NAME, TYPE, H) \
1772void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1773{ \
1774 intptr_t i, opr_sz = simd_oprsz(desc); \
1775 uintptr_t elem = opr_sz / sizeof(TYPE); \
1776 TYPE *d = vd, *n = vn, *m = vm; \
1777 ARMVectorReg tmp; \
1778 if (unlikely(vd == vn)) { \
1779 n = memcpy(&tmp, vn, opr_sz); \
1780 } \
1781 for (i = 0; i < elem; i++) { \
1782 TYPE j = m[H(i)]; \
1783 d[H(i)] = j < elem ? n[H(j)] : 0; \
1784 } \
1785}
1786
1787DO_TBL(sve_tbl_b, uint8_t, H1)
1788DO_TBL(sve_tbl_h, uint16_t, H2)
1789DO_TBL(sve_tbl_s, uint32_t, H4)
1790DO_TBL(sve_tbl_d, uint64_t, )
1791
1792#undef TBL
1793
1794#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1795void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1796{ \
1797 intptr_t i, opr_sz = simd_oprsz(desc); \
1798 TYPED *d = vd; \
1799 TYPES *n = vn; \
1800 ARMVectorReg tmp; \
1801 if (unlikely(vn - vd < opr_sz)) { \
1802 n = memcpy(&tmp, n, opr_sz / 2); \
1803 } \
1804 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1805 d[HD(i)] = n[HS(i)]; \
1806 } \
1807}
1808
1809DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1810DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1811DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1812
1813DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1814DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1815DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1816
1817#undef DO_UNPK
1818
1819
1820
1821
1822
1823static const uint64_t even_bit_esz_masks[5] = {
1824 0x5555555555555555ull,
1825 0x3333333333333333ull,
1826 0x0f0f0f0f0f0f0f0full,
1827 0x00ff00ff00ff00ffull,
1828 0x0000ffff0000ffffull,
1829};
1830
1831
1832
1833
1834
1835
1836static uint64_t expand_bits(uint64_t x, int n)
1837{
1838 int i;
1839
1840 x &= 0xffffffffu;
1841 for (i = 4; i >= n; i--) {
1842 int sh = 1 << i;
1843 x = ((x << sh) | x) & even_bit_esz_masks[i];
1844 }
1845 return x;
1846}
1847
1848
1849
1850
1851
1852
1853static uint64_t compress_bits(uint64_t x, int n)
1854{
1855 int i;
1856
1857 for (i = n; i <= 4; i++) {
1858 int sh = 1 << i;
1859 x &= even_bit_esz_masks[i];
1860 x = (x >> sh) | x;
1861 }
1862 return x & 0xffffffffu;
1863}
1864
1865void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1866{
1867 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1868 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1869 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1870 uint64_t *d = vd;
1871 intptr_t i;
1872
1873 if (oprsz <= 8) {
1874 uint64_t nn = *(uint64_t *)vn;
1875 uint64_t mm = *(uint64_t *)vm;
1876 int half = 4 * oprsz;
1877
1878 nn = extract64(nn, high * half, half);
1879 mm = extract64(mm, high * half, half);
1880 nn = expand_bits(nn, esz);
1881 mm = expand_bits(mm, esz);
1882 d[0] = nn + (mm << (1 << esz));
1883 } else {
1884 ARMPredicateReg tmp_n, tmp_m;
1885
1886
1887
1888 if ((vn - vd) < (uintptr_t)oprsz) {
1889 vn = memcpy(&tmp_n, vn, oprsz);
1890 }
1891 if ((vm - vd) < (uintptr_t)oprsz) {
1892 vm = memcpy(&tmp_m, vm, oprsz);
1893 }
1894 if (high) {
1895 high = oprsz >> 1;
1896 }
1897
1898 if ((high & 3) == 0) {
1899 uint32_t *n = vn, *m = vm;
1900 high >>= 2;
1901
1902 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1903 uint64_t nn = n[H4(high + i)];
1904 uint64_t mm = m[H4(high + i)];
1905
1906 nn = expand_bits(nn, esz);
1907 mm = expand_bits(mm, esz);
1908 d[i] = nn + (mm << (1 << esz));
1909 }
1910 } else {
1911 uint8_t *n = vn, *m = vm;
1912 uint16_t *d16 = vd;
1913
1914 for (i = 0; i < oprsz / 2; i++) {
1915 uint16_t nn = n[H1(high + i)];
1916 uint16_t mm = m[H1(high + i)];
1917
1918 nn = expand_bits(nn, esz);
1919 mm = expand_bits(mm, esz);
1920 d16[H2(i)] = nn + (mm << (1 << esz));
1921 }
1922 }
1923 }
1924}
1925
1926void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1927{
1928 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1929 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1930 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1931 uint64_t *d = vd, *n = vn, *m = vm;
1932 uint64_t l, h;
1933 intptr_t i;
1934
1935 if (oprsz <= 8) {
1936 l = compress_bits(n[0] >> odd, esz);
1937 h = compress_bits(m[0] >> odd, esz);
1938 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1939 } else {
1940 ARMPredicateReg tmp_m;
1941 intptr_t oprsz_16 = oprsz / 16;
1942
1943 if ((vm - vd) < (uintptr_t)oprsz) {
1944 m = memcpy(&tmp_m, vm, oprsz);
1945 }
1946
1947 for (i = 0; i < oprsz_16; i++) {
1948 l = n[2 * i + 0];
1949 h = n[2 * i + 1];
1950 l = compress_bits(l >> odd, esz);
1951 h = compress_bits(h >> odd, esz);
1952 d[i] = l + (h << 32);
1953 }
1954
1955
1956
1957
1958 if (oprsz & 15) {
1959 d[i] = compress_bits(n[2 * i] >> odd, esz);
1960
1961 for (i = 0; i < oprsz_16; i++) {
1962 l = m[2 * i + 0];
1963 h = m[2 * i + 1];
1964 l = compress_bits(l >> odd, esz);
1965 h = compress_bits(h >> odd, esz);
1966 tmp_m.p[i] = l + (h << 32);
1967 }
1968 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
1969
1970 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1971 } else {
1972 for (i = 0; i < oprsz_16; i++) {
1973 l = m[2 * i + 0];
1974 h = m[2 * i + 1];
1975 l = compress_bits(l >> odd, esz);
1976 h = compress_bits(h >> odd, esz);
1977 d[oprsz_16 + i] = l + (h << 32);
1978 }
1979 }
1980 }
1981}
1982
1983void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1984{
1985 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1986 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1987 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1988 uint64_t *d = vd, *n = vn, *m = vm;
1989 uint64_t mask;
1990 int shr, shl;
1991 intptr_t i;
1992
1993 shl = 1 << esz;
1994 shr = 0;
1995 mask = even_bit_esz_masks[esz];
1996 if (odd) {
1997 mask <<= shl;
1998 shr = shl;
1999 shl = 0;
2000 }
2001
2002 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2003 uint64_t nn = (n[i] & mask) >> shr;
2004 uint64_t mm = (m[i] & mask) << shl;
2005 d[i] = nn + mm;
2006 }
2007}
2008
2009
2010static uint64_t reverse_bits_64(uint64_t x, int n)
2011{
2012 int i, sh;
2013
2014 x = bswap64(x);
2015 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2016 uint64_t mask = even_bit_esz_masks[i];
2017 x = ((x & mask) << sh) | ((x >> sh) & mask);
2018 }
2019 return x;
2020}
2021
2022static uint8_t reverse_bits_8(uint8_t x, int n)
2023{
2024 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2025 int i, sh;
2026
2027 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2028 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2029 }
2030 return x;
2031}
2032
2033void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2034{
2035 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2036 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2037 intptr_t i, oprsz_2 = oprsz / 2;
2038
2039 if (oprsz <= 8) {
2040 uint64_t l = *(uint64_t *)vn;
2041 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2042 *(uint64_t *)vd = l;
2043 } else if ((oprsz & 15) == 0) {
2044 for (i = 0; i < oprsz_2; i += 8) {
2045 intptr_t ih = oprsz - 8 - i;
2046 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2047 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2048 *(uint64_t *)(vd + i) = h;
2049 *(uint64_t *)(vd + ih) = l;
2050 }
2051 } else {
2052 for (i = 0; i < oprsz_2; i += 1) {
2053 intptr_t il = H1(i);
2054 intptr_t ih = H1(oprsz - 1 - i);
2055 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2056 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2057 *(uint8_t *)(vd + il) = h;
2058 *(uint8_t *)(vd + ih) = l;
2059 }
2060 }
2061}
2062
2063void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2064{
2065 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2066 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2067 uint64_t *d = vd;
2068 intptr_t i;
2069
2070 if (oprsz <= 8) {
2071 uint64_t nn = *(uint64_t *)vn;
2072 int half = 4 * oprsz;
2073
2074 nn = extract64(nn, high * half, half);
2075 nn = expand_bits(nn, 0);
2076 d[0] = nn;
2077 } else {
2078 ARMPredicateReg tmp_n;
2079
2080
2081
2082 if ((vn - vd) < (uintptr_t)oprsz) {
2083 vn = memcpy(&tmp_n, vn, oprsz);
2084 }
2085 if (high) {
2086 high = oprsz >> 1;
2087 }
2088
2089 if ((high & 3) == 0) {
2090 uint32_t *n = vn;
2091 high >>= 2;
2092
2093 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2094 uint64_t nn = n[H4(high + i)];
2095 d[i] = expand_bits(nn, 0);
2096 }
2097 } else {
2098 uint16_t *d16 = vd;
2099 uint8_t *n = vn;
2100
2101 for (i = 0; i < oprsz / 2; i++) {
2102 uint16_t nn = n[H1(high + i)];
2103 d16[H2(i)] = expand_bits(nn, 0);
2104 }
2105 }
2106 }
2107}
2108
2109#define DO_ZIP(NAME, TYPE, H) \
2110void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2111{ \
2112 intptr_t oprsz = simd_oprsz(desc); \
2113 intptr_t i, oprsz_2 = oprsz / 2; \
2114 ARMVectorReg tmp_n, tmp_m; \
2115
2116 \
2117 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2118 vn = memcpy(&tmp_n, vn, oprsz_2); \
2119 } \
2120 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2121 vm = memcpy(&tmp_m, vm, oprsz_2); \
2122 } \
2123 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2124 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2125 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2126 } \
2127}
2128
2129DO_ZIP(sve_zip_b, uint8_t, H1)
2130DO_ZIP(sve_zip_h, uint16_t, H1_2)
2131DO_ZIP(sve_zip_s, uint32_t, H1_4)
2132DO_ZIP(sve_zip_d, uint64_t, )
2133
2134#define DO_UZP(NAME, TYPE, H) \
2135void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2136{ \
2137 intptr_t oprsz = simd_oprsz(desc); \
2138 intptr_t oprsz_2 = oprsz / 2; \
2139 intptr_t odd_ofs = simd_data(desc); \
2140 intptr_t i; \
2141 ARMVectorReg tmp_m; \
2142 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2143 vm = memcpy(&tmp_m, vm, oprsz); \
2144 } \
2145 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2146 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2147 } \
2148 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2149 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2150 } \
2151}
2152
2153DO_UZP(sve_uzp_b, uint8_t, H1)
2154DO_UZP(sve_uzp_h, uint16_t, H1_2)
2155DO_UZP(sve_uzp_s, uint32_t, H1_4)
2156DO_UZP(sve_uzp_d, uint64_t, )
2157
2158#define DO_TRN(NAME, TYPE, H) \
2159void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2160{ \
2161 intptr_t oprsz = simd_oprsz(desc); \
2162 intptr_t odd_ofs = simd_data(desc); \
2163 intptr_t i; \
2164 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2165 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2166 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2167 *(TYPE *)(vd + H(i + 0)) = ae; \
2168 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2169 } \
2170}
2171
2172DO_TRN(sve_trn_b, uint8_t, H1)
2173DO_TRN(sve_trn_h, uint16_t, H1_2)
2174DO_TRN(sve_trn_s, uint32_t, H1_4)
2175DO_TRN(sve_trn_d, uint64_t, )
2176
2177#undef DO_ZIP
2178#undef DO_UZP
2179#undef DO_TRN
2180
2181void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2182{
2183 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2184 uint32_t *d = vd, *n = vn;
2185 uint8_t *pg = vg;
2186
2187 for (i = j = 0; i < opr_sz; i++) {
2188 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2189 d[H4(j)] = n[H4(i)];
2190 j++;
2191 }
2192 }
2193 for (; j < opr_sz; j++) {
2194 d[H4(j)] = 0;
2195 }
2196}
2197
2198void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2199{
2200 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2201 uint64_t *d = vd, *n = vn;
2202 uint8_t *pg = vg;
2203
2204 for (i = j = 0; i < opr_sz; i++) {
2205 if (pg[H1(i)] & 1) {
2206 d[j] = n[i];
2207 j++;
2208 }
2209 }
2210 for (; j < opr_sz; j++) {
2211 d[j] = 0;
2212 }
2213}
2214
2215
2216
2217
2218
2219int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2220{
2221 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2222 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2223
2224 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2225}
2226
2227void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2228{
2229 intptr_t opr_sz = simd_oprsz(desc) / 8;
2230 int esz = simd_data(desc);
2231 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2232 intptr_t i, first_i, last_i;
2233 ARMVectorReg tmp;
2234
2235 first_i = last_i = 0;
2236 first_g = last_g = 0;
2237
2238
2239 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2240 pg = *(uint64_t *)(vg + i) & mask;
2241 if (pg) {
2242 if (last_g == 0) {
2243 last_g = pg;
2244 last_i = i;
2245 }
2246 first_g = pg;
2247 first_i = i;
2248 }
2249 }
2250
2251 len = 0;
2252 if (first_g != 0) {
2253 first_i = first_i * 8 + ctz64(first_g);
2254 last_i = last_i * 8 + 63 - clz64(last_g);
2255 len = last_i - first_i + (1 << esz);
2256 if (vd == vm) {
2257 vm = memcpy(&tmp, vm, opr_sz * 8);
2258 }
2259 swap_memmove(vd, vn + first_i, len);
2260 }
2261 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2262}
2263
2264void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2265 void *vg, uint32_t desc)
2266{
2267 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2268 uint64_t *d = vd, *n = vn, *m = vm;
2269 uint8_t *pg = vg;
2270
2271 for (i = 0; i < opr_sz; i += 1) {
2272 uint64_t nn = n[i], mm = m[i];
2273 uint64_t pp = expand_pred_b(pg[H1(i)]);
2274 d[i] = (nn & pp) | (mm & ~pp);
2275 }
2276}
2277
2278void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2279 void *vg, uint32_t desc)
2280{
2281 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2282 uint64_t *d = vd, *n = vn, *m = vm;
2283 uint8_t *pg = vg;
2284
2285 for (i = 0; i < opr_sz; i += 1) {
2286 uint64_t nn = n[i], mm = m[i];
2287 uint64_t pp = expand_pred_h(pg[H1(i)]);
2288 d[i] = (nn & pp) | (mm & ~pp);
2289 }
2290}
2291
2292void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2293 void *vg, uint32_t desc)
2294{
2295 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2296 uint64_t *d = vd, *n = vn, *m = vm;
2297 uint8_t *pg = vg;
2298
2299 for (i = 0; i < opr_sz; i += 1) {
2300 uint64_t nn = n[i], mm = m[i];
2301 uint64_t pp = expand_pred_s(pg[H1(i)]);
2302 d[i] = (nn & pp) | (mm & ~pp);
2303 }
2304}
2305
2306void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2307 void *vg, uint32_t desc)
2308{
2309 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2310 uint64_t *d = vd, *n = vn, *m = vm;
2311 uint8_t *pg = vg;
2312
2313 for (i = 0; i < opr_sz; i += 1) {
2314 uint64_t nn = n[i], mm = m[i];
2315 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2316 }
2317}
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2341uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2342{ \
2343 intptr_t opr_sz = simd_oprsz(desc); \
2344 uint32_t flags = PREDTEST_INIT; \
2345 intptr_t i = opr_sz; \
2346 do { \
2347 uint64_t out = 0, pg; \
2348 do { \
2349 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2350 TYPE nn = *(TYPE *)(vn + H(i)); \
2351 TYPE mm = *(TYPE *)(vm + H(i)); \
2352 out |= nn OP mm; \
2353 } while (i & 63); \
2354 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2355 out &= pg; \
2356 *(uint64_t *)(vd + (i >> 3)) = out; \
2357 flags = iter_predtest_bwd(out, pg, flags); \
2358 } while (i > 0); \
2359 return flags; \
2360}
2361
2362#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2363 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2364#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2365 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2366#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2367 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2368#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2369 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2370
2371DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2372DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2373DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2374DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2375
2376DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2377DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2378DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2379DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2380
2381DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2382DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2383DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2384DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2385
2386DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2387DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2388DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2389DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2390
2391DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2392DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2393DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2394DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2395
2396DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2397DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2398DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2399DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2400
2401#undef DO_CMP_PPZZ_B
2402#undef DO_CMP_PPZZ_H
2403#undef DO_CMP_PPZZ_S
2404#undef DO_CMP_PPZZ_D
2405#undef DO_CMP_PPZZ
2406
2407
2408#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2409uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2410{ \
2411 intptr_t opr_sz = simd_oprsz(desc); \
2412 uint32_t flags = PREDTEST_INIT; \
2413 intptr_t i = opr_sz; \
2414 do { \
2415 uint64_t out = 0, pg; \
2416 do { \
2417 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2418 do { \
2419 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2420 TYPE nn = *(TYPE *)(vn + H(i)); \
2421 out |= nn OP mm; \
2422 } while (i & 7); \
2423 } while (i & 63); \
2424 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2425 out &= pg; \
2426 *(uint64_t *)(vd + (i >> 3)) = out; \
2427 flags = iter_predtest_bwd(out, pg, flags); \
2428 } while (i > 0); \
2429 return flags; \
2430}
2431
2432#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2433 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2434#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2435 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2436#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2437 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2438
2439DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, uint8_t, uint64_t, ==)
2440DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, uint16_t, uint64_t, ==)
2441DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, uint32_t, uint64_t, ==)
2442
2443DO_CMP_PPZW_B(sve_cmpne_ppzw_b, uint8_t, uint64_t, !=)
2444DO_CMP_PPZW_H(sve_cmpne_ppzw_h, uint16_t, uint64_t, !=)
2445DO_CMP_PPZW_S(sve_cmpne_ppzw_s, uint32_t, uint64_t, !=)
2446
2447DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2448DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2449DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2450
2451DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2452DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2453DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2454
2455DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2456DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2457DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2458
2459DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2460DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2461DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2462
2463DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2464DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2465DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2466
2467DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2468DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2469DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2470
2471DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2472DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2473DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2474
2475DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2476DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2477DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2478
2479#undef DO_CMP_PPZW_B
2480#undef DO_CMP_PPZW_H
2481#undef DO_CMP_PPZW_S
2482#undef DO_CMP_PPZW
2483
2484
2485#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2486uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2487{ \
2488 intptr_t opr_sz = simd_oprsz(desc); \
2489 uint32_t flags = PREDTEST_INIT; \
2490 TYPE mm = simd_data(desc); \
2491 intptr_t i = opr_sz; \
2492 do { \
2493 uint64_t out = 0, pg; \
2494 do { \
2495 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2496 TYPE nn = *(TYPE *)(vn + H(i)); \
2497 out |= nn OP mm; \
2498 } while (i & 63); \
2499 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2500 out &= pg; \
2501 *(uint64_t *)(vd + (i >> 3)) = out; \
2502 flags = iter_predtest_bwd(out, pg, flags); \
2503 } while (i > 0); \
2504 return flags; \
2505}
2506
2507#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2508 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2509#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2510 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2511#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2512 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2513#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2514 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2515
2516DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2517DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2518DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2519DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2520
2521DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2522DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2523DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2524DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2525
2526DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2527DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2528DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2529DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2530
2531DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2532DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2533DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2534DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2535
2536DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2537DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2538DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2539DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2540
2541DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2542DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2543DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2544DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2545
2546DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2547DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2548DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2549DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2550
2551DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2552DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2553DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2554DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2555
2556DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2557DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2558DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2559DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2560
2561DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2562DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2563DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2564DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2565
2566#undef DO_CMP_PPZI_B
2567#undef DO_CMP_PPZI_H
2568#undef DO_CMP_PPZI_S
2569#undef DO_CMP_PPZI_D
2570#undef DO_CMP_PPZI
2571
2572
2573static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2574{
2575 intptr_t i;
2576
2577 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2578 uint64_t pg = *(uint64_t *)(vg + i);
2579 if (pg) {
2580 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2581 }
2582 }
2583 return 0;
2584}
2585
2586
2587
2588
2589
2590static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2591 bool brk, bool after)
2592{
2593 uint64_t b;
2594
2595 if (brk) {
2596 b = 0;
2597 } else if ((g & n) == 0) {
2598
2599 b = g;
2600 } else {
2601
2602 b = g & n;
2603 b = b & -b;
2604 if (after) {
2605 b = b | (b - 1);
2606 } else {
2607 b = b - 1;
2608 }
2609 brk = true;
2610 }
2611
2612 *retb = b;
2613 return brk;
2614}
2615
2616
2617static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2618 intptr_t oprsz, bool after)
2619{
2620 bool brk = false;
2621 intptr_t i;
2622
2623 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2624 uint64_t this_b, this_g = g[i];
2625
2626 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2627 d[i] = this_b & this_g;
2628 }
2629}
2630
2631
2632static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2633 intptr_t oprsz, bool after)
2634{
2635 uint32_t flags = PREDTEST_INIT;
2636 bool brk = false;
2637 intptr_t i;
2638
2639 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2640 uint64_t this_b, this_d, this_g = g[i];
2641
2642 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2643 d[i] = this_d = this_b & this_g;
2644 flags = iter_predtest_fwd(this_d, this_g, flags);
2645 }
2646 return flags;
2647}
2648
2649
2650static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2651 intptr_t oprsz, bool after)
2652{
2653 bool brk = false;
2654 intptr_t i;
2655
2656 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2657 uint64_t this_b, this_g = g[i];
2658
2659 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2660 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2661 }
2662}
2663
2664
2665static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2666 intptr_t oprsz, bool after)
2667{
2668 uint32_t flags = PREDTEST_INIT;
2669 bool brk = false;
2670 intptr_t i;
2671
2672 for (i = 0; i < oprsz / 8; ++i) {
2673 uint64_t this_b, this_d = d[i], this_g = g[i];
2674
2675 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2676 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2677 flags = iter_predtest_fwd(this_d, this_g, flags);
2678 }
2679 return flags;
2680}
2681
2682static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2683{
2684
2685
2686
2687 memset(d, 0, sizeof(ARMPredicateReg));
2688 return PREDTEST_INIT;
2689}
2690
2691void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2692 uint32_t pred_desc)
2693{
2694 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2695 if (last_active_pred(vn, vg, oprsz)) {
2696 compute_brk_z(vd, vm, vg, oprsz, true);
2697 } else {
2698 do_zero(vd, oprsz);
2699 }
2700}
2701
2702uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2703 uint32_t pred_desc)
2704{
2705 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2706 if (last_active_pred(vn, vg, oprsz)) {
2707 return compute_brks_z(vd, vm, vg, oprsz, true);
2708 } else {
2709 return do_zero(vd, oprsz);
2710 }
2711}
2712
2713void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2714 uint32_t pred_desc)
2715{
2716 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2717 if (last_active_pred(vn, vg, oprsz)) {
2718 compute_brk_z(vd, vm, vg, oprsz, false);
2719 } else {
2720 do_zero(vd, oprsz);
2721 }
2722}
2723
2724uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2725 uint32_t pred_desc)
2726{
2727 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2728 if (last_active_pred(vn, vg, oprsz)) {
2729 return compute_brks_z(vd, vm, vg, oprsz, false);
2730 } else {
2731 return do_zero(vd, oprsz);
2732 }
2733}
2734
2735void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2736{
2737 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2738 compute_brk_z(vd, vn, vg, oprsz, true);
2739}
2740
2741uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2742{
2743 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2744 return compute_brks_z(vd, vn, vg, oprsz, true);
2745}
2746
2747void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2748{
2749 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2750 compute_brk_z(vd, vn, vg, oprsz, false);
2751}
2752
2753uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2754{
2755 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2756 return compute_brks_z(vd, vn, vg, oprsz, false);
2757}
2758
2759void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2760{
2761 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2762 compute_brk_m(vd, vn, vg, oprsz, true);
2763}
2764
2765uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2766{
2767 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2768 return compute_brks_m(vd, vn, vg, oprsz, true);
2769}
2770
2771void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2772{
2773 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2774 compute_brk_m(vd, vn, vg, oprsz, false);
2775}
2776
2777uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2778{
2779 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2780 return compute_brks_m(vd, vn, vg, oprsz, false);
2781}
2782
2783void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2784{
2785 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2786
2787 if (!last_active_pred(vn, vg, oprsz)) {
2788 do_zero(vd, oprsz);
2789 }
2790}
2791
2792
2793static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2794 uint64_t esz_mask)
2795{
2796 uint32_t flags = PREDTEST_INIT;
2797 intptr_t i;
2798
2799 for (i = 0; i < oprsz / 8; i++) {
2800 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2801 }
2802 if (oprsz & 7) {
2803 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2804 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2805 }
2806 return flags;
2807}
2808
2809uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2810{
2811 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2812
2813 if (last_active_pred(vn, vg, oprsz)) {
2814 return predtest_ones(vd, oprsz, -1);
2815 } else {
2816 return do_zero(vd, oprsz);
2817 }
2818}
2819
2820uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2821{
2822 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2823 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2824 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2825 intptr_t i;
2826
2827 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2828 uint64_t t = n[i] & g[i] & mask;
2829 sum += ctpop64(t);
2830 }
2831 return sum;
2832}
2833
2834uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2835{
2836 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2837 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2838 uint64_t esz_mask = pred_esz_masks[esz];
2839 ARMPredicateReg *d = vd;
2840 uint32_t flags;
2841 intptr_t i;
2842
2843
2844 flags = do_zero(d, oprsz);
2845 if (count == 0) {
2846 return flags;
2847 }
2848
2849
2850 count <<= esz;
2851
2852 count = MIN(count, oprsz * 8);
2853
2854
2855 for (i = 0; i < count / 64; ++i) {
2856 d->p[i] = esz_mask;
2857 }
2858 if (count & 63) {
2859 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2860 }
2861
2862 return predtest_ones(d, oprsz, esz_mask);
2863}
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2874static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2875{ \
2876 if (n == 1) { \
2877 return *data; \
2878 } else { \
2879 uintptr_t half = n / 2; \
2880 TYPE lo = NAME##_reduce(data, status, half); \
2881 TYPE hi = NAME##_reduce(data + half, status, half); \
2882 return TYPE##_##FUNC(lo, hi, status); \
2883 } \
2884} \
2885uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2886{ \
2887 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
2888 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2889 for (i = 0; i < oprsz; ) { \
2890 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2891 do { \
2892 TYPE nn = *(TYPE *)(vn + H(i)); \
2893 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2894 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2895 } while (i & 15); \
2896 } \
2897 for (; i < maxsz; i += sizeof(TYPE)) { \
2898 *(TYPE *)((void *)data + i) = IDENT; \
2899 } \
2900 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2901}
2902
2903DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2904DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2905DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2906
2907
2908DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2909DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2910DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2911
2912DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2913DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2914DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2915
2916DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2917DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2918DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2919
2920DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2921DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2922DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2923
2924#undef DO_REDUCE
2925
2926uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2927 void *status, uint32_t desc)
2928{
2929 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2930 float16 result = nn;
2931
2932 do {
2933 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2934 do {
2935 if (pg & 1) {
2936 float16 mm = *(float16 *)(vm + H1_2(i));
2937 result = float16_add(result, mm, status);
2938 }
2939 i += sizeof(float16), pg >>= sizeof(float16);
2940 } while (i & 15);
2941 } while (i < opr_sz);
2942
2943 return result;
2944}
2945
2946uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2947 void *status, uint32_t desc)
2948{
2949 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2950 float32 result = nn;
2951
2952 do {
2953 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2954 do {
2955 if (pg & 1) {
2956 float32 mm = *(float32 *)(vm + H1_2(i));
2957 result = float32_add(result, mm, status);
2958 }
2959 i += sizeof(float32), pg >>= sizeof(float32);
2960 } while (i & 15);
2961 } while (i < opr_sz);
2962
2963 return result;
2964}
2965
2966uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
2967 void *status, uint32_t desc)
2968{
2969 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
2970 uint64_t *m = vm;
2971 uint8_t *pg = vg;
2972
2973 for (i = 0; i < opr_sz; i++) {
2974 if (pg[H1(i)] & 1) {
2975 nn = float64_add(nn, m[i], status);
2976 }
2977 }
2978
2979 return nn;
2980}
2981
2982
2983
2984
2985#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
2986void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
2987 void *status, uint32_t desc) \
2988{ \
2989 intptr_t i = simd_oprsz(desc); \
2990 uint64_t *g = vg; \
2991 do { \
2992 uint64_t pg = g[(i - 1) >> 6]; \
2993 do { \
2994 i -= sizeof(TYPE); \
2995 if (likely((pg >> (i & 63)) & 1)) { \
2996 TYPE nn = *(TYPE *)(vn + H(i)); \
2997 TYPE mm = *(TYPE *)(vm + H(i)); \
2998 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
2999 } \
3000 } while (i & 63); \
3001 } while (i != 0); \
3002}
3003
3004DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3005DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3006DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3007
3008DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3009DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3010DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3011
3012DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3013DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3014DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3015
3016DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3017DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3018DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3019
3020DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3021DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3022DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3023
3024DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3025DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3026DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3027
3028DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3029DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3030DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3031
3032DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3033DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3034DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3035
3036static inline float16 abd_h(float16 a, float16 b, float_status *s)
3037{
3038 return float16_abs(float16_sub(a, b, s));
3039}
3040
3041static inline float32 abd_s(float32 a, float32 b, float_status *s)
3042{
3043 return float32_abs(float32_sub(a, b, s));
3044}
3045
3046static inline float64 abd_d(float64 a, float64 b, float_status *s)
3047{
3048 return float64_abs(float64_sub(a, b, s));
3049}
3050
3051DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3052DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3053DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3054
3055static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3056{
3057 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3058 return float64_scalbn(a, b_int, s);
3059}
3060
3061DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3062DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3063DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3064
3065DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3066DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3067DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3068
3069#undef DO_ZPZZ_FP
3070
3071
3072
3073
3074#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3075void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3076 void *status, uint32_t desc) \
3077{ \
3078 intptr_t i = simd_oprsz(desc); \
3079 uint64_t *g = vg; \
3080 TYPE mm = scalar; \
3081 do { \
3082 uint64_t pg = g[(i - 1) >> 6]; \
3083 do { \
3084 i -= sizeof(TYPE); \
3085 if (likely((pg >> (i & 63)) & 1)) { \
3086 TYPE nn = *(TYPE *)(vn + H(i)); \
3087 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3088 } \
3089 } while (i & 63); \
3090 } while (i != 0); \
3091}
3092
3093DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3094DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3095DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3096
3097DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3098DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3099DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3100
3101DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3102DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3103DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3104
3105static inline float16 subr_h(float16 a, float16 b, float_status *s)
3106{
3107 return float16_sub(b, a, s);
3108}
3109
3110static inline float32 subr_s(float32 a, float32 b, float_status *s)
3111{
3112 return float32_sub(b, a, s);
3113}
3114
3115static inline float64 subr_d(float64 a, float64 b, float_status *s)
3116{
3117 return float64_sub(b, a, s);
3118}
3119
3120DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3121DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3122DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3123
3124DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3125DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3126DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3127
3128DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3129DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3130DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3131
3132DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3133DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3134DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3135
3136DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3137DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3138DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3139
3140
3141
3142
3143#define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3144void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3145{ \
3146 intptr_t i = simd_oprsz(desc); \
3147 uint64_t *g = vg; \
3148 do { \
3149 uint64_t pg = g[(i - 1) >> 6]; \
3150 do { \
3151 i -= sizeof(TYPE); \
3152 if (likely((pg >> (i & 63)) & 1)) { \
3153 TYPE nn = *(TYPE *)(vn + H(i)); \
3154 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3155 } \
3156 } while (i & 63); \
3157 } while (i != 0); \
3158}
3159
3160
3161
3162
3163
3164static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3165{
3166 flag save = get_flush_inputs_to_zero(fpst);
3167 float32 ret;
3168
3169 set_flush_inputs_to_zero(false, fpst);
3170 ret = float16_to_float32(f, true, fpst);
3171 set_flush_inputs_to_zero(save, fpst);
3172 return ret;
3173}
3174
3175static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3176{
3177 flag save = get_flush_inputs_to_zero(fpst);
3178 float64 ret;
3179
3180 set_flush_inputs_to_zero(false, fpst);
3181 ret = float16_to_float64(f, true, fpst);
3182 set_flush_inputs_to_zero(save, fpst);
3183 return ret;
3184}
3185
3186static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3187{
3188 flag save = get_flush_to_zero(fpst);
3189 float16 ret;
3190
3191 set_flush_to_zero(false, fpst);
3192 ret = float32_to_float16(f, true, fpst);
3193 set_flush_to_zero(save, fpst);
3194 return ret;
3195}
3196
3197static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3198{
3199 flag save = get_flush_to_zero(fpst);
3200 float16 ret;
3201
3202 set_flush_to_zero(false, fpst);
3203 ret = float64_to_float16(f, true, fpst);
3204 set_flush_to_zero(save, fpst);
3205 return ret;
3206}
3207
3208static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3209{
3210 if (float16_is_any_nan(f)) {
3211 float_raise(float_flag_invalid, s);
3212 return 0;
3213 }
3214 return float16_to_int16_round_to_zero(f, s);
3215}
3216
3217static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3218{
3219 if (float16_is_any_nan(f)) {
3220 float_raise(float_flag_invalid, s);
3221 return 0;
3222 }
3223 return float16_to_int64_round_to_zero(f, s);
3224}
3225
3226static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3227{
3228 if (float32_is_any_nan(f)) {
3229 float_raise(float_flag_invalid, s);
3230 return 0;
3231 }
3232 return float32_to_int64_round_to_zero(f, s);
3233}
3234
3235static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3236{
3237 if (float64_is_any_nan(f)) {
3238 float_raise(float_flag_invalid, s);
3239 return 0;
3240 }
3241 return float64_to_int64_round_to_zero(f, s);
3242}
3243
3244static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3245{
3246 if (float16_is_any_nan(f)) {
3247 float_raise(float_flag_invalid, s);
3248 return 0;
3249 }
3250 return float16_to_uint16_round_to_zero(f, s);
3251}
3252
3253static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3254{
3255 if (float16_is_any_nan(f)) {
3256 float_raise(float_flag_invalid, s);
3257 return 0;
3258 }
3259 return float16_to_uint64_round_to_zero(f, s);
3260}
3261
3262static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3263{
3264 if (float32_is_any_nan(f)) {
3265 float_raise(float_flag_invalid, s);
3266 return 0;
3267 }
3268 return float32_to_uint64_round_to_zero(f, s);
3269}
3270
3271static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3272{
3273 if (float64_is_any_nan(f)) {
3274 float_raise(float_flag_invalid, s);
3275 return 0;
3276 }
3277 return float64_to_uint64_round_to_zero(f, s);
3278}
3279
3280DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3281DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3282DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3283DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3284DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3285DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3286
3287DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3288DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3289DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3290DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3291DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3292DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3293DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3294
3295DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3296DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3297DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3298DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3299DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3300DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3301DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3302
3303DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3304DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3305DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3306
3307DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3308DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3309DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3310
3311DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3312DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3313DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3314
3315DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3316DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3317DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3318
3319DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3320DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3321DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3322DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3323DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3324DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3325DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3326
3327DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3328DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3329DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3330DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3331DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3332DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3333DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3334
3335#undef DO_ZPZ_FP
3336
3337
3338
3339
3340QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32);
3341
3342static void do_fmla_zpzzz_h(CPUARMState *env, void *vg, uint32_t desc,
3343 uint16_t neg1, uint16_t neg3)
3344{
3345 intptr_t i = simd_oprsz(desc);
3346 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3347 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3348 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3349 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3350 void *vd = &env->vfp.zregs[rd];
3351 void *vn = &env->vfp.zregs[rn];
3352 void *vm = &env->vfp.zregs[rm];
3353 void *va = &env->vfp.zregs[ra];
3354 uint64_t *g = vg;
3355
3356 do {
3357 uint64_t pg = g[(i - 1) >> 6];
3358 do {
3359 i -= 2;
3360 if (likely((pg >> (i & 63)) & 1)) {
3361 float16 e1, e2, e3, r;
3362
3363 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3364 e2 = *(uint16_t *)(vm + H1_2(i));
3365 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3366 r = float16_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3367 *(uint16_t *)(vd + H1_2(i)) = r;
3368 }
3369 } while (i & 63);
3370 } while (i != 0);
3371}
3372
3373void HELPER(sve_fmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3374{
3375 do_fmla_zpzzz_h(env, vg, desc, 0, 0);
3376}
3377
3378void HELPER(sve_fmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3379{
3380 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0);
3381}
3382
3383void HELPER(sve_fnmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3384{
3385 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0x8000);
3386}
3387
3388void HELPER(sve_fnmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3389{
3390 do_fmla_zpzzz_h(env, vg, desc, 0, 0x8000);
3391}
3392
3393static void do_fmla_zpzzz_s(CPUARMState *env, void *vg, uint32_t desc,
3394 uint32_t neg1, uint32_t neg3)
3395{
3396 intptr_t i = simd_oprsz(desc);
3397 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3398 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3399 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3400 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3401 void *vd = &env->vfp.zregs[rd];
3402 void *vn = &env->vfp.zregs[rn];
3403 void *vm = &env->vfp.zregs[rm];
3404 void *va = &env->vfp.zregs[ra];
3405 uint64_t *g = vg;
3406
3407 do {
3408 uint64_t pg = g[(i - 1) >> 6];
3409 do {
3410 i -= 4;
3411 if (likely((pg >> (i & 63)) & 1)) {
3412 float32 e1, e2, e3, r;
3413
3414 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3415 e2 = *(uint32_t *)(vm + H1_4(i));
3416 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3417 r = float32_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3418 *(uint32_t *)(vd + H1_4(i)) = r;
3419 }
3420 } while (i & 63);
3421 } while (i != 0);
3422}
3423
3424void HELPER(sve_fmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3425{
3426 do_fmla_zpzzz_s(env, vg, desc, 0, 0);
3427}
3428
3429void HELPER(sve_fmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3430{
3431 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0);
3432}
3433
3434void HELPER(sve_fnmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3435{
3436 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0x80000000);
3437}
3438
3439void HELPER(sve_fnmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3440{
3441 do_fmla_zpzzz_s(env, vg, desc, 0, 0x80000000);
3442}
3443
3444static void do_fmla_zpzzz_d(CPUARMState *env, void *vg, uint32_t desc,
3445 uint64_t neg1, uint64_t neg3)
3446{
3447 intptr_t i = simd_oprsz(desc);
3448 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3449 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3450 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3451 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3452 void *vd = &env->vfp.zregs[rd];
3453 void *vn = &env->vfp.zregs[rn];
3454 void *vm = &env->vfp.zregs[rm];
3455 void *va = &env->vfp.zregs[ra];
3456 uint64_t *g = vg;
3457
3458 do {
3459 uint64_t pg = g[(i - 1) >> 6];
3460 do {
3461 i -= 8;
3462 if (likely((pg >> (i & 63)) & 1)) {
3463 float64 e1, e2, e3, r;
3464
3465 e1 = *(uint64_t *)(vn + i) ^ neg1;
3466 e2 = *(uint64_t *)(vm + i);
3467 e3 = *(uint64_t *)(va + i) ^ neg3;
3468 r = float64_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3469 *(uint64_t *)(vd + i) = r;
3470 }
3471 } while (i & 63);
3472 } while (i != 0);
3473}
3474
3475void HELPER(sve_fmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3476{
3477 do_fmla_zpzzz_d(env, vg, desc, 0, 0);
3478}
3479
3480void HELPER(sve_fmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3481{
3482 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, 0);
3483}
3484
3485void HELPER(sve_fnmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3486{
3487 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, INT64_MIN);
3488}
3489
3490void HELPER(sve_fnmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3491{
3492 do_fmla_zpzzz_d(env, vg, desc, 0, INT64_MIN);
3493}
3494
3495
3496
3497
3498
3499
3500#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3501void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3502 void *status, uint32_t desc) \
3503{ \
3504 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3505 uint64_t *d = vd, *g = vg; \
3506 do { \
3507 uint64_t out = 0, pg = g[j]; \
3508 do { \
3509 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3510 if (likely((pg >> (i & 63)) & 1)) { \
3511 TYPE nn = *(TYPE *)(vn + H(i)); \
3512 TYPE mm = *(TYPE *)(vm + H(i)); \
3513 out |= OP(TYPE, nn, mm, status); \
3514 } \
3515 } while (i & 63); \
3516 d[j--] = out; \
3517 } while (i > 0); \
3518}
3519
3520#define DO_FPCMP_PPZZ_H(NAME, OP) \
3521 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3522#define DO_FPCMP_PPZZ_S(NAME, OP) \
3523 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3524#define DO_FPCMP_PPZZ_D(NAME, OP) \
3525 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3526
3527#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3528 DO_FPCMP_PPZZ_H(NAME, OP) \
3529 DO_FPCMP_PPZZ_S(NAME, OP) \
3530 DO_FPCMP_PPZZ_D(NAME, OP)
3531
3532#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3533#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
3534#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3535#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
3536#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3537#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3538#define DO_FCMUO(TYPE, X, Y, ST) \
3539 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3540#define DO_FACGE(TYPE, X, Y, ST) \
3541 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3542#define DO_FACGT(TYPE, X, Y, ST) \
3543 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3544
3545DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3546DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3547DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3548DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3549DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3550DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3551DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3552
3553#undef DO_FPCMP_PPZZ_ALL
3554#undef DO_FPCMP_PPZZ_D
3555#undef DO_FPCMP_PPZZ_S
3556#undef DO_FPCMP_PPZZ_H
3557#undef DO_FPCMP_PPZZ
3558
3559
3560
3561
3562#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3563void HELPER(NAME)(void *vd, void *vn, void *vg, \
3564 void *status, uint32_t desc) \
3565{ \
3566 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3567 uint64_t *d = vd, *g = vg; \
3568 do { \
3569 uint64_t out = 0, pg = g[j]; \
3570 do { \
3571 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3572 if ((pg >> (i & 63)) & 1) { \
3573 TYPE nn = *(TYPE *)(vn + H(i)); \
3574 out |= OP(TYPE, nn, 0, status); \
3575 } \
3576 } while (i & 63); \
3577 d[j--] = out; \
3578 } while (i > 0); \
3579}
3580
3581#define DO_FPCMP_PPZ0_H(NAME, OP) \
3582 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3583#define DO_FPCMP_PPZ0_S(NAME, OP) \
3584 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3585#define DO_FPCMP_PPZ0_D(NAME, OP) \
3586 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3587
3588#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3589 DO_FPCMP_PPZ0_H(NAME, OP) \
3590 DO_FPCMP_PPZ0_S(NAME, OP) \
3591 DO_FPCMP_PPZ0_D(NAME, OP)
3592
3593DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3594DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3595DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3596DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3597DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3598DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3599
3600
3601
3602void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3603{
3604 static const float16 coeff[16] = {
3605 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3606 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3607 };
3608 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3609 intptr_t x = simd_data(desc);
3610 float16 *d = vd, *n = vn, *m = vm;
3611 for (i = 0; i < opr_sz; i++) {
3612 float16 mm = m[i];
3613 intptr_t xx = x;
3614 if (float16_is_neg(mm)) {
3615 mm = float16_abs(mm);
3616 xx += 8;
3617 }
3618 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3619 }
3620}
3621
3622void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3623{
3624 static const float32 coeff[16] = {
3625 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3626 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3627 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3628 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3629 };
3630 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3631 intptr_t x = simd_data(desc);
3632 float32 *d = vd, *n = vn, *m = vm;
3633 for (i = 0; i < opr_sz; i++) {
3634 float32 mm = m[i];
3635 intptr_t xx = x;
3636 if (float32_is_neg(mm)) {
3637 mm = float32_abs(mm);
3638 xx += 8;
3639 }
3640 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3641 }
3642}
3643
3644void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3645{
3646 static const float64 coeff[16] = {
3647 0x3ff0000000000000ull, 0xbfc5555555555543ull,
3648 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3649 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3650 0x3de5d8408868552full, 0x0000000000000000ull,
3651 0x3ff0000000000000ull, 0xbfe0000000000000ull,
3652 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3653 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3654 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3655 };
3656 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3657 intptr_t x = simd_data(desc);
3658 float64 *d = vd, *n = vn, *m = vm;
3659 for (i = 0; i < opr_sz; i++) {
3660 float64 mm = m[i];
3661 intptr_t xx = x;
3662 if (float64_is_neg(mm)) {
3663 mm = float64_abs(mm);
3664 xx += 8;
3665 }
3666 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3667 }
3668}
3669
3670
3671
3672
3673
3674void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3675 void *vs, uint32_t desc)
3676{
3677 intptr_t j, i = simd_oprsz(desc);
3678 uint64_t *g = vg;
3679 float16 neg_imag = float16_set_sign(0, simd_data(desc));
3680 float16 neg_real = float16_chs(neg_imag);
3681
3682 do {
3683 uint64_t pg = g[(i - 1) >> 6];
3684 do {
3685 float16 e0, e1, e2, e3;
3686
3687
3688 j = i - sizeof(float16);
3689 i -= 2 * sizeof(float16);
3690
3691 e0 = *(float16 *)(vn + H1_2(i));
3692 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3693 e2 = *(float16 *)(vn + H1_2(j));
3694 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3695
3696 if (likely((pg >> (i & 63)) & 1)) {
3697 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3698 }
3699 if (likely((pg >> (j & 63)) & 1)) {
3700 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3701 }
3702 } while (i & 63);
3703 } while (i != 0);
3704}
3705
3706void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3707 void *vs, uint32_t desc)
3708{
3709 intptr_t j, i = simd_oprsz(desc);
3710 uint64_t *g = vg;
3711 float32 neg_imag = float32_set_sign(0, simd_data(desc));
3712 float32 neg_real = float32_chs(neg_imag);
3713
3714 do {
3715 uint64_t pg = g[(i - 1) >> 6];
3716 do {
3717 float32 e0, e1, e2, e3;
3718
3719
3720 j = i - sizeof(float32);
3721 i -= 2 * sizeof(float32);
3722
3723 e0 = *(float32 *)(vn + H1_2(i));
3724 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3725 e2 = *(float32 *)(vn + H1_2(j));
3726 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3727
3728 if (likely((pg >> (i & 63)) & 1)) {
3729 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3730 }
3731 if (likely((pg >> (j & 63)) & 1)) {
3732 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3733 }
3734 } while (i & 63);
3735 } while (i != 0);
3736}
3737
3738void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3739 void *vs, uint32_t desc)
3740{
3741 intptr_t j, i = simd_oprsz(desc);
3742 uint64_t *g = vg;
3743 float64 neg_imag = float64_set_sign(0, simd_data(desc));
3744 float64 neg_real = float64_chs(neg_imag);
3745
3746 do {
3747 uint64_t pg = g[(i - 1) >> 6];
3748 do {
3749 float64 e0, e1, e2, e3;
3750
3751
3752 j = i - sizeof(float64);
3753 i -= 2 * sizeof(float64);
3754
3755 e0 = *(float64 *)(vn + H1_2(i));
3756 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3757 e2 = *(float64 *)(vn + H1_2(j));
3758 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3759
3760 if (likely((pg >> (i & 63)) & 1)) {
3761 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3762 }
3763 if (likely((pg >> (j & 63)) & 1)) {
3764 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3765 }
3766 } while (i & 63);
3767 } while (i != 0);
3768}
3769
3770
3771
3772
3773
3774QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 22 > 32);
3775
3776void HELPER(sve_fcmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3777{
3778 intptr_t j, i = simd_oprsz(desc);
3779 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3780 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3781 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3782 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3783 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3784 bool flip = rot & 1;
3785 float16 neg_imag, neg_real;
3786 void *vd = &env->vfp.zregs[rd];
3787 void *vn = &env->vfp.zregs[rn];
3788 void *vm = &env->vfp.zregs[rm];
3789 void *va = &env->vfp.zregs[ra];
3790 uint64_t *g = vg;
3791
3792 neg_imag = float16_set_sign(0, (rot & 2) != 0);
3793 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3794
3795 do {
3796 uint64_t pg = g[(i - 1) >> 6];
3797 do {
3798 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3799
3800
3801 j = i - sizeof(float16);
3802 i -= 2 * sizeof(float16);
3803
3804 nr = *(float16 *)(vn + H1_2(i));
3805 ni = *(float16 *)(vn + H1_2(j));
3806 mr = *(float16 *)(vm + H1_2(i));
3807 mi = *(float16 *)(vm + H1_2(j));
3808
3809 e2 = (flip ? ni : nr);
3810 e1 = (flip ? mi : mr) ^ neg_real;
3811 e4 = e2;
3812 e3 = (flip ? mr : mi) ^ neg_imag;
3813
3814 if (likely((pg >> (i & 63)) & 1)) {
3815 d = *(float16 *)(va + H1_2(i));
3816 d = float16_muladd(e2, e1, d, 0, &env->vfp.fp_status_f16);
3817 *(float16 *)(vd + H1_2(i)) = d;
3818 }
3819 if (likely((pg >> (j & 63)) & 1)) {
3820 d = *(float16 *)(va + H1_2(j));
3821 d = float16_muladd(e4, e3, d, 0, &env->vfp.fp_status_f16);
3822 *(float16 *)(vd + H1_2(j)) = d;
3823 }
3824 } while (i & 63);
3825 } while (i != 0);
3826}
3827
3828void HELPER(sve_fcmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3829{
3830 intptr_t j, i = simd_oprsz(desc);
3831 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3832 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3833 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3834 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3835 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3836 bool flip = rot & 1;
3837 float32 neg_imag, neg_real;
3838 void *vd = &env->vfp.zregs[rd];
3839 void *vn = &env->vfp.zregs[rn];
3840 void *vm = &env->vfp.zregs[rm];
3841 void *va = &env->vfp.zregs[ra];
3842 uint64_t *g = vg;
3843
3844 neg_imag = float32_set_sign(0, (rot & 2) != 0);
3845 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3846
3847 do {
3848 uint64_t pg = g[(i - 1) >> 6];
3849 do {
3850 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3851
3852
3853 j = i - sizeof(float32);
3854 i -= 2 * sizeof(float32);
3855
3856 nr = *(float32 *)(vn + H1_2(i));
3857 ni = *(float32 *)(vn + H1_2(j));
3858 mr = *(float32 *)(vm + H1_2(i));
3859 mi = *(float32 *)(vm + H1_2(j));
3860
3861 e2 = (flip ? ni : nr);
3862 e1 = (flip ? mi : mr) ^ neg_real;
3863 e4 = e2;
3864 e3 = (flip ? mr : mi) ^ neg_imag;
3865
3866 if (likely((pg >> (i & 63)) & 1)) {
3867 d = *(float32 *)(va + H1_2(i));
3868 d = float32_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3869 *(float32 *)(vd + H1_2(i)) = d;
3870 }
3871 if (likely((pg >> (j & 63)) & 1)) {
3872 d = *(float32 *)(va + H1_2(j));
3873 d = float32_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3874 *(float32 *)(vd + H1_2(j)) = d;
3875 }
3876 } while (i & 63);
3877 } while (i != 0);
3878}
3879
3880void HELPER(sve_fcmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3881{
3882 intptr_t j, i = simd_oprsz(desc);
3883 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3884 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3885 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3886 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3887 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3888 bool flip = rot & 1;
3889 float64 neg_imag, neg_real;
3890 void *vd = &env->vfp.zregs[rd];
3891 void *vn = &env->vfp.zregs[rn];
3892 void *vm = &env->vfp.zregs[rm];
3893 void *va = &env->vfp.zregs[ra];
3894 uint64_t *g = vg;
3895
3896 neg_imag = float64_set_sign(0, (rot & 2) != 0);
3897 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3898
3899 do {
3900 uint64_t pg = g[(i - 1) >> 6];
3901 do {
3902 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3903
3904
3905 j = i - sizeof(float64);
3906 i -= 2 * sizeof(float64);
3907
3908 nr = *(float64 *)(vn + H1_2(i));
3909 ni = *(float64 *)(vn + H1_2(j));
3910 mr = *(float64 *)(vm + H1_2(i));
3911 mi = *(float64 *)(vm + H1_2(j));
3912
3913 e2 = (flip ? ni : nr);
3914 e1 = (flip ? mi : mr) ^ neg_real;
3915 e4 = e2;
3916 e3 = (flip ? mr : mi) ^ neg_imag;
3917
3918 if (likely((pg >> (i & 63)) & 1)) {
3919 d = *(float64 *)(va + H1_2(i));
3920 d = float64_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3921 *(float64 *)(vd + H1_2(i)) = d;
3922 }
3923 if (likely((pg >> (j & 63)) & 1)) {
3924 d = *(float64 *)(va + H1_2(j));
3925 d = float64_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3926 *(float64 *)(vd + H1_2(j)) = d;
3927 }
3928 } while (i & 63);
3929 } while (i != 0);
3930}
3931
3932
3933
3934
3935#define DO_LD1(NAME, FN, TYPEE, TYPEM, H) \
3936static void do_##NAME(CPUARMState *env, void *vd, void *vg, \
3937 target_ulong addr, intptr_t oprsz, \
3938 uintptr_t ra) \
3939{ \
3940 intptr_t i = 0; \
3941 do { \
3942 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3943 do { \
3944 TYPEM m = 0; \
3945 if (pg & 1) { \
3946 m = FN(env, addr, ra); \
3947 } \
3948 *(TYPEE *)(vd + H(i)) = m; \
3949 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3950 addr += sizeof(TYPEM); \
3951 } while (i & 15); \
3952 } while (i < oprsz); \
3953} \
3954void HELPER(NAME)(CPUARMState *env, void *vg, \
3955 target_ulong addr, uint32_t desc) \
3956{ \
3957 do_##NAME(env, &env->vfp.zregs[simd_data(desc)], vg, \
3958 addr, simd_oprsz(desc), GETPC()); \
3959}
3960
3961#define DO_LD2(NAME, FN, TYPEE, TYPEM, H) \
3962void HELPER(NAME)(CPUARMState *env, void *vg, \
3963 target_ulong addr, uint32_t desc) \
3964{ \
3965 intptr_t i, oprsz = simd_oprsz(desc); \
3966 intptr_t ra = GETPC(); \
3967 unsigned rd = simd_data(desc); \
3968 void *d1 = &env->vfp.zregs[rd]; \
3969 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
3970 for (i = 0; i < oprsz; ) { \
3971 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3972 do { \
3973 TYPEM m1 = 0, m2 = 0; \
3974 if (pg & 1) { \
3975 m1 = FN(env, addr, ra); \
3976 m2 = FN(env, addr + sizeof(TYPEM), ra); \
3977 } \
3978 *(TYPEE *)(d1 + H(i)) = m1; \
3979 *(TYPEE *)(d2 + H(i)) = m2; \
3980 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3981 addr += 2 * sizeof(TYPEM); \
3982 } while (i & 15); \
3983 } \
3984}
3985
3986#define DO_LD3(NAME, FN, TYPEE, TYPEM, H) \
3987void HELPER(NAME)(CPUARMState *env, void *vg, \
3988 target_ulong addr, uint32_t desc) \
3989{ \
3990 intptr_t i, oprsz = simd_oprsz(desc); \
3991 intptr_t ra = GETPC(); \
3992 unsigned rd = simd_data(desc); \
3993 void *d1 = &env->vfp.zregs[rd]; \
3994 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
3995 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
3996 for (i = 0; i < oprsz; ) { \
3997 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3998 do { \
3999 TYPEM m1 = 0, m2 = 0, m3 = 0; \
4000 if (pg & 1) { \
4001 m1 = FN(env, addr, ra); \
4002 m2 = FN(env, addr + sizeof(TYPEM), ra); \
4003 m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
4004 } \
4005 *(TYPEE *)(d1 + H(i)) = m1; \
4006 *(TYPEE *)(d2 + H(i)) = m2; \
4007 *(TYPEE *)(d3 + H(i)) = m3; \
4008 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4009 addr += 3 * sizeof(TYPEM); \
4010 } while (i & 15); \
4011 } \
4012}
4013
4014#define DO_LD4(NAME, FN, TYPEE, TYPEM, H) \
4015void HELPER(NAME)(CPUARMState *env, void *vg, \
4016 target_ulong addr, uint32_t desc) \
4017{ \
4018 intptr_t i, oprsz = simd_oprsz(desc); \
4019 intptr_t ra = GETPC(); \
4020 unsigned rd = simd_data(desc); \
4021 void *d1 = &env->vfp.zregs[rd]; \
4022 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4023 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
4024 void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \
4025 for (i = 0; i < oprsz; ) { \
4026 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4027 do { \
4028 TYPEM m1 = 0, m2 = 0, m3 = 0, m4 = 0; \
4029 if (pg & 1) { \
4030 m1 = FN(env, addr, ra); \
4031 m2 = FN(env, addr + sizeof(TYPEM), ra); \
4032 m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
4033 m4 = FN(env, addr + 3 * sizeof(TYPEM), ra); \
4034 } \
4035 *(TYPEE *)(d1 + H(i)) = m1; \
4036 *(TYPEE *)(d2 + H(i)) = m2; \
4037 *(TYPEE *)(d3 + H(i)) = m3; \
4038 *(TYPEE *)(d4 + H(i)) = m4; \
4039 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4040 addr += 4 * sizeof(TYPEM); \
4041 } while (i & 15); \
4042 } \
4043}
4044
4045DO_LD1(sve_ld1bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
4046DO_LD1(sve_ld1bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
4047DO_LD1(sve_ld1bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
4048DO_LD1(sve_ld1bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
4049DO_LD1(sve_ld1bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
4050DO_LD1(sve_ld1bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
4051
4052DO_LD1(sve_ld1hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
4053DO_LD1(sve_ld1hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
4054DO_LD1(sve_ld1hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
4055DO_LD1(sve_ld1hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
4056
4057DO_LD1(sve_ld1sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
4058DO_LD1(sve_ld1sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
4059
4060DO_LD1(sve_ld1bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4061DO_LD2(sve_ld2bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4062DO_LD3(sve_ld3bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4063DO_LD4(sve_ld4bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4064
4065DO_LD1(sve_ld1hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4066DO_LD2(sve_ld2hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4067DO_LD3(sve_ld3hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4068DO_LD4(sve_ld4hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4069
4070DO_LD1(sve_ld1ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4071DO_LD2(sve_ld2ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4072DO_LD3(sve_ld3ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4073DO_LD4(sve_ld4ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4074
4075DO_LD1(sve_ld1dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4076DO_LD2(sve_ld2dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4077DO_LD3(sve_ld3dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4078DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4079
4080#undef DO_LD1
4081#undef DO_LD2
4082#undef DO_LD3
4083#undef DO_LD4
4084
4085
4086
4087
4088
4089#ifdef CONFIG_USER_ONLY
4090
4091
4092
4093
4094
4095static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4096{
4097 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4098
4099 if (i & 63) {
4100 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4101 i = ROUND_UP(i, 64);
4102 }
4103 for (; i < oprsz; i += 64) {
4104 ffr[i / 64] = 0;
4105 }
4106}
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
4118static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg, \
4119 target_ulong addr, intptr_t oprsz, \
4120 bool first, uintptr_t ra) \
4121{ \
4122 intptr_t i = 0; \
4123 do { \
4124 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4125 do { \
4126 TYPEM m = 0; \
4127 if (pg & 1) { \
4128 if (!first && \
4129 unlikely(page_check_range(addr, sizeof(TYPEM), \
4130 PAGE_READ))) { \
4131 record_fault(env, i, oprsz); \
4132 return; \
4133 } \
4134 m = FN(env, addr, ra); \
4135 first = false; \
4136 } \
4137 *(TYPEE *)(vd + H(i)) = m; \
4138 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4139 addr += sizeof(TYPEM); \
4140 } while (i & 15); \
4141 } while (i < oprsz); \
4142} \
4143void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
4144 target_ulong addr, uint32_t desc) \
4145{ \
4146 intptr_t oprsz = simd_oprsz(desc); \
4147 unsigned rd = simd_data(desc); \
4148 void *vd = &env->vfp.zregs[rd]; \
4149 mmap_lock(); \
4150 if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
4151 do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
4152 } else { \
4153 do_sve_ldff1##PART(env, vd, vg, addr, oprsz, true, GETPC()); \
4154 } \
4155 mmap_unlock(); \
4156}
4157
4158
4159
4160
4161#define DO_LDNF1(PART) \
4162void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
4163 target_ulong addr, uint32_t desc) \
4164{ \
4165 intptr_t oprsz = simd_oprsz(desc); \
4166 unsigned rd = simd_data(desc); \
4167 void *vd = &env->vfp.zregs[rd]; \
4168 mmap_lock(); \
4169 if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
4170 do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
4171 } else { \
4172 do_sve_ldff1##PART(env, vd, vg, addr, oprsz, false, GETPC()); \
4173 } \
4174 mmap_unlock(); \
4175}
4176
4177#else
4178
4179
4180
4181
4182#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
4183void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
4184 target_ulong addr, uint32_t desc) \
4185{ \
4186 g_assert_not_reached(); \
4187}
4188
4189#define DO_LDNF1(PART) \
4190void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
4191 target_ulong addr, uint32_t desc) \
4192{ \
4193 g_assert_not_reached(); \
4194}
4195
4196#endif
4197
4198DO_LDFF1(bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4199DO_LDFF1(bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
4200DO_LDFF1(bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
4201DO_LDFF1(bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
4202DO_LDFF1(bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
4203DO_LDFF1(bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
4204DO_LDFF1(bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
4205
4206DO_LDFF1(hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4207DO_LDFF1(hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
4208DO_LDFF1(hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
4209DO_LDFF1(hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
4210DO_LDFF1(hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
4211
4212DO_LDFF1(ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4213DO_LDFF1(sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
4214DO_LDFF1(sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
4215
4216DO_LDFF1(dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4217
4218#undef DO_LDFF1
4219
4220DO_LDNF1(bb_r)
4221DO_LDNF1(bhu_r)
4222DO_LDNF1(bhs_r)
4223DO_LDNF1(bsu_r)
4224DO_LDNF1(bss_r)
4225DO_LDNF1(bdu_r)
4226DO_LDNF1(bds_r)
4227
4228DO_LDNF1(hh_r)
4229DO_LDNF1(hsu_r)
4230DO_LDNF1(hss_r)
4231DO_LDNF1(hdu_r)
4232DO_LDNF1(hds_r)
4233
4234DO_LDNF1(ss_r)
4235DO_LDNF1(sdu_r)
4236DO_LDNF1(sds_r)
4237
4238DO_LDNF1(dd_r)
4239
4240#undef DO_LDNF1
4241
4242
4243
4244
4245#define DO_ST1(NAME, FN, TYPEE, TYPEM, H) \
4246void HELPER(NAME)(CPUARMState *env, void *vg, \
4247 target_ulong addr, uint32_t desc) \
4248{ \
4249 intptr_t i, oprsz = simd_oprsz(desc); \
4250 intptr_t ra = GETPC(); \
4251 unsigned rd = simd_data(desc); \
4252 void *vd = &env->vfp.zregs[rd]; \
4253 for (i = 0; i < oprsz; ) { \
4254 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4255 do { \
4256 if (pg & 1) { \
4257 TYPEM m = *(TYPEE *)(vd + H(i)); \
4258 FN(env, addr, m, ra); \
4259 } \
4260 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4261 addr += sizeof(TYPEM); \
4262 } while (i & 15); \
4263 } \
4264}
4265
4266#define DO_ST1_D(NAME, FN, TYPEM) \
4267void HELPER(NAME)(CPUARMState *env, void *vg, \
4268 target_ulong addr, uint32_t desc) \
4269{ \
4270 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
4271 intptr_t ra = GETPC(); \
4272 unsigned rd = simd_data(desc); \
4273 uint64_t *d = &env->vfp.zregs[rd].d[0]; \
4274 uint8_t *pg = vg; \
4275 for (i = 0; i < oprsz; i += 1) { \
4276 if (pg[H1(i)] & 1) { \
4277 FN(env, addr, d[i], ra); \
4278 } \
4279 addr += sizeof(TYPEM); \
4280 } \
4281}
4282
4283#define DO_ST2(NAME, FN, TYPEE, TYPEM, H) \
4284void HELPER(NAME)(CPUARMState *env, void *vg, \
4285 target_ulong addr, uint32_t desc) \
4286{ \
4287 intptr_t i, oprsz = simd_oprsz(desc); \
4288 intptr_t ra = GETPC(); \
4289 unsigned rd = simd_data(desc); \
4290 void *d1 = &env->vfp.zregs[rd]; \
4291 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4292 for (i = 0; i < oprsz; ) { \
4293 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4294 do { \
4295 if (pg & 1) { \
4296 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
4297 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
4298 FN(env, addr, m1, ra); \
4299 FN(env, addr + sizeof(TYPEM), m2, ra); \
4300 } \
4301 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4302 addr += 2 * sizeof(TYPEM); \
4303 } while (i & 15); \
4304 } \
4305}
4306
4307#define DO_ST3(NAME, FN, TYPEE, TYPEM, H) \
4308void HELPER(NAME)(CPUARMState *env, void *vg, \
4309 target_ulong addr, uint32_t desc) \
4310{ \
4311 intptr_t i, oprsz = simd_oprsz(desc); \
4312 intptr_t ra = GETPC(); \
4313 unsigned rd = simd_data(desc); \
4314 void *d1 = &env->vfp.zregs[rd]; \
4315 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4316 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
4317 for (i = 0; i < oprsz; ) { \
4318 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4319 do { \
4320 if (pg & 1) { \
4321 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
4322 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
4323 TYPEM m3 = *(TYPEE *)(d3 + H(i)); \
4324 FN(env, addr, m1, ra); \
4325 FN(env, addr + sizeof(TYPEM), m2, ra); \
4326 FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
4327 } \
4328 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4329 addr += 3 * sizeof(TYPEM); \
4330 } while (i & 15); \
4331 } \
4332}
4333
4334#define DO_ST4(NAME, FN, TYPEE, TYPEM, H) \
4335void HELPER(NAME)(CPUARMState *env, void *vg, \
4336 target_ulong addr, uint32_t desc) \
4337{ \
4338 intptr_t i, oprsz = simd_oprsz(desc); \
4339 intptr_t ra = GETPC(); \
4340 unsigned rd = simd_data(desc); \
4341 void *d1 = &env->vfp.zregs[rd]; \
4342 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4343 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
4344 void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \
4345 for (i = 0; i < oprsz; ) { \
4346 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4347 do { \
4348 if (pg & 1) { \
4349 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
4350 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
4351 TYPEM m3 = *(TYPEE *)(d3 + H(i)); \
4352 TYPEM m4 = *(TYPEE *)(d4 + H(i)); \
4353 FN(env, addr, m1, ra); \
4354 FN(env, addr + sizeof(TYPEM), m2, ra); \
4355 FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
4356 FN(env, addr + 3 * sizeof(TYPEM), m4, ra); \
4357 } \
4358 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4359 addr += 4 * sizeof(TYPEM); \
4360 } while (i & 15); \
4361 } \
4362}
4363
4364DO_ST1(sve_st1bh_r, cpu_stb_data_ra, uint16_t, uint8_t, H1_2)
4365DO_ST1(sve_st1bs_r, cpu_stb_data_ra, uint32_t, uint8_t, H1_4)
4366DO_ST1_D(sve_st1bd_r, cpu_stb_data_ra, uint8_t)
4367
4368DO_ST1(sve_st1hs_r, cpu_stw_data_ra, uint32_t, uint16_t, H1_4)
4369DO_ST1_D(sve_st1hd_r, cpu_stw_data_ra, uint16_t)
4370
4371DO_ST1_D(sve_st1sd_r, cpu_stl_data_ra, uint32_t)
4372
4373DO_ST1(sve_st1bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4374DO_ST2(sve_st2bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4375DO_ST3(sve_st3bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4376DO_ST4(sve_st4bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4377
4378DO_ST1(sve_st1hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4379DO_ST2(sve_st2hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4380DO_ST3(sve_st3hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4381DO_ST4(sve_st4hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4382
4383DO_ST1(sve_st1ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4384DO_ST2(sve_st2ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4385DO_ST3(sve_st3ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4386DO_ST4(sve_st4ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4387
4388DO_ST1_D(sve_st1dd_r, cpu_stq_data_ra, uint64_t)
4389
4390void HELPER(sve_st2dd_r)(CPUARMState *env, void *vg,
4391 target_ulong addr, uint32_t desc)
4392{
4393 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4394 intptr_t ra = GETPC();
4395 unsigned rd = simd_data(desc);
4396 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4397 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4398 uint8_t *pg = vg;
4399
4400 for (i = 0; i < oprsz; i += 1) {
4401 if (pg[H1(i)] & 1) {
4402 cpu_stq_data_ra(env, addr, d1[i], ra);
4403 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4404 }
4405 addr += 2 * 8;
4406 }
4407}
4408
4409void HELPER(sve_st3dd_r)(CPUARMState *env, void *vg,
4410 target_ulong addr, uint32_t desc)
4411{
4412 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4413 intptr_t ra = GETPC();
4414 unsigned rd = simd_data(desc);
4415 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4416 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4417 uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
4418 uint8_t *pg = vg;
4419
4420 for (i = 0; i < oprsz; i += 1) {
4421 if (pg[H1(i)] & 1) {
4422 cpu_stq_data_ra(env, addr, d1[i], ra);
4423 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4424 cpu_stq_data_ra(env, addr + 16, d3[i], ra);
4425 }
4426 addr += 3 * 8;
4427 }
4428}
4429
4430void HELPER(sve_st4dd_r)(CPUARMState *env, void *vg,
4431 target_ulong addr, uint32_t desc)
4432{
4433 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4434 intptr_t ra = GETPC();
4435 unsigned rd = simd_data(desc);
4436 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4437 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4438 uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
4439 uint64_t *d4 = &env->vfp.zregs[(rd + 3) & 31].d[0];
4440 uint8_t *pg = vg;
4441
4442 for (i = 0; i < oprsz; i += 1) {
4443 if (pg[H1(i)] & 1) {
4444 cpu_stq_data_ra(env, addr, d1[i], ra);
4445 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4446 cpu_stq_data_ra(env, addr + 16, d3[i], ra);
4447 cpu_stq_data_ra(env, addr + 24, d4[i], ra);
4448 }
4449 addr += 4 * 8;
4450 }
4451}
4452
4453
4454
4455#define DO_LD1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
4456void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4457 target_ulong base, uint32_t desc) \
4458{ \
4459 intptr_t i, oprsz = simd_oprsz(desc); \
4460 unsigned scale = simd_data(desc); \
4461 uintptr_t ra = GETPC(); \
4462 for (i = 0; i < oprsz; ) { \
4463 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4464 do { \
4465 TYPEM m = 0; \
4466 if (pg & 1) { \
4467 target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
4468 m = FN(env, base + (off << scale), ra); \
4469 } \
4470 *(uint32_t *)(vd + H1_4(i)) = m; \
4471 i += 4, pg >>= 4; \
4472 } while (i & 15); \
4473 } \
4474}
4475
4476#define DO_LD1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
4477void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4478 target_ulong base, uint32_t desc) \
4479{ \
4480 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
4481 unsigned scale = simd_data(desc); \
4482 uintptr_t ra = GETPC(); \
4483 uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
4484 for (i = 0; i < oprsz; i++) { \
4485 TYPEM mm = 0; \
4486 if (pg[H1(i)] & 1) { \
4487 target_ulong off = (TYPEI)m[i]; \
4488 mm = FN(env, base + (off << scale), ra); \
4489 } \
4490 d[i] = mm; \
4491 } \
4492}
4493
4494DO_LD1_ZPZ_S(sve_ldbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4495DO_LD1_ZPZ_S(sve_ldhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4496DO_LD1_ZPZ_S(sve_ldssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4497DO_LD1_ZPZ_S(sve_ldbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4498DO_LD1_ZPZ_S(sve_ldhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4499
4500DO_LD1_ZPZ_S(sve_ldbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4501DO_LD1_ZPZ_S(sve_ldhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4502DO_LD1_ZPZ_S(sve_ldssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4503DO_LD1_ZPZ_S(sve_ldbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
4504DO_LD1_ZPZ_S(sve_ldhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
4505
4506DO_LD1_ZPZ_D(sve_ldbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4507DO_LD1_ZPZ_D(sve_ldhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4508DO_LD1_ZPZ_D(sve_ldsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4509DO_LD1_ZPZ_D(sve_ldddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
4510DO_LD1_ZPZ_D(sve_ldbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4511DO_LD1_ZPZ_D(sve_ldhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4512DO_LD1_ZPZ_D(sve_ldsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
4513
4514DO_LD1_ZPZ_D(sve_ldbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4515DO_LD1_ZPZ_D(sve_ldhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4516DO_LD1_ZPZ_D(sve_ldsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4517DO_LD1_ZPZ_D(sve_ldddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
4518DO_LD1_ZPZ_D(sve_ldbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
4519DO_LD1_ZPZ_D(sve_ldhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
4520DO_LD1_ZPZ_D(sve_ldsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
4521
4522DO_LD1_ZPZ_D(sve_ldbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
4523DO_LD1_ZPZ_D(sve_ldhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
4524DO_LD1_ZPZ_D(sve_ldsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
4525DO_LD1_ZPZ_D(sve_ldddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
4526DO_LD1_ZPZ_D(sve_ldbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
4527DO_LD1_ZPZ_D(sve_ldhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
4528DO_LD1_ZPZ_D(sve_ldsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
4529
4530
4531
4532#ifdef CONFIG_USER_ONLY
4533
4534#define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
4535void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4536 target_ulong base, uint32_t desc) \
4537{ \
4538 intptr_t i, oprsz = simd_oprsz(desc); \
4539 unsigned scale = simd_data(desc); \
4540 uintptr_t ra = GETPC(); \
4541 bool first = true; \
4542 mmap_lock(); \
4543 for (i = 0; i < oprsz; ) { \
4544 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4545 do { \
4546 TYPEM m = 0; \
4547 if (pg & 1) { \
4548 target_ulong off = *(TYPEI *)(vm + H(i)); \
4549 target_ulong addr = base + (off << scale); \
4550 if (!first && \
4551 page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \
4552 record_fault(env, i, oprsz); \
4553 goto exit; \
4554 } \
4555 m = FN(env, addr, ra); \
4556 first = false; \
4557 } \
4558 *(TYPEE *)(vd + H(i)) = m; \
4559 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4560 } while (i & 15); \
4561 } \
4562 exit: \
4563 mmap_unlock(); \
4564}
4565
4566#else
4567
4568#define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
4569void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4570 target_ulong base, uint32_t desc) \
4571{ \
4572 g_assert_not_reached(); \
4573}
4574
4575#endif
4576
4577#define DO_LDFF1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
4578 DO_LDFF1_ZPZ(NAME, uint32_t, TYPEI, TYPEM, FN, H1_4)
4579#define DO_LDFF1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
4580 DO_LDFF1_ZPZ(NAME, uint64_t, TYPEI, TYPEM, FN, )
4581
4582DO_LDFF1_ZPZ_S(sve_ldffbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4583DO_LDFF1_ZPZ_S(sve_ldffhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4584DO_LDFF1_ZPZ_S(sve_ldffssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4585DO_LDFF1_ZPZ_S(sve_ldffbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4586DO_LDFF1_ZPZ_S(sve_ldffhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4587
4588DO_LDFF1_ZPZ_S(sve_ldffbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4589DO_LDFF1_ZPZ_S(sve_ldffhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4590DO_LDFF1_ZPZ_S(sve_ldffssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4591DO_LDFF1_ZPZ_S(sve_ldffbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
4592DO_LDFF1_ZPZ_S(sve_ldffhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
4593
4594DO_LDFF1_ZPZ_D(sve_ldffbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4595DO_LDFF1_ZPZ_D(sve_ldffhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4596DO_LDFF1_ZPZ_D(sve_ldffsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4597DO_LDFF1_ZPZ_D(sve_ldffddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
4598DO_LDFF1_ZPZ_D(sve_ldffbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4599DO_LDFF1_ZPZ_D(sve_ldffhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4600DO_LDFF1_ZPZ_D(sve_ldffsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
4601
4602DO_LDFF1_ZPZ_D(sve_ldffbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4603DO_LDFF1_ZPZ_D(sve_ldffhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4604DO_LDFF1_ZPZ_D(sve_ldffsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4605DO_LDFF1_ZPZ_D(sve_ldffddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
4606DO_LDFF1_ZPZ_D(sve_ldffbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
4607DO_LDFF1_ZPZ_D(sve_ldffhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
4608DO_LDFF1_ZPZ_D(sve_ldffsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
4609
4610DO_LDFF1_ZPZ_D(sve_ldffbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
4611DO_LDFF1_ZPZ_D(sve_ldffhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
4612DO_LDFF1_ZPZ_D(sve_ldffsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
4613DO_LDFF1_ZPZ_D(sve_ldffddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
4614DO_LDFF1_ZPZ_D(sve_ldffbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
4615DO_LDFF1_ZPZ_D(sve_ldffhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
4616DO_LDFF1_ZPZ_D(sve_ldffsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
4617
4618
4619
4620#define DO_ST1_ZPZ_S(NAME, TYPEI, FN) \
4621void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4622 target_ulong base, uint32_t desc) \
4623{ \
4624 intptr_t i, oprsz = simd_oprsz(desc); \
4625 unsigned scale = simd_data(desc); \
4626 uintptr_t ra = GETPC(); \
4627 for (i = 0; i < oprsz; ) { \
4628 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4629 do { \
4630 if (likely(pg & 1)) { \
4631 target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
4632 uint32_t d = *(uint32_t *)(vd + H1_4(i)); \
4633 FN(env, base + (off << scale), d, ra); \
4634 } \
4635 i += sizeof(uint32_t), pg >>= sizeof(uint32_t); \
4636 } while (i & 15); \
4637 } \
4638}
4639
4640#define DO_ST1_ZPZ_D(NAME, TYPEI, FN) \
4641void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4642 target_ulong base, uint32_t desc) \
4643{ \
4644 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
4645 unsigned scale = simd_data(desc); \
4646 uintptr_t ra = GETPC(); \
4647 uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
4648 for (i = 0; i < oprsz; i++) { \
4649 if (likely(pg[H1(i)] & 1)) { \
4650 target_ulong off = (target_ulong)(TYPEI)m[i] << scale; \
4651 FN(env, base + off, d[i], ra); \
4652 } \
4653 } \
4654}
4655
4656DO_ST1_ZPZ_S(sve_stbs_zsu, uint32_t, cpu_stb_data_ra)
4657DO_ST1_ZPZ_S(sve_sths_zsu, uint32_t, cpu_stw_data_ra)
4658DO_ST1_ZPZ_S(sve_stss_zsu, uint32_t, cpu_stl_data_ra)
4659
4660DO_ST1_ZPZ_S(sve_stbs_zss, int32_t, cpu_stb_data_ra)
4661DO_ST1_ZPZ_S(sve_sths_zss, int32_t, cpu_stw_data_ra)
4662DO_ST1_ZPZ_S(sve_stss_zss, int32_t, cpu_stl_data_ra)
4663
4664DO_ST1_ZPZ_D(sve_stbd_zsu, uint32_t, cpu_stb_data_ra)
4665DO_ST1_ZPZ_D(sve_sthd_zsu, uint32_t, cpu_stw_data_ra)
4666DO_ST1_ZPZ_D(sve_stsd_zsu, uint32_t, cpu_stl_data_ra)
4667DO_ST1_ZPZ_D(sve_stdd_zsu, uint32_t, cpu_stq_data_ra)
4668
4669DO_ST1_ZPZ_D(sve_stbd_zss, int32_t, cpu_stb_data_ra)
4670DO_ST1_ZPZ_D(sve_sthd_zss, int32_t, cpu_stw_data_ra)
4671DO_ST1_ZPZ_D(sve_stsd_zss, int32_t, cpu_stl_data_ra)
4672DO_ST1_ZPZ_D(sve_stdd_zss, int32_t, cpu_stq_data_ra)
4673
4674DO_ST1_ZPZ_D(sve_stbd_zd, uint64_t, cpu_stb_data_ra)
4675DO_ST1_ZPZ_D(sve_sthd_zd, uint64_t, cpu_stw_data_ra)
4676DO_ST1_ZPZ_D(sve_stsd_zd, uint64_t, cpu_stl_data_ra)
4677DO_ST1_ZPZ_D(sve_stdd_zd, uint64_t, cpu_stq_data_ra)
4678