1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20#include "qemu/osdep.h"
21#include "cpu.h"
22#include "internals.h"
23#include "tcg/tcg-gvec-desc.h"
24#include "exec/helper-proto.h"
25#include "exec/cpu_ldst.h"
26#include "exec/exec-all.h"
27#include "qemu/int128.h"
28#include "fpu/softfloat.h"
29#include "vec_internal.h"
30#include "sve_ldst_internal.h"
31
32void helper_set_svcr(CPUARMState *env, uint32_t val, uint32_t mask)
33{
34 aarch64_set_svcr(env, val, mask);
35}
36
37void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
38{
39 uint32_t i;
40
41
42
43
44
45
46 if (imm == 0xff) {
47 memset(env->zarray, 0, sizeof(env->zarray));
48 return;
49 }
50
51
52
53
54
55 for (i = 0; i < svl; i++) {
56 if (imm & (1 << (i % 8))) {
57 memset(&env->zarray[i], 0, svl);
58 }
59 }
60}
61
62
63
64
65
66
67
68
69
70
71
72
73
74#define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg))
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92#define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg))
93
94
95
96
97
98#define DO_MOVA_C(NAME, TYPE, H) \
99void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc) \
100{ \
101 int i, oprsz = simd_oprsz(desc); \
102 for (i = 0; i < oprsz; ) { \
103 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
104 do { \
105 if (pg & 1) { \
106 *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \
107 } \
108 i += sizeof(TYPE); \
109 pg >>= sizeof(TYPE); \
110 } while (i & 15); \
111 } \
112}
113
114DO_MOVA_C(sme_mova_cz_b, uint8_t, H1)
115DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2)
116DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4)
117
118void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc)
119{
120 int i, oprsz = simd_oprsz(desc) / 8;
121 uint8_t *pg = vg;
122 uint64_t *n = vn;
123 uint64_t *a = za;
124
125 for (i = 0; i < oprsz; i++) {
126 if (pg[H1(i)] & 1) {
127 a[tile_vslice_index(i)] = n[i];
128 }
129 }
130}
131
132void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
133{
134 int i, oprsz = simd_oprsz(desc) / 16;
135 uint16_t *pg = vg;
136 Int128 *n = vn;
137 Int128 *a = za;
138
139
140
141
142
143 for (i = 0; i < oprsz; i++) {
144 if (pg[H2(i)] & 1) {
145 a[tile_vslice_index(i)] = n[i];
146 }
147 }
148}
149
150#undef DO_MOVA_C
151
152
153
154
155#define DO_MOVA_Z(NAME, TYPE, H) \
156void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc) \
157{ \
158 int i, oprsz = simd_oprsz(desc); \
159 for (i = 0; i < oprsz; ) { \
160 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
161 do { \
162 if (pg & 1) { \
163 *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \
164 } \
165 i += sizeof(TYPE); \
166 pg >>= sizeof(TYPE); \
167 } while (i & 15); \
168 } \
169}
170
171DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1)
172DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2)
173DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4)
174
175void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc)
176{
177 int i, oprsz = simd_oprsz(desc) / 8;
178 uint8_t *pg = vg;
179 uint64_t *d = vd;
180 uint64_t *a = za;
181
182 for (i = 0; i < oprsz; i++) {
183 if (pg[H1(i)] & 1) {
184 d[i] = a[tile_vslice_index(i)];
185 }
186 }
187}
188
189void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc)
190{
191 int i, oprsz = simd_oprsz(desc) / 16;
192 uint16_t *pg = vg;
193 Int128 *d = vd;
194 Int128 *a = za;
195
196
197
198
199
200 for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) {
201 if (pg[H2(i)] & 1) {
202 d[i] = a[tile_vslice_index(i)];
203 }
204 }
205}
206
207#undef DO_MOVA_Z
208
209
210
211
212
213typedef void ClearFn(void *ptr, size_t off, size_t len);
214
215static void clear_horizontal(void *ptr, size_t off, size_t len)
216{
217 memset(ptr + off, 0, len);
218}
219
220static void clear_vertical_b(void *vptr, size_t off, size_t len)
221{
222 for (size_t i = 0; i < len; ++i) {
223 *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0;
224 }
225}
226
227static void clear_vertical_h(void *vptr, size_t off, size_t len)
228{
229 for (size_t i = 0; i < len; i += 2) {
230 *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0;
231 }
232}
233
234static void clear_vertical_s(void *vptr, size_t off, size_t len)
235{
236 for (size_t i = 0; i < len; i += 4) {
237 *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0;
238 }
239}
240
241static void clear_vertical_d(void *vptr, size_t off, size_t len)
242{
243 for (size_t i = 0; i < len; i += 8) {
244 *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0;
245 }
246}
247
248static void clear_vertical_q(void *vptr, size_t off, size_t len)
249{
250 for (size_t i = 0; i < len; i += 16) {
251 memset(vptr + tile_vslice_offset(i + off), 0, 16);
252 }
253}
254
255
256
257
258
259typedef void CopyFn(void *dst, const void *src, size_t len);
260
261static void copy_horizontal(void *dst, const void *src, size_t len)
262{
263 memcpy(dst, src, len);
264}
265
266static void copy_vertical_b(void *vdst, const void *vsrc, size_t len)
267{
268 const uint8_t *src = vsrc;
269 uint8_t *dst = vdst;
270 size_t i;
271
272 for (i = 0; i < len; ++i) {
273 dst[tile_vslice_index(i)] = src[i];
274 }
275}
276
277static void copy_vertical_h(void *vdst, const void *vsrc, size_t len)
278{
279 const uint16_t *src = vsrc;
280 uint16_t *dst = vdst;
281 size_t i;
282
283 for (i = 0; i < len / 2; ++i) {
284 dst[tile_vslice_index(i)] = src[i];
285 }
286}
287
288static void copy_vertical_s(void *vdst, const void *vsrc, size_t len)
289{
290 const uint32_t *src = vsrc;
291 uint32_t *dst = vdst;
292 size_t i;
293
294 for (i = 0; i < len / 4; ++i) {
295 dst[tile_vslice_index(i)] = src[i];
296 }
297}
298
299static void copy_vertical_d(void *vdst, const void *vsrc, size_t len)
300{
301 const uint64_t *src = vsrc;
302 uint64_t *dst = vdst;
303 size_t i;
304
305 for (i = 0; i < len / 8; ++i) {
306 dst[tile_vslice_index(i)] = src[i];
307 }
308}
309
310static void copy_vertical_q(void *vdst, const void *vsrc, size_t len)
311{
312 for (size_t i = 0; i < len; i += 16) {
313 memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16);
314 }
315}
316
317
318
319
320
321#define DO_LD(NAME, TYPE, HOST, TLB) \
322static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \
323{ \
324 TYPE val = HOST(host); \
325 *(TYPE *)(za + tile_vslice_offset(off)) = val; \
326} \
327static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \
328 intptr_t off, target_ulong addr, uintptr_t ra) \
329{ \
330 TYPE val = TLB(env, useronly_clean_ptr(addr), ra); \
331 *(TYPE *)(za + tile_vslice_offset(off)) = val; \
332}
333
334#define DO_ST(NAME, TYPE, HOST, TLB) \
335static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \
336{ \
337 TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \
338 HOST(host, val); \
339} \
340static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \
341 intptr_t off, target_ulong addr, uintptr_t ra) \
342{ \
343 TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \
344 TLB(env, useronly_clean_ptr(addr), val, ra); \
345}
346
347
348
349
350
351
352#define DO_LDQ(HNAME, VNAME, BE, HOST, TLB) \
353static inline void HNAME##_host(void *za, intptr_t off, void *host) \
354{ \
355 uint64_t val0 = HOST(host), val1 = HOST(host + 8); \
356 uint64_t *ptr = za + off; \
357 ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \
358} \
359static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \
360{ \
361 HNAME##_host(za, tile_vslice_offset(off), host); \
362} \
363static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \
364 target_ulong addr, uintptr_t ra) \
365{ \
366 uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra); \
367 uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra); \
368 uint64_t *ptr = za + off; \
369 ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \
370} \
371static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \
372 target_ulong addr, uintptr_t ra) \
373{ \
374 HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \
375}
376
377#define DO_STQ(HNAME, VNAME, BE, HOST, TLB) \
378static inline void HNAME##_host(void *za, intptr_t off, void *host) \
379{ \
380 uint64_t *ptr = za + off; \
381 HOST(host, ptr[BE]); \
382 HOST(host + 1, ptr[!BE]); \
383} \
384static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \
385{ \
386 HNAME##_host(za, tile_vslice_offset(off), host); \
387} \
388static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \
389 target_ulong addr, uintptr_t ra) \
390{ \
391 uint64_t *ptr = za + off; \
392 TLB(env, useronly_clean_ptr(addr), ptr[BE], ra); \
393 TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra); \
394} \
395static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \
396 target_ulong addr, uintptr_t ra) \
397{ \
398 HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \
399}
400
401DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra)
402DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra)
403DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra)
404DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra)
405DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra)
406DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra)
407DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra)
408
409DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra)
410DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra)
411
412DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra)
413DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra)
414DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra)
415DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra)
416DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra)
417DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra)
418DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra)
419
420DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra)
421DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra)
422
423#undef DO_LD
424#undef DO_ST
425#undef DO_LDQ
426#undef DO_STQ
427
428
429
430
431
432static inline QEMU_ALWAYS_INLINE
433void sme_ld1(CPUARMState *env, void *za, uint64_t *vg,
434 const target_ulong addr, uint32_t desc, const uintptr_t ra,
435 const int esz, uint32_t mtedesc, bool vertical,
436 sve_ldst1_host_fn *host_fn,
437 sve_ldst1_tlb_fn *tlb_fn,
438 ClearFn *clr_fn,
439 CopyFn *cpy_fn)
440{
441 const intptr_t reg_max = simd_oprsz(desc);
442 const intptr_t esize = 1 << esz;
443 intptr_t reg_off, reg_last;
444 SVEContLdSt info;
445 void *host;
446 int flags;
447
448
449 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
450
451 clr_fn(za, 0, reg_max);
452 return;
453 }
454
455
456 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
457
458
459 sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
460 BP_MEM_READ, ra);
461
462
463
464
465
466 if (mtedesc) {
467 sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
468 mtedesc, ra);
469 }
470
471 flags = info.page[0].flags | info.page[1].flags;
472 if (unlikely(flags != 0)) {
473#ifdef CONFIG_USER_ONLY
474 g_assert_not_reached();
475#else
476
477
478
479
480
481
482 ARMVectorReg scratch = { };
483
484 reg_off = info.reg_off_first[0];
485 reg_last = info.reg_off_last[1];
486 if (reg_last < 0) {
487 reg_last = info.reg_off_split;
488 if (reg_last < 0) {
489 reg_last = info.reg_off_last[0];
490 }
491 }
492
493 do {
494 uint64_t pg = vg[reg_off >> 6];
495 do {
496 if ((pg >> (reg_off & 63)) & 1) {
497 tlb_fn(env, &scratch, reg_off, addr + reg_off, ra);
498 }
499 reg_off += esize;
500 } while (reg_off & 63);
501 } while (reg_off <= reg_last);
502
503 cpy_fn(za, &scratch, reg_max);
504 return;
505#endif
506 }
507
508
509
510 reg_off = info.reg_off_first[0];
511 reg_last = info.reg_off_last[0];
512 host = info.page[0].host;
513
514 if (!vertical) {
515 memset(za, 0, reg_max);
516 } else if (reg_off) {
517 clr_fn(za, 0, reg_off);
518 }
519
520 while (reg_off <= reg_last) {
521 uint64_t pg = vg[reg_off >> 6];
522 do {
523 if ((pg >> (reg_off & 63)) & 1) {
524 host_fn(za, reg_off, host + reg_off);
525 } else if (vertical) {
526 clr_fn(za, reg_off, esize);
527 }
528 reg_off += esize;
529 } while (reg_off <= reg_last && (reg_off & 63));
530 }
531
532
533
534
535
536 reg_off = info.reg_off_split;
537 if (unlikely(reg_off >= 0)) {
538 tlb_fn(env, za, reg_off, addr + reg_off, ra);
539 }
540
541 reg_off = info.reg_off_first[1];
542 if (unlikely(reg_off >= 0)) {
543 reg_last = info.reg_off_last[1];
544 host = info.page[1].host;
545
546 do {
547 uint64_t pg = vg[reg_off >> 6];
548 do {
549 if ((pg >> (reg_off & 63)) & 1) {
550 host_fn(za, reg_off, host + reg_off);
551 } else if (vertical) {
552 clr_fn(za, reg_off, esize);
553 }
554 reg_off += esize;
555 } while (reg_off & 63);
556 } while (reg_off <= reg_last);
557 }
558}
559
560static inline QEMU_ALWAYS_INLINE
561void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg,
562 target_ulong addr, uint32_t desc, uintptr_t ra,
563 const int esz, bool vertical,
564 sve_ldst1_host_fn *host_fn,
565 sve_ldst1_tlb_fn *tlb_fn,
566 ClearFn *clr_fn,
567 CopyFn *cpy_fn)
568{
569 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
570 int bit55 = extract64(addr, 55, 1);
571
572
573 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
574
575
576 if (!tbi_check(desc, bit55) ||
577 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
578 mtedesc = 0;
579 }
580
581 sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical,
582 host_fn, tlb_fn, clr_fn, cpy_fn);
583}
584
585#define DO_LD(L, END, ESZ) \
586void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg, \
587 target_ulong addr, uint32_t desc) \
588{ \
589 sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \
590 sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \
591 clear_horizontal, copy_horizontal); \
592} \
593void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg, \
594 target_ulong addr, uint32_t desc) \
595{ \
596 sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \
597 sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \
598 clear_vertical_##L, copy_vertical_##L); \
599} \
600void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
601 target_ulong addr, uint32_t desc) \
602{ \
603 sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \
604 sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \
605 clear_horizontal, copy_horizontal); \
606} \
607void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
608 target_ulong addr, uint32_t desc) \
609{ \
610 sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \
611 sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \
612 clear_vertical_##L, copy_vertical_##L); \
613}
614
615DO_LD(b, , MO_8)
616DO_LD(h, _be, MO_16)
617DO_LD(h, _le, MO_16)
618DO_LD(s, _be, MO_32)
619DO_LD(s, _le, MO_32)
620DO_LD(d, _be, MO_64)
621DO_LD(d, _le, MO_64)
622DO_LD(q, _be, MO_128)
623DO_LD(q, _le, MO_128)
624
625#undef DO_LD
626
627
628
629
630
631static inline QEMU_ALWAYS_INLINE
632void sme_st1(CPUARMState *env, void *za, uint64_t *vg,
633 const target_ulong addr, uint32_t desc, const uintptr_t ra,
634 const int esz, uint32_t mtedesc, bool vertical,
635 sve_ldst1_host_fn *host_fn,
636 sve_ldst1_tlb_fn *tlb_fn)
637{
638 const intptr_t reg_max = simd_oprsz(desc);
639 const intptr_t esize = 1 << esz;
640 intptr_t reg_off, reg_last;
641 SVEContLdSt info;
642 void *host;
643 int flags;
644
645
646 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
647
648 return;
649 }
650
651
652 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
653
654
655 sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
656 BP_MEM_WRITE, ra);
657
658
659
660
661
662 if (mtedesc) {
663 sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
664 mtedesc, ra);
665 }
666
667 flags = info.page[0].flags | info.page[1].flags;
668 if (unlikely(flags != 0)) {
669#ifdef CONFIG_USER_ONLY
670 g_assert_not_reached();
671#else
672
673
674
675
676
677
678 reg_off = info.reg_off_first[0];
679 reg_last = info.reg_off_last[1];
680 if (reg_last < 0) {
681 reg_last = info.reg_off_split;
682 if (reg_last < 0) {
683 reg_last = info.reg_off_last[0];
684 }
685 }
686
687 do {
688 uint64_t pg = vg[reg_off >> 6];
689 do {
690 if ((pg >> (reg_off & 63)) & 1) {
691 tlb_fn(env, za, reg_off, addr + reg_off, ra);
692 }
693 reg_off += esize;
694 } while (reg_off & 63);
695 } while (reg_off <= reg_last);
696 return;
697#endif
698 }
699
700 reg_off = info.reg_off_first[0];
701 reg_last = info.reg_off_last[0];
702 host = info.page[0].host;
703
704 while (reg_off <= reg_last) {
705 uint64_t pg = vg[reg_off >> 6];
706 do {
707 if ((pg >> (reg_off & 63)) & 1) {
708 host_fn(za, reg_off, host + reg_off);
709 }
710 reg_off += 1 << esz;
711 } while (reg_off <= reg_last && (reg_off & 63));
712 }
713
714
715
716
717
718 reg_off = info.reg_off_split;
719 if (unlikely(reg_off >= 0)) {
720 tlb_fn(env, za, reg_off, addr + reg_off, ra);
721 }
722
723 reg_off = info.reg_off_first[1];
724 if (unlikely(reg_off >= 0)) {
725 reg_last = info.reg_off_last[1];
726 host = info.page[1].host;
727
728 do {
729 uint64_t pg = vg[reg_off >> 6];
730 do {
731 if ((pg >> (reg_off & 63)) & 1) {
732 host_fn(za, reg_off, host + reg_off);
733 }
734 reg_off += 1 << esz;
735 } while (reg_off & 63);
736 } while (reg_off <= reg_last);
737 }
738}
739
740static inline QEMU_ALWAYS_INLINE
741void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr,
742 uint32_t desc, uintptr_t ra, int esz, bool vertical,
743 sve_ldst1_host_fn *host_fn,
744 sve_ldst1_tlb_fn *tlb_fn)
745{
746 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
747 int bit55 = extract64(addr, 55, 1);
748
749
750 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
751
752
753 if (!tbi_check(desc, bit55) ||
754 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
755 mtedesc = 0;
756 }
757
758 sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc,
759 vertical, host_fn, tlb_fn);
760}
761
762#define DO_ST(L, END, ESZ) \
763void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg, \
764 target_ulong addr, uint32_t desc) \
765{ \
766 sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \
767 sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \
768} \
769void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg, \
770 target_ulong addr, uint32_t desc) \
771{ \
772 sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \
773 sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \
774} \
775void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
776 target_ulong addr, uint32_t desc) \
777{ \
778 sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \
779 sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \
780} \
781void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
782 target_ulong addr, uint32_t desc) \
783{ \
784 sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \
785 sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \
786}
787
788DO_ST(b, , MO_8)
789DO_ST(h, _be, MO_16)
790DO_ST(h, _le, MO_16)
791DO_ST(s, _be, MO_32)
792DO_ST(s, _le, MO_32)
793DO_ST(d, _be, MO_64)
794DO_ST(d, _le, MO_64)
795DO_ST(q, _be, MO_128)
796DO_ST(q, _le, MO_128)
797
798#undef DO_ST
799
800void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn,
801 void *vpm, uint32_t desc)
802{
803 intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
804 uint64_t *pn = vpn, *pm = vpm;
805 uint32_t *zda = vzda, *zn = vzn;
806
807 for (row = 0; row < oprsz; ) {
808 uint64_t pa = pn[row >> 4];
809 do {
810 if (pa & 1) {
811 for (col = 0; col < oprsz; ) {
812 uint64_t pb = pm[col >> 4];
813 do {
814 if (pb & 1) {
815 zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)];
816 }
817 pb >>= 4;
818 } while (++col & 15);
819 }
820 }
821 pa >>= 4;
822 } while (++row & 15);
823 }
824}
825
826void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn,
827 void *vpm, uint32_t desc)
828{
829 intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
830 uint8_t *pn = vpn, *pm = vpm;
831 uint64_t *zda = vzda, *zn = vzn;
832
833 for (row = 0; row < oprsz; ++row) {
834 if (pn[H1(row)] & 1) {
835 for (col = 0; col < oprsz; ++col) {
836 if (pm[H1(col)] & 1) {
837 zda[tile_vslice_index(row) + col] += zn[col];
838 }
839 }
840 }
841 }
842}
843
844void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn,
845 void *vpm, uint32_t desc)
846{
847 intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
848 uint64_t *pn = vpn, *pm = vpm;
849 uint32_t *zda = vzda, *zn = vzn;
850
851 for (row = 0; row < oprsz; ) {
852 uint64_t pa = pn[row >> 4];
853 do {
854 if (pa & 1) {
855 uint32_t zn_row = zn[H4(row)];
856 for (col = 0; col < oprsz; ) {
857 uint64_t pb = pm[col >> 4];
858 do {
859 if (pb & 1) {
860 zda[tile_vslice_index(row) + H4(col)] += zn_row;
861 }
862 pb >>= 4;
863 } while (++col & 15);
864 }
865 }
866 pa >>= 4;
867 } while (++row & 15);
868 }
869}
870
871void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn,
872 void *vpm, uint32_t desc)
873{
874 intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
875 uint8_t *pn = vpn, *pm = vpm;
876 uint64_t *zda = vzda, *zn = vzn;
877
878 for (row = 0; row < oprsz; ++row) {
879 if (pn[H1(row)] & 1) {
880 uint64_t zn_row = zn[row];
881 for (col = 0; col < oprsz; ++col) {
882 if (pm[H1(col)] & 1) {
883 zda[tile_vslice_index(row) + col] += zn_row;
884 }
885 }
886 }
887 }
888}
889
890void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
891 void *vpm, void *vst, uint32_t desc)
892{
893 intptr_t row, col, oprsz = simd_maxsz(desc);
894 uint32_t neg = simd_data(desc) << 31;
895 uint16_t *pn = vpn, *pm = vpm;
896 float_status fpst;
897
898
899
900
901
902
903 fpst = *(float_status *)vst;
904 set_default_nan_mode(true, &fpst);
905
906 for (row = 0; row < oprsz; ) {
907 uint16_t pa = pn[H2(row >> 4)];
908 do {
909 if (pa & 1) {
910 void *vza_row = vza + tile_vslice_offset(row);
911 uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg;
912
913 for (col = 0; col < oprsz; ) {
914 uint16_t pb = pm[H2(col >> 4)];
915 do {
916 if (pb & 1) {
917 uint32_t *a = vza_row + H1_4(col);
918 uint32_t *m = vzm + H1_4(col);
919 *a = float32_muladd(n, *m, *a, 0, vst);
920 }
921 col += 4;
922 pb >>= 4;
923 } while (col & 15);
924 }
925 }
926 row += 4;
927 pa >>= 4;
928 } while (row & 15);
929 }
930}
931
932void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn,
933 void *vpm, void *vst, uint32_t desc)
934{
935 intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
936 uint64_t neg = (uint64_t)simd_data(desc) << 63;
937 uint64_t *za = vza, *zn = vzn, *zm = vzm;
938 uint8_t *pn = vpn, *pm = vpm;
939 float_status fpst = *(float_status *)vst;
940
941 set_default_nan_mode(true, &fpst);
942
943 for (row = 0; row < oprsz; ++row) {
944 if (pn[H1(row)] & 1) {
945 uint64_t *za_row = &za[tile_vslice_index(row)];
946 uint64_t n = zn[row] ^ neg;
947
948 for (col = 0; col < oprsz; ++col) {
949 if (pm[H1(col)] & 1) {
950 uint64_t *a = &za_row[col];
951 *a = float64_muladd(n, zm[col], *a, 0, &fpst);
952 }
953 }
954 }
955 }
956}
957
958
959
960
961
962static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg)
963{
964
965
966
967
968 pair ^= neg;
969 if (!(pg & 1)) {
970 pair &= 0xffff0000u;
971 }
972 if (!(pg & 4)) {
973 pair &= 0x0000ffffu;
974 }
975 return pair;
976}
977
978static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
979 float_status *s_std, float_status *s_odd)
980{
981 float64 e1r = float16_to_float64(e1 & 0xffff, true, s_std);
982 float64 e1c = float16_to_float64(e1 >> 16, true, s_std);
983 float64 e2r = float16_to_float64(e2 & 0xffff, true, s_std);
984 float64 e2c = float16_to_float64(e2 >> 16, true, s_std);
985 float64 t64;
986 float32 t32;
987
988
989
990
991
992
993
994
995 t64 = float64_mul(e1r, e2r, s_odd);
996 t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
997
998
999 t32 = float64_to_float32(t64, s_std);
1000
1001
1002 return float32_add(sum, t32, s_std);
1003}
1004
1005void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
1006 void *vpm, void *vst, uint32_t desc)
1007{
1008 intptr_t row, col, oprsz = simd_maxsz(desc);
1009 uint32_t neg = simd_data(desc) * 0x80008000u;
1010 uint16_t *pn = vpn, *pm = vpm;
1011 float_status fpst_odd, fpst_std;
1012
1013
1014
1015
1016
1017
1018 fpst_std = *(float_status *)vst;
1019 set_default_nan_mode(true, &fpst_std);
1020 fpst_odd = fpst_std;
1021 set_float_rounding_mode(float_round_to_odd, &fpst_odd);
1022
1023 for (row = 0; row < oprsz; ) {
1024 uint16_t prow = pn[H2(row >> 4)];
1025 do {
1026 void *vza_row = vza + tile_vslice_offset(row);
1027 uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1028
1029 n = f16mop_adj_pair(n, prow, neg);
1030
1031 for (col = 0; col < oprsz; ) {
1032 uint16_t pcol = pm[H2(col >> 4)];
1033 do {
1034 if (prow & pcol & 0b0101) {
1035 uint32_t *a = vza_row + H1_4(col);
1036 uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1037
1038 m = f16mop_adj_pair(m, pcol, 0);
1039 *a = f16_dotadd(*a, n, m, &fpst_std, &fpst_odd);
1040
1041 col += 4;
1042 pcol >>= 4;
1043 }
1044 } while (col & 15);
1045 }
1046 row += 4;
1047 prow >>= 4;
1048 } while (row & 15);
1049 }
1050}
1051
1052void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn,
1053 void *vpm, uint32_t desc)
1054{
1055 intptr_t row, col, oprsz = simd_maxsz(desc);
1056 uint32_t neg = simd_data(desc) * 0x80008000u;
1057 uint16_t *pn = vpn, *pm = vpm;
1058
1059 for (row = 0; row < oprsz; ) {
1060 uint16_t prow = pn[H2(row >> 4)];
1061 do {
1062 void *vza_row = vza + tile_vslice_offset(row);
1063 uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1064
1065 n = f16mop_adj_pair(n, prow, neg);
1066
1067 for (col = 0; col < oprsz; ) {
1068 uint16_t pcol = pm[H2(col >> 4)];
1069 do {
1070 if (prow & pcol & 0b0101) {
1071 uint32_t *a = vza_row + H1_4(col);
1072 uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1073
1074 m = f16mop_adj_pair(m, pcol, 0);
1075 *a = bfdotadd(*a, n, m);
1076
1077 col += 4;
1078 pcol >>= 4;
1079 }
1080 } while (col & 15);
1081 }
1082 row += 4;
1083 prow >>= 4;
1084 } while (row & 15);
1085 }
1086}
1087
1088typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool);
1089
1090static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm,
1091 uint8_t *pn, uint8_t *pm,
1092 uint32_t desc, IMOPFn *fn)
1093{
1094 intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
1095 bool neg = simd_data(desc);
1096
1097 for (row = 0; row < oprsz; ++row) {
1098 uint8_t pa = pn[H1(row)];
1099 uint64_t *za_row = &za[tile_vslice_index(row)];
1100 uint64_t n = zn[row];
1101
1102 for (col = 0; col < oprsz; ++col) {
1103 uint8_t pb = pm[H1(col)];
1104 uint64_t *a = &za_row[col];
1105
1106 *a = fn(n, zm[col], *a, pa & pb, neg);
1107 }
1108 }
1109}
1110
1111#define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
1112static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1113{ \
1114 uint32_t sum0 = 0, sum1 = 0; \
1115 \
1116 n &= expand_pred_b(p); \
1117 sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \
1118 sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8); \
1119 sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \
1120 sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24); \
1121 sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \
1122 sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40); \
1123 sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \
1124 sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56); \
1125 if (neg) { \
1126 sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1; \
1127 } else { \
1128 sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1; \
1129 } \
1130 return ((uint64_t)sum1 << 32) | sum0; \
1131}
1132
1133#define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
1134static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1135{ \
1136 uint64_t sum = 0; \
1137 \
1138 n &= expand_pred_h(p); \
1139 sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \
1140 sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \
1141 sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \
1142 sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \
1143 return neg ? a - sum : a + sum; \
1144}
1145
1146DEF_IMOP_32(smopa_s, int8_t, int8_t)
1147DEF_IMOP_32(umopa_s, uint8_t, uint8_t)
1148DEF_IMOP_32(sumopa_s, int8_t, uint8_t)
1149DEF_IMOP_32(usmopa_s, uint8_t, int8_t)
1150
1151DEF_IMOP_64(smopa_d, int16_t, int16_t)
1152DEF_IMOP_64(umopa_d, uint16_t, uint16_t)
1153DEF_IMOP_64(sumopa_d, int16_t, uint16_t)
1154DEF_IMOP_64(usmopa_d, uint16_t, int16_t)
1155
1156#define DEF_IMOPH(NAME) \
1157 void HELPER(sme_##NAME)(void *vza, void *vzn, void *vzm, void *vpn, \
1158 void *vpm, uint32_t desc) \
1159 { do_imopa(vza, vzn, vzm, vpn, vpm, desc, NAME); }
1160
1161DEF_IMOPH(smopa_s)
1162DEF_IMOPH(umopa_s)
1163DEF_IMOPH(sumopa_s)
1164DEF_IMOPH(usmopa_s)
1165DEF_IMOPH(smopa_d)
1166DEF_IMOPH(umopa_d)
1167DEF_IMOPH(sumopa_d)
1168DEF_IMOPH(usmopa_d)
1169