1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20#include "qemu/osdep.h"
21#include "cpu.h"
22#include "internals.h"
23#include "tcg/tcg-gvec-desc.h"
24#include "exec/helper-proto.h"
25#include "exec/cpu_ldst.h"
26#include "exec/exec-all.h"
27#include "qemu/int128.h"
28#include "fpu/softfloat.h"
29#include "vec_internal.h"
30#include "sve_ldst_internal.h"
31
32
33void arm_reset_sve_state(CPUARMState *env)
34{
35 memset(env->vfp.zregs, 0, sizeof(env->vfp.zregs));
36
37 memset(env->vfp.pregs, 0, sizeof(env->vfp.pregs));
38 vfp_set_fpcr(env, 0x0800009f);
39}
40
41void helper_set_pstate_sm(CPUARMState *env, uint32_t i)
42{
43 if (i == FIELD_EX64(env->svcr, SVCR, SM)) {
44 return;
45 }
46 env->svcr ^= R_SVCR_SM_MASK;
47 arm_reset_sve_state(env);
48}
49
50void helper_set_pstate_za(CPUARMState *env, uint32_t i)
51{
52 if (i == FIELD_EX64(env->svcr, SVCR, ZA)) {
53 return;
54 }
55 env->svcr ^= R_SVCR_ZA_MASK;
56
57
58
59
60
61
62
63
64
65 if (i) {
66 memset(env->zarray, 0, sizeof(env->zarray));
67 }
68}
69
70void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
71{
72 uint32_t i;
73
74
75
76
77
78
79 if (imm == 0xff) {
80 memset(env->zarray, 0, sizeof(env->zarray));
81 return;
82 }
83
84
85
86
87
88 for (i = 0; i < svl; i++) {
89 if (imm & (1 << (i % 8))) {
90 memset(&env->zarray[i], 0, svl);
91 }
92 }
93}
94
95
96
97
98
99
100
101
102
103
104
105
106
107#define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg))
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125#define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg))
126
127
128
129
130
131#define DO_MOVA_C(NAME, TYPE, H) \
132void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc) \
133{ \
134 int i, oprsz = simd_oprsz(desc); \
135 for (i = 0; i < oprsz; ) { \
136 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
137 do { \
138 if (pg & 1) { \
139 *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \
140 } \
141 i += sizeof(TYPE); \
142 pg >>= sizeof(TYPE); \
143 } while (i & 15); \
144 } \
145}
146
147DO_MOVA_C(sme_mova_cz_b, uint8_t, H1)
148DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2)
149DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4)
150
151void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc)
152{
153 int i, oprsz = simd_oprsz(desc) / 8;
154 uint8_t *pg = vg;
155 uint64_t *n = vn;
156 uint64_t *a = za;
157
158 for (i = 0; i < oprsz; i++) {
159 if (pg[H1(i)] & 1) {
160 a[tile_vslice_index(i)] = n[i];
161 }
162 }
163}
164
165void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
166{
167 int i, oprsz = simd_oprsz(desc) / 16;
168 uint16_t *pg = vg;
169 Int128 *n = vn;
170 Int128 *a = za;
171
172
173
174
175
176 for (i = 0; i < oprsz; i++) {
177 if (pg[H2(i)] & 1) {
178 a[tile_vslice_index(i)] = n[i];
179 }
180 }
181}
182
183#undef DO_MOVA_C
184
185
186
187
188#define DO_MOVA_Z(NAME, TYPE, H) \
189void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc) \
190{ \
191 int i, oprsz = simd_oprsz(desc); \
192 for (i = 0; i < oprsz; ) { \
193 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
194 do { \
195 if (pg & 1) { \
196 *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \
197 } \
198 i += sizeof(TYPE); \
199 pg >>= sizeof(TYPE); \
200 } while (i & 15); \
201 } \
202}
203
204DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1)
205DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2)
206DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4)
207
208void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc)
209{
210 int i, oprsz = simd_oprsz(desc) / 8;
211 uint8_t *pg = vg;
212 uint64_t *d = vd;
213 uint64_t *a = za;
214
215 for (i = 0; i < oprsz; i++) {
216 if (pg[H1(i)] & 1) {
217 d[i] = a[tile_vslice_index(i)];
218 }
219 }
220}
221
222void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc)
223{
224 int i, oprsz = simd_oprsz(desc) / 16;
225 uint16_t *pg = vg;
226 Int128 *d = vd;
227 Int128 *a = za;
228
229
230
231
232
233 for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) {
234 if (pg[H2(i)] & 1) {
235 d[i] = a[tile_vslice_index(i)];
236 }
237 }
238}
239
240#undef DO_MOVA_Z
241
242
243
244
245
246typedef void ClearFn(void *ptr, size_t off, size_t len);
247
248static void clear_horizontal(void *ptr, size_t off, size_t len)
249{
250 memset(ptr + off, 0, len);
251}
252
253static void clear_vertical_b(void *vptr, size_t off, size_t len)
254{
255 for (size_t i = 0; i < len; ++i) {
256 *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0;
257 }
258}
259
260static void clear_vertical_h(void *vptr, size_t off, size_t len)
261{
262 for (size_t i = 0; i < len; i += 2) {
263 *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0;
264 }
265}
266
267static void clear_vertical_s(void *vptr, size_t off, size_t len)
268{
269 for (size_t i = 0; i < len; i += 4) {
270 *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0;
271 }
272}
273
274static void clear_vertical_d(void *vptr, size_t off, size_t len)
275{
276 for (size_t i = 0; i < len; i += 8) {
277 *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0;
278 }
279}
280
281static void clear_vertical_q(void *vptr, size_t off, size_t len)
282{
283 for (size_t i = 0; i < len; i += 16) {
284 memset(vptr + tile_vslice_offset(i + off), 0, 16);
285 }
286}
287
288
289
290
291
292typedef void CopyFn(void *dst, const void *src, size_t len);
293
294static void copy_horizontal(void *dst, const void *src, size_t len)
295{
296 memcpy(dst, src, len);
297}
298
299static void copy_vertical_b(void *vdst, const void *vsrc, size_t len)
300{
301 const uint8_t *src = vsrc;
302 uint8_t *dst = vdst;
303 size_t i;
304
305 for (i = 0; i < len; ++i) {
306 dst[tile_vslice_index(i)] = src[i];
307 }
308}
309
310static void copy_vertical_h(void *vdst, const void *vsrc, size_t len)
311{
312 const uint16_t *src = vsrc;
313 uint16_t *dst = vdst;
314 size_t i;
315
316 for (i = 0; i < len / 2; ++i) {
317 dst[tile_vslice_index(i)] = src[i];
318 }
319}
320
321static void copy_vertical_s(void *vdst, const void *vsrc, size_t len)
322{
323 const uint32_t *src = vsrc;
324 uint32_t *dst = vdst;
325 size_t i;
326
327 for (i = 0; i < len / 4; ++i) {
328 dst[tile_vslice_index(i)] = src[i];
329 }
330}
331
332static void copy_vertical_d(void *vdst, const void *vsrc, size_t len)
333{
334 const uint64_t *src = vsrc;
335 uint64_t *dst = vdst;
336 size_t i;
337
338 for (i = 0; i < len / 8; ++i) {
339 dst[tile_vslice_index(i)] = src[i];
340 }
341}
342
343static void copy_vertical_q(void *vdst, const void *vsrc, size_t len)
344{
345 for (size_t i = 0; i < len; i += 16) {
346 memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16);
347 }
348}
349
350
351
352
353
354#define DO_LD(NAME, TYPE, HOST, TLB) \
355static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \
356{ \
357 TYPE val = HOST(host); \
358 *(TYPE *)(za + tile_vslice_offset(off)) = val; \
359} \
360static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \
361 intptr_t off, target_ulong addr, uintptr_t ra) \
362{ \
363 TYPE val = TLB(env, useronly_clean_ptr(addr), ra); \
364 *(TYPE *)(za + tile_vslice_offset(off)) = val; \
365}
366
367#define DO_ST(NAME, TYPE, HOST, TLB) \
368static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \
369{ \
370 TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \
371 HOST(host, val); \
372} \
373static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \
374 intptr_t off, target_ulong addr, uintptr_t ra) \
375{ \
376 TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \
377 TLB(env, useronly_clean_ptr(addr), val, ra); \
378}
379
380
381
382
383
384
385#define DO_LDQ(HNAME, VNAME, BE, HOST, TLB) \
386static inline void HNAME##_host(void *za, intptr_t off, void *host) \
387{ \
388 uint64_t val0 = HOST(host), val1 = HOST(host + 8); \
389 uint64_t *ptr = za + off; \
390 ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \
391} \
392static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \
393{ \
394 HNAME##_host(za, tile_vslice_offset(off), host); \
395} \
396static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \
397 target_ulong addr, uintptr_t ra) \
398{ \
399 uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra); \
400 uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra); \
401 uint64_t *ptr = za + off; \
402 ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \
403} \
404static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \
405 target_ulong addr, uintptr_t ra) \
406{ \
407 HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \
408}
409
410#define DO_STQ(HNAME, VNAME, BE, HOST, TLB) \
411static inline void HNAME##_host(void *za, intptr_t off, void *host) \
412{ \
413 uint64_t *ptr = za + off; \
414 HOST(host, ptr[BE]); \
415 HOST(host + 1, ptr[!BE]); \
416} \
417static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \
418{ \
419 HNAME##_host(za, tile_vslice_offset(off), host); \
420} \
421static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \
422 target_ulong addr, uintptr_t ra) \
423{ \
424 uint64_t *ptr = za + off; \
425 TLB(env, useronly_clean_ptr(addr), ptr[BE], ra); \
426 TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra); \
427} \
428static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \
429 target_ulong addr, uintptr_t ra) \
430{ \
431 HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \
432}
433
434DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra)
435DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra)
436DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra)
437DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra)
438DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra)
439DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra)
440DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra)
441
442DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra)
443DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra)
444
445DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra)
446DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra)
447DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra)
448DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra)
449DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra)
450DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra)
451DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra)
452
453DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra)
454DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra)
455
456#undef DO_LD
457#undef DO_ST
458#undef DO_LDQ
459#undef DO_STQ
460
461
462
463
464
465static inline QEMU_ALWAYS_INLINE
466void sme_ld1(CPUARMState *env, void *za, uint64_t *vg,
467 const target_ulong addr, uint32_t desc, const uintptr_t ra,
468 const int esz, uint32_t mtedesc, bool vertical,
469 sve_ldst1_host_fn *host_fn,
470 sve_ldst1_tlb_fn *tlb_fn,
471 ClearFn *clr_fn,
472 CopyFn *cpy_fn)
473{
474 const intptr_t reg_max = simd_oprsz(desc);
475 const intptr_t esize = 1 << esz;
476 intptr_t reg_off, reg_last;
477 SVEContLdSt info;
478 void *host;
479 int flags;
480
481
482 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
483
484 clr_fn(za, 0, reg_max);
485 return;
486 }
487
488
489 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
490
491
492 sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
493 BP_MEM_READ, ra);
494
495
496
497
498
499 if (mtedesc) {
500 sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
501 mtedesc, ra);
502 }
503
504 flags = info.page[0].flags | info.page[1].flags;
505 if (unlikely(flags != 0)) {
506#ifdef CONFIG_USER_ONLY
507 g_assert_not_reached();
508#else
509
510
511
512
513
514
515 ARMVectorReg scratch = { };
516
517 reg_off = info.reg_off_first[0];
518 reg_last = info.reg_off_last[1];
519 if (reg_last < 0) {
520 reg_last = info.reg_off_split;
521 if (reg_last < 0) {
522 reg_last = info.reg_off_last[0];
523 }
524 }
525
526 do {
527 uint64_t pg = vg[reg_off >> 6];
528 do {
529 if ((pg >> (reg_off & 63)) & 1) {
530 tlb_fn(env, &scratch, reg_off, addr + reg_off, ra);
531 }
532 reg_off += esize;
533 } while (reg_off & 63);
534 } while (reg_off <= reg_last);
535
536 cpy_fn(za, &scratch, reg_max);
537 return;
538#endif
539 }
540
541
542
543 reg_off = info.reg_off_first[0];
544 reg_last = info.reg_off_last[0];
545 host = info.page[0].host;
546
547 if (!vertical) {
548 memset(za, 0, reg_max);
549 } else if (reg_off) {
550 clr_fn(za, 0, reg_off);
551 }
552
553 while (reg_off <= reg_last) {
554 uint64_t pg = vg[reg_off >> 6];
555 do {
556 if ((pg >> (reg_off & 63)) & 1) {
557 host_fn(za, reg_off, host + reg_off);
558 } else if (vertical) {
559 clr_fn(za, reg_off, esize);
560 }
561 reg_off += esize;
562 } while (reg_off <= reg_last && (reg_off & 63));
563 }
564
565
566
567
568
569 reg_off = info.reg_off_split;
570 if (unlikely(reg_off >= 0)) {
571 tlb_fn(env, za, reg_off, addr + reg_off, ra);
572 }
573
574 reg_off = info.reg_off_first[1];
575 if (unlikely(reg_off >= 0)) {
576 reg_last = info.reg_off_last[1];
577 host = info.page[1].host;
578
579 do {
580 uint64_t pg = vg[reg_off >> 6];
581 do {
582 if ((pg >> (reg_off & 63)) & 1) {
583 host_fn(za, reg_off, host + reg_off);
584 } else if (vertical) {
585 clr_fn(za, reg_off, esize);
586 }
587 reg_off += esize;
588 } while (reg_off & 63);
589 } while (reg_off <= reg_last);
590 }
591}
592
593static inline QEMU_ALWAYS_INLINE
594void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg,
595 target_ulong addr, uint32_t desc, uintptr_t ra,
596 const int esz, bool vertical,
597 sve_ldst1_host_fn *host_fn,
598 sve_ldst1_tlb_fn *tlb_fn,
599 ClearFn *clr_fn,
600 CopyFn *cpy_fn)
601{
602 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
603 int bit55 = extract64(addr, 55, 1);
604
605
606 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
607
608
609 if (!tbi_check(desc, bit55) ||
610 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
611 mtedesc = 0;
612 }
613
614 sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical,
615 host_fn, tlb_fn, clr_fn, cpy_fn);
616}
617
618#define DO_LD(L, END, ESZ) \
619void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg, \
620 target_ulong addr, uint32_t desc) \
621{ \
622 sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \
623 sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \
624 clear_horizontal, copy_horizontal); \
625} \
626void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg, \
627 target_ulong addr, uint32_t desc) \
628{ \
629 sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \
630 sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \
631 clear_vertical_##L, copy_vertical_##L); \
632} \
633void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
634 target_ulong addr, uint32_t desc) \
635{ \
636 sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \
637 sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \
638 clear_horizontal, copy_horizontal); \
639} \
640void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
641 target_ulong addr, uint32_t desc) \
642{ \
643 sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \
644 sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \
645 clear_vertical_##L, copy_vertical_##L); \
646}
647
648DO_LD(b, , MO_8)
649DO_LD(h, _be, MO_16)
650DO_LD(h, _le, MO_16)
651DO_LD(s, _be, MO_32)
652DO_LD(s, _le, MO_32)
653DO_LD(d, _be, MO_64)
654DO_LD(d, _le, MO_64)
655DO_LD(q, _be, MO_128)
656DO_LD(q, _le, MO_128)
657
658#undef DO_LD
659
660
661
662
663
664static inline QEMU_ALWAYS_INLINE
665void sme_st1(CPUARMState *env, void *za, uint64_t *vg,
666 const target_ulong addr, uint32_t desc, const uintptr_t ra,
667 const int esz, uint32_t mtedesc, bool vertical,
668 sve_ldst1_host_fn *host_fn,
669 sve_ldst1_tlb_fn *tlb_fn)
670{
671 const intptr_t reg_max = simd_oprsz(desc);
672 const intptr_t esize = 1 << esz;
673 intptr_t reg_off, reg_last;
674 SVEContLdSt info;
675 void *host;
676 int flags;
677
678
679 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
680
681 return;
682 }
683
684
685 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
686
687
688 sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
689 BP_MEM_WRITE, ra);
690
691
692
693
694
695 if (mtedesc) {
696 sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
697 mtedesc, ra);
698 }
699
700 flags = info.page[0].flags | info.page[1].flags;
701 if (unlikely(flags != 0)) {
702#ifdef CONFIG_USER_ONLY
703 g_assert_not_reached();
704#else
705
706
707
708
709
710
711 reg_off = info.reg_off_first[0];
712 reg_last = info.reg_off_last[1];
713 if (reg_last < 0) {
714 reg_last = info.reg_off_split;
715 if (reg_last < 0) {
716 reg_last = info.reg_off_last[0];
717 }
718 }
719
720 do {
721 uint64_t pg = vg[reg_off >> 6];
722 do {
723 if ((pg >> (reg_off & 63)) & 1) {
724 tlb_fn(env, za, reg_off, addr + reg_off, ra);
725 }
726 reg_off += esize;
727 } while (reg_off & 63);
728 } while (reg_off <= reg_last);
729 return;
730#endif
731 }
732
733 reg_off = info.reg_off_first[0];
734 reg_last = info.reg_off_last[0];
735 host = info.page[0].host;
736
737 while (reg_off <= reg_last) {
738 uint64_t pg = vg[reg_off >> 6];
739 do {
740 if ((pg >> (reg_off & 63)) & 1) {
741 host_fn(za, reg_off, host + reg_off);
742 }
743 reg_off += 1 << esz;
744 } while (reg_off <= reg_last && (reg_off & 63));
745 }
746
747
748
749
750
751 reg_off = info.reg_off_split;
752 if (unlikely(reg_off >= 0)) {
753 tlb_fn(env, za, reg_off, addr + reg_off, ra);
754 }
755
756 reg_off = info.reg_off_first[1];
757 if (unlikely(reg_off >= 0)) {
758 reg_last = info.reg_off_last[1];
759 host = info.page[1].host;
760
761 do {
762 uint64_t pg = vg[reg_off >> 6];
763 do {
764 if ((pg >> (reg_off & 63)) & 1) {
765 host_fn(za, reg_off, host + reg_off);
766 }
767 reg_off += 1 << esz;
768 } while (reg_off & 63);
769 } while (reg_off <= reg_last);
770 }
771}
772
773static inline QEMU_ALWAYS_INLINE
774void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr,
775 uint32_t desc, uintptr_t ra, int esz, bool vertical,
776 sve_ldst1_host_fn *host_fn,
777 sve_ldst1_tlb_fn *tlb_fn)
778{
779 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
780 int bit55 = extract64(addr, 55, 1);
781
782
783 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
784
785
786 if (!tbi_check(desc, bit55) ||
787 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
788 mtedesc = 0;
789 }
790
791 sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc,
792 vertical, host_fn, tlb_fn);
793}
794
795#define DO_ST(L, END, ESZ) \
796void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg, \
797 target_ulong addr, uint32_t desc) \
798{ \
799 sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \
800 sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \
801} \
802void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg, \
803 target_ulong addr, uint32_t desc) \
804{ \
805 sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \
806 sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \
807} \
808void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
809 target_ulong addr, uint32_t desc) \
810{ \
811 sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \
812 sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \
813} \
814void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
815 target_ulong addr, uint32_t desc) \
816{ \
817 sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \
818 sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \
819}
820
821DO_ST(b, , MO_8)
822DO_ST(h, _be, MO_16)
823DO_ST(h, _le, MO_16)
824DO_ST(s, _be, MO_32)
825DO_ST(s, _le, MO_32)
826DO_ST(d, _be, MO_64)
827DO_ST(d, _le, MO_64)
828DO_ST(q, _be, MO_128)
829DO_ST(q, _le, MO_128)
830
831#undef DO_ST
832
833void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn,
834 void *vpm, uint32_t desc)
835{
836 intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
837 uint64_t *pn = vpn, *pm = vpm;
838 uint32_t *zda = vzda, *zn = vzn;
839
840 for (row = 0; row < oprsz; ) {
841 uint64_t pa = pn[row >> 4];
842 do {
843 if (pa & 1) {
844 for (col = 0; col < oprsz; ) {
845 uint64_t pb = pm[col >> 4];
846 do {
847 if (pb & 1) {
848 zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)];
849 }
850 pb >>= 4;
851 } while (++col & 15);
852 }
853 }
854 pa >>= 4;
855 } while (++row & 15);
856 }
857}
858
859void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn,
860 void *vpm, uint32_t desc)
861{
862 intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
863 uint8_t *pn = vpn, *pm = vpm;
864 uint64_t *zda = vzda, *zn = vzn;
865
866 for (row = 0; row < oprsz; ++row) {
867 if (pn[H1(row)] & 1) {
868 for (col = 0; col < oprsz; ++col) {
869 if (pm[H1(col)] & 1) {
870 zda[tile_vslice_index(row) + col] += zn[col];
871 }
872 }
873 }
874 }
875}
876
877void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn,
878 void *vpm, uint32_t desc)
879{
880 intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
881 uint64_t *pn = vpn, *pm = vpm;
882 uint32_t *zda = vzda, *zn = vzn;
883
884 for (row = 0; row < oprsz; ) {
885 uint64_t pa = pn[row >> 4];
886 do {
887 if (pa & 1) {
888 uint32_t zn_row = zn[H4(row)];
889 for (col = 0; col < oprsz; ) {
890 uint64_t pb = pm[col >> 4];
891 do {
892 if (pb & 1) {
893 zda[tile_vslice_index(row) + H4(col)] += zn_row;
894 }
895 pb >>= 4;
896 } while (++col & 15);
897 }
898 }
899 pa >>= 4;
900 } while (++row & 15);
901 }
902}
903
904void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn,
905 void *vpm, uint32_t desc)
906{
907 intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
908 uint8_t *pn = vpn, *pm = vpm;
909 uint64_t *zda = vzda, *zn = vzn;
910
911 for (row = 0; row < oprsz; ++row) {
912 if (pn[H1(row)] & 1) {
913 uint64_t zn_row = zn[row];
914 for (col = 0; col < oprsz; ++col) {
915 if (pm[H1(col)] & 1) {
916 zda[tile_vslice_index(row) + col] += zn_row;
917 }
918 }
919 }
920 }
921}
922
923void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
924 void *vpm, void *vst, uint32_t desc)
925{
926 intptr_t row, col, oprsz = simd_maxsz(desc);
927 uint32_t neg = simd_data(desc) << 31;
928 uint16_t *pn = vpn, *pm = vpm;
929 float_status fpst;
930
931
932
933
934
935
936 fpst = *(float_status *)vst;
937 set_default_nan_mode(true, &fpst);
938
939 for (row = 0; row < oprsz; ) {
940 uint16_t pa = pn[H2(row >> 4)];
941 do {
942 if (pa & 1) {
943 void *vza_row = vza + tile_vslice_offset(row);
944 uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg;
945
946 for (col = 0; col < oprsz; ) {
947 uint16_t pb = pm[H2(col >> 4)];
948 do {
949 if (pb & 1) {
950 uint32_t *a = vza_row + H1_4(col);
951 uint32_t *m = vzm + H1_4(col);
952 *a = float32_muladd(n, *m, *a, 0, vst);
953 }
954 col += 4;
955 pb >>= 4;
956 } while (col & 15);
957 }
958 }
959 row += 4;
960 pa >>= 4;
961 } while (row & 15);
962 }
963}
964
965void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn,
966 void *vpm, void *vst, uint32_t desc)
967{
968 intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
969 uint64_t neg = (uint64_t)simd_data(desc) << 63;
970 uint64_t *za = vza, *zn = vzn, *zm = vzm;
971 uint8_t *pn = vpn, *pm = vpm;
972 float_status fpst = *(float_status *)vst;
973
974 set_default_nan_mode(true, &fpst);
975
976 for (row = 0; row < oprsz; ++row) {
977 if (pn[H1(row)] & 1) {
978 uint64_t *za_row = &za[tile_vslice_index(row)];
979 uint64_t n = zn[row] ^ neg;
980
981 for (col = 0; col < oprsz; ++col) {
982 if (pm[H1(col)] & 1) {
983 uint64_t *a = &za_row[col];
984 *a = float64_muladd(n, zm[col], *a, 0, &fpst);
985 }
986 }
987 }
988 }
989}
990
991
992
993
994
995static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg)
996{
997
998
999
1000
1001 pair ^= neg;
1002 if (!(pg & 1)) {
1003 pair &= 0xffff0000u;
1004 }
1005 if (!(pg & 4)) {
1006 pair &= 0x0000ffffu;
1007 }
1008 return pair;
1009}
1010
1011static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
1012 float_status *s_std, float_status *s_odd)
1013{
1014 float64 e1r = float16_to_float64(e1 & 0xffff, true, s_std);
1015 float64 e1c = float16_to_float64(e1 >> 16, true, s_std);
1016 float64 e2r = float16_to_float64(e2 & 0xffff, true, s_std);
1017 float64 e2c = float16_to_float64(e2 >> 16, true, s_std);
1018 float64 t64;
1019 float32 t32;
1020
1021
1022
1023
1024
1025
1026
1027
1028 t64 = float64_mul(e1r, e2r, s_odd);
1029 t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
1030
1031
1032 t32 = float64_to_float32(t64, s_std);
1033
1034
1035 return float32_add(sum, t32, s_std);
1036}
1037
1038void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
1039 void *vpm, void *vst, uint32_t desc)
1040{
1041 intptr_t row, col, oprsz = simd_maxsz(desc);
1042 uint32_t neg = simd_data(desc) * 0x80008000u;
1043 uint16_t *pn = vpn, *pm = vpm;
1044 float_status fpst_odd, fpst_std;
1045
1046
1047
1048
1049
1050
1051 fpst_std = *(float_status *)vst;
1052 set_default_nan_mode(true, &fpst_std);
1053 fpst_odd = fpst_std;
1054 set_float_rounding_mode(float_round_to_odd, &fpst_odd);
1055
1056 for (row = 0; row < oprsz; ) {
1057 uint16_t prow = pn[H2(row >> 4)];
1058 do {
1059 void *vza_row = vza + tile_vslice_offset(row);
1060 uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1061
1062 n = f16mop_adj_pair(n, prow, neg);
1063
1064 for (col = 0; col < oprsz; ) {
1065 uint16_t pcol = pm[H2(col >> 4)];
1066 do {
1067 if (prow & pcol & 0b0101) {
1068 uint32_t *a = vza_row + H1_4(col);
1069 uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1070
1071 m = f16mop_adj_pair(m, pcol, 0);
1072 *a = f16_dotadd(*a, n, m, &fpst_std, &fpst_odd);
1073
1074 col += 4;
1075 pcol >>= 4;
1076 }
1077 } while (col & 15);
1078 }
1079 row += 4;
1080 prow >>= 4;
1081 } while (row & 15);
1082 }
1083}
1084
1085void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn,
1086 void *vpm, uint32_t desc)
1087{
1088 intptr_t row, col, oprsz = simd_maxsz(desc);
1089 uint32_t neg = simd_data(desc) * 0x80008000u;
1090 uint16_t *pn = vpn, *pm = vpm;
1091
1092 for (row = 0; row < oprsz; ) {
1093 uint16_t prow = pn[H2(row >> 4)];
1094 do {
1095 void *vza_row = vza + tile_vslice_offset(row);
1096 uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1097
1098 n = f16mop_adj_pair(n, prow, neg);
1099
1100 for (col = 0; col < oprsz; ) {
1101 uint16_t pcol = pm[H2(col >> 4)];
1102 do {
1103 if (prow & pcol & 0b0101) {
1104 uint32_t *a = vza_row + H1_4(col);
1105 uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1106
1107 m = f16mop_adj_pair(m, pcol, 0);
1108 *a = bfdotadd(*a, n, m);
1109
1110 col += 4;
1111 pcol >>= 4;
1112 }
1113 } while (col & 15);
1114 }
1115 row += 4;
1116 prow >>= 4;
1117 } while (row & 15);
1118 }
1119}
1120
1121typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool);
1122
1123static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm,
1124 uint8_t *pn, uint8_t *pm,
1125 uint32_t desc, IMOPFn *fn)
1126{
1127 intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
1128 bool neg = simd_data(desc);
1129
1130 for (row = 0; row < oprsz; ++row) {
1131 uint8_t pa = pn[H1(row)];
1132 uint64_t *za_row = &za[tile_vslice_index(row)];
1133 uint64_t n = zn[row];
1134
1135 for (col = 0; col < oprsz; ++col) {
1136 uint8_t pb = pm[H1(col)];
1137 uint64_t *a = &za_row[col];
1138
1139 *a = fn(n, zm[col], *a, pa & pb, neg);
1140 }
1141 }
1142}
1143
1144#define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
1145static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1146{ \
1147 uint32_t sum0 = 0, sum1 = 0; \
1148 \
1149 n &= expand_pred_b(p); \
1150 sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \
1151 sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8); \
1152 sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \
1153 sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24); \
1154 sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \
1155 sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40); \
1156 sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \
1157 sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56); \
1158 if (neg) { \
1159 sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1; \
1160 } else { \
1161 sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1; \
1162 } \
1163 return ((uint64_t)sum1 << 32) | sum0; \
1164}
1165
1166#define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
1167static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1168{ \
1169 uint64_t sum = 0; \
1170 \
1171 n &= expand_pred_h(p); \
1172 sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \
1173 sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \
1174 sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \
1175 sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \
1176 return neg ? a - sum : a + sum; \
1177}
1178
1179DEF_IMOP_32(smopa_s, int8_t, int8_t)
1180DEF_IMOP_32(umopa_s, uint8_t, uint8_t)
1181DEF_IMOP_32(sumopa_s, int8_t, uint8_t)
1182DEF_IMOP_32(usmopa_s, uint8_t, int8_t)
1183
1184DEF_IMOP_64(smopa_d, int16_t, int16_t)
1185DEF_IMOP_64(umopa_d, uint16_t, uint16_t)
1186DEF_IMOP_64(sumopa_d, int16_t, uint16_t)
1187DEF_IMOP_64(usmopa_d, uint16_t, int16_t)
1188
1189#define DEF_IMOPH(NAME) \
1190 void HELPER(sme_##NAME)(void *vza, void *vzn, void *vzm, void *vpn, \
1191 void *vpm, uint32_t desc) \
1192 { do_imopa(vza, vzn, vzm, vpn, vpm, desc, NAME); }
1193
1194DEF_IMOPH(smopa_s)
1195DEF_IMOPH(umopa_s)
1196DEF_IMOPH(sumopa_s)
1197DEF_IMOPH(usmopa_s)
1198DEF_IMOPH(smopa_d)
1199DEF_IMOPH(umopa_d)
1200DEF_IMOPH(sumopa_d)
1201DEF_IMOPH(usmopa_d)
1202