1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20#include "qemu/osdep.h"
21#include "cpu.h"
22#include "exec/helper-proto.h"
23#include "tcg/tcg-gvec-desc.h"
24#include "fpu/softfloat.h"
25#include "qemu/int128.h"
26#include "vec_internal.h"
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41const uint64_t expand_pred_b_data[256] = {
42 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
43 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
44 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
45 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
46 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
47 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
48 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
49 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
50 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
51 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
52 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
53 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
54 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
55 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
56 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
57 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
58 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
59 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
60 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
61 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
62 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
63 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
64 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
65 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
66 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
67 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
68 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
69 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
70 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
71 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
72 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
73 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
74 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
75 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
76 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
77 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
78 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
79 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
80 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
81 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
82 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
83 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
84 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
85 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
86 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
87 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
88 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
89 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
90 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
91 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
92 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
93 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
94 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
95 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
96 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
97 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
98 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
99 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
100 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
101 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
102 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
103 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
104 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
105 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
106 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
107 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
108 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
109 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
110 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
111 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
112 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
113 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
114 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
115 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
116 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
117 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
118 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
119 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
120 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
121 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
122 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
123 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
124 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
125 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
126 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
127 0xffffffffffffffff,
128};
129
130
131int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
132 bool neg, bool round)
133{
134
135
136
137
138
139 int32_t ret = (int32_t)src1 * src2;
140 if (neg) {
141 ret = -ret;
142 }
143 ret += ((int32_t)src3 << 7) + (round << 6);
144 ret >>= 7;
145
146 if (ret != (int8_t)ret) {
147 ret = (ret < 0 ? INT8_MIN : INT8_MAX);
148 }
149 return ret;
150}
151
152void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
153 void *va, uint32_t desc)
154{
155 intptr_t i, opr_sz = simd_oprsz(desc);
156 int8_t *d = vd, *n = vn, *m = vm, *a = va;
157
158 for (i = 0; i < opr_sz; ++i) {
159 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
160 }
161}
162
163void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
164 void *va, uint32_t desc)
165{
166 intptr_t i, opr_sz = simd_oprsz(desc);
167 int8_t *d = vd, *n = vn, *m = vm, *a = va;
168
169 for (i = 0; i < opr_sz; ++i) {
170 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
171 }
172}
173
174void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
175{
176 intptr_t i, opr_sz = simd_oprsz(desc);
177 int8_t *d = vd, *n = vn, *m = vm;
178
179 for (i = 0; i < opr_sz; ++i) {
180 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
181 }
182}
183
184void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
185{
186 intptr_t i, opr_sz = simd_oprsz(desc);
187 int8_t *d = vd, *n = vn, *m = vm;
188
189 for (i = 0; i < opr_sz; ++i) {
190 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
191 }
192}
193
194
195int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
196 bool neg, bool round, uint32_t *sat)
197{
198
199 int32_t ret = (int32_t)src1 * src2;
200 if (neg) {
201 ret = -ret;
202 }
203 ret += ((int32_t)src3 << 15) + (round << 14);
204 ret >>= 15;
205
206 if (ret != (int16_t)ret) {
207 *sat = 1;
208 ret = (ret < 0 ? INT16_MIN : INT16_MAX);
209 }
210 return ret;
211}
212
213uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
214 uint32_t src2, uint32_t src3)
215{
216 uint32_t *sat = &env->vfp.qc[0];
217 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
218 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
219 false, true, sat);
220 return deposit32(e1, 16, 16, e2);
221}
222
223void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
224 void *vq, uint32_t desc)
225{
226 uintptr_t opr_sz = simd_oprsz(desc);
227 int16_t *d = vd;
228 int16_t *n = vn;
229 int16_t *m = vm;
230 uintptr_t i;
231
232 for (i = 0; i < opr_sz / 2; ++i) {
233 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
234 }
235 clear_tail(d, opr_sz, simd_maxsz(desc));
236}
237
238uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
239 uint32_t src2, uint32_t src3)
240{
241 uint32_t *sat = &env->vfp.qc[0];
242 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
243 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
244 true, true, sat);
245 return deposit32(e1, 16, 16, e2);
246}
247
248void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
249 void *vq, uint32_t desc)
250{
251 uintptr_t opr_sz = simd_oprsz(desc);
252 int16_t *d = vd;
253 int16_t *n = vn;
254 int16_t *m = vm;
255 uintptr_t i;
256
257 for (i = 0; i < opr_sz / 2; ++i) {
258 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
259 }
260 clear_tail(d, opr_sz, simd_maxsz(desc));
261}
262
263void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
264 void *vq, uint32_t desc)
265{
266 intptr_t i, opr_sz = simd_oprsz(desc);
267 int16_t *d = vd, *n = vn, *m = vm;
268
269 for (i = 0; i < opr_sz / 2; ++i) {
270 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
271 }
272 clear_tail(d, opr_sz, simd_maxsz(desc));
273}
274
275void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
276 void *vq, uint32_t desc)
277{
278 intptr_t i, opr_sz = simd_oprsz(desc);
279 int16_t *d = vd, *n = vn, *m = vm;
280
281 for (i = 0; i < opr_sz / 2; ++i) {
282 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
283 }
284 clear_tail(d, opr_sz, simd_maxsz(desc));
285}
286
287void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
288 void *va, uint32_t desc)
289{
290 intptr_t i, opr_sz = simd_oprsz(desc);
291 int16_t *d = vd, *n = vn, *m = vm, *a = va;
292 uint32_t discard;
293
294 for (i = 0; i < opr_sz / 2; ++i) {
295 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
296 }
297}
298
299void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
300 void *va, uint32_t desc)
301{
302 intptr_t i, opr_sz = simd_oprsz(desc);
303 int16_t *d = vd, *n = vn, *m = vm, *a = va;
304 uint32_t discard;
305
306 for (i = 0; i < opr_sz / 2; ++i) {
307 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
308 }
309}
310
311void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
312{
313 intptr_t i, opr_sz = simd_oprsz(desc);
314 int16_t *d = vd, *n = vn, *m = vm;
315 uint32_t discard;
316
317 for (i = 0; i < opr_sz / 2; ++i) {
318 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
319 }
320}
321
322void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
323{
324 intptr_t i, opr_sz = simd_oprsz(desc);
325 int16_t *d = vd, *n = vn, *m = vm;
326 uint32_t discard;
327
328 for (i = 0; i < opr_sz / 2; ++i) {
329 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
330 }
331}
332
333void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
334{
335 intptr_t i, j, opr_sz = simd_oprsz(desc);
336 int idx = simd_data(desc);
337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338 uint32_t discard;
339
340 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
341 int16_t mm = m[i];
342 for (j = 0; j < 16 / 2; ++j) {
343 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
344 }
345 }
346}
347
348void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
349{
350 intptr_t i, j, opr_sz = simd_oprsz(desc);
351 int idx = simd_data(desc);
352 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
353 uint32_t discard;
354
355 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
356 int16_t mm = m[i];
357 for (j = 0; j < 16 / 2; ++j) {
358 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
359 }
360 }
361}
362
363
364int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
365 bool neg, bool round, uint32_t *sat)
366{
367
368 int64_t ret = (int64_t)src1 * src2;
369 if (neg) {
370 ret = -ret;
371 }
372 ret += ((int64_t)src3 << 31) + (round << 30);
373 ret >>= 31;
374
375 if (ret != (int32_t)ret) {
376 *sat = 1;
377 ret = (ret < 0 ? INT32_MIN : INT32_MAX);
378 }
379 return ret;
380}
381
382uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
383 int32_t src2, int32_t src3)
384{
385 uint32_t *sat = &env->vfp.qc[0];
386 return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
387}
388
389void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
390 void *vq, uint32_t desc)
391{
392 uintptr_t opr_sz = simd_oprsz(desc);
393 int32_t *d = vd;
394 int32_t *n = vn;
395 int32_t *m = vm;
396 uintptr_t i;
397
398 for (i = 0; i < opr_sz / 4; ++i) {
399 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
400 }
401 clear_tail(d, opr_sz, simd_maxsz(desc));
402}
403
404uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
405 int32_t src2, int32_t src3)
406{
407 uint32_t *sat = &env->vfp.qc[0];
408 return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
409}
410
411void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
412 void *vq, uint32_t desc)
413{
414 uintptr_t opr_sz = simd_oprsz(desc);
415 int32_t *d = vd;
416 int32_t *n = vn;
417 int32_t *m = vm;
418 uintptr_t i;
419
420 for (i = 0; i < opr_sz / 4; ++i) {
421 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
422 }
423 clear_tail(d, opr_sz, simd_maxsz(desc));
424}
425
426void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
427 void *vq, uint32_t desc)
428{
429 intptr_t i, opr_sz = simd_oprsz(desc);
430 int32_t *d = vd, *n = vn, *m = vm;
431
432 for (i = 0; i < opr_sz / 4; ++i) {
433 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
434 }
435 clear_tail(d, opr_sz, simd_maxsz(desc));
436}
437
438void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
439 void *vq, uint32_t desc)
440{
441 intptr_t i, opr_sz = simd_oprsz(desc);
442 int32_t *d = vd, *n = vn, *m = vm;
443
444 for (i = 0; i < opr_sz / 4; ++i) {
445 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
446 }
447 clear_tail(d, opr_sz, simd_maxsz(desc));
448}
449
450void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
451 void *va, uint32_t desc)
452{
453 intptr_t i, opr_sz = simd_oprsz(desc);
454 int32_t *d = vd, *n = vn, *m = vm, *a = va;
455 uint32_t discard;
456
457 for (i = 0; i < opr_sz / 4; ++i) {
458 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
459 }
460}
461
462void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
463 void *va, uint32_t desc)
464{
465 intptr_t i, opr_sz = simd_oprsz(desc);
466 int32_t *d = vd, *n = vn, *m = vm, *a = va;
467 uint32_t discard;
468
469 for (i = 0; i < opr_sz / 4; ++i) {
470 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
471 }
472}
473
474void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
475{
476 intptr_t i, opr_sz = simd_oprsz(desc);
477 int32_t *d = vd, *n = vn, *m = vm;
478 uint32_t discard;
479
480 for (i = 0; i < opr_sz / 4; ++i) {
481 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
482 }
483}
484
485void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
486{
487 intptr_t i, opr_sz = simd_oprsz(desc);
488 int32_t *d = vd, *n = vn, *m = vm;
489 uint32_t discard;
490
491 for (i = 0; i < opr_sz / 4; ++i) {
492 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
493 }
494}
495
496void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
497{
498 intptr_t i, j, opr_sz = simd_oprsz(desc);
499 int idx = simd_data(desc);
500 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
501 uint32_t discard;
502
503 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
504 int32_t mm = m[i];
505 for (j = 0; j < 16 / 4; ++j) {
506 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
507 }
508 }
509}
510
511void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
512{
513 intptr_t i, j, opr_sz = simd_oprsz(desc);
514 int idx = simd_data(desc);
515 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
516 uint32_t discard;
517
518 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
519 int32_t mm = m[i];
520 for (j = 0; j < 16 / 4; ++j) {
521 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
522 }
523 }
524}
525
526
527static int64_t do_sat128_d(Int128 r)
528{
529 int64_t ls = int128_getlo(r);
530 int64_t hs = int128_gethi(r);
531
532 if (unlikely(hs != (ls >> 63))) {
533 return hs < 0 ? INT64_MIN : INT64_MAX;
534 }
535 return ls;
536}
537
538int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
539{
540 uint64_t l, h;
541 Int128 r, t;
542
543
544 muls64(&l, &h, m, n);
545 r = int128_make128(l, h);
546 if (neg) {
547 r = int128_neg(r);
548 }
549 if (a) {
550 t = int128_exts64(a);
551 t = int128_lshift(t, 63);
552 r = int128_add(r, t);
553 }
554 if (round) {
555 t = int128_exts64(1ll << 62);
556 r = int128_add(r, t);
557 }
558 r = int128_rshift(r, 63);
559
560 return do_sat128_d(r);
561}
562
563void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
564 void *va, uint32_t desc)
565{
566 intptr_t i, opr_sz = simd_oprsz(desc);
567 int64_t *d = vd, *n = vn, *m = vm, *a = va;
568
569 for (i = 0; i < opr_sz / 8; ++i) {
570 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
571 }
572}
573
574void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
575 void *va, uint32_t desc)
576{
577 intptr_t i, opr_sz = simd_oprsz(desc);
578 int64_t *d = vd, *n = vn, *m = vm, *a = va;
579
580 for (i = 0; i < opr_sz / 8; ++i) {
581 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
582 }
583}
584
585void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
586{
587 intptr_t i, opr_sz = simd_oprsz(desc);
588 int64_t *d = vd, *n = vn, *m = vm;
589
590 for (i = 0; i < opr_sz / 8; ++i) {
591 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
592 }
593}
594
595void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
596{
597 intptr_t i, opr_sz = simd_oprsz(desc);
598 int64_t *d = vd, *n = vn, *m = vm;
599
600 for (i = 0; i < opr_sz / 8; ++i) {
601 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
602 }
603}
604
605void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
606{
607 intptr_t i, j, opr_sz = simd_oprsz(desc);
608 int idx = simd_data(desc);
609 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
610
611 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
612 int64_t mm = m[i];
613 for (j = 0; j < 16 / 8; ++j) {
614 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
615 }
616 }
617}
618
619void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
620{
621 intptr_t i, j, opr_sz = simd_oprsz(desc);
622 int idx = simd_data(desc);
623 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
624
625 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
626 int64_t mm = m[i];
627 for (j = 0; j < 16 / 8; ++j) {
628 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
629 }
630 }
631}
632
633
634
635
636
637
638
639
640#define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
641void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
642{ \
643 intptr_t i, opr_sz = simd_oprsz(desc); \
644 TYPED *d = vd, *a = va; \
645 TYPEN *n = vn; \
646 TYPEM *m = vm; \
647 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \
648 d[i] = (a[i] + \
649 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \
650 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \
651 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \
652 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \
653 } \
654 clear_tail(d, opr_sz, simd_maxsz(desc)); \
655}
656
657DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
658DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
659DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
660DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
661DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
662
663#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
664void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
665{ \
666 intptr_t i = 0, opr_sz = simd_oprsz(desc); \
667 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \
668 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \
669 intptr_t index = simd_data(desc); \
670 TYPED *d = vd, *a = va; \
671 TYPEN *n = vn; \
672 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \
673 do { \
674 TYPED m0 = m_indexed[i * 4 + 0]; \
675 TYPED m1 = m_indexed[i * 4 + 1]; \
676 TYPED m2 = m_indexed[i * 4 + 2]; \
677 TYPED m3 = m_indexed[i * 4 + 3]; \
678 do { \
679 d[i] = (a[i] + \
680 n[i * 4 + 0] * m0 + \
681 n[i * 4 + 1] * m1 + \
682 n[i * 4 + 2] * m2 + \
683 n[i * 4 + 3] * m3); \
684 } while (++i < segend); \
685 segend = i + 4; \
686 } while (i < opr_sz_n); \
687 clear_tail(d, opr_sz, simd_maxsz(desc)); \
688}
689
690DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
691DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
692DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
693DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
694DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
695DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
696
697void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
698 void *vfpst, uint32_t desc)
699{
700 uintptr_t opr_sz = simd_oprsz(desc);
701 float16 *d = vd;
702 float16 *n = vn;
703 float16 *m = vm;
704 float_status *fpst = vfpst;
705 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
706 uint32_t neg_imag = neg_real ^ 1;
707 uintptr_t i;
708
709
710 neg_real <<= 15;
711 neg_imag <<= 15;
712
713 for (i = 0; i < opr_sz / 2; i += 2) {
714 float16 e0 = n[H2(i)];
715 float16 e1 = m[H2(i + 1)] ^ neg_imag;
716 float16 e2 = n[H2(i + 1)];
717 float16 e3 = m[H2(i)] ^ neg_real;
718
719 d[H2(i)] = float16_add(e0, e1, fpst);
720 d[H2(i + 1)] = float16_add(e2, e3, fpst);
721 }
722 clear_tail(d, opr_sz, simd_maxsz(desc));
723}
724
725void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
726 void *vfpst, uint32_t desc)
727{
728 uintptr_t opr_sz = simd_oprsz(desc);
729 float32 *d = vd;
730 float32 *n = vn;
731 float32 *m = vm;
732 float_status *fpst = vfpst;
733 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
734 uint32_t neg_imag = neg_real ^ 1;
735 uintptr_t i;
736
737
738 neg_real <<= 31;
739 neg_imag <<= 31;
740
741 for (i = 0; i < opr_sz / 4; i += 2) {
742 float32 e0 = n[H4(i)];
743 float32 e1 = m[H4(i + 1)] ^ neg_imag;
744 float32 e2 = n[H4(i + 1)];
745 float32 e3 = m[H4(i)] ^ neg_real;
746
747 d[H4(i)] = float32_add(e0, e1, fpst);
748 d[H4(i + 1)] = float32_add(e2, e3, fpst);
749 }
750 clear_tail(d, opr_sz, simd_maxsz(desc));
751}
752
753void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
754 void *vfpst, uint32_t desc)
755{
756 uintptr_t opr_sz = simd_oprsz(desc);
757 float64 *d = vd;
758 float64 *n = vn;
759 float64 *m = vm;
760 float_status *fpst = vfpst;
761 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
762 uint64_t neg_imag = neg_real ^ 1;
763 uintptr_t i;
764
765
766 neg_real <<= 63;
767 neg_imag <<= 63;
768
769 for (i = 0; i < opr_sz / 8; i += 2) {
770 float64 e0 = n[i];
771 float64 e1 = m[i + 1] ^ neg_imag;
772 float64 e2 = n[i + 1];
773 float64 e3 = m[i] ^ neg_real;
774
775 d[i] = float64_add(e0, e1, fpst);
776 d[i + 1] = float64_add(e2, e3, fpst);
777 }
778 clear_tail(d, opr_sz, simd_maxsz(desc));
779}
780
781void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
782 void *vfpst, uint32_t desc)
783{
784 uintptr_t opr_sz = simd_oprsz(desc);
785 float16 *d = vd, *n = vn, *m = vm, *a = va;
786 float_status *fpst = vfpst;
787 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
788 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
789 uint32_t neg_real = flip ^ neg_imag;
790 uintptr_t i;
791
792
793 neg_real <<= 15;
794 neg_imag <<= 15;
795
796 for (i = 0; i < opr_sz / 2; i += 2) {
797 float16 e2 = n[H2(i + flip)];
798 float16 e1 = m[H2(i + flip)] ^ neg_real;
799 float16 e4 = e2;
800 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
801
802 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
803 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
804 }
805 clear_tail(d, opr_sz, simd_maxsz(desc));
806}
807
808void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
809 void *vfpst, uint32_t desc)
810{
811 uintptr_t opr_sz = simd_oprsz(desc);
812 float16 *d = vd, *n = vn, *m = vm, *a = va;
813 float_status *fpst = vfpst;
814 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
815 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
816 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
817 uint32_t neg_real = flip ^ neg_imag;
818 intptr_t elements = opr_sz / sizeof(float16);
819 intptr_t eltspersegment = 16 / sizeof(float16);
820 intptr_t i, j;
821
822
823 neg_real <<= 15;
824 neg_imag <<= 15;
825
826 for (i = 0; i < elements; i += eltspersegment) {
827 float16 mr = m[H2(i + 2 * index + 0)];
828 float16 mi = m[H2(i + 2 * index + 1)];
829 float16 e1 = neg_real ^ (flip ? mi : mr);
830 float16 e3 = neg_imag ^ (flip ? mr : mi);
831
832 for (j = i; j < i + eltspersegment; j += 2) {
833 float16 e2 = n[H2(j + flip)];
834 float16 e4 = e2;
835
836 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
837 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
838 }
839 }
840 clear_tail(d, opr_sz, simd_maxsz(desc));
841}
842
843void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
844 void *vfpst, uint32_t desc)
845{
846 uintptr_t opr_sz = simd_oprsz(desc);
847 float32 *d = vd, *n = vn, *m = vm, *a = va;
848 float_status *fpst = vfpst;
849 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
850 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
851 uint32_t neg_real = flip ^ neg_imag;
852 uintptr_t i;
853
854
855 neg_real <<= 31;
856 neg_imag <<= 31;
857
858 for (i = 0; i < opr_sz / 4; i += 2) {
859 float32 e2 = n[H4(i + flip)];
860 float32 e1 = m[H4(i + flip)] ^ neg_real;
861 float32 e4 = e2;
862 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
863
864 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
865 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
866 }
867 clear_tail(d, opr_sz, simd_maxsz(desc));
868}
869
870void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
871 void *vfpst, uint32_t desc)
872{
873 uintptr_t opr_sz = simd_oprsz(desc);
874 float32 *d = vd, *n = vn, *m = vm, *a = va;
875 float_status *fpst = vfpst;
876 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
877 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
878 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
879 uint32_t neg_real = flip ^ neg_imag;
880 intptr_t elements = opr_sz / sizeof(float32);
881 intptr_t eltspersegment = 16 / sizeof(float32);
882 intptr_t i, j;
883
884
885 neg_real <<= 31;
886 neg_imag <<= 31;
887
888 for (i = 0; i < elements; i += eltspersegment) {
889 float32 mr = m[H4(i + 2 * index + 0)];
890 float32 mi = m[H4(i + 2 * index + 1)];
891 float32 e1 = neg_real ^ (flip ? mi : mr);
892 float32 e3 = neg_imag ^ (flip ? mr : mi);
893
894 for (j = i; j < i + eltspersegment; j += 2) {
895 float32 e2 = n[H4(j + flip)];
896 float32 e4 = e2;
897
898 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
899 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
900 }
901 }
902 clear_tail(d, opr_sz, simd_maxsz(desc));
903}
904
905void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
906 void *vfpst, uint32_t desc)
907{
908 uintptr_t opr_sz = simd_oprsz(desc);
909 float64 *d = vd, *n = vn, *m = vm, *a = va;
910 float_status *fpst = vfpst;
911 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
912 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
913 uint64_t neg_real = flip ^ neg_imag;
914 uintptr_t i;
915
916
917 neg_real <<= 63;
918 neg_imag <<= 63;
919
920 for (i = 0; i < opr_sz / 8; i += 2) {
921 float64 e2 = n[i + flip];
922 float64 e1 = m[i + flip] ^ neg_real;
923 float64 e4 = e2;
924 float64 e3 = m[i + 1 - flip] ^ neg_imag;
925
926 d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
927 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
928 }
929 clear_tail(d, opr_sz, simd_maxsz(desc));
930}
931
932
933
934
935
936
937static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
938{
939 return -float16_eq_quiet(op1, op2, stat);
940}
941
942static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
943{
944 return -float32_eq_quiet(op1, op2, stat);
945}
946
947static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
948{
949 return -float16_le(op2, op1, stat);
950}
951
952static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
953{
954 return -float32_le(op2, op1, stat);
955}
956
957static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
958{
959 return -float16_lt(op2, op1, stat);
960}
961
962static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
963{
964 return -float32_lt(op2, op1, stat);
965}
966
967static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
968{
969 return -float16_le(float16_abs(op2), float16_abs(op1), stat);
970}
971
972static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
973{
974 return -float32_le(float32_abs(op2), float32_abs(op1), stat);
975}
976
977static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
978{
979 return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
980}
981
982static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
983{
984 return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
985}
986
987static int16_t vfp_tosszh(float16 x, void *fpstp)
988{
989 float_status *fpst = fpstp;
990 if (float16_is_any_nan(x)) {
991 float_raise(float_flag_invalid, fpst);
992 return 0;
993 }
994 return float16_to_int16_round_to_zero(x, fpst);
995}
996
997static uint16_t vfp_touszh(float16 x, void *fpstp)
998{
999 float_status *fpst = fpstp;
1000 if (float16_is_any_nan(x)) {
1001 float_raise(float_flag_invalid, fpst);
1002 return 0;
1003 }
1004 return float16_to_uint16_round_to_zero(x, fpst);
1005}
1006
1007#define DO_2OP(NAME, FUNC, TYPE) \
1008void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
1009{ \
1010 intptr_t i, oprsz = simd_oprsz(desc); \
1011 TYPE *d = vd, *n = vn; \
1012 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1013 d[i] = FUNC(n[i], stat); \
1014 } \
1015 clear_tail(d, oprsz, simd_maxsz(desc)); \
1016}
1017
1018DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1019DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1020DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1021
1022DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1023DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1024DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1025
1026DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1027DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1028
1029DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1030DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1031DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1032DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1033DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1034DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1035DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1036DO_2OP(gvec_touszh, vfp_touszh, float16)
1037
1038#define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \
1039 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1040 { \
1041 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \
1042 }
1043
1044#define WRAP_CMP0_REV(FN, CMPOP, TYPE) \
1045 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1046 { \
1047 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \
1048 }
1049
1050#define DO_2OP_CMP0(FN, CMPOP, DIRN) \
1051 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \
1052 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \
1053 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \
1054 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1055
1056DO_2OP_CMP0(cgt, cgt, FWD)
1057DO_2OP_CMP0(cge, cge, FWD)
1058DO_2OP_CMP0(ceq, ceq, FWD)
1059DO_2OP_CMP0(clt, cgt, REV)
1060DO_2OP_CMP0(cle, cge, REV)
1061
1062#undef DO_2OP
1063#undef DO_2OP_CMP0
1064
1065
1066
1067
1068static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1069{
1070 float16 result = float16_mul(op1, op1, stat);
1071 if (!float16_is_any_nan(result)) {
1072 result = float16_set_sign(result, op2 & 1);
1073 }
1074 return result;
1075}
1076
1077static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1078{
1079 float32 result = float32_mul(op1, op1, stat);
1080 if (!float32_is_any_nan(result)) {
1081 result = float32_set_sign(result, op2 & 1);
1082 }
1083 return result;
1084}
1085
1086static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1087{
1088 float64 result = float64_mul(op1, op1, stat);
1089 if (!float64_is_any_nan(result)) {
1090 result = float64_set_sign(result, op2 & 1);
1091 }
1092 return result;
1093}
1094
1095static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1096{
1097 return float16_abs(float16_sub(op1, op2, stat));
1098}
1099
1100static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1101{
1102 return float32_abs(float32_sub(op1, op2, stat));
1103}
1104
1105
1106
1107
1108
1109static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1110{
1111 op1 = float16_squash_input_denormal(op1, stat);
1112 op2 = float16_squash_input_denormal(op2, stat);
1113
1114 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1115 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1116 return float16_two;
1117 }
1118 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1119}
1120
1121static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1122{
1123 op1 = float32_squash_input_denormal(op1, stat);
1124 op2 = float32_squash_input_denormal(op2, stat);
1125
1126 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1127 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1128 return float32_two;
1129 }
1130 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1131}
1132
1133
1134static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1135{
1136 op1 = float16_squash_input_denormal(op1, stat);
1137 op2 = float16_squash_input_denormal(op2, stat);
1138
1139 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1140 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1141 return float16_one_point_five;
1142 }
1143 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1144 return float16_div(op1, float16_two, stat);
1145}
1146
1147static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1148{
1149 op1 = float32_squash_input_denormal(op1, stat);
1150 op2 = float32_squash_input_denormal(op2, stat);
1151
1152 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1153 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1154 return float32_one_point_five;
1155 }
1156 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1157 return float32_div(op1, float32_two, stat);
1158}
1159
1160#define DO_3OP(NAME, FUNC, TYPE) \
1161void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1162{ \
1163 intptr_t i, oprsz = simd_oprsz(desc); \
1164 TYPE *d = vd, *n = vn, *m = vm; \
1165 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1166 d[i] = FUNC(n[i], m[i], stat); \
1167 } \
1168 clear_tail(d, oprsz, simd_maxsz(desc)); \
1169}
1170
1171DO_3OP(gvec_fadd_h, float16_add, float16)
1172DO_3OP(gvec_fadd_s, float32_add, float32)
1173DO_3OP(gvec_fadd_d, float64_add, float64)
1174
1175DO_3OP(gvec_fsub_h, float16_sub, float16)
1176DO_3OP(gvec_fsub_s, float32_sub, float32)
1177DO_3OP(gvec_fsub_d, float64_sub, float64)
1178
1179DO_3OP(gvec_fmul_h, float16_mul, float16)
1180DO_3OP(gvec_fmul_s, float32_mul, float32)
1181DO_3OP(gvec_fmul_d, float64_mul, float64)
1182
1183DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1184DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1185DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1186
1187DO_3OP(gvec_fabd_h, float16_abd, float16)
1188DO_3OP(gvec_fabd_s, float32_abd, float32)
1189
1190DO_3OP(gvec_fceq_h, float16_ceq, float16)
1191DO_3OP(gvec_fceq_s, float32_ceq, float32)
1192
1193DO_3OP(gvec_fcge_h, float16_cge, float16)
1194DO_3OP(gvec_fcge_s, float32_cge, float32)
1195
1196DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1197DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1198
1199DO_3OP(gvec_facge_h, float16_acge, float16)
1200DO_3OP(gvec_facge_s, float32_acge, float32)
1201
1202DO_3OP(gvec_facgt_h, float16_acgt, float16)
1203DO_3OP(gvec_facgt_s, float32_acgt, float32)
1204
1205DO_3OP(gvec_fmax_h, float16_max, float16)
1206DO_3OP(gvec_fmax_s, float32_max, float32)
1207
1208DO_3OP(gvec_fmin_h, float16_min, float16)
1209DO_3OP(gvec_fmin_s, float32_min, float32)
1210
1211DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1212DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1213
1214DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1215DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1216
1217DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1218DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1219
1220DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1221DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1222
1223#ifdef TARGET_AARCH64
1224
1225DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1226DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1227DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1228
1229DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1230DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1231DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1232
1233#endif
1234#undef DO_3OP
1235
1236
1237static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1238 float_status *stat)
1239{
1240 return float16_add(dest, float16_mul(op1, op2, stat), stat);
1241}
1242
1243static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1244 float_status *stat)
1245{
1246 return float32_add(dest, float32_mul(op1, op2, stat), stat);
1247}
1248
1249static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1250 float_status *stat)
1251{
1252 return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1253}
1254
1255static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1256 float_status *stat)
1257{
1258 return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1259}
1260
1261
1262static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1263 float_status *stat)
1264{
1265 return float16_muladd(op1, op2, dest, 0, stat);
1266}
1267
1268static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1269 float_status *stat)
1270{
1271 return float32_muladd(op1, op2, dest, 0, stat);
1272}
1273
1274static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1275 float_status *stat)
1276{
1277 return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1278}
1279
1280static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1281 float_status *stat)
1282{
1283 return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1284}
1285
1286#define DO_MULADD(NAME, FUNC, TYPE) \
1287void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1288{ \
1289 intptr_t i, oprsz = simd_oprsz(desc); \
1290 TYPE *d = vd, *n = vn, *m = vm; \
1291 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1292 d[i] = FUNC(d[i], n[i], m[i], stat); \
1293 } \
1294 clear_tail(d, oprsz, simd_maxsz(desc)); \
1295}
1296
1297DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1298DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1299
1300DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1301DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1302
1303DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1304DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1305
1306DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1307DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1308
1309
1310
1311
1312
1313#define DO_MUL_IDX(NAME, TYPE, H) \
1314void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1315{ \
1316 intptr_t i, j, oprsz = simd_oprsz(desc); \
1317 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1318 intptr_t idx = simd_data(desc); \
1319 TYPE *d = vd, *n = vn, *m = vm; \
1320 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1321 TYPE mm = m[H(i + idx)]; \
1322 for (j = 0; j < segment; j++) { \
1323 d[i + j] = n[i + j] * mm; \
1324 } \
1325 } \
1326 clear_tail(d, oprsz, simd_maxsz(desc)); \
1327}
1328
1329DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1330DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1331DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1332
1333#undef DO_MUL_IDX
1334
1335#define DO_MLA_IDX(NAME, TYPE, OP, H) \
1336void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1337{ \
1338 intptr_t i, j, oprsz = simd_oprsz(desc); \
1339 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1340 intptr_t idx = simd_data(desc); \
1341 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1342 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1343 TYPE mm = m[H(i + idx)]; \
1344 for (j = 0; j < segment; j++) { \
1345 d[i + j] = a[i + j] OP n[i + j] * mm; \
1346 } \
1347 } \
1348 clear_tail(d, oprsz, simd_maxsz(desc)); \
1349}
1350
1351DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1352DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1353DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1354
1355DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1356DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1357DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1358
1359#undef DO_MLA_IDX
1360
1361#define DO_FMUL_IDX(NAME, ADD, TYPE, H) \
1362void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1363{ \
1364 intptr_t i, j, oprsz = simd_oprsz(desc); \
1365 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1366 intptr_t idx = simd_data(desc); \
1367 TYPE *d = vd, *n = vn, *m = vm; \
1368 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1369 TYPE mm = m[H(i + idx)]; \
1370 for (j = 0; j < segment; j++) { \
1371 d[i + j] = TYPE##_##ADD(d[i + j], \
1372 TYPE##_mul(n[i + j], mm, stat), stat); \
1373 } \
1374 } \
1375 clear_tail(d, oprsz, simd_maxsz(desc)); \
1376}
1377
1378#define float16_nop(N, M, S) (M)
1379#define float32_nop(N, M, S) (M)
1380#define float64_nop(N, M, S) (M)
1381
1382DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
1383DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
1384DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8)
1385
1386
1387
1388
1389
1390DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2)
1391DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4)
1392DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2)
1393DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4)
1394
1395#undef float16_nop
1396#undef float32_nop
1397#undef float64_nop
1398#undef DO_FMUL_IDX
1399
1400#define DO_FMLA_IDX(NAME, TYPE, H) \
1401void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
1402 void *stat, uint32_t desc) \
1403{ \
1404 intptr_t i, j, oprsz = simd_oprsz(desc); \
1405 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1406 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \
1407 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \
1408 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1409 op1_neg <<= (8 * sizeof(TYPE) - 1); \
1410 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1411 TYPE mm = m[H(i + idx)]; \
1412 for (j = 0; j < segment; j++) { \
1413 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \
1414 mm, a[i + j], 0, stat); \
1415 } \
1416 } \
1417 clear_tail(d, oprsz, simd_maxsz(desc)); \
1418}
1419
1420DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1421DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1422DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1423
1424#undef DO_FMLA_IDX
1425
1426#define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1427void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \
1428{ \
1429 intptr_t i, oprsz = simd_oprsz(desc); \
1430 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \
1431 bool q = false; \
1432 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \
1433 WTYPE dd = (WTYPE)n[i] OP m[i]; \
1434 if (dd < MIN) { \
1435 dd = MIN; \
1436 q = true; \
1437 } else if (dd > MAX) { \
1438 dd = MAX; \
1439 q = true; \
1440 } \
1441 d[i] = dd; \
1442 } \
1443 if (q) { \
1444 uint32_t *qc = vq; \
1445 qc[0] = 1; \
1446 } \
1447 clear_tail(d, oprsz, simd_maxsz(desc)); \
1448}
1449
1450DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1451DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1452DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1453
1454DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1455DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1456DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1457
1458DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1459DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1460DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1461
1462DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1463DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1464DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1465
1466#undef DO_SAT
1467
1468void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1469 void *vm, uint32_t desc)
1470{
1471 intptr_t i, oprsz = simd_oprsz(desc);
1472 uint64_t *d = vd, *n = vn, *m = vm;
1473 bool q = false;
1474
1475 for (i = 0; i < oprsz / 8; i++) {
1476 uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1477 if (dd < nn) {
1478 dd = UINT64_MAX;
1479 q = true;
1480 }
1481 d[i] = dd;
1482 }
1483 if (q) {
1484 uint32_t *qc = vq;
1485 qc[0] = 1;
1486 }
1487 clear_tail(d, oprsz, simd_maxsz(desc));
1488}
1489
1490void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1491 void *vm, uint32_t desc)
1492{
1493 intptr_t i, oprsz = simd_oprsz(desc);
1494 uint64_t *d = vd, *n = vn, *m = vm;
1495 bool q = false;
1496
1497 for (i = 0; i < oprsz / 8; i++) {
1498 uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1499 if (nn < mm) {
1500 dd = 0;
1501 q = true;
1502 }
1503 d[i] = dd;
1504 }
1505 if (q) {
1506 uint32_t *qc = vq;
1507 qc[0] = 1;
1508 }
1509 clear_tail(d, oprsz, simd_maxsz(desc));
1510}
1511
1512void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1513 void *vm, uint32_t desc)
1514{
1515 intptr_t i, oprsz = simd_oprsz(desc);
1516 int64_t *d = vd, *n = vn, *m = vm;
1517 bool q = false;
1518
1519 for (i = 0; i < oprsz / 8; i++) {
1520 int64_t nn = n[i], mm = m[i], dd = nn + mm;
1521 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1522 dd = (nn >> 63) ^ ~INT64_MIN;
1523 q = true;
1524 }
1525 d[i] = dd;
1526 }
1527 if (q) {
1528 uint32_t *qc = vq;
1529 qc[0] = 1;
1530 }
1531 clear_tail(d, oprsz, simd_maxsz(desc));
1532}
1533
1534void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1535 void *vm, uint32_t desc)
1536{
1537 intptr_t i, oprsz = simd_oprsz(desc);
1538 int64_t *d = vd, *n = vn, *m = vm;
1539 bool q = false;
1540
1541 for (i = 0; i < oprsz / 8; i++) {
1542 int64_t nn = n[i], mm = m[i], dd = nn - mm;
1543 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1544 dd = (nn >> 63) ^ ~INT64_MIN;
1545 q = true;
1546 }
1547 d[i] = dd;
1548 }
1549 if (q) {
1550 uint32_t *qc = vq;
1551 qc[0] = 1;
1552 }
1553 clear_tail(d, oprsz, simd_maxsz(desc));
1554}
1555
1556
1557#define DO_SRA(NAME, TYPE) \
1558void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1559{ \
1560 intptr_t i, oprsz = simd_oprsz(desc); \
1561 int shift = simd_data(desc); \
1562 TYPE *d = vd, *n = vn; \
1563 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1564 d[i] += n[i] >> shift; \
1565 } \
1566 clear_tail(d, oprsz, simd_maxsz(desc)); \
1567}
1568
1569DO_SRA(gvec_ssra_b, int8_t)
1570DO_SRA(gvec_ssra_h, int16_t)
1571DO_SRA(gvec_ssra_s, int32_t)
1572DO_SRA(gvec_ssra_d, int64_t)
1573
1574DO_SRA(gvec_usra_b, uint8_t)
1575DO_SRA(gvec_usra_h, uint16_t)
1576DO_SRA(gvec_usra_s, uint32_t)
1577DO_SRA(gvec_usra_d, uint64_t)
1578
1579#undef DO_SRA
1580
1581#define DO_RSHR(NAME, TYPE) \
1582void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1583{ \
1584 intptr_t i, oprsz = simd_oprsz(desc); \
1585 int shift = simd_data(desc); \
1586 TYPE *d = vd, *n = vn; \
1587 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1588 TYPE tmp = n[i] >> (shift - 1); \
1589 d[i] = (tmp >> 1) + (tmp & 1); \
1590 } \
1591 clear_tail(d, oprsz, simd_maxsz(desc)); \
1592}
1593
1594DO_RSHR(gvec_srshr_b, int8_t)
1595DO_RSHR(gvec_srshr_h, int16_t)
1596DO_RSHR(gvec_srshr_s, int32_t)
1597DO_RSHR(gvec_srshr_d, int64_t)
1598
1599DO_RSHR(gvec_urshr_b, uint8_t)
1600DO_RSHR(gvec_urshr_h, uint16_t)
1601DO_RSHR(gvec_urshr_s, uint32_t)
1602DO_RSHR(gvec_urshr_d, uint64_t)
1603
1604#undef DO_RSHR
1605
1606#define DO_RSRA(NAME, TYPE) \
1607void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1608{ \
1609 intptr_t i, oprsz = simd_oprsz(desc); \
1610 int shift = simd_data(desc); \
1611 TYPE *d = vd, *n = vn; \
1612 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1613 TYPE tmp = n[i] >> (shift - 1); \
1614 d[i] += (tmp >> 1) + (tmp & 1); \
1615 } \
1616 clear_tail(d, oprsz, simd_maxsz(desc)); \
1617}
1618
1619DO_RSRA(gvec_srsra_b, int8_t)
1620DO_RSRA(gvec_srsra_h, int16_t)
1621DO_RSRA(gvec_srsra_s, int32_t)
1622DO_RSRA(gvec_srsra_d, int64_t)
1623
1624DO_RSRA(gvec_ursra_b, uint8_t)
1625DO_RSRA(gvec_ursra_h, uint16_t)
1626DO_RSRA(gvec_ursra_s, uint32_t)
1627DO_RSRA(gvec_ursra_d, uint64_t)
1628
1629#undef DO_RSRA
1630
1631#define DO_SRI(NAME, TYPE) \
1632void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1633{ \
1634 intptr_t i, oprsz = simd_oprsz(desc); \
1635 int shift = simd_data(desc); \
1636 TYPE *d = vd, *n = vn; \
1637 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1638 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1639 } \
1640 clear_tail(d, oprsz, simd_maxsz(desc)); \
1641}
1642
1643DO_SRI(gvec_sri_b, uint8_t)
1644DO_SRI(gvec_sri_h, uint16_t)
1645DO_SRI(gvec_sri_s, uint32_t)
1646DO_SRI(gvec_sri_d, uint64_t)
1647
1648#undef DO_SRI
1649
1650#define DO_SLI(NAME, TYPE) \
1651void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1652{ \
1653 intptr_t i, oprsz = simd_oprsz(desc); \
1654 int shift = simd_data(desc); \
1655 TYPE *d = vd, *n = vn; \
1656 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1657 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1658 } \
1659 clear_tail(d, oprsz, simd_maxsz(desc)); \
1660}
1661
1662DO_SLI(gvec_sli_b, uint8_t)
1663DO_SLI(gvec_sli_h, uint16_t)
1664DO_SLI(gvec_sli_s, uint32_t)
1665DO_SLI(gvec_sli_d, uint64_t)
1666
1667#undef DO_SLI
1668
1669
1670
1671
1672
1673
1674static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1675{
1676 const int f16_bias = 15;
1677 const int f32_bias = 127;
1678 uint32_t sign = extract32(f16, 15, 1);
1679 uint32_t exp = extract32(f16, 10, 5);
1680 uint32_t frac = extract32(f16, 0, 10);
1681
1682 if (exp == 0x1f) {
1683
1684 exp = 0xff;
1685 } else if (exp == 0) {
1686
1687 if (frac != 0) {
1688 if (fz16) {
1689 frac = 0;
1690 } else {
1691
1692
1693
1694
1695
1696
1697
1698
1699 int shift = clz32(frac) - 21;
1700 frac = (frac << shift) & 0x3ff;
1701 exp = f32_bias - f16_bias - shift + 1;
1702 }
1703 }
1704 } else {
1705
1706 exp += f32_bias - f16_bias;
1707 }
1708 sign <<= 31;
1709 exp <<= 23;
1710 frac <<= 23 - 10;
1711
1712 return sign | exp | frac;
1713}
1714
1715static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1716{
1717
1718
1719
1720
1721
1722
1723 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1724}
1725
1726
1727
1728
1729
1730
1731static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1732 uint32_t desc, bool fz16)
1733{
1734 intptr_t i, oprsz = simd_oprsz(desc);
1735 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1736 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1737 int is_q = oprsz == 16;
1738 uint64_t n_4, m_4;
1739
1740
1741 n_4 = load4_f16(vn, is_q, is_2);
1742 m_4 = load4_f16(vm, is_q, is_2);
1743
1744
1745 if (is_s) {
1746 n_4 ^= 0x8000800080008000ull;
1747 }
1748
1749 for (i = 0; i < oprsz / 4; i++) {
1750 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1751 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1752 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1753 }
1754 clear_tail(d, oprsz, simd_maxsz(desc));
1755}
1756
1757void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1758 void *venv, uint32_t desc)
1759{
1760 CPUARMState *env = venv;
1761 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1762 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1763}
1764
1765void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1766 void *venv, uint32_t desc)
1767{
1768 CPUARMState *env = venv;
1769 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1770 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1771}
1772
1773void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
1774 void *venv, uint32_t desc)
1775{
1776 intptr_t i, oprsz = simd_oprsz(desc);
1777 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1778 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1779 CPUARMState *env = venv;
1780 float_status *status = &env->vfp.fp_status;
1781 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1782
1783 for (i = 0; i < oprsz; i += sizeof(float32)) {
1784 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
1785 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
1786 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1787 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1788 float32 aa = *(float32 *)(va + H1_4(i));
1789
1790 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
1791 }
1792}
1793
1794static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
1795 uint32_t desc, bool fz16)
1796{
1797 intptr_t i, oprsz = simd_oprsz(desc);
1798 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1799 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1800 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
1801 int is_q = oprsz == 16;
1802 uint64_t n_4;
1803 float32 m_1;
1804
1805
1806 n_4 = load4_f16(vn, is_q, is_2);
1807
1808
1809 if (is_s) {
1810 n_4 ^= 0x8000800080008000ull;
1811 }
1812
1813 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
1814
1815 for (i = 0; i < oprsz / 4; i++) {
1816 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1817 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1818 }
1819 clear_tail(d, oprsz, simd_maxsz(desc));
1820}
1821
1822void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
1823 void *venv, uint32_t desc)
1824{
1825 CPUARMState *env = venv;
1826 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1827 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1828}
1829
1830void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
1831 void *venv, uint32_t desc)
1832{
1833 CPUARMState *env = venv;
1834 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
1835 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1836}
1837
1838void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
1839 void *venv, uint32_t desc)
1840{
1841 intptr_t i, j, oprsz = simd_oprsz(desc);
1842 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1843 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1844 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
1845 CPUARMState *env = venv;
1846 float_status *status = &env->vfp.fp_status;
1847 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1848
1849 for (i = 0; i < oprsz; i += 16) {
1850 float16 mm_16 = *(float16 *)(vm + i + idx);
1851 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1852
1853 for (j = 0; j < 16; j += sizeof(float32)) {
1854 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
1855 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1856 float32 aa = *(float32 *)(va + H1_4(i + j));
1857
1858 *(float32 *)(vd + H1_4(i + j)) =
1859 float32_muladd(nn, mm, aa, 0, status);
1860 }
1861 }
1862}
1863
1864void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1865{
1866 intptr_t i, opr_sz = simd_oprsz(desc);
1867 int8_t *d = vd, *n = vn, *m = vm;
1868
1869 for (i = 0; i < opr_sz; ++i) {
1870 int8_t mm = m[i];
1871 int8_t nn = n[i];
1872 int8_t res = 0;
1873 if (mm >= 0) {
1874 if (mm < 8) {
1875 res = nn << mm;
1876 }
1877 } else {
1878 res = nn >> (mm > -8 ? -mm : 7);
1879 }
1880 d[i] = res;
1881 }
1882 clear_tail(d, opr_sz, simd_maxsz(desc));
1883}
1884
1885void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1886{
1887 intptr_t i, opr_sz = simd_oprsz(desc);
1888 int16_t *d = vd, *n = vn, *m = vm;
1889
1890 for (i = 0; i < opr_sz / 2; ++i) {
1891 int8_t mm = m[i];
1892 int16_t nn = n[i];
1893 int16_t res = 0;
1894 if (mm >= 0) {
1895 if (mm < 16) {
1896 res = nn << mm;
1897 }
1898 } else {
1899 res = nn >> (mm > -16 ? -mm : 15);
1900 }
1901 d[i] = res;
1902 }
1903 clear_tail(d, opr_sz, simd_maxsz(desc));
1904}
1905
1906void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1907{
1908 intptr_t i, opr_sz = simd_oprsz(desc);
1909 uint8_t *d = vd, *n = vn, *m = vm;
1910
1911 for (i = 0; i < opr_sz; ++i) {
1912 int8_t mm = m[i];
1913 uint8_t nn = n[i];
1914 uint8_t res = 0;
1915 if (mm >= 0) {
1916 if (mm < 8) {
1917 res = nn << mm;
1918 }
1919 } else {
1920 if (mm > -8) {
1921 res = nn >> -mm;
1922 }
1923 }
1924 d[i] = res;
1925 }
1926 clear_tail(d, opr_sz, simd_maxsz(desc));
1927}
1928
1929void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1930{
1931 intptr_t i, opr_sz = simd_oprsz(desc);
1932 uint16_t *d = vd, *n = vn, *m = vm;
1933
1934 for (i = 0; i < opr_sz / 2; ++i) {
1935 int8_t mm = m[i];
1936 uint16_t nn = n[i];
1937 uint16_t res = 0;
1938 if (mm >= 0) {
1939 if (mm < 16) {
1940 res = nn << mm;
1941 }
1942 } else {
1943 if (mm > -16) {
1944 res = nn >> -mm;
1945 }
1946 }
1947 d[i] = res;
1948 }
1949 clear_tail(d, opr_sz, simd_maxsz(desc));
1950}
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
1962{
1963 intptr_t i, j, opr_sz = simd_oprsz(desc);
1964 uint64_t *d = vd, *n = vn, *m = vm;
1965
1966 for (i = 0; i < opr_sz / 8; ++i) {
1967 uint64_t nn = n[i];
1968 uint64_t mm = m[i];
1969 uint64_t rr = 0;
1970
1971 for (j = 0; j < 8; ++j) {
1972 uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
1973 rr ^= mm & mask;
1974 mm = (mm << 1) & 0xfefefefefefefefeull;
1975 nn >>= 1;
1976 }
1977 d[i] = rr;
1978 }
1979 clear_tail(d, opr_sz, simd_maxsz(desc));
1980}
1981
1982
1983
1984
1985
1986
1987void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
1988{
1989 intptr_t i, j, opr_sz = simd_oprsz(desc);
1990 intptr_t hi = simd_data(desc);
1991 uint64_t *d = vd, *n = vn, *m = vm;
1992
1993 for (i = 0; i < opr_sz / 8; i += 2) {
1994 uint64_t nn = n[i + hi];
1995 uint64_t mm = m[i + hi];
1996 uint64_t rhi = 0;
1997 uint64_t rlo = 0;
1998
1999
2000 if (nn & 1) {
2001 rlo = mm;
2002 }
2003
2004 for (j = 1; j < 64; ++j) {
2005 uint64_t mask = -((nn >> j) & 1);
2006 rlo ^= (mm << j) & mask;
2007 rhi ^= (mm >> (64 - j)) & mask;
2008 }
2009 d[i] = rlo;
2010 d[i + 1] = rhi;
2011 }
2012 clear_tail(d, opr_sz, simd_maxsz(desc));
2013}
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023static uint64_t expand_byte_to_half(uint64_t x)
2024{
2025 return (x & 0x000000ff)
2026 | ((x & 0x0000ff00) << 8)
2027 | ((x & 0x00ff0000) << 16)
2028 | ((x & 0xff000000) << 24);
2029}
2030
2031static uint64_t pmull_h(uint64_t op1, uint64_t op2)
2032{
2033 uint64_t result = 0;
2034 int i;
2035
2036 for (i = 0; i < 8; ++i) {
2037 uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
2038 result ^= op2 & mask;
2039 op1 >>= 1;
2040 op2 <<= 1;
2041 }
2042 return result;
2043}
2044
2045void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2046{
2047 int hi = simd_data(desc);
2048 uint64_t *d = vd, *n = vn, *m = vm;
2049 uint64_t nn = n[hi], mm = m[hi];
2050
2051 d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
2052 nn >>= 32;
2053 mm >>= 32;
2054 d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
2055
2056 clear_tail(d, 16, simd_maxsz(desc));
2057}
2058
2059#ifdef TARGET_AARCH64
2060void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2061{
2062 int shift = simd_data(desc) * 8;
2063 intptr_t i, opr_sz = simd_oprsz(desc);
2064 uint64_t *d = vd, *n = vn, *m = vm;
2065
2066 for (i = 0; i < opr_sz / 8; ++i) {
2067 uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
2068 uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
2069
2070 d[i] = pmull_h(nn, mm);
2071 }
2072}
2073
2074static uint64_t pmull_d(uint64_t op1, uint64_t op2)
2075{
2076 uint64_t result = 0;
2077 int i;
2078
2079 for (i = 0; i < 32; ++i) {
2080 uint64_t mask = -((op1 >> i) & 1);
2081 result ^= (op2 << i) & mask;
2082 }
2083 return result;
2084}
2085
2086void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2087{
2088 intptr_t sel = H4(simd_data(desc));
2089 intptr_t i, opr_sz = simd_oprsz(desc);
2090 uint32_t *n = vn, *m = vm;
2091 uint64_t *d = vd;
2092
2093 for (i = 0; i < opr_sz / 8; ++i) {
2094 d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
2095 }
2096}
2097#endif
2098
2099#define DO_CMP0(NAME, TYPE, OP) \
2100void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2101{ \
2102 intptr_t i, opr_sz = simd_oprsz(desc); \
2103 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
2104 TYPE nn = *(TYPE *)(vn + i); \
2105 *(TYPE *)(vd + i) = -(nn OP 0); \
2106 } \
2107 clear_tail(vd, opr_sz, simd_maxsz(desc)); \
2108}
2109
2110DO_CMP0(gvec_ceq0_b, int8_t, ==)
2111DO_CMP0(gvec_clt0_b, int8_t, <)
2112DO_CMP0(gvec_cle0_b, int8_t, <=)
2113DO_CMP0(gvec_cgt0_b, int8_t, >)
2114DO_CMP0(gvec_cge0_b, int8_t, >=)
2115
2116DO_CMP0(gvec_ceq0_h, int16_t, ==)
2117DO_CMP0(gvec_clt0_h, int16_t, <)
2118DO_CMP0(gvec_cle0_h, int16_t, <=)
2119DO_CMP0(gvec_cgt0_h, int16_t, >)
2120DO_CMP0(gvec_cge0_h, int16_t, >=)
2121
2122#undef DO_CMP0
2123
2124#define DO_ABD(NAME, TYPE) \
2125void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2126{ \
2127 intptr_t i, opr_sz = simd_oprsz(desc); \
2128 TYPE *d = vd, *n = vn, *m = vm; \
2129 \
2130 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2131 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2132 } \
2133 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2134}
2135
2136DO_ABD(gvec_sabd_b, int8_t)
2137DO_ABD(gvec_sabd_h, int16_t)
2138DO_ABD(gvec_sabd_s, int32_t)
2139DO_ABD(gvec_sabd_d, int64_t)
2140
2141DO_ABD(gvec_uabd_b, uint8_t)
2142DO_ABD(gvec_uabd_h, uint16_t)
2143DO_ABD(gvec_uabd_s, uint32_t)
2144DO_ABD(gvec_uabd_d, uint64_t)
2145
2146#undef DO_ABD
2147
2148#define DO_ABA(NAME, TYPE) \
2149void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2150{ \
2151 intptr_t i, opr_sz = simd_oprsz(desc); \
2152 TYPE *d = vd, *n = vn, *m = vm; \
2153 \
2154 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2155 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2156 } \
2157 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2158}
2159
2160DO_ABA(gvec_saba_b, int8_t)
2161DO_ABA(gvec_saba_h, int16_t)
2162DO_ABA(gvec_saba_s, int32_t)
2163DO_ABA(gvec_saba_d, int64_t)
2164
2165DO_ABA(gvec_uaba_b, uint8_t)
2166DO_ABA(gvec_uaba_h, uint16_t)
2167DO_ABA(gvec_uaba_s, uint32_t)
2168DO_ABA(gvec_uaba_d, uint64_t)
2169
2170#undef DO_ABA
2171
2172#define DO_NEON_PAIRWISE(NAME, OP) \
2173 void HELPER(NAME##s)(void *vd, void *vn, void *vm, \
2174 void *stat, uint32_t oprsz) \
2175 { \
2176 float_status *fpst = stat; \
2177 float32 *d = vd; \
2178 float32 *n = vn; \
2179 float32 *m = vm; \
2180 float32 r0, r1; \
2181 \
2182 \
2183 r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \
2184 r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \
2185 \
2186 d[H4(0)] = r0; \
2187 d[H4(1)] = r1; \
2188 } \
2189 \
2190 void HELPER(NAME##h)(void *vd, void *vn, void *vm, \
2191 void *stat, uint32_t oprsz) \
2192 { \
2193 float_status *fpst = stat; \
2194 float16 *d = vd; \
2195 float16 *n = vn; \
2196 float16 *m = vm; \
2197 float16 r0, r1, r2, r3; \
2198 \
2199 \
2200 r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \
2201 r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \
2202 r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \
2203 r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \
2204 \
2205 d[H2(0)] = r0; \
2206 d[H2(1)] = r1; \
2207 d[H2(2)] = r2; \
2208 d[H2(3)] = r3; \
2209 }
2210
2211DO_NEON_PAIRWISE(neon_padd, add)
2212DO_NEON_PAIRWISE(neon_pmax, max)
2213DO_NEON_PAIRWISE(neon_pmin, min)
2214
2215#undef DO_NEON_PAIRWISE
2216
2217#define DO_VCVT_FIXED(NAME, FUNC, TYPE) \
2218 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2219 { \
2220 intptr_t i, oprsz = simd_oprsz(desc); \
2221 int shift = simd_data(desc); \
2222 TYPE *d = vd, *n = vn; \
2223 float_status *fpst = stat; \
2224 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2225 d[i] = FUNC(n[i], shift, fpst); \
2226 } \
2227 clear_tail(d, oprsz, simd_maxsz(desc)); \
2228 }
2229
2230DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2231DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2232DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2233DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2234DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2235DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2236DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2237DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2238
2239#undef DO_VCVT_FIXED
2240
2241#define DO_VCVT_RMODE(NAME, FUNC, TYPE) \
2242 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2243 { \
2244 float_status *fpst = stat; \
2245 intptr_t i, oprsz = simd_oprsz(desc); \
2246 uint32_t rmode = simd_data(desc); \
2247 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2248 TYPE *d = vd, *n = vn; \
2249 set_float_rounding_mode(rmode, fpst); \
2250 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2251 d[i] = FUNC(n[i], 0, fpst); \
2252 } \
2253 set_float_rounding_mode(prev_rmode, fpst); \
2254 clear_tail(d, oprsz, simd_maxsz(desc)); \
2255 }
2256
2257DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2258DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2259DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2260DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2261
2262#undef DO_VCVT_RMODE
2263
2264#define DO_VRINT_RMODE(NAME, FUNC, TYPE) \
2265 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2266 { \
2267 float_status *fpst = stat; \
2268 intptr_t i, oprsz = simd_oprsz(desc); \
2269 uint32_t rmode = simd_data(desc); \
2270 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2271 TYPE *d = vd, *n = vn; \
2272 set_float_rounding_mode(rmode, fpst); \
2273 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2274 d[i] = FUNC(n[i], fpst); \
2275 } \
2276 set_float_rounding_mode(prev_rmode, fpst); \
2277 clear_tail(d, oprsz, simd_maxsz(desc)); \
2278 }
2279
2280DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2281DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2282
2283#undef DO_VRINT_RMODE
2284
2285#ifdef TARGET_AARCH64
2286void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2287{
2288 const uint8_t *indices = vm;
2289 CPUARMState *env = venv;
2290 size_t oprsz = simd_oprsz(desc);
2291 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2292 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2293 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2294 union {
2295 uint8_t b[16];
2296 uint64_t d[2];
2297 } result;
2298
2299
2300
2301
2302
2303
2304
2305
2306 if (is_tbx) {
2307 memcpy(&result, vd, 16);
2308 } else {
2309 memset(&result, 0, 16);
2310 }
2311
2312 for (size_t i = 0; i < oprsz; ++i) {
2313 uint32_t index = indices[H1(i)];
2314
2315 if (index < table_len) {
2316
2317
2318
2319
2320
2321
2322 const uint8_t *table = (const uint8_t *)
2323 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2324 result.b[H1(i)] = table[H1(index % 16)];
2325 }
2326 }
2327
2328 memcpy(vd, &result, 16);
2329 clear_tail(vd, oprsz, simd_maxsz(desc));
2330}
2331#endif
2332
2333
2334
2335
2336
2337
2338
2339void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2340{
2341 intptr_t i, opr_sz = simd_oprsz(desc);
2342 int8_t *d = vd, *n = vn, *m = vm;
2343
2344 for (i = 0; i < opr_sz; ++i) {
2345 d[i] = ((int32_t)n[i] * m[i]) >> 8;
2346 }
2347 clear_tail(d, opr_sz, simd_maxsz(desc));
2348}
2349
2350void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2351{
2352 intptr_t i, opr_sz = simd_oprsz(desc);
2353 int16_t *d = vd, *n = vn, *m = vm;
2354
2355 for (i = 0; i < opr_sz / 2; ++i) {
2356 d[i] = ((int32_t)n[i] * m[i]) >> 16;
2357 }
2358 clear_tail(d, opr_sz, simd_maxsz(desc));
2359}
2360
2361void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2362{
2363 intptr_t i, opr_sz = simd_oprsz(desc);
2364 int32_t *d = vd, *n = vn, *m = vm;
2365
2366 for (i = 0; i < opr_sz / 4; ++i) {
2367 d[i] = ((int64_t)n[i] * m[i]) >> 32;
2368 }
2369 clear_tail(d, opr_sz, simd_maxsz(desc));
2370}
2371
2372void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2373{
2374 intptr_t i, opr_sz = simd_oprsz(desc);
2375 uint64_t *d = vd, *n = vn, *m = vm;
2376 uint64_t discard;
2377
2378 for (i = 0; i < opr_sz / 8; ++i) {
2379 muls64(&discard, &d[i], n[i], m[i]);
2380 }
2381 clear_tail(d, opr_sz, simd_maxsz(desc));
2382}
2383
2384void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2385{
2386 intptr_t i, opr_sz = simd_oprsz(desc);
2387 uint8_t *d = vd, *n = vn, *m = vm;
2388
2389 for (i = 0; i < opr_sz; ++i) {
2390 d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2391 }
2392 clear_tail(d, opr_sz, simd_maxsz(desc));
2393}
2394
2395void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2396{
2397 intptr_t i, opr_sz = simd_oprsz(desc);
2398 uint16_t *d = vd, *n = vn, *m = vm;
2399
2400 for (i = 0; i < opr_sz / 2; ++i) {
2401 d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2402 }
2403 clear_tail(d, opr_sz, simd_maxsz(desc));
2404}
2405
2406void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2407{
2408 intptr_t i, opr_sz = simd_oprsz(desc);
2409 uint32_t *d = vd, *n = vn, *m = vm;
2410
2411 for (i = 0; i < opr_sz / 4; ++i) {
2412 d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2413 }
2414 clear_tail(d, opr_sz, simd_maxsz(desc));
2415}
2416
2417void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2418{
2419 intptr_t i, opr_sz = simd_oprsz(desc);
2420 uint64_t *d = vd, *n = vn, *m = vm;
2421 uint64_t discard;
2422
2423 for (i = 0; i < opr_sz / 8; ++i) {
2424 mulu64(&discard, &d[i], n[i], m[i]);
2425 }
2426 clear_tail(d, opr_sz, simd_maxsz(desc));
2427}
2428
2429void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2430{
2431 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2432 int shr = simd_data(desc);
2433 uint64_t *d = vd, *n = vn, *m = vm;
2434
2435 for (i = 0; i < opr_sz; ++i) {
2436 d[i] = ror64(n[i] ^ m[i], shr);
2437 }
2438 clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2439}
2440
2441
2442
2443
2444
2445static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2446{
2447 int8_t *n = vn, *m = vm;
2448
2449 for (intptr_t k = 0; k < 8; ++k) {
2450 sum += n[H1(k)] * m[H1(k)];
2451 }
2452 return sum;
2453}
2454
2455static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2456{
2457 uint8_t *n = vn, *m = vm;
2458
2459 for (intptr_t k = 0; k < 8; ++k) {
2460 sum += n[H1(k)] * m[H1(k)];
2461 }
2462 return sum;
2463}
2464
2465static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2466{
2467 uint8_t *n = vn;
2468 int8_t *m = vm;
2469
2470 for (intptr_t k = 0; k < 8; ++k) {
2471 sum += n[H1(k)] * m[H1(k)];
2472 }
2473 return sum;
2474}
2475
2476static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2477 uint32_t (*inner_loop)(uint32_t, void *, void *))
2478{
2479 intptr_t seg, opr_sz = simd_oprsz(desc);
2480
2481 for (seg = 0; seg < opr_sz; seg += 16) {
2482 uint32_t *d = vd + seg;
2483 uint32_t *a = va + seg;
2484 uint32_t sum0, sum1, sum2, sum3;
2485
2486
2487
2488
2489
2490
2491
2492
2493 sum0 = a[H4(0 + 0)];
2494 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2495 sum1 = a[H4(0 + 1)];
2496 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2497 sum2 = a[H4(2 + 0)];
2498 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2499 sum3 = a[H4(2 + 1)];
2500 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2501
2502 d[H4(0)] = sum0;
2503 d[H4(1)] = sum1;
2504 d[H4(2)] = sum2;
2505 d[H4(3)] = sum3;
2506 }
2507 clear_tail(vd, opr_sz, simd_maxsz(desc));
2508}
2509
2510#define DO_MMLA_B(NAME, INNER) \
2511 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2512 { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2513
2514DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2515DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2516DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2517
2518
2519
2520
2521
2522static float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
2523{
2524
2525 float_status bf_status = {
2526 .tininess_before_rounding = float_tininess_before_rounding,
2527 .float_rounding_mode = float_round_to_odd_inf,
2528 .flush_to_zero = true,
2529 .flush_inputs_to_zero = true,
2530 .default_nan_mode = true,
2531 };
2532 float32 t1, t2;
2533
2534
2535
2536
2537
2538 t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2539 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2540 t1 = float32_add(t1, t2, &bf_status);
2541 t1 = float32_add(sum, t1, &bf_status);
2542
2543 return t1;
2544}
2545
2546void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2547{
2548 intptr_t i, opr_sz = simd_oprsz(desc);
2549 float32 *d = vd, *a = va;
2550 uint32_t *n = vn, *m = vm;
2551
2552 for (i = 0; i < opr_sz / 4; ++i) {
2553 d[i] = bfdotadd(a[i], n[i], m[i]);
2554 }
2555 clear_tail(d, opr_sz, simd_maxsz(desc));
2556}
2557
2558void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2559 void *va, uint32_t desc)
2560{
2561 intptr_t i, j, opr_sz = simd_oprsz(desc);
2562 intptr_t index = simd_data(desc);
2563 intptr_t elements = opr_sz / 4;
2564 intptr_t eltspersegment = MIN(16 / 4, elements);
2565 float32 *d = vd, *a = va;
2566 uint32_t *n = vn, *m = vm;
2567
2568 for (i = 0; i < elements; i += eltspersegment) {
2569 uint32_t m_idx = m[i + H4(index)];
2570
2571 for (j = i; j < i + eltspersegment; j++) {
2572 d[j] = bfdotadd(a[j], n[j], m_idx);
2573 }
2574 }
2575 clear_tail(d, opr_sz, simd_maxsz(desc));
2576}
2577
2578void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2579{
2580 intptr_t s, opr_sz = simd_oprsz(desc);
2581 float32 *d = vd, *a = va;
2582 uint32_t *n = vn, *m = vm;
2583
2584 for (s = 0; s < opr_sz / 4; s += 4) {
2585 float32 sum00, sum01, sum10, sum11;
2586
2587
2588
2589
2590
2591
2592
2593
2594 sum00 = a[s + H4(0 + 0)];
2595 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2596 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2597
2598 sum01 = a[s + H4(0 + 1)];
2599 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2600 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2601
2602 sum10 = a[s + H4(2 + 0)];
2603 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2604 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2605
2606 sum11 = a[s + H4(2 + 1)];
2607 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2608 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2609
2610 d[s + H4(0 + 0)] = sum00;
2611 d[s + H4(0 + 1)] = sum01;
2612 d[s + H4(2 + 0)] = sum10;
2613 d[s + H4(2 + 1)] = sum11;
2614 }
2615 clear_tail(d, opr_sz, simd_maxsz(desc));
2616}
2617
2618void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2619 void *stat, uint32_t desc)
2620{
2621 intptr_t i, opr_sz = simd_oprsz(desc);
2622 intptr_t sel = simd_data(desc);
2623 float32 *d = vd, *a = va;
2624 bfloat16 *n = vn, *m = vm;
2625
2626 for (i = 0; i < opr_sz / 4; ++i) {
2627 float32 nn = n[H2(i * 2 + sel)] << 16;
2628 float32 mm = m[H2(i * 2 + sel)] << 16;
2629 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2630 }
2631 clear_tail(d, opr_sz, simd_maxsz(desc));
2632}
2633
2634void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2635 void *va, void *stat, uint32_t desc)
2636{
2637 intptr_t i, j, opr_sz = simd_oprsz(desc);
2638 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2639 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2640 intptr_t elements = opr_sz / 4;
2641 intptr_t eltspersegment = MIN(16 / 4, elements);
2642 float32 *d = vd, *a = va;
2643 bfloat16 *n = vn, *m = vm;
2644
2645 for (i = 0; i < elements; i += eltspersegment) {
2646 float32 m_idx = m[H2(2 * i + index)] << 16;
2647
2648 for (j = i; j < i + eltspersegment; j++) {
2649 float32 n_j = n[H2(2 * j + sel)] << 16;
2650 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2651 }
2652 }
2653 clear_tail(d, opr_sz, simd_maxsz(desc));
2654}
2655