1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20#include "qemu/osdep.h"
21#include "cpu.h"
22#include "exec/helper-proto.h"
23#include "tcg/tcg-gvec-desc.h"
24#include "fpu/softfloat.h"
25#include "qemu/int128.h"
26#include "vec_internal.h"
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41const uint64_t expand_pred_b_data[256] = {
42 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
43 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
44 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
45 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
46 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
47 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
48 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
49 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
50 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
51 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
52 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
53 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
54 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
55 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
56 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
57 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
58 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
59 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
60 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
61 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
62 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
63 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
64 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
65 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
66 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
67 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
68 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
69 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
70 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
71 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
72 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
73 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
74 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
75 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
76 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
77 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
78 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
79 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
80 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
81 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
82 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
83 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
84 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
85 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
86 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
87 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
88 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
89 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
90 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
91 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
92 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
93 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
94 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
95 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
96 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
97 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
98 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
99 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
100 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
101 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
102 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
103 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
104 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
105 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
106 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
107 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
108 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
109 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
110 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
111 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
112 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
113 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
114 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
115 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
116 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
117 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
118 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
119 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
120 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
121 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
122 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
123 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
124 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
125 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
126 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
127 0xffffffffffffffff,
128};
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145const uint64_t expand_pred_h_data[0x55 + 1] = {
146 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
147 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
148 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
149 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
150 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
151 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
152 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
153 [0x55] = 0xffffffffffffffff,
154};
155
156
157int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
158 bool neg, bool round)
159{
160
161
162
163
164
165 int32_t ret = (int32_t)src1 * src2;
166 if (neg) {
167 ret = -ret;
168 }
169 ret += ((int32_t)src3 << 7) + (round << 6);
170 ret >>= 7;
171
172 if (ret != (int8_t)ret) {
173 ret = (ret < 0 ? INT8_MIN : INT8_MAX);
174 }
175 return ret;
176}
177
178void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
179 void *va, uint32_t desc)
180{
181 intptr_t i, opr_sz = simd_oprsz(desc);
182 int8_t *d = vd, *n = vn, *m = vm, *a = va;
183
184 for (i = 0; i < opr_sz; ++i) {
185 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
186 }
187}
188
189void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
190 void *va, uint32_t desc)
191{
192 intptr_t i, opr_sz = simd_oprsz(desc);
193 int8_t *d = vd, *n = vn, *m = vm, *a = va;
194
195 for (i = 0; i < opr_sz; ++i) {
196 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
197 }
198}
199
200void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
201{
202 intptr_t i, opr_sz = simd_oprsz(desc);
203 int8_t *d = vd, *n = vn, *m = vm;
204
205 for (i = 0; i < opr_sz; ++i) {
206 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
207 }
208}
209
210void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
211{
212 intptr_t i, opr_sz = simd_oprsz(desc);
213 int8_t *d = vd, *n = vn, *m = vm;
214
215 for (i = 0; i < opr_sz; ++i) {
216 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
217 }
218}
219
220
221int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
222 bool neg, bool round, uint32_t *sat)
223{
224
225 int32_t ret = (int32_t)src1 * src2;
226 if (neg) {
227 ret = -ret;
228 }
229 ret += ((int32_t)src3 << 15) + (round << 14);
230 ret >>= 15;
231
232 if (ret != (int16_t)ret) {
233 *sat = 1;
234 ret = (ret < 0 ? INT16_MIN : INT16_MAX);
235 }
236 return ret;
237}
238
239uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
240 uint32_t src2, uint32_t src3)
241{
242 uint32_t *sat = &env->vfp.qc[0];
243 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
244 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
245 false, true, sat);
246 return deposit32(e1, 16, 16, e2);
247}
248
249void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
250 void *vq, uint32_t desc)
251{
252 uintptr_t opr_sz = simd_oprsz(desc);
253 int16_t *d = vd;
254 int16_t *n = vn;
255 int16_t *m = vm;
256 uintptr_t i;
257
258 for (i = 0; i < opr_sz / 2; ++i) {
259 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
260 }
261 clear_tail(d, opr_sz, simd_maxsz(desc));
262}
263
264uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
265 uint32_t src2, uint32_t src3)
266{
267 uint32_t *sat = &env->vfp.qc[0];
268 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
269 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
270 true, true, sat);
271 return deposit32(e1, 16, 16, e2);
272}
273
274void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
275 void *vq, uint32_t desc)
276{
277 uintptr_t opr_sz = simd_oprsz(desc);
278 int16_t *d = vd;
279 int16_t *n = vn;
280 int16_t *m = vm;
281 uintptr_t i;
282
283 for (i = 0; i < opr_sz / 2; ++i) {
284 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
285 }
286 clear_tail(d, opr_sz, simd_maxsz(desc));
287}
288
289void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
290 void *vq, uint32_t desc)
291{
292 intptr_t i, opr_sz = simd_oprsz(desc);
293 int16_t *d = vd, *n = vn, *m = vm;
294
295 for (i = 0; i < opr_sz / 2; ++i) {
296 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
297 }
298 clear_tail(d, opr_sz, simd_maxsz(desc));
299}
300
301void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
302 void *vq, uint32_t desc)
303{
304 intptr_t i, opr_sz = simd_oprsz(desc);
305 int16_t *d = vd, *n = vn, *m = vm;
306
307 for (i = 0; i < opr_sz / 2; ++i) {
308 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
309 }
310 clear_tail(d, opr_sz, simd_maxsz(desc));
311}
312
313void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
314 void *va, uint32_t desc)
315{
316 intptr_t i, opr_sz = simd_oprsz(desc);
317 int16_t *d = vd, *n = vn, *m = vm, *a = va;
318 uint32_t discard;
319
320 for (i = 0; i < opr_sz / 2; ++i) {
321 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
322 }
323}
324
325void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
326 void *va, uint32_t desc)
327{
328 intptr_t i, opr_sz = simd_oprsz(desc);
329 int16_t *d = vd, *n = vn, *m = vm, *a = va;
330 uint32_t discard;
331
332 for (i = 0; i < opr_sz / 2; ++i) {
333 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
334 }
335}
336
337void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
338{
339 intptr_t i, opr_sz = simd_oprsz(desc);
340 int16_t *d = vd, *n = vn, *m = vm;
341 uint32_t discard;
342
343 for (i = 0; i < opr_sz / 2; ++i) {
344 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
345 }
346}
347
348void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
349{
350 intptr_t i, opr_sz = simd_oprsz(desc);
351 int16_t *d = vd, *n = vn, *m = vm;
352 uint32_t discard;
353
354 for (i = 0; i < opr_sz / 2; ++i) {
355 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
356 }
357}
358
359void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
360{
361 intptr_t i, j, opr_sz = simd_oprsz(desc);
362 int idx = simd_data(desc);
363 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
364 uint32_t discard;
365
366 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
367 int16_t mm = m[i];
368 for (j = 0; j < 16 / 2; ++j) {
369 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
370 }
371 }
372}
373
374void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
375{
376 intptr_t i, j, opr_sz = simd_oprsz(desc);
377 int idx = simd_data(desc);
378 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
379 uint32_t discard;
380
381 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
382 int16_t mm = m[i];
383 for (j = 0; j < 16 / 2; ++j) {
384 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
385 }
386 }
387}
388
389
390int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
391 bool neg, bool round, uint32_t *sat)
392{
393
394 int64_t ret = (int64_t)src1 * src2;
395 if (neg) {
396 ret = -ret;
397 }
398 ret += ((int64_t)src3 << 31) + (round << 30);
399 ret >>= 31;
400
401 if (ret != (int32_t)ret) {
402 *sat = 1;
403 ret = (ret < 0 ? INT32_MIN : INT32_MAX);
404 }
405 return ret;
406}
407
408uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
409 int32_t src2, int32_t src3)
410{
411 uint32_t *sat = &env->vfp.qc[0];
412 return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
413}
414
415void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
416 void *vq, uint32_t desc)
417{
418 uintptr_t opr_sz = simd_oprsz(desc);
419 int32_t *d = vd;
420 int32_t *n = vn;
421 int32_t *m = vm;
422 uintptr_t i;
423
424 for (i = 0; i < opr_sz / 4; ++i) {
425 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
426 }
427 clear_tail(d, opr_sz, simd_maxsz(desc));
428}
429
430uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
431 int32_t src2, int32_t src3)
432{
433 uint32_t *sat = &env->vfp.qc[0];
434 return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
435}
436
437void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
438 void *vq, uint32_t desc)
439{
440 uintptr_t opr_sz = simd_oprsz(desc);
441 int32_t *d = vd;
442 int32_t *n = vn;
443 int32_t *m = vm;
444 uintptr_t i;
445
446 for (i = 0; i < opr_sz / 4; ++i) {
447 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
448 }
449 clear_tail(d, opr_sz, simd_maxsz(desc));
450}
451
452void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
453 void *vq, uint32_t desc)
454{
455 intptr_t i, opr_sz = simd_oprsz(desc);
456 int32_t *d = vd, *n = vn, *m = vm;
457
458 for (i = 0; i < opr_sz / 4; ++i) {
459 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
460 }
461 clear_tail(d, opr_sz, simd_maxsz(desc));
462}
463
464void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
465 void *vq, uint32_t desc)
466{
467 intptr_t i, opr_sz = simd_oprsz(desc);
468 int32_t *d = vd, *n = vn, *m = vm;
469
470 for (i = 0; i < opr_sz / 4; ++i) {
471 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
472 }
473 clear_tail(d, opr_sz, simd_maxsz(desc));
474}
475
476void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
477 void *va, uint32_t desc)
478{
479 intptr_t i, opr_sz = simd_oprsz(desc);
480 int32_t *d = vd, *n = vn, *m = vm, *a = va;
481 uint32_t discard;
482
483 for (i = 0; i < opr_sz / 4; ++i) {
484 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
485 }
486}
487
488void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
489 void *va, uint32_t desc)
490{
491 intptr_t i, opr_sz = simd_oprsz(desc);
492 int32_t *d = vd, *n = vn, *m = vm, *a = va;
493 uint32_t discard;
494
495 for (i = 0; i < opr_sz / 4; ++i) {
496 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
497 }
498}
499
500void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
501{
502 intptr_t i, opr_sz = simd_oprsz(desc);
503 int32_t *d = vd, *n = vn, *m = vm;
504 uint32_t discard;
505
506 for (i = 0; i < opr_sz / 4; ++i) {
507 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
508 }
509}
510
511void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
512{
513 intptr_t i, opr_sz = simd_oprsz(desc);
514 int32_t *d = vd, *n = vn, *m = vm;
515 uint32_t discard;
516
517 for (i = 0; i < opr_sz / 4; ++i) {
518 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
519 }
520}
521
522void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
523{
524 intptr_t i, j, opr_sz = simd_oprsz(desc);
525 int idx = simd_data(desc);
526 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
527 uint32_t discard;
528
529 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
530 int32_t mm = m[i];
531 for (j = 0; j < 16 / 4; ++j) {
532 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
533 }
534 }
535}
536
537void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
538{
539 intptr_t i, j, opr_sz = simd_oprsz(desc);
540 int idx = simd_data(desc);
541 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
542 uint32_t discard;
543
544 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
545 int32_t mm = m[i];
546 for (j = 0; j < 16 / 4; ++j) {
547 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
548 }
549 }
550}
551
552
553static int64_t do_sat128_d(Int128 r)
554{
555 int64_t ls = int128_getlo(r);
556 int64_t hs = int128_gethi(r);
557
558 if (unlikely(hs != (ls >> 63))) {
559 return hs < 0 ? INT64_MIN : INT64_MAX;
560 }
561 return ls;
562}
563
564int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
565{
566 uint64_t l, h;
567 Int128 r, t;
568
569
570 muls64(&l, &h, m, n);
571 r = int128_make128(l, h);
572 if (neg) {
573 r = int128_neg(r);
574 }
575 if (a) {
576 t = int128_exts64(a);
577 t = int128_lshift(t, 63);
578 r = int128_add(r, t);
579 }
580 if (round) {
581 t = int128_exts64(1ll << 62);
582 r = int128_add(r, t);
583 }
584 r = int128_rshift(r, 63);
585
586 return do_sat128_d(r);
587}
588
589void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
590 void *va, uint32_t desc)
591{
592 intptr_t i, opr_sz = simd_oprsz(desc);
593 int64_t *d = vd, *n = vn, *m = vm, *a = va;
594
595 for (i = 0; i < opr_sz / 8; ++i) {
596 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
597 }
598}
599
600void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
601 void *va, uint32_t desc)
602{
603 intptr_t i, opr_sz = simd_oprsz(desc);
604 int64_t *d = vd, *n = vn, *m = vm, *a = va;
605
606 for (i = 0; i < opr_sz / 8; ++i) {
607 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
608 }
609}
610
611void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
612{
613 intptr_t i, opr_sz = simd_oprsz(desc);
614 int64_t *d = vd, *n = vn, *m = vm;
615
616 for (i = 0; i < opr_sz / 8; ++i) {
617 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
618 }
619}
620
621void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
622{
623 intptr_t i, opr_sz = simd_oprsz(desc);
624 int64_t *d = vd, *n = vn, *m = vm;
625
626 for (i = 0; i < opr_sz / 8; ++i) {
627 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
628 }
629}
630
631void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
632{
633 intptr_t i, j, opr_sz = simd_oprsz(desc);
634 int idx = simd_data(desc);
635 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
636
637 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
638 int64_t mm = m[i];
639 for (j = 0; j < 16 / 8; ++j) {
640 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
641 }
642 }
643}
644
645void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
646{
647 intptr_t i, j, opr_sz = simd_oprsz(desc);
648 int idx = simd_data(desc);
649 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
650
651 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
652 int64_t mm = m[i];
653 for (j = 0; j < 16 / 8; ++j) {
654 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
655 }
656 }
657}
658
659
660
661
662
663
664
665
666#define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
667void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
668{ \
669 intptr_t i, opr_sz = simd_oprsz(desc); \
670 TYPED *d = vd, *a = va; \
671 TYPEN *n = vn; \
672 TYPEM *m = vm; \
673 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \
674 d[i] = (a[i] + \
675 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \
676 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \
677 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \
678 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \
679 } \
680 clear_tail(d, opr_sz, simd_maxsz(desc)); \
681}
682
683DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
684DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
685DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
686DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
687DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
688
689#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
690void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
691{ \
692 intptr_t i = 0, opr_sz = simd_oprsz(desc); \
693 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \
694 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \
695 intptr_t index = simd_data(desc); \
696 TYPED *d = vd, *a = va; \
697 TYPEN *n = vn; \
698 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \
699 do { \
700 TYPED m0 = m_indexed[i * 4 + 0]; \
701 TYPED m1 = m_indexed[i * 4 + 1]; \
702 TYPED m2 = m_indexed[i * 4 + 2]; \
703 TYPED m3 = m_indexed[i * 4 + 3]; \
704 do { \
705 d[i] = (a[i] + \
706 n[i * 4 + 0] * m0 + \
707 n[i * 4 + 1] * m1 + \
708 n[i * 4 + 2] * m2 + \
709 n[i * 4 + 3] * m3); \
710 } while (++i < segend); \
711 segend = i + 4; \
712 } while (i < opr_sz_n); \
713 clear_tail(d, opr_sz, simd_maxsz(desc)); \
714}
715
716DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
717DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
718DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
719DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
720DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
721DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
722
723void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
724 void *vfpst, uint32_t desc)
725{
726 uintptr_t opr_sz = simd_oprsz(desc);
727 float16 *d = vd;
728 float16 *n = vn;
729 float16 *m = vm;
730 float_status *fpst = vfpst;
731 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
732 uint32_t neg_imag = neg_real ^ 1;
733 uintptr_t i;
734
735
736 neg_real <<= 15;
737 neg_imag <<= 15;
738
739 for (i = 0; i < opr_sz / 2; i += 2) {
740 float16 e0 = n[H2(i)];
741 float16 e1 = m[H2(i + 1)] ^ neg_imag;
742 float16 e2 = n[H2(i + 1)];
743 float16 e3 = m[H2(i)] ^ neg_real;
744
745 d[H2(i)] = float16_add(e0, e1, fpst);
746 d[H2(i + 1)] = float16_add(e2, e3, fpst);
747 }
748 clear_tail(d, opr_sz, simd_maxsz(desc));
749}
750
751void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
752 void *vfpst, uint32_t desc)
753{
754 uintptr_t opr_sz = simd_oprsz(desc);
755 float32 *d = vd;
756 float32 *n = vn;
757 float32 *m = vm;
758 float_status *fpst = vfpst;
759 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
760 uint32_t neg_imag = neg_real ^ 1;
761 uintptr_t i;
762
763
764 neg_real <<= 31;
765 neg_imag <<= 31;
766
767 for (i = 0; i < opr_sz / 4; i += 2) {
768 float32 e0 = n[H4(i)];
769 float32 e1 = m[H4(i + 1)] ^ neg_imag;
770 float32 e2 = n[H4(i + 1)];
771 float32 e3 = m[H4(i)] ^ neg_real;
772
773 d[H4(i)] = float32_add(e0, e1, fpst);
774 d[H4(i + 1)] = float32_add(e2, e3, fpst);
775 }
776 clear_tail(d, opr_sz, simd_maxsz(desc));
777}
778
779void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
780 void *vfpst, uint32_t desc)
781{
782 uintptr_t opr_sz = simd_oprsz(desc);
783 float64 *d = vd;
784 float64 *n = vn;
785 float64 *m = vm;
786 float_status *fpst = vfpst;
787 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
788 uint64_t neg_imag = neg_real ^ 1;
789 uintptr_t i;
790
791
792 neg_real <<= 63;
793 neg_imag <<= 63;
794
795 for (i = 0; i < opr_sz / 8; i += 2) {
796 float64 e0 = n[i];
797 float64 e1 = m[i + 1] ^ neg_imag;
798 float64 e2 = n[i + 1];
799 float64 e3 = m[i] ^ neg_real;
800
801 d[i] = float64_add(e0, e1, fpst);
802 d[i + 1] = float64_add(e2, e3, fpst);
803 }
804 clear_tail(d, opr_sz, simd_maxsz(desc));
805}
806
807void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
808 void *vfpst, uint32_t desc)
809{
810 uintptr_t opr_sz = simd_oprsz(desc);
811 float16 *d = vd, *n = vn, *m = vm, *a = va;
812 float_status *fpst = vfpst;
813 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
814 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
815 uint32_t neg_real = flip ^ neg_imag;
816 uintptr_t i;
817
818
819 neg_real <<= 15;
820 neg_imag <<= 15;
821
822 for (i = 0; i < opr_sz / 2; i += 2) {
823 float16 e2 = n[H2(i + flip)];
824 float16 e1 = m[H2(i + flip)] ^ neg_real;
825 float16 e4 = e2;
826 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
827
828 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
829 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
830 }
831 clear_tail(d, opr_sz, simd_maxsz(desc));
832}
833
834void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
835 void *vfpst, uint32_t desc)
836{
837 uintptr_t opr_sz = simd_oprsz(desc);
838 float16 *d = vd, *n = vn, *m = vm, *a = va;
839 float_status *fpst = vfpst;
840 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
841 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
842 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
843 uint32_t neg_real = flip ^ neg_imag;
844 intptr_t elements = opr_sz / sizeof(float16);
845 intptr_t eltspersegment = 16 / sizeof(float16);
846 intptr_t i, j;
847
848
849 neg_real <<= 15;
850 neg_imag <<= 15;
851
852 for (i = 0; i < elements; i += eltspersegment) {
853 float16 mr = m[H2(i + 2 * index + 0)];
854 float16 mi = m[H2(i + 2 * index + 1)];
855 float16 e1 = neg_real ^ (flip ? mi : mr);
856 float16 e3 = neg_imag ^ (flip ? mr : mi);
857
858 for (j = i; j < i + eltspersegment; j += 2) {
859 float16 e2 = n[H2(j + flip)];
860 float16 e4 = e2;
861
862 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
863 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
864 }
865 }
866 clear_tail(d, opr_sz, simd_maxsz(desc));
867}
868
869void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
870 void *vfpst, uint32_t desc)
871{
872 uintptr_t opr_sz = simd_oprsz(desc);
873 float32 *d = vd, *n = vn, *m = vm, *a = va;
874 float_status *fpst = vfpst;
875 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
876 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
877 uint32_t neg_real = flip ^ neg_imag;
878 uintptr_t i;
879
880
881 neg_real <<= 31;
882 neg_imag <<= 31;
883
884 for (i = 0; i < opr_sz / 4; i += 2) {
885 float32 e2 = n[H4(i + flip)];
886 float32 e1 = m[H4(i + flip)] ^ neg_real;
887 float32 e4 = e2;
888 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
889
890 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
891 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
892 }
893 clear_tail(d, opr_sz, simd_maxsz(desc));
894}
895
896void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
897 void *vfpst, uint32_t desc)
898{
899 uintptr_t opr_sz = simd_oprsz(desc);
900 float32 *d = vd, *n = vn, *m = vm, *a = va;
901 float_status *fpst = vfpst;
902 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
903 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
904 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
905 uint32_t neg_real = flip ^ neg_imag;
906 intptr_t elements = opr_sz / sizeof(float32);
907 intptr_t eltspersegment = 16 / sizeof(float32);
908 intptr_t i, j;
909
910
911 neg_real <<= 31;
912 neg_imag <<= 31;
913
914 for (i = 0; i < elements; i += eltspersegment) {
915 float32 mr = m[H4(i + 2 * index + 0)];
916 float32 mi = m[H4(i + 2 * index + 1)];
917 float32 e1 = neg_real ^ (flip ? mi : mr);
918 float32 e3 = neg_imag ^ (flip ? mr : mi);
919
920 for (j = i; j < i + eltspersegment; j += 2) {
921 float32 e2 = n[H4(j + flip)];
922 float32 e4 = e2;
923
924 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
925 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
926 }
927 }
928 clear_tail(d, opr_sz, simd_maxsz(desc));
929}
930
931void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
932 void *vfpst, uint32_t desc)
933{
934 uintptr_t opr_sz = simd_oprsz(desc);
935 float64 *d = vd, *n = vn, *m = vm, *a = va;
936 float_status *fpst = vfpst;
937 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
938 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
939 uint64_t neg_real = flip ^ neg_imag;
940 uintptr_t i;
941
942
943 neg_real <<= 63;
944 neg_imag <<= 63;
945
946 for (i = 0; i < opr_sz / 8; i += 2) {
947 float64 e2 = n[i + flip];
948 float64 e1 = m[i + flip] ^ neg_real;
949 float64 e4 = e2;
950 float64 e3 = m[i + 1 - flip] ^ neg_imag;
951
952 d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
953 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
954 }
955 clear_tail(d, opr_sz, simd_maxsz(desc));
956}
957
958
959
960
961
962
963static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
964{
965 return -float16_eq_quiet(op1, op2, stat);
966}
967
968static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
969{
970 return -float32_eq_quiet(op1, op2, stat);
971}
972
973static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
974{
975 return -float16_le(op2, op1, stat);
976}
977
978static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
979{
980 return -float32_le(op2, op1, stat);
981}
982
983static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
984{
985 return -float16_lt(op2, op1, stat);
986}
987
988static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
989{
990 return -float32_lt(op2, op1, stat);
991}
992
993static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
994{
995 return -float16_le(float16_abs(op2), float16_abs(op1), stat);
996}
997
998static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
999{
1000 return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1001}
1002
1003static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1004{
1005 return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1006}
1007
1008static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1009{
1010 return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1011}
1012
1013static int16_t vfp_tosszh(float16 x, void *fpstp)
1014{
1015 float_status *fpst = fpstp;
1016 if (float16_is_any_nan(x)) {
1017 float_raise(float_flag_invalid, fpst);
1018 return 0;
1019 }
1020 return float16_to_int16_round_to_zero(x, fpst);
1021}
1022
1023static uint16_t vfp_touszh(float16 x, void *fpstp)
1024{
1025 float_status *fpst = fpstp;
1026 if (float16_is_any_nan(x)) {
1027 float_raise(float_flag_invalid, fpst);
1028 return 0;
1029 }
1030 return float16_to_uint16_round_to_zero(x, fpst);
1031}
1032
1033#define DO_2OP(NAME, FUNC, TYPE) \
1034void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
1035{ \
1036 intptr_t i, oprsz = simd_oprsz(desc); \
1037 TYPE *d = vd, *n = vn; \
1038 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1039 d[i] = FUNC(n[i], stat); \
1040 } \
1041 clear_tail(d, oprsz, simd_maxsz(desc)); \
1042}
1043
1044DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1045DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1046DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1047
1048DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1049DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1050DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1051
1052DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1053DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1054
1055DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1056DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1057DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1058DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1059DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1060DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1061DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1062DO_2OP(gvec_touszh, vfp_touszh, float16)
1063
1064#define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \
1065 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1066 { \
1067 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \
1068 }
1069
1070#define WRAP_CMP0_REV(FN, CMPOP, TYPE) \
1071 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1072 { \
1073 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \
1074 }
1075
1076#define DO_2OP_CMP0(FN, CMPOP, DIRN) \
1077 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \
1078 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \
1079 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \
1080 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1081
1082DO_2OP_CMP0(cgt, cgt, FWD)
1083DO_2OP_CMP0(cge, cge, FWD)
1084DO_2OP_CMP0(ceq, ceq, FWD)
1085DO_2OP_CMP0(clt, cgt, REV)
1086DO_2OP_CMP0(cle, cge, REV)
1087
1088#undef DO_2OP
1089#undef DO_2OP_CMP0
1090
1091
1092
1093
1094static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1095{
1096 float16 result = float16_mul(op1, op1, stat);
1097 if (!float16_is_any_nan(result)) {
1098 result = float16_set_sign(result, op2 & 1);
1099 }
1100 return result;
1101}
1102
1103static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1104{
1105 float32 result = float32_mul(op1, op1, stat);
1106 if (!float32_is_any_nan(result)) {
1107 result = float32_set_sign(result, op2 & 1);
1108 }
1109 return result;
1110}
1111
1112static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1113{
1114 float64 result = float64_mul(op1, op1, stat);
1115 if (!float64_is_any_nan(result)) {
1116 result = float64_set_sign(result, op2 & 1);
1117 }
1118 return result;
1119}
1120
1121static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1122{
1123 return float16_abs(float16_sub(op1, op2, stat));
1124}
1125
1126static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1127{
1128 return float32_abs(float32_sub(op1, op2, stat));
1129}
1130
1131
1132
1133
1134
1135static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1136{
1137 op1 = float16_squash_input_denormal(op1, stat);
1138 op2 = float16_squash_input_denormal(op2, stat);
1139
1140 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1141 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1142 return float16_two;
1143 }
1144 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1145}
1146
1147static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1148{
1149 op1 = float32_squash_input_denormal(op1, stat);
1150 op2 = float32_squash_input_denormal(op2, stat);
1151
1152 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1153 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1154 return float32_two;
1155 }
1156 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1157}
1158
1159
1160static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1161{
1162 op1 = float16_squash_input_denormal(op1, stat);
1163 op2 = float16_squash_input_denormal(op2, stat);
1164
1165 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1166 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1167 return float16_one_point_five;
1168 }
1169 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1170 return float16_div(op1, float16_two, stat);
1171}
1172
1173static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1174{
1175 op1 = float32_squash_input_denormal(op1, stat);
1176 op2 = float32_squash_input_denormal(op2, stat);
1177
1178 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1179 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1180 return float32_one_point_five;
1181 }
1182 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1183 return float32_div(op1, float32_two, stat);
1184}
1185
1186#define DO_3OP(NAME, FUNC, TYPE) \
1187void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1188{ \
1189 intptr_t i, oprsz = simd_oprsz(desc); \
1190 TYPE *d = vd, *n = vn, *m = vm; \
1191 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1192 d[i] = FUNC(n[i], m[i], stat); \
1193 } \
1194 clear_tail(d, oprsz, simd_maxsz(desc)); \
1195}
1196
1197DO_3OP(gvec_fadd_h, float16_add, float16)
1198DO_3OP(gvec_fadd_s, float32_add, float32)
1199DO_3OP(gvec_fadd_d, float64_add, float64)
1200
1201DO_3OP(gvec_fsub_h, float16_sub, float16)
1202DO_3OP(gvec_fsub_s, float32_sub, float32)
1203DO_3OP(gvec_fsub_d, float64_sub, float64)
1204
1205DO_3OP(gvec_fmul_h, float16_mul, float16)
1206DO_3OP(gvec_fmul_s, float32_mul, float32)
1207DO_3OP(gvec_fmul_d, float64_mul, float64)
1208
1209DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1210DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1211DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1212
1213DO_3OP(gvec_fabd_h, float16_abd, float16)
1214DO_3OP(gvec_fabd_s, float32_abd, float32)
1215
1216DO_3OP(gvec_fceq_h, float16_ceq, float16)
1217DO_3OP(gvec_fceq_s, float32_ceq, float32)
1218
1219DO_3OP(gvec_fcge_h, float16_cge, float16)
1220DO_3OP(gvec_fcge_s, float32_cge, float32)
1221
1222DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1223DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1224
1225DO_3OP(gvec_facge_h, float16_acge, float16)
1226DO_3OP(gvec_facge_s, float32_acge, float32)
1227
1228DO_3OP(gvec_facgt_h, float16_acgt, float16)
1229DO_3OP(gvec_facgt_s, float32_acgt, float32)
1230
1231DO_3OP(gvec_fmax_h, float16_max, float16)
1232DO_3OP(gvec_fmax_s, float32_max, float32)
1233
1234DO_3OP(gvec_fmin_h, float16_min, float16)
1235DO_3OP(gvec_fmin_s, float32_min, float32)
1236
1237DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1238DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1239
1240DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1241DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1242
1243DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1244DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1245
1246DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1247DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1248
1249#ifdef TARGET_AARCH64
1250
1251DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1252DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1253DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1254
1255DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1256DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1257DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1258
1259#endif
1260#undef DO_3OP
1261
1262
1263static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1264 float_status *stat)
1265{
1266 return float16_add(dest, float16_mul(op1, op2, stat), stat);
1267}
1268
1269static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1270 float_status *stat)
1271{
1272 return float32_add(dest, float32_mul(op1, op2, stat), stat);
1273}
1274
1275static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1276 float_status *stat)
1277{
1278 return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1279}
1280
1281static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1282 float_status *stat)
1283{
1284 return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1285}
1286
1287
1288static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1289 float_status *stat)
1290{
1291 return float16_muladd(op1, op2, dest, 0, stat);
1292}
1293
1294static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1295 float_status *stat)
1296{
1297 return float32_muladd(op1, op2, dest, 0, stat);
1298}
1299
1300static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1301 float_status *stat)
1302{
1303 return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1304}
1305
1306static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1307 float_status *stat)
1308{
1309 return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1310}
1311
1312#define DO_MULADD(NAME, FUNC, TYPE) \
1313void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1314{ \
1315 intptr_t i, oprsz = simd_oprsz(desc); \
1316 TYPE *d = vd, *n = vn, *m = vm; \
1317 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1318 d[i] = FUNC(d[i], n[i], m[i], stat); \
1319 } \
1320 clear_tail(d, oprsz, simd_maxsz(desc)); \
1321}
1322
1323DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1324DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1325
1326DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1327DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1328
1329DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1330DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1331
1332DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1333DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1334
1335
1336
1337
1338
1339#define DO_MUL_IDX(NAME, TYPE, H) \
1340void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1341{ \
1342 intptr_t i, j, oprsz = simd_oprsz(desc); \
1343 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1344 intptr_t idx = simd_data(desc); \
1345 TYPE *d = vd, *n = vn, *m = vm; \
1346 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1347 TYPE mm = m[H(i + idx)]; \
1348 for (j = 0; j < segment; j++) { \
1349 d[i + j] = n[i + j] * mm; \
1350 } \
1351 } \
1352 clear_tail(d, oprsz, simd_maxsz(desc)); \
1353}
1354
1355DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1356DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1357DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1358
1359#undef DO_MUL_IDX
1360
1361#define DO_MLA_IDX(NAME, TYPE, OP, H) \
1362void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1363{ \
1364 intptr_t i, j, oprsz = simd_oprsz(desc); \
1365 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1366 intptr_t idx = simd_data(desc); \
1367 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1368 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1369 TYPE mm = m[H(i + idx)]; \
1370 for (j = 0; j < segment; j++) { \
1371 d[i + j] = a[i + j] OP n[i + j] * mm; \
1372 } \
1373 } \
1374 clear_tail(d, oprsz, simd_maxsz(desc)); \
1375}
1376
1377DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1378DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1379DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1380
1381DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1382DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1383DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1384
1385#undef DO_MLA_IDX
1386
1387#define DO_FMUL_IDX(NAME, ADD, TYPE, H) \
1388void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1389{ \
1390 intptr_t i, j, oprsz = simd_oprsz(desc); \
1391 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1392 intptr_t idx = simd_data(desc); \
1393 TYPE *d = vd, *n = vn, *m = vm; \
1394 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1395 TYPE mm = m[H(i + idx)]; \
1396 for (j = 0; j < segment; j++) { \
1397 d[i + j] = TYPE##_##ADD(d[i + j], \
1398 TYPE##_mul(n[i + j], mm, stat), stat); \
1399 } \
1400 } \
1401 clear_tail(d, oprsz, simd_maxsz(desc)); \
1402}
1403
1404#define float16_nop(N, M, S) (M)
1405#define float32_nop(N, M, S) (M)
1406#define float64_nop(N, M, S) (M)
1407
1408DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
1409DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
1410DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8)
1411
1412
1413
1414
1415
1416DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2)
1417DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4)
1418DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2)
1419DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4)
1420
1421#undef float16_nop
1422#undef float32_nop
1423#undef float64_nop
1424#undef DO_FMUL_IDX
1425
1426#define DO_FMLA_IDX(NAME, TYPE, H) \
1427void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
1428 void *stat, uint32_t desc) \
1429{ \
1430 intptr_t i, j, oprsz = simd_oprsz(desc); \
1431 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
1432 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \
1433 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \
1434 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1435 op1_neg <<= (8 * sizeof(TYPE) - 1); \
1436 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1437 TYPE mm = m[H(i + idx)]; \
1438 for (j = 0; j < segment; j++) { \
1439 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \
1440 mm, a[i + j], 0, stat); \
1441 } \
1442 } \
1443 clear_tail(d, oprsz, simd_maxsz(desc)); \
1444}
1445
1446DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1447DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1448DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1449
1450#undef DO_FMLA_IDX
1451
1452#define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1453void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \
1454{ \
1455 intptr_t i, oprsz = simd_oprsz(desc); \
1456 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \
1457 bool q = false; \
1458 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \
1459 WTYPE dd = (WTYPE)n[i] OP m[i]; \
1460 if (dd < MIN) { \
1461 dd = MIN; \
1462 q = true; \
1463 } else if (dd > MAX) { \
1464 dd = MAX; \
1465 q = true; \
1466 } \
1467 d[i] = dd; \
1468 } \
1469 if (q) { \
1470 uint32_t *qc = vq; \
1471 qc[0] = 1; \
1472 } \
1473 clear_tail(d, oprsz, simd_maxsz(desc)); \
1474}
1475
1476DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1477DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1478DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1479
1480DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1481DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1482DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1483
1484DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1485DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1486DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1487
1488DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1489DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1490DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1491
1492#undef DO_SAT
1493
1494void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1495 void *vm, uint32_t desc)
1496{
1497 intptr_t i, oprsz = simd_oprsz(desc);
1498 uint64_t *d = vd, *n = vn, *m = vm;
1499 bool q = false;
1500
1501 for (i = 0; i < oprsz / 8; i++) {
1502 uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1503 if (dd < nn) {
1504 dd = UINT64_MAX;
1505 q = true;
1506 }
1507 d[i] = dd;
1508 }
1509 if (q) {
1510 uint32_t *qc = vq;
1511 qc[0] = 1;
1512 }
1513 clear_tail(d, oprsz, simd_maxsz(desc));
1514}
1515
1516void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1517 void *vm, uint32_t desc)
1518{
1519 intptr_t i, oprsz = simd_oprsz(desc);
1520 uint64_t *d = vd, *n = vn, *m = vm;
1521 bool q = false;
1522
1523 for (i = 0; i < oprsz / 8; i++) {
1524 uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1525 if (nn < mm) {
1526 dd = 0;
1527 q = true;
1528 }
1529 d[i] = dd;
1530 }
1531 if (q) {
1532 uint32_t *qc = vq;
1533 qc[0] = 1;
1534 }
1535 clear_tail(d, oprsz, simd_maxsz(desc));
1536}
1537
1538void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1539 void *vm, uint32_t desc)
1540{
1541 intptr_t i, oprsz = simd_oprsz(desc);
1542 int64_t *d = vd, *n = vn, *m = vm;
1543 bool q = false;
1544
1545 for (i = 0; i < oprsz / 8; i++) {
1546 int64_t nn = n[i], mm = m[i], dd = nn + mm;
1547 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1548 dd = (nn >> 63) ^ ~INT64_MIN;
1549 q = true;
1550 }
1551 d[i] = dd;
1552 }
1553 if (q) {
1554 uint32_t *qc = vq;
1555 qc[0] = 1;
1556 }
1557 clear_tail(d, oprsz, simd_maxsz(desc));
1558}
1559
1560void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1561 void *vm, uint32_t desc)
1562{
1563 intptr_t i, oprsz = simd_oprsz(desc);
1564 int64_t *d = vd, *n = vn, *m = vm;
1565 bool q = false;
1566
1567 for (i = 0; i < oprsz / 8; i++) {
1568 int64_t nn = n[i], mm = m[i], dd = nn - mm;
1569 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1570 dd = (nn >> 63) ^ ~INT64_MIN;
1571 q = true;
1572 }
1573 d[i] = dd;
1574 }
1575 if (q) {
1576 uint32_t *qc = vq;
1577 qc[0] = 1;
1578 }
1579 clear_tail(d, oprsz, simd_maxsz(desc));
1580}
1581
1582
1583#define DO_SRA(NAME, TYPE) \
1584void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1585{ \
1586 intptr_t i, oprsz = simd_oprsz(desc); \
1587 int shift = simd_data(desc); \
1588 TYPE *d = vd, *n = vn; \
1589 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1590 d[i] += n[i] >> shift; \
1591 } \
1592 clear_tail(d, oprsz, simd_maxsz(desc)); \
1593}
1594
1595DO_SRA(gvec_ssra_b, int8_t)
1596DO_SRA(gvec_ssra_h, int16_t)
1597DO_SRA(gvec_ssra_s, int32_t)
1598DO_SRA(gvec_ssra_d, int64_t)
1599
1600DO_SRA(gvec_usra_b, uint8_t)
1601DO_SRA(gvec_usra_h, uint16_t)
1602DO_SRA(gvec_usra_s, uint32_t)
1603DO_SRA(gvec_usra_d, uint64_t)
1604
1605#undef DO_SRA
1606
1607#define DO_RSHR(NAME, TYPE) \
1608void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1609{ \
1610 intptr_t i, oprsz = simd_oprsz(desc); \
1611 int shift = simd_data(desc); \
1612 TYPE *d = vd, *n = vn; \
1613 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1614 TYPE tmp = n[i] >> (shift - 1); \
1615 d[i] = (tmp >> 1) + (tmp & 1); \
1616 } \
1617 clear_tail(d, oprsz, simd_maxsz(desc)); \
1618}
1619
1620DO_RSHR(gvec_srshr_b, int8_t)
1621DO_RSHR(gvec_srshr_h, int16_t)
1622DO_RSHR(gvec_srshr_s, int32_t)
1623DO_RSHR(gvec_srshr_d, int64_t)
1624
1625DO_RSHR(gvec_urshr_b, uint8_t)
1626DO_RSHR(gvec_urshr_h, uint16_t)
1627DO_RSHR(gvec_urshr_s, uint32_t)
1628DO_RSHR(gvec_urshr_d, uint64_t)
1629
1630#undef DO_RSHR
1631
1632#define DO_RSRA(NAME, TYPE) \
1633void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1634{ \
1635 intptr_t i, oprsz = simd_oprsz(desc); \
1636 int shift = simd_data(desc); \
1637 TYPE *d = vd, *n = vn; \
1638 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1639 TYPE tmp = n[i] >> (shift - 1); \
1640 d[i] += (tmp >> 1) + (tmp & 1); \
1641 } \
1642 clear_tail(d, oprsz, simd_maxsz(desc)); \
1643}
1644
1645DO_RSRA(gvec_srsra_b, int8_t)
1646DO_RSRA(gvec_srsra_h, int16_t)
1647DO_RSRA(gvec_srsra_s, int32_t)
1648DO_RSRA(gvec_srsra_d, int64_t)
1649
1650DO_RSRA(gvec_ursra_b, uint8_t)
1651DO_RSRA(gvec_ursra_h, uint16_t)
1652DO_RSRA(gvec_ursra_s, uint32_t)
1653DO_RSRA(gvec_ursra_d, uint64_t)
1654
1655#undef DO_RSRA
1656
1657#define DO_SRI(NAME, TYPE) \
1658void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1659{ \
1660 intptr_t i, oprsz = simd_oprsz(desc); \
1661 int shift = simd_data(desc); \
1662 TYPE *d = vd, *n = vn; \
1663 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1664 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1665 } \
1666 clear_tail(d, oprsz, simd_maxsz(desc)); \
1667}
1668
1669DO_SRI(gvec_sri_b, uint8_t)
1670DO_SRI(gvec_sri_h, uint16_t)
1671DO_SRI(gvec_sri_s, uint32_t)
1672DO_SRI(gvec_sri_d, uint64_t)
1673
1674#undef DO_SRI
1675
1676#define DO_SLI(NAME, TYPE) \
1677void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1678{ \
1679 intptr_t i, oprsz = simd_oprsz(desc); \
1680 int shift = simd_data(desc); \
1681 TYPE *d = vd, *n = vn; \
1682 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1683 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1684 } \
1685 clear_tail(d, oprsz, simd_maxsz(desc)); \
1686}
1687
1688DO_SLI(gvec_sli_b, uint8_t)
1689DO_SLI(gvec_sli_h, uint16_t)
1690DO_SLI(gvec_sli_s, uint32_t)
1691DO_SLI(gvec_sli_d, uint64_t)
1692
1693#undef DO_SLI
1694
1695
1696
1697
1698
1699
1700static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1701{
1702 const int f16_bias = 15;
1703 const int f32_bias = 127;
1704 uint32_t sign = extract32(f16, 15, 1);
1705 uint32_t exp = extract32(f16, 10, 5);
1706 uint32_t frac = extract32(f16, 0, 10);
1707
1708 if (exp == 0x1f) {
1709
1710 exp = 0xff;
1711 } else if (exp == 0) {
1712
1713 if (frac != 0) {
1714 if (fz16) {
1715 frac = 0;
1716 } else {
1717
1718
1719
1720
1721
1722
1723
1724
1725 int shift = clz32(frac) - 21;
1726 frac = (frac << shift) & 0x3ff;
1727 exp = f32_bias - f16_bias - shift + 1;
1728 }
1729 }
1730 } else {
1731
1732 exp += f32_bias - f16_bias;
1733 }
1734 sign <<= 31;
1735 exp <<= 23;
1736 frac <<= 23 - 10;
1737
1738 return sign | exp | frac;
1739}
1740
1741static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1742{
1743
1744
1745
1746
1747
1748
1749 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1750}
1751
1752
1753
1754
1755
1756
1757static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1758 uint32_t desc, bool fz16)
1759{
1760 intptr_t i, oprsz = simd_oprsz(desc);
1761 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1762 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1763 int is_q = oprsz == 16;
1764 uint64_t n_4, m_4;
1765
1766
1767 n_4 = load4_f16(vn, is_q, is_2);
1768 m_4 = load4_f16(vm, is_q, is_2);
1769
1770
1771 if (is_s) {
1772 n_4 ^= 0x8000800080008000ull;
1773 }
1774
1775 for (i = 0; i < oprsz / 4; i++) {
1776 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1777 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1778 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1779 }
1780 clear_tail(d, oprsz, simd_maxsz(desc));
1781}
1782
1783void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1784 void *venv, uint32_t desc)
1785{
1786 CPUARMState *env = venv;
1787 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1788 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1789}
1790
1791void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1792 void *venv, uint32_t desc)
1793{
1794 CPUARMState *env = venv;
1795 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1796 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1797}
1798
1799void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
1800 void *venv, uint32_t desc)
1801{
1802 intptr_t i, oprsz = simd_oprsz(desc);
1803 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1804 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1805 CPUARMState *env = venv;
1806 float_status *status = &env->vfp.fp_status;
1807 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1808
1809 for (i = 0; i < oprsz; i += sizeof(float32)) {
1810 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
1811 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
1812 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1813 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1814 float32 aa = *(float32 *)(va + H1_4(i));
1815
1816 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
1817 }
1818}
1819
1820static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
1821 uint32_t desc, bool fz16)
1822{
1823 intptr_t i, oprsz = simd_oprsz(desc);
1824 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1825 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1826 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
1827 int is_q = oprsz == 16;
1828 uint64_t n_4;
1829 float32 m_1;
1830
1831
1832 n_4 = load4_f16(vn, is_q, is_2);
1833
1834
1835 if (is_s) {
1836 n_4 ^= 0x8000800080008000ull;
1837 }
1838
1839 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
1840
1841 for (i = 0; i < oprsz / 4; i++) {
1842 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1843 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1844 }
1845 clear_tail(d, oprsz, simd_maxsz(desc));
1846}
1847
1848void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
1849 void *venv, uint32_t desc)
1850{
1851 CPUARMState *env = venv;
1852 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1853 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1854}
1855
1856void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
1857 void *venv, uint32_t desc)
1858{
1859 CPUARMState *env = venv;
1860 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
1861 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1862}
1863
1864void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
1865 void *venv, uint32_t desc)
1866{
1867 intptr_t i, j, oprsz = simd_oprsz(desc);
1868 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1869 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1870 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
1871 CPUARMState *env = venv;
1872 float_status *status = &env->vfp.fp_status;
1873 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1874
1875 for (i = 0; i < oprsz; i += 16) {
1876 float16 mm_16 = *(float16 *)(vm + i + idx);
1877 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1878
1879 for (j = 0; j < 16; j += sizeof(float32)) {
1880 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
1881 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1882 float32 aa = *(float32 *)(va + H1_4(i + j));
1883
1884 *(float32 *)(vd + H1_4(i + j)) =
1885 float32_muladd(nn, mm, aa, 0, status);
1886 }
1887 }
1888}
1889
1890void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1891{
1892 intptr_t i, opr_sz = simd_oprsz(desc);
1893 int8_t *d = vd, *n = vn, *m = vm;
1894
1895 for (i = 0; i < opr_sz; ++i) {
1896 int8_t mm = m[i];
1897 int8_t nn = n[i];
1898 int8_t res = 0;
1899 if (mm >= 0) {
1900 if (mm < 8) {
1901 res = nn << mm;
1902 }
1903 } else {
1904 res = nn >> (mm > -8 ? -mm : 7);
1905 }
1906 d[i] = res;
1907 }
1908 clear_tail(d, opr_sz, simd_maxsz(desc));
1909}
1910
1911void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1912{
1913 intptr_t i, opr_sz = simd_oprsz(desc);
1914 int16_t *d = vd, *n = vn, *m = vm;
1915
1916 for (i = 0; i < opr_sz / 2; ++i) {
1917 int8_t mm = m[i];
1918 int16_t nn = n[i];
1919 int16_t res = 0;
1920 if (mm >= 0) {
1921 if (mm < 16) {
1922 res = nn << mm;
1923 }
1924 } else {
1925 res = nn >> (mm > -16 ? -mm : 15);
1926 }
1927 d[i] = res;
1928 }
1929 clear_tail(d, opr_sz, simd_maxsz(desc));
1930}
1931
1932void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1933{
1934 intptr_t i, opr_sz = simd_oprsz(desc);
1935 uint8_t *d = vd, *n = vn, *m = vm;
1936
1937 for (i = 0; i < opr_sz; ++i) {
1938 int8_t mm = m[i];
1939 uint8_t nn = n[i];
1940 uint8_t res = 0;
1941 if (mm >= 0) {
1942 if (mm < 8) {
1943 res = nn << mm;
1944 }
1945 } else {
1946 if (mm > -8) {
1947 res = nn >> -mm;
1948 }
1949 }
1950 d[i] = res;
1951 }
1952 clear_tail(d, opr_sz, simd_maxsz(desc));
1953}
1954
1955void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1956{
1957 intptr_t i, opr_sz = simd_oprsz(desc);
1958 uint16_t *d = vd, *n = vn, *m = vm;
1959
1960 for (i = 0; i < opr_sz / 2; ++i) {
1961 int8_t mm = m[i];
1962 uint16_t nn = n[i];
1963 uint16_t res = 0;
1964 if (mm >= 0) {
1965 if (mm < 16) {
1966 res = nn << mm;
1967 }
1968 } else {
1969 if (mm > -16) {
1970 res = nn >> -mm;
1971 }
1972 }
1973 d[i] = res;
1974 }
1975 clear_tail(d, opr_sz, simd_maxsz(desc));
1976}
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
1988{
1989 intptr_t i, j, opr_sz = simd_oprsz(desc);
1990 uint64_t *d = vd, *n = vn, *m = vm;
1991
1992 for (i = 0; i < opr_sz / 8; ++i) {
1993 uint64_t nn = n[i];
1994 uint64_t mm = m[i];
1995 uint64_t rr = 0;
1996
1997 for (j = 0; j < 8; ++j) {
1998 uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
1999 rr ^= mm & mask;
2000 mm = (mm << 1) & 0xfefefefefefefefeull;
2001 nn >>= 1;
2002 }
2003 d[i] = rr;
2004 }
2005 clear_tail(d, opr_sz, simd_maxsz(desc));
2006}
2007
2008
2009
2010
2011
2012
2013void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2014{
2015 intptr_t i, j, opr_sz = simd_oprsz(desc);
2016 intptr_t hi = simd_data(desc);
2017 uint64_t *d = vd, *n = vn, *m = vm;
2018
2019 for (i = 0; i < opr_sz / 8; i += 2) {
2020 uint64_t nn = n[i + hi];
2021 uint64_t mm = m[i + hi];
2022 uint64_t rhi = 0;
2023 uint64_t rlo = 0;
2024
2025
2026 if (nn & 1) {
2027 rlo = mm;
2028 }
2029
2030 for (j = 1; j < 64; ++j) {
2031 uint64_t mask = -((nn >> j) & 1);
2032 rlo ^= (mm << j) & mask;
2033 rhi ^= (mm >> (64 - j)) & mask;
2034 }
2035 d[i] = rlo;
2036 d[i + 1] = rhi;
2037 }
2038 clear_tail(d, opr_sz, simd_maxsz(desc));
2039}
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049static uint64_t expand_byte_to_half(uint64_t x)
2050{
2051 return (x & 0x000000ff)
2052 | ((x & 0x0000ff00) << 8)
2053 | ((x & 0x00ff0000) << 16)
2054 | ((x & 0xff000000) << 24);
2055}
2056
2057uint64_t pmull_w(uint64_t op1, uint64_t op2)
2058{
2059 uint64_t result = 0;
2060 int i;
2061 for (i = 0; i < 16; ++i) {
2062 uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff;
2063 result ^= op2 & mask;
2064 op1 >>= 1;
2065 op2 <<= 1;
2066 }
2067 return result;
2068}
2069
2070uint64_t pmull_h(uint64_t op1, uint64_t op2)
2071{
2072 uint64_t result = 0;
2073 int i;
2074 for (i = 0; i < 8; ++i) {
2075 uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
2076 result ^= op2 & mask;
2077 op1 >>= 1;
2078 op2 <<= 1;
2079 }
2080 return result;
2081}
2082
2083void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2084{
2085 int hi = simd_data(desc);
2086 uint64_t *d = vd, *n = vn, *m = vm;
2087 uint64_t nn = n[hi], mm = m[hi];
2088
2089 d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
2090 nn >>= 32;
2091 mm >>= 32;
2092 d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
2093
2094 clear_tail(d, 16, simd_maxsz(desc));
2095}
2096
2097#ifdef TARGET_AARCH64
2098void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2099{
2100 int shift = simd_data(desc) * 8;
2101 intptr_t i, opr_sz = simd_oprsz(desc);
2102 uint64_t *d = vd, *n = vn, *m = vm;
2103
2104 for (i = 0; i < opr_sz / 8; ++i) {
2105 uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
2106 uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
2107
2108 d[i] = pmull_h(nn, mm);
2109 }
2110}
2111
2112static uint64_t pmull_d(uint64_t op1, uint64_t op2)
2113{
2114 uint64_t result = 0;
2115 int i;
2116
2117 for (i = 0; i < 32; ++i) {
2118 uint64_t mask = -((op1 >> i) & 1);
2119 result ^= (op2 << i) & mask;
2120 }
2121 return result;
2122}
2123
2124void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2125{
2126 intptr_t sel = H4(simd_data(desc));
2127 intptr_t i, opr_sz = simd_oprsz(desc);
2128 uint32_t *n = vn, *m = vm;
2129 uint64_t *d = vd;
2130
2131 for (i = 0; i < opr_sz / 8; ++i) {
2132 d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
2133 }
2134}
2135#endif
2136
2137#define DO_CMP0(NAME, TYPE, OP) \
2138void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2139{ \
2140 intptr_t i, opr_sz = simd_oprsz(desc); \
2141 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
2142 TYPE nn = *(TYPE *)(vn + i); \
2143 *(TYPE *)(vd + i) = -(nn OP 0); \
2144 } \
2145 clear_tail(vd, opr_sz, simd_maxsz(desc)); \
2146}
2147
2148DO_CMP0(gvec_ceq0_b, int8_t, ==)
2149DO_CMP0(gvec_clt0_b, int8_t, <)
2150DO_CMP0(gvec_cle0_b, int8_t, <=)
2151DO_CMP0(gvec_cgt0_b, int8_t, >)
2152DO_CMP0(gvec_cge0_b, int8_t, >=)
2153
2154DO_CMP0(gvec_ceq0_h, int16_t, ==)
2155DO_CMP0(gvec_clt0_h, int16_t, <)
2156DO_CMP0(gvec_cle0_h, int16_t, <=)
2157DO_CMP0(gvec_cgt0_h, int16_t, >)
2158DO_CMP0(gvec_cge0_h, int16_t, >=)
2159
2160#undef DO_CMP0
2161
2162#define DO_ABD(NAME, TYPE) \
2163void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2164{ \
2165 intptr_t i, opr_sz = simd_oprsz(desc); \
2166 TYPE *d = vd, *n = vn, *m = vm; \
2167 \
2168 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2169 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2170 } \
2171 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2172}
2173
2174DO_ABD(gvec_sabd_b, int8_t)
2175DO_ABD(gvec_sabd_h, int16_t)
2176DO_ABD(gvec_sabd_s, int32_t)
2177DO_ABD(gvec_sabd_d, int64_t)
2178
2179DO_ABD(gvec_uabd_b, uint8_t)
2180DO_ABD(gvec_uabd_h, uint16_t)
2181DO_ABD(gvec_uabd_s, uint32_t)
2182DO_ABD(gvec_uabd_d, uint64_t)
2183
2184#undef DO_ABD
2185
2186#define DO_ABA(NAME, TYPE) \
2187void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2188{ \
2189 intptr_t i, opr_sz = simd_oprsz(desc); \
2190 TYPE *d = vd, *n = vn, *m = vm; \
2191 \
2192 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2193 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2194 } \
2195 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2196}
2197
2198DO_ABA(gvec_saba_b, int8_t)
2199DO_ABA(gvec_saba_h, int16_t)
2200DO_ABA(gvec_saba_s, int32_t)
2201DO_ABA(gvec_saba_d, int64_t)
2202
2203DO_ABA(gvec_uaba_b, uint8_t)
2204DO_ABA(gvec_uaba_h, uint16_t)
2205DO_ABA(gvec_uaba_s, uint32_t)
2206DO_ABA(gvec_uaba_d, uint64_t)
2207
2208#undef DO_ABA
2209
2210#define DO_NEON_PAIRWISE(NAME, OP) \
2211 void HELPER(NAME##s)(void *vd, void *vn, void *vm, \
2212 void *stat, uint32_t oprsz) \
2213 { \
2214 float_status *fpst = stat; \
2215 float32 *d = vd; \
2216 float32 *n = vn; \
2217 float32 *m = vm; \
2218 float32 r0, r1; \
2219 \
2220 \
2221 r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \
2222 r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \
2223 \
2224 d[H4(0)] = r0; \
2225 d[H4(1)] = r1; \
2226 } \
2227 \
2228 void HELPER(NAME##h)(void *vd, void *vn, void *vm, \
2229 void *stat, uint32_t oprsz) \
2230 { \
2231 float_status *fpst = stat; \
2232 float16 *d = vd; \
2233 float16 *n = vn; \
2234 float16 *m = vm; \
2235 float16 r0, r1, r2, r3; \
2236 \
2237 \
2238 r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \
2239 r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \
2240 r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \
2241 r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \
2242 \
2243 d[H2(0)] = r0; \
2244 d[H2(1)] = r1; \
2245 d[H2(2)] = r2; \
2246 d[H2(3)] = r3; \
2247 }
2248
2249DO_NEON_PAIRWISE(neon_padd, add)
2250DO_NEON_PAIRWISE(neon_pmax, max)
2251DO_NEON_PAIRWISE(neon_pmin, min)
2252
2253#undef DO_NEON_PAIRWISE
2254
2255#define DO_VCVT_FIXED(NAME, FUNC, TYPE) \
2256 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2257 { \
2258 intptr_t i, oprsz = simd_oprsz(desc); \
2259 int shift = simd_data(desc); \
2260 TYPE *d = vd, *n = vn; \
2261 float_status *fpst = stat; \
2262 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2263 d[i] = FUNC(n[i], shift, fpst); \
2264 } \
2265 clear_tail(d, oprsz, simd_maxsz(desc)); \
2266 }
2267
2268DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2269DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2270DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2271DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2272DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2273DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2274DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2275DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2276
2277#undef DO_VCVT_FIXED
2278
2279#define DO_VCVT_RMODE(NAME, FUNC, TYPE) \
2280 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2281 { \
2282 float_status *fpst = stat; \
2283 intptr_t i, oprsz = simd_oprsz(desc); \
2284 uint32_t rmode = simd_data(desc); \
2285 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2286 TYPE *d = vd, *n = vn; \
2287 set_float_rounding_mode(rmode, fpst); \
2288 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2289 d[i] = FUNC(n[i], 0, fpst); \
2290 } \
2291 set_float_rounding_mode(prev_rmode, fpst); \
2292 clear_tail(d, oprsz, simd_maxsz(desc)); \
2293 }
2294
2295DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2296DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2297DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2298DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2299
2300#undef DO_VCVT_RMODE
2301
2302#define DO_VRINT_RMODE(NAME, FUNC, TYPE) \
2303 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2304 { \
2305 float_status *fpst = stat; \
2306 intptr_t i, oprsz = simd_oprsz(desc); \
2307 uint32_t rmode = simd_data(desc); \
2308 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2309 TYPE *d = vd, *n = vn; \
2310 set_float_rounding_mode(rmode, fpst); \
2311 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2312 d[i] = FUNC(n[i], fpst); \
2313 } \
2314 set_float_rounding_mode(prev_rmode, fpst); \
2315 clear_tail(d, oprsz, simd_maxsz(desc)); \
2316 }
2317
2318DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2319DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2320
2321#undef DO_VRINT_RMODE
2322
2323#ifdef TARGET_AARCH64
2324void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2325{
2326 const uint8_t *indices = vm;
2327 CPUARMState *env = venv;
2328 size_t oprsz = simd_oprsz(desc);
2329 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2330 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2331 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2332 union {
2333 uint8_t b[16];
2334 uint64_t d[2];
2335 } result;
2336
2337
2338
2339
2340
2341
2342
2343
2344 if (is_tbx) {
2345 memcpy(&result, vd, 16);
2346 } else {
2347 memset(&result, 0, 16);
2348 }
2349
2350 for (size_t i = 0; i < oprsz; ++i) {
2351 uint32_t index = indices[H1(i)];
2352
2353 if (index < table_len) {
2354
2355
2356
2357
2358
2359
2360 const uint8_t *table = (const uint8_t *)
2361 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2362 result.b[H1(i)] = table[H1(index % 16)];
2363 }
2364 }
2365
2366 memcpy(vd, &result, 16);
2367 clear_tail(vd, oprsz, simd_maxsz(desc));
2368}
2369#endif
2370
2371
2372
2373
2374
2375
2376
2377void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2378{
2379 intptr_t i, opr_sz = simd_oprsz(desc);
2380 int8_t *d = vd, *n = vn, *m = vm;
2381
2382 for (i = 0; i < opr_sz; ++i) {
2383 d[i] = ((int32_t)n[i] * m[i]) >> 8;
2384 }
2385 clear_tail(d, opr_sz, simd_maxsz(desc));
2386}
2387
2388void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2389{
2390 intptr_t i, opr_sz = simd_oprsz(desc);
2391 int16_t *d = vd, *n = vn, *m = vm;
2392
2393 for (i = 0; i < opr_sz / 2; ++i) {
2394 d[i] = ((int32_t)n[i] * m[i]) >> 16;
2395 }
2396 clear_tail(d, opr_sz, simd_maxsz(desc));
2397}
2398
2399void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2400{
2401 intptr_t i, opr_sz = simd_oprsz(desc);
2402 int32_t *d = vd, *n = vn, *m = vm;
2403
2404 for (i = 0; i < opr_sz / 4; ++i) {
2405 d[i] = ((int64_t)n[i] * m[i]) >> 32;
2406 }
2407 clear_tail(d, opr_sz, simd_maxsz(desc));
2408}
2409
2410void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2411{
2412 intptr_t i, opr_sz = simd_oprsz(desc);
2413 uint64_t *d = vd, *n = vn, *m = vm;
2414 uint64_t discard;
2415
2416 for (i = 0; i < opr_sz / 8; ++i) {
2417 muls64(&discard, &d[i], n[i], m[i]);
2418 }
2419 clear_tail(d, opr_sz, simd_maxsz(desc));
2420}
2421
2422void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2423{
2424 intptr_t i, opr_sz = simd_oprsz(desc);
2425 uint8_t *d = vd, *n = vn, *m = vm;
2426
2427 for (i = 0; i < opr_sz; ++i) {
2428 d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2429 }
2430 clear_tail(d, opr_sz, simd_maxsz(desc));
2431}
2432
2433void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2434{
2435 intptr_t i, opr_sz = simd_oprsz(desc);
2436 uint16_t *d = vd, *n = vn, *m = vm;
2437
2438 for (i = 0; i < opr_sz / 2; ++i) {
2439 d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2440 }
2441 clear_tail(d, opr_sz, simd_maxsz(desc));
2442}
2443
2444void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2445{
2446 intptr_t i, opr_sz = simd_oprsz(desc);
2447 uint32_t *d = vd, *n = vn, *m = vm;
2448
2449 for (i = 0; i < opr_sz / 4; ++i) {
2450 d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2451 }
2452 clear_tail(d, opr_sz, simd_maxsz(desc));
2453}
2454
2455void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2456{
2457 intptr_t i, opr_sz = simd_oprsz(desc);
2458 uint64_t *d = vd, *n = vn, *m = vm;
2459 uint64_t discard;
2460
2461 for (i = 0; i < opr_sz / 8; ++i) {
2462 mulu64(&discard, &d[i], n[i], m[i]);
2463 }
2464 clear_tail(d, opr_sz, simd_maxsz(desc));
2465}
2466
2467void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2468{
2469 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2470 int shr = simd_data(desc);
2471 uint64_t *d = vd, *n = vn, *m = vm;
2472
2473 for (i = 0; i < opr_sz; ++i) {
2474 d[i] = ror64(n[i] ^ m[i], shr);
2475 }
2476 clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2477}
2478
2479
2480
2481
2482
2483static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2484{
2485 int8_t *n = vn, *m = vm;
2486
2487 for (intptr_t k = 0; k < 8; ++k) {
2488 sum += n[H1(k)] * m[H1(k)];
2489 }
2490 return sum;
2491}
2492
2493static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2494{
2495 uint8_t *n = vn, *m = vm;
2496
2497 for (intptr_t k = 0; k < 8; ++k) {
2498 sum += n[H1(k)] * m[H1(k)];
2499 }
2500 return sum;
2501}
2502
2503static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2504{
2505 uint8_t *n = vn;
2506 int8_t *m = vm;
2507
2508 for (intptr_t k = 0; k < 8; ++k) {
2509 sum += n[H1(k)] * m[H1(k)];
2510 }
2511 return sum;
2512}
2513
2514static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2515 uint32_t (*inner_loop)(uint32_t, void *, void *))
2516{
2517 intptr_t seg, opr_sz = simd_oprsz(desc);
2518
2519 for (seg = 0; seg < opr_sz; seg += 16) {
2520 uint32_t *d = vd + seg;
2521 uint32_t *a = va + seg;
2522 uint32_t sum0, sum1, sum2, sum3;
2523
2524
2525
2526
2527
2528
2529
2530
2531 sum0 = a[H4(0 + 0)];
2532 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2533 sum1 = a[H4(0 + 1)];
2534 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2535 sum2 = a[H4(2 + 0)];
2536 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2537 sum3 = a[H4(2 + 1)];
2538 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2539
2540 d[H4(0)] = sum0;
2541 d[H4(1)] = sum1;
2542 d[H4(2)] = sum2;
2543 d[H4(3)] = sum3;
2544 }
2545 clear_tail(vd, opr_sz, simd_maxsz(desc));
2546}
2547
2548#define DO_MMLA_B(NAME, INNER) \
2549 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2550 { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2551
2552DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2553DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2554DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2555
2556
2557
2558
2559
2560float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
2561{
2562
2563 float_status bf_status = {
2564 .tininess_before_rounding = float_tininess_before_rounding,
2565 .float_rounding_mode = float_round_to_odd_inf,
2566 .flush_to_zero = true,
2567 .flush_inputs_to_zero = true,
2568 .default_nan_mode = true,
2569 };
2570 float32 t1, t2;
2571
2572
2573
2574
2575
2576 t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2577 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2578 t1 = float32_add(t1, t2, &bf_status);
2579 t1 = float32_add(sum, t1, &bf_status);
2580
2581 return t1;
2582}
2583
2584void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2585{
2586 intptr_t i, opr_sz = simd_oprsz(desc);
2587 float32 *d = vd, *a = va;
2588 uint32_t *n = vn, *m = vm;
2589
2590 for (i = 0; i < opr_sz / 4; ++i) {
2591 d[i] = bfdotadd(a[i], n[i], m[i]);
2592 }
2593 clear_tail(d, opr_sz, simd_maxsz(desc));
2594}
2595
2596void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2597 void *va, uint32_t desc)
2598{
2599 intptr_t i, j, opr_sz = simd_oprsz(desc);
2600 intptr_t index = simd_data(desc);
2601 intptr_t elements = opr_sz / 4;
2602 intptr_t eltspersegment = MIN(16 / 4, elements);
2603 float32 *d = vd, *a = va;
2604 uint32_t *n = vn, *m = vm;
2605
2606 for (i = 0; i < elements; i += eltspersegment) {
2607 uint32_t m_idx = m[i + H4(index)];
2608
2609 for (j = i; j < i + eltspersegment; j++) {
2610 d[j] = bfdotadd(a[j], n[j], m_idx);
2611 }
2612 }
2613 clear_tail(d, opr_sz, simd_maxsz(desc));
2614}
2615
2616void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2617{
2618 intptr_t s, opr_sz = simd_oprsz(desc);
2619 float32 *d = vd, *a = va;
2620 uint32_t *n = vn, *m = vm;
2621
2622 for (s = 0; s < opr_sz / 4; s += 4) {
2623 float32 sum00, sum01, sum10, sum11;
2624
2625
2626
2627
2628
2629
2630
2631
2632 sum00 = a[s + H4(0 + 0)];
2633 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2634 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2635
2636 sum01 = a[s + H4(0 + 1)];
2637 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2638 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2639
2640 sum10 = a[s + H4(2 + 0)];
2641 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2642 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2643
2644 sum11 = a[s + H4(2 + 1)];
2645 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2646 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2647
2648 d[s + H4(0 + 0)] = sum00;
2649 d[s + H4(0 + 1)] = sum01;
2650 d[s + H4(2 + 0)] = sum10;
2651 d[s + H4(2 + 1)] = sum11;
2652 }
2653 clear_tail(d, opr_sz, simd_maxsz(desc));
2654}
2655
2656void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2657 void *stat, uint32_t desc)
2658{
2659 intptr_t i, opr_sz = simd_oprsz(desc);
2660 intptr_t sel = simd_data(desc);
2661 float32 *d = vd, *a = va;
2662 bfloat16 *n = vn, *m = vm;
2663
2664 for (i = 0; i < opr_sz / 4; ++i) {
2665 float32 nn = n[H2(i * 2 + sel)] << 16;
2666 float32 mm = m[H2(i * 2 + sel)] << 16;
2667 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2668 }
2669 clear_tail(d, opr_sz, simd_maxsz(desc));
2670}
2671
2672void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2673 void *va, void *stat, uint32_t desc)
2674{
2675 intptr_t i, j, opr_sz = simd_oprsz(desc);
2676 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2677 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2678 intptr_t elements = opr_sz / 4;
2679 intptr_t eltspersegment = MIN(16 / 4, elements);
2680 float32 *d = vd, *a = va;
2681 bfloat16 *n = vn, *m = vm;
2682
2683 for (i = 0; i < elements; i += eltspersegment) {
2684 float32 m_idx = m[H2(2 * i + index)] << 16;
2685
2686 for (j = i; j < i + eltspersegment; j++) {
2687 float32 n_j = n[H2(2 * j + sel)] << 16;
2688 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2689 }
2690 }
2691 clear_tail(d, opr_sz, simd_maxsz(desc));
2692}
2693
2694#define DO_CLAMP(NAME, TYPE) \
2695void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \
2696{ \
2697 intptr_t i, opr_sz = simd_oprsz(desc); \
2698 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
2699 TYPE aa = *(TYPE *)(a + i); \
2700 TYPE nn = *(TYPE *)(n + i); \
2701 TYPE mm = *(TYPE *)(m + i); \
2702 TYPE dd = MIN(MAX(aa, nn), mm); \
2703 *(TYPE *)(d + i) = dd; \
2704 } \
2705 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2706}
2707
2708DO_CLAMP(gvec_sclamp_b, int8_t)
2709DO_CLAMP(gvec_sclamp_h, int16_t)
2710DO_CLAMP(gvec_sclamp_s, int32_t)
2711DO_CLAMP(gvec_sclamp_d, int64_t)
2712
2713DO_CLAMP(gvec_uclamp_b, uint8_t)
2714DO_CLAMP(gvec_uclamp_h, uint16_t)
2715DO_CLAMP(gvec_uclamp_s, uint32_t)
2716DO_CLAMP(gvec_uclamp_d, uint64_t)
2717