qemu/target/arm/neon_helper.c
<<
>>
Prefs
   1/*
   2 * ARM NEON vector operations.
   3 *
   4 * Copyright (c) 2007, 2008 CodeSourcery.
   5 * Written by Paul Brook
   6 *
   7 * This code is licensed under the GNU GPL v2.
   8 */
   9#include "qemu/osdep.h"
  10
  11#include "cpu.h"
  12#include "exec/helper-proto.h"
  13#include "fpu/softfloat.h"
  14#include "vec_internal.h"
  15
  16#define SIGNBIT (uint32_t)0x80000000
  17#define SIGNBIT64 ((uint64_t)1 << 63)
  18
  19#define SET_QC() env->vfp.qc[0] = 1
  20
  21#define NEON_TYPE1(name, type) \
  22typedef struct \
  23{ \
  24    type v1; \
  25} neon_##name;
  26#if HOST_BIG_ENDIAN
  27#define NEON_TYPE2(name, type) \
  28typedef struct \
  29{ \
  30    type v2; \
  31    type v1; \
  32} neon_##name;
  33#define NEON_TYPE4(name, type) \
  34typedef struct \
  35{ \
  36    type v4; \
  37    type v3; \
  38    type v2; \
  39    type v1; \
  40} neon_##name;
  41#else
  42#define NEON_TYPE2(name, type) \
  43typedef struct \
  44{ \
  45    type v1; \
  46    type v2; \
  47} neon_##name;
  48#define NEON_TYPE4(name, type) \
  49typedef struct \
  50{ \
  51    type v1; \
  52    type v2; \
  53    type v3; \
  54    type v4; \
  55} neon_##name;
  56#endif
  57
  58NEON_TYPE4(s8, int8_t)
  59NEON_TYPE4(u8, uint8_t)
  60NEON_TYPE2(s16, int16_t)
  61NEON_TYPE2(u16, uint16_t)
  62NEON_TYPE1(s32, int32_t)
  63NEON_TYPE1(u32, uint32_t)
  64#undef NEON_TYPE4
  65#undef NEON_TYPE2
  66#undef NEON_TYPE1
  67
  68/* Copy from a uint32_t to a vector structure type.  */
  69#define NEON_UNPACK(vtype, dest, val) do { \
  70    union { \
  71        vtype v; \
  72        uint32_t i; \
  73    } conv_u; \
  74    conv_u.i = (val); \
  75    dest = conv_u.v; \
  76    } while(0)
  77
  78/* Copy from a vector structure type to a uint32_t.  */
  79#define NEON_PACK(vtype, dest, val) do { \
  80    union { \
  81        vtype v; \
  82        uint32_t i; \
  83    } conv_u; \
  84    conv_u.v = (val); \
  85    dest = conv_u.i; \
  86    } while(0)
  87
  88#define NEON_DO1 \
  89    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
  90#define NEON_DO2 \
  91    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
  92    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
  93#define NEON_DO4 \
  94    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
  95    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
  96    NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
  97    NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
  98
  99#define NEON_VOP_BODY(vtype, n) \
 100{ \
 101    uint32_t res; \
 102    vtype vsrc1; \
 103    vtype vsrc2; \
 104    vtype vdest; \
 105    NEON_UNPACK(vtype, vsrc1, arg1); \
 106    NEON_UNPACK(vtype, vsrc2, arg2); \
 107    NEON_DO##n; \
 108    NEON_PACK(vtype, res, vdest); \
 109    return res; \
 110}
 111
 112#define NEON_VOP(name, vtype, n) \
 113uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
 114NEON_VOP_BODY(vtype, n)
 115
 116#define NEON_VOP_ENV(name, vtype, n) \
 117uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
 118NEON_VOP_BODY(vtype, n)
 119
 120/* Pairwise operations.  */
 121/* For 32-bit elements each segment only contains a single element, so
 122   the elementwise and pairwise operations are the same.  */
 123#define NEON_PDO2 \
 124    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
 125    NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
 126#define NEON_PDO4 \
 127    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
 128    NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
 129    NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
 130    NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
 131
 132#define NEON_POP(name, vtype, n) \
 133uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
 134{ \
 135    uint32_t res; \
 136    vtype vsrc1; \
 137    vtype vsrc2; \
 138    vtype vdest; \
 139    NEON_UNPACK(vtype, vsrc1, arg1); \
 140    NEON_UNPACK(vtype, vsrc2, arg2); \
 141    NEON_PDO##n; \
 142    NEON_PACK(vtype, res, vdest); \
 143    return res; \
 144}
 145
 146/* Unary operators.  */
 147#define NEON_VOP1(name, vtype, n) \
 148uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
 149{ \
 150    vtype vsrc1; \
 151    vtype vdest; \
 152    NEON_UNPACK(vtype, vsrc1, arg); \
 153    NEON_DO##n; \
 154    NEON_PACK(vtype, arg, vdest); \
 155    return arg; \
 156}
 157
 158
 159#define NEON_USAT(dest, src1, src2, type) do { \
 160    uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
 161    if (tmp != (type)tmp) { \
 162        SET_QC(); \
 163        dest = ~0; \
 164    } else { \
 165        dest = tmp; \
 166    }} while(0)
 167#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
 168NEON_VOP_ENV(qadd_u8, neon_u8, 4)
 169#undef NEON_FN
 170#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
 171NEON_VOP_ENV(qadd_u16, neon_u16, 2)
 172#undef NEON_FN
 173#undef NEON_USAT
 174
 175uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
 176{
 177    uint32_t res = a + b;
 178    if (res < a) {
 179        SET_QC();
 180        res = ~0;
 181    }
 182    return res;
 183}
 184
 185uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
 186{
 187    uint64_t res;
 188
 189    res = src1 + src2;
 190    if (res < src1) {
 191        SET_QC();
 192        res = ~(uint64_t)0;
 193    }
 194    return res;
 195}
 196
 197#define NEON_SSAT(dest, src1, src2, type) do { \
 198    int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
 199    if (tmp != (type)tmp) { \
 200        SET_QC(); \
 201        if (src2 > 0) { \
 202            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
 203        } else { \
 204            tmp = 1 << (sizeof(type) * 8 - 1); \
 205        } \
 206    } \
 207    dest = tmp; \
 208    } while(0)
 209#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
 210NEON_VOP_ENV(qadd_s8, neon_s8, 4)
 211#undef NEON_FN
 212#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
 213NEON_VOP_ENV(qadd_s16, neon_s16, 2)
 214#undef NEON_FN
 215#undef NEON_SSAT
 216
 217uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
 218{
 219    uint32_t res = a + b;
 220    if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
 221        SET_QC();
 222        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
 223    }
 224    return res;
 225}
 226
 227uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
 228{
 229    uint64_t res;
 230
 231    res = src1 + src2;
 232    if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
 233        SET_QC();
 234        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
 235    }
 236    return res;
 237}
 238
 239/* Unsigned saturating accumulate of signed value
 240 *
 241 * Op1/Rn is treated as signed
 242 * Op2/Rd is treated as unsigned
 243 *
 244 * Explicit casting is used to ensure the correct sign extension of
 245 * inputs. The result is treated as a unsigned value and saturated as such.
 246 *
 247 * We use a macro for the 8/16 bit cases which expects signed integers of va,
 248 * vb, and vr for interim calculation and an unsigned 32 bit result value r.
 249 */
 250
 251#define USATACC(bits, shift) \
 252    do { \
 253        va = sextract32(a, shift, bits);                                \
 254        vb = extract32(b, shift, bits);                                 \
 255        vr = va + vb;                                                   \
 256        if (vr > UINT##bits##_MAX) {                                    \
 257            SET_QC();                                                   \
 258            vr = UINT##bits##_MAX;                                      \
 259        } else if (vr < 0) {                                            \
 260            SET_QC();                                                   \
 261            vr = 0;                                                     \
 262        }                                                               \
 263        r = deposit32(r, shift, bits, vr);                              \
 264   } while (0)
 265
 266uint32_t HELPER(neon_uqadd_s8)(CPUARMState *env, uint32_t a, uint32_t b)
 267{
 268    int16_t va, vb, vr;
 269    uint32_t r = 0;
 270
 271    USATACC(8, 0);
 272    USATACC(8, 8);
 273    USATACC(8, 16);
 274    USATACC(8, 24);
 275    return r;
 276}
 277
 278uint32_t HELPER(neon_uqadd_s16)(CPUARMState *env, uint32_t a, uint32_t b)
 279{
 280    int32_t va, vb, vr;
 281    uint64_t r = 0;
 282
 283    USATACC(16, 0);
 284    USATACC(16, 16);
 285    return r;
 286}
 287
 288#undef USATACC
 289
 290uint32_t HELPER(neon_uqadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
 291{
 292    int64_t va = (int32_t)a;
 293    int64_t vb = (uint32_t)b;
 294    int64_t vr = va + vb;
 295    if (vr > UINT32_MAX) {
 296        SET_QC();
 297        vr = UINT32_MAX;
 298    } else if (vr < 0) {
 299        SET_QC();
 300        vr = 0;
 301    }
 302    return vr;
 303}
 304
 305uint64_t HELPER(neon_uqadd_s64)(CPUARMState *env, uint64_t a, uint64_t b)
 306{
 307    uint64_t res;
 308    res = a + b;
 309    /* We only need to look at the pattern of SIGN bits to detect
 310     * +ve/-ve saturation
 311     */
 312    if (~a & b & ~res & SIGNBIT64) {
 313        SET_QC();
 314        res = UINT64_MAX;
 315    } else if (a & ~b & res & SIGNBIT64) {
 316        SET_QC();
 317        res = 0;
 318    }
 319    return res;
 320}
 321
 322/* Signed saturating accumulate of unsigned value
 323 *
 324 * Op1/Rn is treated as unsigned
 325 * Op2/Rd is treated as signed
 326 *
 327 * The result is treated as a signed value and saturated as such
 328 *
 329 * We use a macro for the 8/16 bit cases which expects signed integers of va,
 330 * vb, and vr for interim calculation and an unsigned 32 bit result value r.
 331 */
 332
 333#define SSATACC(bits, shift) \
 334    do { \
 335        va = extract32(a, shift, bits);                                 \
 336        vb = sextract32(b, shift, bits);                                \
 337        vr = va + vb;                                                   \
 338        if (vr > INT##bits##_MAX) {                                     \
 339            SET_QC();                                                   \
 340            vr = INT##bits##_MAX;                                       \
 341        } else if (vr < INT##bits##_MIN) {                              \
 342            SET_QC();                                                   \
 343            vr = INT##bits##_MIN;                                       \
 344        }                                                               \
 345        r = deposit32(r, shift, bits, vr);                              \
 346    } while (0)
 347
 348uint32_t HELPER(neon_sqadd_u8)(CPUARMState *env, uint32_t a, uint32_t b)
 349{
 350    int16_t va, vb, vr;
 351    uint32_t r = 0;
 352
 353    SSATACC(8, 0);
 354    SSATACC(8, 8);
 355    SSATACC(8, 16);
 356    SSATACC(8, 24);
 357    return r;
 358}
 359
 360uint32_t HELPER(neon_sqadd_u16)(CPUARMState *env, uint32_t a, uint32_t b)
 361{
 362    int32_t va, vb, vr;
 363    uint32_t r = 0;
 364
 365    SSATACC(16, 0);
 366    SSATACC(16, 16);
 367
 368    return r;
 369}
 370
 371#undef SSATACC
 372
 373uint32_t HELPER(neon_sqadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
 374{
 375    int64_t res;
 376    int64_t op1 = (uint32_t)a;
 377    int64_t op2 = (int32_t)b;
 378    res = op1 + op2;
 379    if (res > INT32_MAX) {
 380        SET_QC();
 381        res = INT32_MAX;
 382    } else if (res < INT32_MIN) {
 383        SET_QC();
 384        res = INT32_MIN;
 385    }
 386    return res;
 387}
 388
 389uint64_t HELPER(neon_sqadd_u64)(CPUARMState *env, uint64_t a, uint64_t b)
 390{
 391    uint64_t res;
 392    res = a + b;
 393    /* We only need to look at the pattern of SIGN bits to detect an overflow */
 394    if (((a & res)
 395         | (~b & res)
 396         | (a & ~b)) & SIGNBIT64) {
 397        SET_QC();
 398        res = INT64_MAX;
 399    }
 400    return res;
 401}
 402
 403
 404#define NEON_USAT(dest, src1, src2, type) do { \
 405    uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
 406    if (tmp != (type)tmp) { \
 407        SET_QC(); \
 408        dest = 0; \
 409    } else { \
 410        dest = tmp; \
 411    }} while(0)
 412#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
 413NEON_VOP_ENV(qsub_u8, neon_u8, 4)
 414#undef NEON_FN
 415#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
 416NEON_VOP_ENV(qsub_u16, neon_u16, 2)
 417#undef NEON_FN
 418#undef NEON_USAT
 419
 420uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b)
 421{
 422    uint32_t res = a - b;
 423    if (res > a) {
 424        SET_QC();
 425        res = 0;
 426    }
 427    return res;
 428}
 429
 430uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
 431{
 432    uint64_t res;
 433
 434    if (src1 < src2) {
 435        SET_QC();
 436        res = 0;
 437    } else {
 438        res = src1 - src2;
 439    }
 440    return res;
 441}
 442
 443#define NEON_SSAT(dest, src1, src2, type) do { \
 444    int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
 445    if (tmp != (type)tmp) { \
 446        SET_QC(); \
 447        if (src2 < 0) { \
 448            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
 449        } else { \
 450            tmp = 1 << (sizeof(type) * 8 - 1); \
 451        } \
 452    } \
 453    dest = tmp; \
 454    } while(0)
 455#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
 456NEON_VOP_ENV(qsub_s8, neon_s8, 4)
 457#undef NEON_FN
 458#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
 459NEON_VOP_ENV(qsub_s16, neon_s16, 2)
 460#undef NEON_FN
 461#undef NEON_SSAT
 462
 463uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b)
 464{
 465    uint32_t res = a - b;
 466    if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
 467        SET_QC();
 468        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
 469    }
 470    return res;
 471}
 472
 473uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
 474{
 475    uint64_t res;
 476
 477    res = src1 - src2;
 478    if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
 479        SET_QC();
 480        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
 481    }
 482    return res;
 483}
 484
 485#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
 486NEON_VOP(hadd_s8, neon_s8, 4)
 487NEON_VOP(hadd_u8, neon_u8, 4)
 488NEON_VOP(hadd_s16, neon_s16, 2)
 489NEON_VOP(hadd_u16, neon_u16, 2)
 490#undef NEON_FN
 491
 492int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
 493{
 494    int32_t dest;
 495
 496    dest = (src1 >> 1) + (src2 >> 1);
 497    if (src1 & src2 & 1)
 498        dest++;
 499    return dest;
 500}
 501
 502uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
 503{
 504    uint32_t dest;
 505
 506    dest = (src1 >> 1) + (src2 >> 1);
 507    if (src1 & src2 & 1)
 508        dest++;
 509    return dest;
 510}
 511
 512#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
 513NEON_VOP(rhadd_s8, neon_s8, 4)
 514NEON_VOP(rhadd_u8, neon_u8, 4)
 515NEON_VOP(rhadd_s16, neon_s16, 2)
 516NEON_VOP(rhadd_u16, neon_u16, 2)
 517#undef NEON_FN
 518
 519int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
 520{
 521    int32_t dest;
 522
 523    dest = (src1 >> 1) + (src2 >> 1);
 524    if ((src1 | src2) & 1)
 525        dest++;
 526    return dest;
 527}
 528
 529uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
 530{
 531    uint32_t dest;
 532
 533    dest = (src1 >> 1) + (src2 >> 1);
 534    if ((src1 | src2) & 1)
 535        dest++;
 536    return dest;
 537}
 538
 539#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
 540NEON_VOP(hsub_s8, neon_s8, 4)
 541NEON_VOP(hsub_u8, neon_u8, 4)
 542NEON_VOP(hsub_s16, neon_s16, 2)
 543NEON_VOP(hsub_u16, neon_u16, 2)
 544#undef NEON_FN
 545
 546int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
 547{
 548    int32_t dest;
 549
 550    dest = (src1 >> 1) - (src2 >> 1);
 551    if ((~src1) & src2 & 1)
 552        dest--;
 553    return dest;
 554}
 555
 556uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
 557{
 558    uint32_t dest;
 559
 560    dest = (src1 >> 1) - (src2 >> 1);
 561    if ((~src1) & src2 & 1)
 562        dest--;
 563    return dest;
 564}
 565
 566#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
 567NEON_POP(pmin_s8, neon_s8, 4)
 568NEON_POP(pmin_u8, neon_u8, 4)
 569NEON_POP(pmin_s16, neon_s16, 2)
 570NEON_POP(pmin_u16, neon_u16, 2)
 571#undef NEON_FN
 572
 573#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
 574NEON_POP(pmax_s8, neon_s8, 4)
 575NEON_POP(pmax_u8, neon_u8, 4)
 576NEON_POP(pmax_s16, neon_s16, 2)
 577NEON_POP(pmax_u16, neon_u16, 2)
 578#undef NEON_FN
 579
 580#define NEON_FN(dest, src1, src2) \
 581    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
 582NEON_VOP(shl_u16, neon_u16, 2)
 583#undef NEON_FN
 584
 585#define NEON_FN(dest, src1, src2) \
 586    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
 587NEON_VOP(shl_s16, neon_s16, 2)
 588#undef NEON_FN
 589
 590#define NEON_FN(dest, src1, src2) \
 591    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
 592NEON_VOP(rshl_s8, neon_s8, 4)
 593#undef NEON_FN
 594
 595#define NEON_FN(dest, src1, src2) \
 596    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
 597NEON_VOP(rshl_s16, neon_s16, 2)
 598#undef NEON_FN
 599
 600uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
 601{
 602    return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
 603}
 604
 605uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
 606{
 607    return do_sqrshl_d(val, (int8_t)shift, true, NULL);
 608}
 609
 610#define NEON_FN(dest, src1, src2) \
 611    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
 612NEON_VOP(rshl_u8, neon_u8, 4)
 613#undef NEON_FN
 614
 615#define NEON_FN(dest, src1, src2) \
 616    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
 617NEON_VOP(rshl_u16, neon_u16, 2)
 618#undef NEON_FN
 619
 620uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
 621{
 622    return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
 623}
 624
 625uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
 626{
 627    return do_uqrshl_d(val, (int8_t)shift, true, NULL);
 628}
 629
 630#define NEON_FN(dest, src1, src2) \
 631    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
 632NEON_VOP_ENV(qshl_u8, neon_u8, 4)
 633#undef NEON_FN
 634
 635#define NEON_FN(dest, src1, src2) \
 636    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
 637NEON_VOP_ENV(qshl_u16, neon_u16, 2)
 638#undef NEON_FN
 639
 640uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
 641{
 642    return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
 643}
 644
 645uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
 646{
 647    return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
 648}
 649
 650#define NEON_FN(dest, src1, src2) \
 651    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
 652NEON_VOP_ENV(qshl_s8, neon_s8, 4)
 653#undef NEON_FN
 654
 655#define NEON_FN(dest, src1, src2) \
 656    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
 657NEON_VOP_ENV(qshl_s16, neon_s16, 2)
 658#undef NEON_FN
 659
 660uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
 661{
 662    return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
 663}
 664
 665uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
 666{
 667    return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
 668}
 669
 670#define NEON_FN(dest, src1, src2) \
 671    (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
 672NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
 673#undef NEON_FN
 674
 675#define NEON_FN(dest, src1, src2) \
 676    (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
 677NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
 678#undef NEON_FN
 679
 680uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
 681{
 682    return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
 683}
 684
 685uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
 686{
 687    return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
 688}
 689
 690#define NEON_FN(dest, src1, src2) \
 691    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
 692NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
 693#undef NEON_FN
 694
 695#define NEON_FN(dest, src1, src2) \
 696    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
 697NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
 698#undef NEON_FN
 699
 700uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
 701{
 702    return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
 703}
 704
 705uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
 706{
 707    return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
 708}
 709
 710#define NEON_FN(dest, src1, src2) \
 711    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
 712NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
 713#undef NEON_FN
 714
 715#define NEON_FN(dest, src1, src2) \
 716    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
 717NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
 718#undef NEON_FN
 719
 720uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
 721{
 722    return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
 723}
 724
 725uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
 726{
 727    return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
 728}
 729
 730uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
 731{
 732    uint32_t mask;
 733    mask = (a ^ b) & 0x80808080u;
 734    a &= ~0x80808080u;
 735    b &= ~0x80808080u;
 736    return (a + b) ^ mask;
 737}
 738
 739uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
 740{
 741    uint32_t mask;
 742    mask = (a ^ b) & 0x80008000u;
 743    a &= ~0x80008000u;
 744    b &= ~0x80008000u;
 745    return (a + b) ^ mask;
 746}
 747
 748#define NEON_FN(dest, src1, src2) dest = src1 + src2
 749NEON_POP(padd_u8, neon_u8, 4)
 750NEON_POP(padd_u16, neon_u16, 2)
 751#undef NEON_FN
 752
 753#define NEON_FN(dest, src1, src2) dest = src1 - src2
 754NEON_VOP(sub_u8, neon_u8, 4)
 755NEON_VOP(sub_u16, neon_u16, 2)
 756#undef NEON_FN
 757
 758#define NEON_FN(dest, src1, src2) dest = src1 * src2
 759NEON_VOP(mul_u8, neon_u8, 4)
 760NEON_VOP(mul_u16, neon_u16, 2)
 761#undef NEON_FN
 762
 763#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
 764NEON_VOP(tst_u8, neon_u8, 4)
 765NEON_VOP(tst_u16, neon_u16, 2)
 766NEON_VOP(tst_u32, neon_u32, 1)
 767#undef NEON_FN
 768
 769/* Count Leading Sign/Zero Bits.  */
 770static inline int do_clz8(uint8_t x)
 771{
 772    int n;
 773    for (n = 8; x; n--)
 774        x >>= 1;
 775    return n;
 776}
 777
 778static inline int do_clz16(uint16_t x)
 779{
 780    int n;
 781    for (n = 16; x; n--)
 782        x >>= 1;
 783    return n;
 784}
 785
 786#define NEON_FN(dest, src, dummy) dest = do_clz8(src)
 787NEON_VOP1(clz_u8, neon_u8, 4)
 788#undef NEON_FN
 789
 790#define NEON_FN(dest, src, dummy) dest = do_clz16(src)
 791NEON_VOP1(clz_u16, neon_u16, 2)
 792#undef NEON_FN
 793
 794#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
 795NEON_VOP1(cls_s8, neon_s8, 4)
 796#undef NEON_FN
 797
 798#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
 799NEON_VOP1(cls_s16, neon_s16, 2)
 800#undef NEON_FN
 801
 802uint32_t HELPER(neon_cls_s32)(uint32_t x)
 803{
 804    int count;
 805    if ((int32_t)x < 0)
 806        x = ~x;
 807    for (count = 32; x; count--)
 808        x = x >> 1;
 809    return count - 1;
 810}
 811
 812/* Bit count.  */
 813uint32_t HELPER(neon_cnt_u8)(uint32_t x)
 814{
 815    x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
 816    x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
 817    x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
 818    return x;
 819}
 820
 821/* Reverse bits in each 8 bit word */
 822uint32_t HELPER(neon_rbit_u8)(uint32_t x)
 823{
 824    x =  ((x & 0xf0f0f0f0) >> 4)
 825       | ((x & 0x0f0f0f0f) << 4);
 826    x =  ((x & 0x88888888) >> 3)
 827       | ((x & 0x44444444) >> 1)
 828       | ((x & 0x22222222) << 1)
 829       | ((x & 0x11111111) << 3);
 830    return x;
 831}
 832
 833#define NEON_QDMULH16(dest, src1, src2, round) do { \
 834    uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
 835    if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
 836        SET_QC(); \
 837        tmp = (tmp >> 31) ^ ~SIGNBIT; \
 838    } else { \
 839        tmp <<= 1; \
 840    } \
 841    if (round) { \
 842        int32_t old = tmp; \
 843        tmp += 1 << 15; \
 844        if ((int32_t)tmp < old) { \
 845            SET_QC(); \
 846            tmp = SIGNBIT - 1; \
 847        } \
 848    } \
 849    dest = tmp >> 16; \
 850    } while(0)
 851#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
 852NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
 853#undef NEON_FN
 854#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
 855NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
 856#undef NEON_FN
 857#undef NEON_QDMULH16
 858
 859#define NEON_QDMULH32(dest, src1, src2, round) do { \
 860    uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
 861    if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
 862        SET_QC(); \
 863        tmp = (tmp >> 63) ^ ~SIGNBIT64; \
 864    } else { \
 865        tmp <<= 1; \
 866    } \
 867    if (round) { \
 868        int64_t old = tmp; \
 869        tmp += (int64_t)1 << 31; \
 870        if ((int64_t)tmp < old) { \
 871            SET_QC(); \
 872            tmp = SIGNBIT64 - 1; \
 873        } \
 874    } \
 875    dest = tmp >> 32; \
 876    } while(0)
 877#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
 878NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
 879#undef NEON_FN
 880#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
 881NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
 882#undef NEON_FN
 883#undef NEON_QDMULH32
 884
 885uint32_t HELPER(neon_narrow_u8)(uint64_t x)
 886{
 887    return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
 888           | ((x >> 24) & 0xff000000u);
 889}
 890
 891uint32_t HELPER(neon_narrow_u16)(uint64_t x)
 892{
 893    return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
 894}
 895
 896uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
 897{
 898    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
 899            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
 900}
 901
 902uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
 903{
 904    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
 905}
 906
 907uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
 908{
 909    x &= 0xff80ff80ff80ff80ull;
 910    x += 0x0080008000800080ull;
 911    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
 912            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
 913}
 914
 915uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
 916{
 917    x &= 0xffff8000ffff8000ull;
 918    x += 0x0000800000008000ull;
 919    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
 920}
 921
 922uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
 923{
 924    uint16_t s;
 925    uint8_t d;
 926    uint32_t res = 0;
 927#define SAT8(n) \
 928    s = x >> n; \
 929    if (s & 0x8000) { \
 930        SET_QC(); \
 931    } else { \
 932        if (s > 0xff) { \
 933            d = 0xff; \
 934            SET_QC(); \
 935        } else  { \
 936            d = s; \
 937        } \
 938        res |= (uint32_t)d << (n / 2); \
 939    }
 940
 941    SAT8(0);
 942    SAT8(16);
 943    SAT8(32);
 944    SAT8(48);
 945#undef SAT8
 946    return res;
 947}
 948
 949uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
 950{
 951    uint16_t s;
 952    uint8_t d;
 953    uint32_t res = 0;
 954#define SAT8(n) \
 955    s = x >> n; \
 956    if (s > 0xff) { \
 957        d = 0xff; \
 958        SET_QC(); \
 959    } else  { \
 960        d = s; \
 961    } \
 962    res |= (uint32_t)d << (n / 2);
 963
 964    SAT8(0);
 965    SAT8(16);
 966    SAT8(32);
 967    SAT8(48);
 968#undef SAT8
 969    return res;
 970}
 971
 972uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
 973{
 974    int16_t s;
 975    uint8_t d;
 976    uint32_t res = 0;
 977#define SAT8(n) \
 978    s = x >> n; \
 979    if (s != (int8_t)s) { \
 980        d = (s >> 15) ^ 0x7f; \
 981        SET_QC(); \
 982    } else  { \
 983        d = s; \
 984    } \
 985    res |= (uint32_t)d << (n / 2);
 986
 987    SAT8(0);
 988    SAT8(16);
 989    SAT8(32);
 990    SAT8(48);
 991#undef SAT8
 992    return res;
 993}
 994
 995uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
 996{
 997    uint32_t high;
 998    uint32_t low;
 999    low = x;
1000    if (low & 0x80000000) {
1001        low = 0;
1002        SET_QC();
1003    } else if (low > 0xffff) {
1004        low = 0xffff;
1005        SET_QC();
1006    }
1007    high = x >> 32;
1008    if (high & 0x80000000) {
1009        high = 0;
1010        SET_QC();
1011    } else if (high > 0xffff) {
1012        high = 0xffff;
1013        SET_QC();
1014    }
1015    return low | (high << 16);
1016}
1017
1018uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
1019{
1020    uint32_t high;
1021    uint32_t low;
1022    low = x;
1023    if (low > 0xffff) {
1024        low = 0xffff;
1025        SET_QC();
1026    }
1027    high = x >> 32;
1028    if (high > 0xffff) {
1029        high = 0xffff;
1030        SET_QC();
1031    }
1032    return low | (high << 16);
1033}
1034
1035uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
1036{
1037    int32_t low;
1038    int32_t high;
1039    low = x;
1040    if (low != (int16_t)low) {
1041        low = (low >> 31) ^ 0x7fff;
1042        SET_QC();
1043    }
1044    high = x >> 32;
1045    if (high != (int16_t)high) {
1046        high = (high >> 31) ^ 0x7fff;
1047        SET_QC();
1048    }
1049    return (uint16_t)low | (high << 16);
1050}
1051
1052uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
1053{
1054    if (x & 0x8000000000000000ull) {
1055        SET_QC();
1056        return 0;
1057    }
1058    if (x > 0xffffffffu) {
1059        SET_QC();
1060        return 0xffffffffu;
1061    }
1062    return x;
1063}
1064
1065uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
1066{
1067    if (x > 0xffffffffu) {
1068        SET_QC();
1069        return 0xffffffffu;
1070    }
1071    return x;
1072}
1073
1074uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
1075{
1076    if ((int64_t)x != (int32_t)x) {
1077        SET_QC();
1078        return ((int64_t)x >> 63) ^ 0x7fffffff;
1079    }
1080    return x;
1081}
1082
1083uint64_t HELPER(neon_widen_u8)(uint32_t x)
1084{
1085    uint64_t tmp;
1086    uint64_t ret;
1087    ret = (uint8_t)x;
1088    tmp = (uint8_t)(x >> 8);
1089    ret |= tmp << 16;
1090    tmp = (uint8_t)(x >> 16);
1091    ret |= tmp << 32;
1092    tmp = (uint8_t)(x >> 24);
1093    ret |= tmp << 48;
1094    return ret;
1095}
1096
1097uint64_t HELPER(neon_widen_s8)(uint32_t x)
1098{
1099    uint64_t tmp;
1100    uint64_t ret;
1101    ret = (uint16_t)(int8_t)x;
1102    tmp = (uint16_t)(int8_t)(x >> 8);
1103    ret |= tmp << 16;
1104    tmp = (uint16_t)(int8_t)(x >> 16);
1105    ret |= tmp << 32;
1106    tmp = (uint16_t)(int8_t)(x >> 24);
1107    ret |= tmp << 48;
1108    return ret;
1109}
1110
1111uint64_t HELPER(neon_widen_u16)(uint32_t x)
1112{
1113    uint64_t high = (uint16_t)(x >> 16);
1114    return ((uint16_t)x) | (high << 32);
1115}
1116
1117uint64_t HELPER(neon_widen_s16)(uint32_t x)
1118{
1119    uint64_t high = (int16_t)(x >> 16);
1120    return ((uint32_t)(int16_t)x) | (high << 32);
1121}
1122
1123uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
1124{
1125    uint64_t mask;
1126    mask = (a ^ b) & 0x8000800080008000ull;
1127    a &= ~0x8000800080008000ull;
1128    b &= ~0x8000800080008000ull;
1129    return (a + b) ^ mask;
1130}
1131
1132uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
1133{
1134    uint64_t mask;
1135    mask = (a ^ b) & 0x8000000080000000ull;
1136    a &= ~0x8000000080000000ull;
1137    b &= ~0x8000000080000000ull;
1138    return (a + b) ^ mask;
1139}
1140
1141uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
1142{
1143    uint64_t tmp;
1144    uint64_t tmp2;
1145
1146    tmp = a & 0x0000ffff0000ffffull;
1147    tmp += (a >> 16) & 0x0000ffff0000ffffull;
1148    tmp2 = b & 0xffff0000ffff0000ull;
1149    tmp2 += (b << 16) & 0xffff0000ffff0000ull;
1150    return    ( tmp         & 0xffff)
1151            | ((tmp  >> 16) & 0xffff0000ull)
1152            | ((tmp2 << 16) & 0xffff00000000ull)
1153            | ( tmp2        & 0xffff000000000000ull);
1154}
1155
1156uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
1157{
1158    uint32_t low = a + (a >> 32);
1159    uint32_t high = b + (b >> 32);
1160    return low + ((uint64_t)high << 32);
1161}
1162
1163uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
1164{
1165    uint64_t mask;
1166    mask = (a ^ ~b) & 0x8000800080008000ull;
1167    a |= 0x8000800080008000ull;
1168    b &= ~0x8000800080008000ull;
1169    return (a - b) ^ mask;
1170}
1171
1172uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
1173{
1174    uint64_t mask;
1175    mask = (a ^ ~b) & 0x8000000080000000ull;
1176    a |= 0x8000000080000000ull;
1177    b &= ~0x8000000080000000ull;
1178    return (a - b) ^ mask;
1179}
1180
1181uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
1182{
1183    uint32_t x, y;
1184    uint32_t low, high;
1185
1186    x = a;
1187    y = b;
1188    low = x + y;
1189    if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1190        SET_QC();
1191        low = ((int32_t)x >> 31) ^ ~SIGNBIT;
1192    }
1193    x = a >> 32;
1194    y = b >> 32;
1195    high = x + y;
1196    if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1197        SET_QC();
1198        high = ((int32_t)x >> 31) ^ ~SIGNBIT;
1199    }
1200    return low | ((uint64_t)high << 32);
1201}
1202
1203uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
1204{
1205    uint64_t result;
1206
1207    result = a + b;
1208    if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
1209        SET_QC();
1210        result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
1211    }
1212    return result;
1213}
1214
1215/* We have to do the arithmetic in a larger type than
1216 * the input type, because for example with a signed 32 bit
1217 * op the absolute difference can overflow a signed 32 bit value.
1218 */
1219#define DO_ABD(dest, x, y, intype, arithtype) do {            \
1220    arithtype tmp_x = (intype)(x);                            \
1221    arithtype tmp_y = (intype)(y);                            \
1222    dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1223    } while(0)
1224
1225uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
1226{
1227    uint64_t tmp;
1228    uint64_t result;
1229    DO_ABD(result, a, b, uint8_t, uint32_t);
1230    DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
1231    result |= tmp << 16;
1232    DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
1233    result |= tmp << 32;
1234    DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
1235    result |= tmp << 48;
1236    return result;
1237}
1238
1239uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
1240{
1241    uint64_t tmp;
1242    uint64_t result;
1243    DO_ABD(result, a, b, int8_t, int32_t);
1244    DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
1245    result |= tmp << 16;
1246    DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
1247    result |= tmp << 32;
1248    DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
1249    result |= tmp << 48;
1250    return result;
1251}
1252
1253uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
1254{
1255    uint64_t tmp;
1256    uint64_t result;
1257    DO_ABD(result, a, b, uint16_t, uint32_t);
1258    DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1259    return result | (tmp << 32);
1260}
1261
1262uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
1263{
1264    uint64_t tmp;
1265    uint64_t result;
1266    DO_ABD(result, a, b, int16_t, int32_t);
1267    DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
1268    return result | (tmp << 32);
1269}
1270
1271uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
1272{
1273    uint64_t result;
1274    DO_ABD(result, a, b, uint32_t, uint64_t);
1275    return result;
1276}
1277
1278uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
1279{
1280    uint64_t result;
1281    DO_ABD(result, a, b, int32_t, int64_t);
1282    return result;
1283}
1284#undef DO_ABD
1285
1286/* Widening multiply. Named type is the source type.  */
1287#define DO_MULL(dest, x, y, type1, type2) do { \
1288    type1 tmp_x = x; \
1289    type1 tmp_y = y; \
1290    dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1291    } while(0)
1292
1293uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1294{
1295    uint64_t tmp;
1296    uint64_t result;
1297
1298    DO_MULL(result, a, b, uint8_t, uint16_t);
1299    DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1300    result |= tmp << 16;
1301    DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1302    result |= tmp << 32;
1303    DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1304    result |= tmp << 48;
1305    return result;
1306}
1307
1308uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1309{
1310    uint64_t tmp;
1311    uint64_t result;
1312
1313    DO_MULL(result, a, b, int8_t, uint16_t);
1314    DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1315    result |= tmp << 16;
1316    DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1317    result |= tmp << 32;
1318    DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1319    result |= tmp << 48;
1320    return result;
1321}
1322
1323uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1324{
1325    uint64_t tmp;
1326    uint64_t result;
1327
1328    DO_MULL(result, a, b, uint16_t, uint32_t);
1329    DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1330    return result | (tmp << 32);
1331}
1332
1333uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1334{
1335    uint64_t tmp;
1336    uint64_t result;
1337
1338    DO_MULL(result, a, b, int16_t, uint32_t);
1339    DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1340    return result | (tmp << 32);
1341}
1342
1343uint64_t HELPER(neon_negl_u16)(uint64_t x)
1344{
1345    uint16_t tmp;
1346    uint64_t result;
1347    result = (uint16_t)-x;
1348    tmp = -(x >> 16);
1349    result |= (uint64_t)tmp << 16;
1350    tmp = -(x >> 32);
1351    result |= (uint64_t)tmp << 32;
1352    tmp = -(x >> 48);
1353    result |= (uint64_t)tmp << 48;
1354    return result;
1355}
1356
1357uint64_t HELPER(neon_negl_u32)(uint64_t x)
1358{
1359    uint32_t low = -x;
1360    uint32_t high = -(x >> 32);
1361    return low | ((uint64_t)high << 32);
1362}
1363
1364/* Saturating sign manipulation.  */
1365/* ??? Make these use NEON_VOP1 */
1366#define DO_QABS8(x) do { \
1367    if (x == (int8_t)0x80) { \
1368        x = 0x7f; \
1369        SET_QC(); \
1370    } else if (x < 0) { \
1371        x = -x; \
1372    }} while (0)
1373uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1374{
1375    neon_s8 vec;
1376    NEON_UNPACK(neon_s8, vec, x);
1377    DO_QABS8(vec.v1);
1378    DO_QABS8(vec.v2);
1379    DO_QABS8(vec.v3);
1380    DO_QABS8(vec.v4);
1381    NEON_PACK(neon_s8, x, vec);
1382    return x;
1383}
1384#undef DO_QABS8
1385
1386#define DO_QNEG8(x) do { \
1387    if (x == (int8_t)0x80) { \
1388        x = 0x7f; \
1389        SET_QC(); \
1390    } else { \
1391        x = -x; \
1392    }} while (0)
1393uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1394{
1395    neon_s8 vec;
1396    NEON_UNPACK(neon_s8, vec, x);
1397    DO_QNEG8(vec.v1);
1398    DO_QNEG8(vec.v2);
1399    DO_QNEG8(vec.v3);
1400    DO_QNEG8(vec.v4);
1401    NEON_PACK(neon_s8, x, vec);
1402    return x;
1403}
1404#undef DO_QNEG8
1405
1406#define DO_QABS16(x) do { \
1407    if (x == (int16_t)0x8000) { \
1408        x = 0x7fff; \
1409        SET_QC(); \
1410    } else if (x < 0) { \
1411        x = -x; \
1412    }} while (0)
1413uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1414{
1415    neon_s16 vec;
1416    NEON_UNPACK(neon_s16, vec, x);
1417    DO_QABS16(vec.v1);
1418    DO_QABS16(vec.v2);
1419    NEON_PACK(neon_s16, x, vec);
1420    return x;
1421}
1422#undef DO_QABS16
1423
1424#define DO_QNEG16(x) do { \
1425    if (x == (int16_t)0x8000) { \
1426        x = 0x7fff; \
1427        SET_QC(); \
1428    } else { \
1429        x = -x; \
1430    }} while (0)
1431uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1432{
1433    neon_s16 vec;
1434    NEON_UNPACK(neon_s16, vec, x);
1435    DO_QNEG16(vec.v1);
1436    DO_QNEG16(vec.v2);
1437    NEON_PACK(neon_s16, x, vec);
1438    return x;
1439}
1440#undef DO_QNEG16
1441
1442uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1443{
1444    if (x == SIGNBIT) {
1445        SET_QC();
1446        x = ~SIGNBIT;
1447    } else if ((int32_t)x < 0) {
1448        x = -x;
1449    }
1450    return x;
1451}
1452
1453uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1454{
1455    if (x == SIGNBIT) {
1456        SET_QC();
1457        x = ~SIGNBIT;
1458    } else {
1459        x = -x;
1460    }
1461    return x;
1462}
1463
1464uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1465{
1466    if (x == SIGNBIT64) {
1467        SET_QC();
1468        x = ~SIGNBIT64;
1469    } else if ((int64_t)x < 0) {
1470        x = -x;
1471    }
1472    return x;
1473}
1474
1475uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1476{
1477    if (x == SIGNBIT64) {
1478        SET_QC();
1479        x = ~SIGNBIT64;
1480    } else {
1481        x = -x;
1482    }
1483    return x;
1484}
1485
1486/* NEON Float helpers.  */
1487
1488/* Floating point comparisons produce an integer result.
1489 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1490 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1491 */
1492uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
1493{
1494    float_status *fpst = fpstp;
1495    return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1496}
1497
1498uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
1499{
1500    float_status *fpst = fpstp;
1501    return -float32_le(make_float32(b), make_float32(a), fpst);
1502}
1503
1504uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1505{
1506    float_status *fpst = fpstp;
1507    return -float32_lt(make_float32(b), make_float32(a), fpst);
1508}
1509
1510uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
1511{
1512    float_status *fpst = fpstp;
1513    float32 f0 = float32_abs(make_float32(a));
1514    float32 f1 = float32_abs(make_float32(b));
1515    return -float32_le(f1, f0, fpst);
1516}
1517
1518uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1519{
1520    float_status *fpst = fpstp;
1521    float32 f0 = float32_abs(make_float32(a));
1522    float32 f1 = float32_abs(make_float32(b));
1523    return -float32_lt(f1, f0, fpst);
1524}
1525
1526uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
1527{
1528    float_status *fpst = fpstp;
1529    float64 f0 = float64_abs(make_float64(a));
1530    float64 f1 = float64_abs(make_float64(b));
1531    return -float64_le(f1, f0, fpst);
1532}
1533
1534uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
1535{
1536    float_status *fpst = fpstp;
1537    float64 f0 = float64_abs(make_float64(a));
1538    float64 f1 = float64_abs(make_float64(b));
1539    return -float64_lt(f1, f0, fpst);
1540}
1541
1542#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1543
1544void HELPER(neon_qunzip8)(void *vd, void *vm)
1545{
1546    uint64_t *rd = vd, *rm = vm;
1547    uint64_t zd0 = rd[0], zd1 = rd[1];
1548    uint64_t zm0 = rm[0], zm1 = rm[1];
1549
1550    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1551        | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1552        | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1553        | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1554    uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1555        | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1556        | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1557        | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1558    uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1559        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1560        | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1561        | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1562    uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1563        | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1564        | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1565        | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1566
1567    rm[0] = m0;
1568    rm[1] = m1;
1569    rd[0] = d0;
1570    rd[1] = d1;
1571}
1572
1573void HELPER(neon_qunzip16)(void *vd, void *vm)
1574{
1575    uint64_t *rd = vd, *rm = vm;
1576    uint64_t zd0 = rd[0], zd1 = rd[1];
1577    uint64_t zm0 = rm[0], zm1 = rm[1];
1578
1579    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1580        | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1581    uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1582        | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1583    uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1584        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1585    uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1586        | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1587
1588    rm[0] = m0;
1589    rm[1] = m1;
1590    rd[0] = d0;
1591    rd[1] = d1;
1592}
1593
1594void HELPER(neon_qunzip32)(void *vd, void *vm)
1595{
1596    uint64_t *rd = vd, *rm = vm;
1597    uint64_t zd0 = rd[0], zd1 = rd[1];
1598    uint64_t zm0 = rm[0], zm1 = rm[1];
1599
1600    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1601    uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1602    uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1603    uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1604
1605    rm[0] = m0;
1606    rm[1] = m1;
1607    rd[0] = d0;
1608    rd[1] = d1;
1609}
1610
1611void HELPER(neon_unzip8)(void *vd, void *vm)
1612{
1613    uint64_t *rd = vd, *rm = vm;
1614    uint64_t zd = rd[0], zm = rm[0];
1615
1616    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1617        | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1618        | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1619        | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1620    uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1621        | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1622        | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1623        | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1624
1625    rm[0] = m0;
1626    rd[0] = d0;
1627}
1628
1629void HELPER(neon_unzip16)(void *vd, void *vm)
1630{
1631    uint64_t *rd = vd, *rm = vm;
1632    uint64_t zd = rd[0], zm = rm[0];
1633
1634    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1635        | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1636    uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1637        | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1638
1639    rm[0] = m0;
1640    rd[0] = d0;
1641}
1642
1643void HELPER(neon_qzip8)(void *vd, void *vm)
1644{
1645    uint64_t *rd = vd, *rm = vm;
1646    uint64_t zd0 = rd[0], zd1 = rd[1];
1647    uint64_t zm0 = rm[0], zm1 = rm[1];
1648
1649    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1650        | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1651        | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1652        | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1653    uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1654        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1655        | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1656        | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1657    uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1658        | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1659        | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1660        | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1661    uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1662        | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1663        | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1664        | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1665
1666    rm[0] = m0;
1667    rm[1] = m1;
1668    rd[0] = d0;
1669    rd[1] = d1;
1670}
1671
1672void HELPER(neon_qzip16)(void *vd, void *vm)
1673{
1674    uint64_t *rd = vd, *rm = vm;
1675    uint64_t zd0 = rd[0], zd1 = rd[1];
1676    uint64_t zm0 = rm[0], zm1 = rm[1];
1677
1678    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1679        | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1680    uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1681        | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1682    uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1683        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1684    uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1685        | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1686
1687    rm[0] = m0;
1688    rm[1] = m1;
1689    rd[0] = d0;
1690    rd[1] = d1;
1691}
1692
1693void HELPER(neon_qzip32)(void *vd, void *vm)
1694{
1695    uint64_t *rd = vd, *rm = vm;
1696    uint64_t zd0 = rd[0], zd1 = rd[1];
1697    uint64_t zm0 = rm[0], zm1 = rm[1];
1698
1699    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1700    uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1701    uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1702    uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1703
1704    rm[0] = m0;
1705    rm[1] = m1;
1706    rd[0] = d0;
1707    rd[1] = d1;
1708}
1709
1710void HELPER(neon_zip8)(void *vd, void *vm)
1711{
1712    uint64_t *rd = vd, *rm = vm;
1713    uint64_t zd = rd[0], zm = rm[0];
1714
1715    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1716        | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1717        | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1718        | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1719    uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1720        | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1721        | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1722        | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1723
1724    rm[0] = m0;
1725    rd[0] = d0;
1726}
1727
1728void HELPER(neon_zip16)(void *vd, void *vm)
1729{
1730    uint64_t *rd = vd, *rm = vm;
1731    uint64_t zd = rd[0], zm = rm[0];
1732
1733    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1734        | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1735    uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1736        | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1737
1738    rm[0] = m0;
1739    rd[0] = d0;
1740}
1741