qemu/target/arm/neon_helper.c
<<
>>
Prefs
   1/*
   2 * ARM NEON vector operations.
   3 *
   4 * Copyright (c) 2007, 2008 CodeSourcery.
   5 * Written by Paul Brook
   6 *
   7 * This code is licensed under the GNU GPL v2.
   8 */
   9#include "qemu/osdep.h"
  10
  11#include "cpu.h"
  12#include "exec/exec-all.h"
  13#include "exec/helper-proto.h"
  14#include "fpu/softfloat.h"
  15
  16#define SIGNBIT (uint32_t)0x80000000
  17#define SIGNBIT64 ((uint64_t)1 << 63)
  18
  19#define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] |= CPSR_Q
  20
  21#define NEON_TYPE1(name, type) \
  22typedef struct \
  23{ \
  24    type v1; \
  25} neon_##name;
  26#ifdef HOST_WORDS_BIGENDIAN
  27#define NEON_TYPE2(name, type) \
  28typedef struct \
  29{ \
  30    type v2; \
  31    type v1; \
  32} neon_##name;
  33#define NEON_TYPE4(name, type) \
  34typedef struct \
  35{ \
  36    type v4; \
  37    type v3; \
  38    type v2; \
  39    type v1; \
  40} neon_##name;
  41#else
  42#define NEON_TYPE2(name, type) \
  43typedef struct \
  44{ \
  45    type v1; \
  46    type v2; \
  47} neon_##name;
  48#define NEON_TYPE4(name, type) \
  49typedef struct \
  50{ \
  51    type v1; \
  52    type v2; \
  53    type v3; \
  54    type v4; \
  55} neon_##name;
  56#endif
  57
  58NEON_TYPE4(s8, int8_t)
  59NEON_TYPE4(u8, uint8_t)
  60NEON_TYPE2(s16, int16_t)
  61NEON_TYPE2(u16, uint16_t)
  62NEON_TYPE1(s32, int32_t)
  63NEON_TYPE1(u32, uint32_t)
  64#undef NEON_TYPE4
  65#undef NEON_TYPE2
  66#undef NEON_TYPE1
  67
  68/* Copy from a uint32_t to a vector structure type.  */
  69#define NEON_UNPACK(vtype, dest, val) do { \
  70    union { \
  71        vtype v; \
  72        uint32_t i; \
  73    } conv_u; \
  74    conv_u.i = (val); \
  75    dest = conv_u.v; \
  76    } while(0)
  77
  78/* Copy from a vector structure type to a uint32_t.  */
  79#define NEON_PACK(vtype, dest, val) do { \
  80    union { \
  81        vtype v; \
  82        uint32_t i; \
  83    } conv_u; \
  84    conv_u.v = (val); \
  85    dest = conv_u.i; \
  86    } while(0)
  87
  88#define NEON_DO1 \
  89    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
  90#define NEON_DO2 \
  91    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
  92    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
  93#define NEON_DO4 \
  94    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
  95    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
  96    NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
  97    NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
  98
  99#define NEON_VOP_BODY(vtype, n) \
 100{ \
 101    uint32_t res; \
 102    vtype vsrc1; \
 103    vtype vsrc2; \
 104    vtype vdest; \
 105    NEON_UNPACK(vtype, vsrc1, arg1); \
 106    NEON_UNPACK(vtype, vsrc2, arg2); \
 107    NEON_DO##n; \
 108    NEON_PACK(vtype, res, vdest); \
 109    return res; \
 110}
 111
 112#define NEON_VOP(name, vtype, n) \
 113uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
 114NEON_VOP_BODY(vtype, n)
 115
 116#define NEON_VOP_ENV(name, vtype, n) \
 117uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
 118NEON_VOP_BODY(vtype, n)
 119
 120/* Pairwise operations.  */
 121/* For 32-bit elements each segment only contains a single element, so
 122   the elementwise and pairwise operations are the same.  */
 123#define NEON_PDO2 \
 124    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
 125    NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
 126#define NEON_PDO4 \
 127    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
 128    NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
 129    NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
 130    NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
 131
 132#define NEON_POP(name, vtype, n) \
 133uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
 134{ \
 135    uint32_t res; \
 136    vtype vsrc1; \
 137    vtype vsrc2; \
 138    vtype vdest; \
 139    NEON_UNPACK(vtype, vsrc1, arg1); \
 140    NEON_UNPACK(vtype, vsrc2, arg2); \
 141    NEON_PDO##n; \
 142    NEON_PACK(vtype, res, vdest); \
 143    return res; \
 144}
 145
 146/* Unary operators.  */
 147#define NEON_VOP1(name, vtype, n) \
 148uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
 149{ \
 150    vtype vsrc1; \
 151    vtype vdest; \
 152    NEON_UNPACK(vtype, vsrc1, arg); \
 153    NEON_DO##n; \
 154    NEON_PACK(vtype, arg, vdest); \
 155    return arg; \
 156}
 157
 158
 159#define NEON_USAT(dest, src1, src2, type) do { \
 160    uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
 161    if (tmp != (type)tmp) { \
 162        SET_QC(); \
 163        dest = ~0; \
 164    } else { \
 165        dest = tmp; \
 166    }} while(0)
 167#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
 168NEON_VOP_ENV(qadd_u8, neon_u8, 4)
 169#undef NEON_FN
 170#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
 171NEON_VOP_ENV(qadd_u16, neon_u16, 2)
 172#undef NEON_FN
 173#undef NEON_USAT
 174
 175uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
 176{
 177    uint32_t res = a + b;
 178    if (res < a) {
 179        SET_QC();
 180        res = ~0;
 181    }
 182    return res;
 183}
 184
 185uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
 186{
 187    uint64_t res;
 188
 189    res = src1 + src2;
 190    if (res < src1) {
 191        SET_QC();
 192        res = ~(uint64_t)0;
 193    }
 194    return res;
 195}
 196
 197#define NEON_SSAT(dest, src1, src2, type) do { \
 198    int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
 199    if (tmp != (type)tmp) { \
 200        SET_QC(); \
 201        if (src2 > 0) { \
 202            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
 203        } else { \
 204            tmp = 1 << (sizeof(type) * 8 - 1); \
 205        } \
 206    } \
 207    dest = tmp; \
 208    } while(0)
 209#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
 210NEON_VOP_ENV(qadd_s8, neon_s8, 4)
 211#undef NEON_FN
 212#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
 213NEON_VOP_ENV(qadd_s16, neon_s16, 2)
 214#undef NEON_FN
 215#undef NEON_SSAT
 216
 217uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
 218{
 219    uint32_t res = a + b;
 220    if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
 221        SET_QC();
 222        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
 223    }
 224    return res;
 225}
 226
 227uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
 228{
 229    uint64_t res;
 230
 231    res = src1 + src2;
 232    if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
 233        SET_QC();
 234        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
 235    }
 236    return res;
 237}
 238
 239/* Unsigned saturating accumulate of signed value
 240 *
 241 * Op1/Rn is treated as signed
 242 * Op2/Rd is treated as unsigned
 243 *
 244 * Explicit casting is used to ensure the correct sign extension of
 245 * inputs. The result is treated as a unsigned value and saturated as such.
 246 *
 247 * We use a macro for the 8/16 bit cases which expects signed integers of va,
 248 * vb, and vr for interim calculation and an unsigned 32 bit result value r.
 249 */
 250
 251#define USATACC(bits, shift) \
 252    do { \
 253        va = sextract32(a, shift, bits);                                \
 254        vb = extract32(b, shift, bits);                                 \
 255        vr = va + vb;                                                   \
 256        if (vr > UINT##bits##_MAX) {                                    \
 257            SET_QC();                                                   \
 258            vr = UINT##bits##_MAX;                                      \
 259        } else if (vr < 0) {                                            \
 260            SET_QC();                                                   \
 261            vr = 0;                                                     \
 262        }                                                               \
 263        r = deposit32(r, shift, bits, vr);                              \
 264   } while (0)
 265
 266uint32_t HELPER(neon_uqadd_s8)(CPUARMState *env, uint32_t a, uint32_t b)
 267{
 268    int16_t va, vb, vr;
 269    uint32_t r = 0;
 270
 271    USATACC(8, 0);
 272    USATACC(8, 8);
 273    USATACC(8, 16);
 274    USATACC(8, 24);
 275    return r;
 276}
 277
 278uint32_t HELPER(neon_uqadd_s16)(CPUARMState *env, uint32_t a, uint32_t b)
 279{
 280    int32_t va, vb, vr;
 281    uint64_t r = 0;
 282
 283    USATACC(16, 0);
 284    USATACC(16, 16);
 285    return r;
 286}
 287
 288#undef USATACC
 289
 290uint32_t HELPER(neon_uqadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
 291{
 292    int64_t va = (int32_t)a;
 293    int64_t vb = (uint32_t)b;
 294    int64_t vr = va + vb;
 295    if (vr > UINT32_MAX) {
 296        SET_QC();
 297        vr = UINT32_MAX;
 298    } else if (vr < 0) {
 299        SET_QC();
 300        vr = 0;
 301    }
 302    return vr;
 303}
 304
 305uint64_t HELPER(neon_uqadd_s64)(CPUARMState *env, uint64_t a, uint64_t b)
 306{
 307    uint64_t res;
 308    res = a + b;
 309    /* We only need to look at the pattern of SIGN bits to detect
 310     * +ve/-ve saturation
 311     */
 312    if (~a & b & ~res & SIGNBIT64) {
 313        SET_QC();
 314        res = UINT64_MAX;
 315    } else if (a & ~b & res & SIGNBIT64) {
 316        SET_QC();
 317        res = 0;
 318    }
 319    return res;
 320}
 321
 322/* Signed saturating accumulate of unsigned value
 323 *
 324 * Op1/Rn is treated as unsigned
 325 * Op2/Rd is treated as signed
 326 *
 327 * The result is treated as a signed value and saturated as such
 328 *
 329 * We use a macro for the 8/16 bit cases which expects signed integers of va,
 330 * vb, and vr for interim calculation and an unsigned 32 bit result value r.
 331 */
 332
 333#define SSATACC(bits, shift) \
 334    do { \
 335        va = extract32(a, shift, bits);                                 \
 336        vb = sextract32(b, shift, bits);                                \
 337        vr = va + vb;                                                   \
 338        if (vr > INT##bits##_MAX) {                                     \
 339            SET_QC();                                                   \
 340            vr = INT##bits##_MAX;                                       \
 341        } else if (vr < INT##bits##_MIN) {                              \
 342            SET_QC();                                                   \
 343            vr = INT##bits##_MIN;                                       \
 344        }                                                               \
 345        r = deposit32(r, shift, bits, vr);                              \
 346    } while (0)
 347
 348uint32_t HELPER(neon_sqadd_u8)(CPUARMState *env, uint32_t a, uint32_t b)
 349{
 350    int16_t va, vb, vr;
 351    uint32_t r = 0;
 352
 353    SSATACC(8, 0);
 354    SSATACC(8, 8);
 355    SSATACC(8, 16);
 356    SSATACC(8, 24);
 357    return r;
 358}
 359
 360uint32_t HELPER(neon_sqadd_u16)(CPUARMState *env, uint32_t a, uint32_t b)
 361{
 362    int32_t va, vb, vr;
 363    uint32_t r = 0;
 364
 365    SSATACC(16, 0);
 366    SSATACC(16, 16);
 367
 368    return r;
 369}
 370
 371#undef SSATACC
 372
 373uint32_t HELPER(neon_sqadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
 374{
 375    int64_t res;
 376    int64_t op1 = (uint32_t)a;
 377    int64_t op2 = (int32_t)b;
 378    res = op1 + op2;
 379    if (res > INT32_MAX) {
 380        SET_QC();
 381        res = INT32_MAX;
 382    } else if (res < INT32_MIN) {
 383        SET_QC();
 384        res = INT32_MIN;
 385    }
 386    return res;
 387}
 388
 389uint64_t HELPER(neon_sqadd_u64)(CPUARMState *env, uint64_t a, uint64_t b)
 390{
 391    uint64_t res;
 392    res = a + b;
 393    /* We only need to look at the pattern of SIGN bits to detect an overflow */
 394    if (((a & res)
 395         | (~b & res)
 396         | (a & ~b)) & SIGNBIT64) {
 397        SET_QC();
 398        res = INT64_MAX;
 399    }
 400    return res;
 401}
 402
 403
 404#define NEON_USAT(dest, src1, src2, type) do { \
 405    uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
 406    if (tmp != (type)tmp) { \
 407        SET_QC(); \
 408        dest = 0; \
 409    } else { \
 410        dest = tmp; \
 411    }} while(0)
 412#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
 413NEON_VOP_ENV(qsub_u8, neon_u8, 4)
 414#undef NEON_FN
 415#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
 416NEON_VOP_ENV(qsub_u16, neon_u16, 2)
 417#undef NEON_FN
 418#undef NEON_USAT
 419
 420uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b)
 421{
 422    uint32_t res = a - b;
 423    if (res > a) {
 424        SET_QC();
 425        res = 0;
 426    }
 427    return res;
 428}
 429
 430uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
 431{
 432    uint64_t res;
 433
 434    if (src1 < src2) {
 435        SET_QC();
 436        res = 0;
 437    } else {
 438        res = src1 - src2;
 439    }
 440    return res;
 441}
 442
 443#define NEON_SSAT(dest, src1, src2, type) do { \
 444    int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
 445    if (tmp != (type)tmp) { \
 446        SET_QC(); \
 447        if (src2 < 0) { \
 448            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
 449        } else { \
 450            tmp = 1 << (sizeof(type) * 8 - 1); \
 451        } \
 452    } \
 453    dest = tmp; \
 454    } while(0)
 455#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
 456NEON_VOP_ENV(qsub_s8, neon_s8, 4)
 457#undef NEON_FN
 458#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
 459NEON_VOP_ENV(qsub_s16, neon_s16, 2)
 460#undef NEON_FN
 461#undef NEON_SSAT
 462
 463uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b)
 464{
 465    uint32_t res = a - b;
 466    if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
 467        SET_QC();
 468        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
 469    }
 470    return res;
 471}
 472
 473uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
 474{
 475    uint64_t res;
 476
 477    res = src1 - src2;
 478    if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
 479        SET_QC();
 480        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
 481    }
 482    return res;
 483}
 484
 485#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
 486NEON_VOP(hadd_s8, neon_s8, 4)
 487NEON_VOP(hadd_u8, neon_u8, 4)
 488NEON_VOP(hadd_s16, neon_s16, 2)
 489NEON_VOP(hadd_u16, neon_u16, 2)
 490#undef NEON_FN
 491
 492int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
 493{
 494    int32_t dest;
 495
 496    dest = (src1 >> 1) + (src2 >> 1);
 497    if (src1 & src2 & 1)
 498        dest++;
 499    return dest;
 500}
 501
 502uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
 503{
 504    uint32_t dest;
 505
 506    dest = (src1 >> 1) + (src2 >> 1);
 507    if (src1 & src2 & 1)
 508        dest++;
 509    return dest;
 510}
 511
 512#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
 513NEON_VOP(rhadd_s8, neon_s8, 4)
 514NEON_VOP(rhadd_u8, neon_u8, 4)
 515NEON_VOP(rhadd_s16, neon_s16, 2)
 516NEON_VOP(rhadd_u16, neon_u16, 2)
 517#undef NEON_FN
 518
 519int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
 520{
 521    int32_t dest;
 522
 523    dest = (src1 >> 1) + (src2 >> 1);
 524    if ((src1 | src2) & 1)
 525        dest++;
 526    return dest;
 527}
 528
 529uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
 530{
 531    uint32_t dest;
 532
 533    dest = (src1 >> 1) + (src2 >> 1);
 534    if ((src1 | src2) & 1)
 535        dest++;
 536    return dest;
 537}
 538
 539#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
 540NEON_VOP(hsub_s8, neon_s8, 4)
 541NEON_VOP(hsub_u8, neon_u8, 4)
 542NEON_VOP(hsub_s16, neon_s16, 2)
 543NEON_VOP(hsub_u16, neon_u16, 2)
 544#undef NEON_FN
 545
 546int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
 547{
 548    int32_t dest;
 549
 550    dest = (src1 >> 1) - (src2 >> 1);
 551    if ((~src1) & src2 & 1)
 552        dest--;
 553    return dest;
 554}
 555
 556uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
 557{
 558    uint32_t dest;
 559
 560    dest = (src1 >> 1) - (src2 >> 1);
 561    if ((~src1) & src2 & 1)
 562        dest--;
 563    return dest;
 564}
 565
 566#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
 567NEON_VOP(cgt_s8, neon_s8, 4)
 568NEON_VOP(cgt_u8, neon_u8, 4)
 569NEON_VOP(cgt_s16, neon_s16, 2)
 570NEON_VOP(cgt_u16, neon_u16, 2)
 571NEON_VOP(cgt_s32, neon_s32, 1)
 572NEON_VOP(cgt_u32, neon_u32, 1)
 573#undef NEON_FN
 574
 575#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
 576NEON_VOP(cge_s8, neon_s8, 4)
 577NEON_VOP(cge_u8, neon_u8, 4)
 578NEON_VOP(cge_s16, neon_s16, 2)
 579NEON_VOP(cge_u16, neon_u16, 2)
 580NEON_VOP(cge_s32, neon_s32, 1)
 581NEON_VOP(cge_u32, neon_u32, 1)
 582#undef NEON_FN
 583
 584#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
 585NEON_VOP(min_s8, neon_s8, 4)
 586NEON_VOP(min_u8, neon_u8, 4)
 587NEON_VOP(min_s16, neon_s16, 2)
 588NEON_VOP(min_u16, neon_u16, 2)
 589NEON_VOP(min_s32, neon_s32, 1)
 590NEON_VOP(min_u32, neon_u32, 1)
 591NEON_POP(pmin_s8, neon_s8, 4)
 592NEON_POP(pmin_u8, neon_u8, 4)
 593NEON_POP(pmin_s16, neon_s16, 2)
 594NEON_POP(pmin_u16, neon_u16, 2)
 595#undef NEON_FN
 596
 597#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
 598NEON_VOP(max_s8, neon_s8, 4)
 599NEON_VOP(max_u8, neon_u8, 4)
 600NEON_VOP(max_s16, neon_s16, 2)
 601NEON_VOP(max_u16, neon_u16, 2)
 602NEON_VOP(max_s32, neon_s32, 1)
 603NEON_VOP(max_u32, neon_u32, 1)
 604NEON_POP(pmax_s8, neon_s8, 4)
 605NEON_POP(pmax_u8, neon_u8, 4)
 606NEON_POP(pmax_s16, neon_s16, 2)
 607NEON_POP(pmax_u16, neon_u16, 2)
 608#undef NEON_FN
 609
 610#define NEON_FN(dest, src1, src2) \
 611    dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
 612NEON_VOP(abd_s8, neon_s8, 4)
 613NEON_VOP(abd_u8, neon_u8, 4)
 614NEON_VOP(abd_s16, neon_s16, 2)
 615NEON_VOP(abd_u16, neon_u16, 2)
 616NEON_VOP(abd_s32, neon_s32, 1)
 617NEON_VOP(abd_u32, neon_u32, 1)
 618#undef NEON_FN
 619
 620#define NEON_FN(dest, src1, src2) do { \
 621    int8_t tmp; \
 622    tmp = (int8_t)src2; \
 623    if (tmp >= (ssize_t)sizeof(src1) * 8 || \
 624        tmp <= -(ssize_t)sizeof(src1) * 8) { \
 625        dest = 0; \
 626    } else if (tmp < 0) { \
 627        dest = src1 >> -tmp; \
 628    } else { \
 629        dest = src1 << tmp; \
 630    }} while (0)
 631NEON_VOP(shl_u8, neon_u8, 4)
 632NEON_VOP(shl_u16, neon_u16, 2)
 633NEON_VOP(shl_u32, neon_u32, 1)
 634#undef NEON_FN
 635
 636uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop)
 637{
 638    int8_t shift = (int8_t)shiftop;
 639    if (shift >= 64 || shift <= -64) {
 640        val = 0;
 641    } else if (shift < 0) {
 642        val >>= -shift;
 643    } else {
 644        val <<= shift;
 645    }
 646    return val;
 647}
 648
 649#define NEON_FN(dest, src1, src2) do { \
 650    int8_t tmp; \
 651    tmp = (int8_t)src2; \
 652    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
 653        dest = 0; \
 654    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
 655        dest = src1 >> (sizeof(src1) * 8 - 1); \
 656    } else if (tmp < 0) { \
 657        dest = src1 >> -tmp; \
 658    } else { \
 659        dest = src1 << tmp; \
 660    }} while (0)
 661NEON_VOP(shl_s8, neon_s8, 4)
 662NEON_VOP(shl_s16, neon_s16, 2)
 663NEON_VOP(shl_s32, neon_s32, 1)
 664#undef NEON_FN
 665
 666uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop)
 667{
 668    int8_t shift = (int8_t)shiftop;
 669    int64_t val = valop;
 670    if (shift >= 64) {
 671        val = 0;
 672    } else if (shift <= -64) {
 673        val >>= 63;
 674    } else if (shift < 0) {
 675        val >>= -shift;
 676    } else {
 677        val <<= shift;
 678    }
 679    return val;
 680}
 681
 682#define NEON_FN(dest, src1, src2) do { \
 683    int8_t tmp; \
 684    tmp = (int8_t)src2; \
 685    if ((tmp >= (ssize_t)sizeof(src1) * 8) \
 686        || (tmp <= -(ssize_t)sizeof(src1) * 8)) { \
 687        dest = 0; \
 688    } else if (tmp < 0) { \
 689        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
 690    } else { \
 691        dest = src1 << tmp; \
 692    }} while (0)
 693NEON_VOP(rshl_s8, neon_s8, 4)
 694NEON_VOP(rshl_s16, neon_s16, 2)
 695#undef NEON_FN
 696
 697/* The addition of the rounding constant may overflow, so we use an
 698 * intermediate 64 bit accumulator.  */
 699uint32_t HELPER(neon_rshl_s32)(uint32_t valop, uint32_t shiftop)
 700{
 701    int32_t dest;
 702    int32_t val = (int32_t)valop;
 703    int8_t shift = (int8_t)shiftop;
 704    if ((shift >= 32) || (shift <= -32)) {
 705        dest = 0;
 706    } else if (shift < 0) {
 707        int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
 708        dest = big_dest >> -shift;
 709    } else {
 710        dest = val << shift;
 711    }
 712    return dest;
 713}
 714
 715/* Handling addition overflow with 64 bit input values is more
 716 * tricky than with 32 bit values.  */
 717uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
 718{
 719    int8_t shift = (int8_t)shiftop;
 720    int64_t val = valop;
 721    if ((shift >= 64) || (shift <= -64)) {
 722        val = 0;
 723    } else if (shift < 0) {
 724        val >>= (-shift - 1);
 725        if (val == INT64_MAX) {
 726            /* In this case, it means that the rounding constant is 1,
 727             * and the addition would overflow. Return the actual
 728             * result directly.  */
 729            val = 0x4000000000000000LL;
 730        } else {
 731            val++;
 732            val >>= 1;
 733        }
 734    } else {
 735        val <<= shift;
 736    }
 737    return val;
 738}
 739
 740#define NEON_FN(dest, src1, src2) do { \
 741    int8_t tmp; \
 742    tmp = (int8_t)src2; \
 743    if (tmp >= (ssize_t)sizeof(src1) * 8 || \
 744        tmp < -(ssize_t)sizeof(src1) * 8) { \
 745        dest = 0; \
 746    } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
 747        dest = src1 >> (-tmp - 1); \
 748    } else if (tmp < 0) { \
 749        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
 750    } else { \
 751        dest = src1 << tmp; \
 752    }} while (0)
 753NEON_VOP(rshl_u8, neon_u8, 4)
 754NEON_VOP(rshl_u16, neon_u16, 2)
 755#undef NEON_FN
 756
 757/* The addition of the rounding constant may overflow, so we use an
 758 * intermediate 64 bit accumulator.  */
 759uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shiftop)
 760{
 761    uint32_t dest;
 762    int8_t shift = (int8_t)shiftop;
 763    if (shift >= 32 || shift < -32) {
 764        dest = 0;
 765    } else if (shift == -32) {
 766        dest = val >> 31;
 767    } else if (shift < 0) {
 768        uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
 769        dest = big_dest >> -shift;
 770    } else {
 771        dest = val << shift;
 772    }
 773    return dest;
 774}
 775
 776/* Handling addition overflow with 64 bit input values is more
 777 * tricky than with 32 bit values.  */
 778uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
 779{
 780    int8_t shift = (uint8_t)shiftop;
 781    if (shift >= 64 || shift < -64) {
 782        val = 0;
 783    } else if (shift == -64) {
 784        /* Rounding a 1-bit result just preserves that bit.  */
 785        val >>= 63;
 786    } else if (shift < 0) {
 787        val >>= (-shift - 1);
 788        if (val == UINT64_MAX) {
 789            /* In this case, it means that the rounding constant is 1,
 790             * and the addition would overflow. Return the actual
 791             * result directly.  */
 792            val = 0x8000000000000000ULL;
 793        } else {
 794            val++;
 795            val >>= 1;
 796        }
 797    } else {
 798        val <<= shift;
 799    }
 800    return val;
 801}
 802
 803#define NEON_FN(dest, src1, src2) do { \
 804    int8_t tmp; \
 805    tmp = (int8_t)src2; \
 806    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
 807        if (src1) { \
 808            SET_QC(); \
 809            dest = ~0; \
 810        } else { \
 811            dest = 0; \
 812        } \
 813    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
 814        dest = 0; \
 815    } else if (tmp < 0) { \
 816        dest = src1 >> -tmp; \
 817    } else { \
 818        dest = src1 << tmp; \
 819        if ((dest >> tmp) != src1) { \
 820            SET_QC(); \
 821            dest = ~0; \
 822        } \
 823    }} while (0)
 824NEON_VOP_ENV(qshl_u8, neon_u8, 4)
 825NEON_VOP_ENV(qshl_u16, neon_u16, 2)
 826NEON_VOP_ENV(qshl_u32, neon_u32, 1)
 827#undef NEON_FN
 828
 829uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shiftop)
 830{
 831    int8_t shift = (int8_t)shiftop;
 832    if (shift >= 64) {
 833        if (val) {
 834            val = ~(uint64_t)0;
 835            SET_QC();
 836        }
 837    } else if (shift <= -64) {
 838        val = 0;
 839    } else if (shift < 0) {
 840        val >>= -shift;
 841    } else {
 842        uint64_t tmp = val;
 843        val <<= shift;
 844        if ((val >> shift) != tmp) {
 845            SET_QC();
 846            val = ~(uint64_t)0;
 847        }
 848    }
 849    return val;
 850}
 851
 852#define NEON_FN(dest, src1, src2) do { \
 853    int8_t tmp; \
 854    tmp = (int8_t)src2; \
 855    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
 856        if (src1) { \
 857            SET_QC(); \
 858            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
 859            if (src1 > 0) { \
 860                dest--; \
 861            } \
 862        } else { \
 863            dest = src1; \
 864        } \
 865    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
 866        dest = src1 >> 31; \
 867    } else if (tmp < 0) { \
 868        dest = src1 >> -tmp; \
 869    } else { \
 870        dest = src1 << tmp; \
 871        if ((dest >> tmp) != src1) { \
 872            SET_QC(); \
 873            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
 874            if (src1 > 0) { \
 875                dest--; \
 876            } \
 877        } \
 878    }} while (0)
 879NEON_VOP_ENV(qshl_s8, neon_s8, 4)
 880NEON_VOP_ENV(qshl_s16, neon_s16, 2)
 881NEON_VOP_ENV(qshl_s32, neon_s32, 1)
 882#undef NEON_FN
 883
 884uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop)
 885{
 886    int8_t shift = (uint8_t)shiftop;
 887    int64_t val = valop;
 888    if (shift >= 64) {
 889        if (val) {
 890            SET_QC();
 891            val = (val >> 63) ^ ~SIGNBIT64;
 892        }
 893    } else if (shift <= -64) {
 894        val >>= 63;
 895    } else if (shift < 0) {
 896        val >>= -shift;
 897    } else {
 898        int64_t tmp = val;
 899        val <<= shift;
 900        if ((val >> shift) != tmp) {
 901            SET_QC();
 902            val = (tmp >> 63) ^ ~SIGNBIT64;
 903        }
 904    }
 905    return val;
 906}
 907
 908#define NEON_FN(dest, src1, src2) do { \
 909    if (src1 & (1 << (sizeof(src1) * 8 - 1))) { \
 910        SET_QC(); \
 911        dest = 0; \
 912    } else { \
 913        int8_t tmp; \
 914        tmp = (int8_t)src2; \
 915        if (tmp >= (ssize_t)sizeof(src1) * 8) { \
 916            if (src1) { \
 917                SET_QC(); \
 918                dest = ~0; \
 919            } else { \
 920                dest = 0; \
 921            } \
 922        } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
 923            dest = 0; \
 924        } else if (tmp < 0) { \
 925            dest = src1 >> -tmp; \
 926        } else { \
 927            dest = src1 << tmp; \
 928            if ((dest >> tmp) != src1) { \
 929                SET_QC(); \
 930                dest = ~0; \
 931            } \
 932        } \
 933    }} while (0)
 934NEON_VOP_ENV(qshlu_s8, neon_u8, 4)
 935NEON_VOP_ENV(qshlu_s16, neon_u16, 2)
 936#undef NEON_FN
 937
 938uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t valop, uint32_t shiftop)
 939{
 940    if ((int32_t)valop < 0) {
 941        SET_QC();
 942        return 0;
 943    }
 944    return helper_neon_qshl_u32(env, valop, shiftop);
 945}
 946
 947uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop)
 948{
 949    if ((int64_t)valop < 0) {
 950        SET_QC();
 951        return 0;
 952    }
 953    return helper_neon_qshl_u64(env, valop, shiftop);
 954}
 955
 956#define NEON_FN(dest, src1, src2) do { \
 957    int8_t tmp; \
 958    tmp = (int8_t)src2; \
 959    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
 960        if (src1) { \
 961            SET_QC(); \
 962            dest = ~0; \
 963        } else { \
 964            dest = 0; \
 965        } \
 966    } else if (tmp < -(ssize_t)sizeof(src1) * 8) { \
 967        dest = 0; \
 968    } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
 969        dest = src1 >> (sizeof(src1) * 8 - 1); \
 970    } else if (tmp < 0) { \
 971        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
 972    } else { \
 973        dest = src1 << tmp; \
 974        if ((dest >> tmp) != src1) { \
 975            SET_QC(); \
 976            dest = ~0; \
 977        } \
 978    }} while (0)
 979NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
 980NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
 981#undef NEON_FN
 982
 983/* The addition of the rounding constant may overflow, so we use an
 984 * intermediate 64 bit accumulator.  */
 985uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shiftop)
 986{
 987    uint32_t dest;
 988    int8_t shift = (int8_t)shiftop;
 989    if (shift >= 32) {
 990        if (val) {
 991            SET_QC();
 992            dest = ~0;
 993        } else {
 994            dest = 0;
 995        }
 996    } else if (shift < -32) {
 997        dest = 0;
 998    } else if (shift == -32) {
 999        dest = val >> 31;
1000    } else if (shift < 0) {
1001        uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
1002        dest = big_dest >> -shift;
1003    } else {
1004        dest = val << shift;
1005        if ((dest >> shift) != val) {
1006            SET_QC();
1007            dest = ~0;
1008        }
1009    }
1010    return dest;
1011}
1012
1013/* Handling addition overflow with 64 bit input values is more
1014 * tricky than with 32 bit values.  */
1015uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shiftop)
1016{
1017    int8_t shift = (int8_t)shiftop;
1018    if (shift >= 64) {
1019        if (val) {
1020            SET_QC();
1021            val = ~0;
1022        }
1023    } else if (shift < -64) {
1024        val = 0;
1025    } else if (shift == -64) {
1026        val >>= 63;
1027    } else if (shift < 0) {
1028        val >>= (-shift - 1);
1029        if (val == UINT64_MAX) {
1030            /* In this case, it means that the rounding constant is 1,
1031             * and the addition would overflow. Return the actual
1032             * result directly.  */
1033            val = 0x8000000000000000ULL;
1034        } else {
1035            val++;
1036            val >>= 1;
1037        }
1038    } else { \
1039        uint64_t tmp = val;
1040        val <<= shift;
1041        if ((val >> shift) != tmp) {
1042            SET_QC();
1043            val = ~0;
1044        }
1045    }
1046    return val;
1047}
1048
1049#define NEON_FN(dest, src1, src2) do { \
1050    int8_t tmp; \
1051    tmp = (int8_t)src2; \
1052    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
1053        if (src1) { \
1054            SET_QC(); \
1055            dest = (typeof(dest))(1 << (sizeof(src1) * 8 - 1)); \
1056            if (src1 > 0) { \
1057                dest--; \
1058            } \
1059        } else { \
1060            dest = 0; \
1061        } \
1062    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
1063        dest = 0; \
1064    } else if (tmp < 0) { \
1065        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
1066    } else { \
1067        dest = src1 << tmp; \
1068        if ((dest >> tmp) != src1) { \
1069            SET_QC(); \
1070            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
1071            if (src1 > 0) { \
1072                dest--; \
1073            } \
1074        } \
1075    }} while (0)
1076NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
1077NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
1078#undef NEON_FN
1079
1080/* The addition of the rounding constant may overflow, so we use an
1081 * intermediate 64 bit accumulator.  */
1082uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t valop, uint32_t shiftop)
1083{
1084    int32_t dest;
1085    int32_t val = (int32_t)valop;
1086    int8_t shift = (int8_t)shiftop;
1087    if (shift >= 32) {
1088        if (val) {
1089            SET_QC();
1090            dest = (val >> 31) ^ ~SIGNBIT;
1091        } else {
1092            dest = 0;
1093        }
1094    } else if (shift <= -32) {
1095        dest = 0;
1096    } else if (shift < 0) {
1097        int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
1098        dest = big_dest >> -shift;
1099    } else {
1100        dest = val << shift;
1101        if ((dest >> shift) != val) {
1102            SET_QC();
1103            dest = (val >> 31) ^ ~SIGNBIT;
1104        }
1105    }
1106    return dest;
1107}
1108
1109/* Handling addition overflow with 64 bit input values is more
1110 * tricky than with 32 bit values.  */
1111uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop)
1112{
1113    int8_t shift = (uint8_t)shiftop;
1114    int64_t val = valop;
1115
1116    if (shift >= 64) {
1117        if (val) {
1118            SET_QC();
1119            val = (val >> 63) ^ ~SIGNBIT64;
1120        }
1121    } else if (shift <= -64) {
1122        val = 0;
1123    } else if (shift < 0) {
1124        val >>= (-shift - 1);
1125        if (val == INT64_MAX) {
1126            /* In this case, it means that the rounding constant is 1,
1127             * and the addition would overflow. Return the actual
1128             * result directly.  */
1129            val = 0x4000000000000000ULL;
1130        } else {
1131            val++;
1132            val >>= 1;
1133        }
1134    } else {
1135        int64_t tmp = val;
1136        val <<= shift;
1137        if ((val >> shift) != tmp) {
1138            SET_QC();
1139            val = (tmp >> 63) ^ ~SIGNBIT64;
1140        }
1141    }
1142    return val;
1143}
1144
1145uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
1146{
1147    uint32_t mask;
1148    mask = (a ^ b) & 0x80808080u;
1149    a &= ~0x80808080u;
1150    b &= ~0x80808080u;
1151    return (a + b) ^ mask;
1152}
1153
1154uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
1155{
1156    uint32_t mask;
1157    mask = (a ^ b) & 0x80008000u;
1158    a &= ~0x80008000u;
1159    b &= ~0x80008000u;
1160    return (a + b) ^ mask;
1161}
1162
1163#define NEON_FN(dest, src1, src2) dest = src1 + src2
1164NEON_POP(padd_u8, neon_u8, 4)
1165NEON_POP(padd_u16, neon_u16, 2)
1166#undef NEON_FN
1167
1168#define NEON_FN(dest, src1, src2) dest = src1 - src2
1169NEON_VOP(sub_u8, neon_u8, 4)
1170NEON_VOP(sub_u16, neon_u16, 2)
1171#undef NEON_FN
1172
1173#define NEON_FN(dest, src1, src2) dest = src1 * src2
1174NEON_VOP(mul_u8, neon_u8, 4)
1175NEON_VOP(mul_u16, neon_u16, 2)
1176#undef NEON_FN
1177
1178/* Polynomial multiplication is like integer multiplication except the
1179   partial products are XORed, not added.  */
1180uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2)
1181{
1182    uint32_t mask;
1183    uint32_t result;
1184    result = 0;
1185    while (op1) {
1186        mask = 0;
1187        if (op1 & 1)
1188            mask |= 0xff;
1189        if (op1 & (1 << 8))
1190            mask |= (0xff << 8);
1191        if (op1 & (1 << 16))
1192            mask |= (0xff << 16);
1193        if (op1 & (1 << 24))
1194            mask |= (0xff << 24);
1195        result ^= op2 & mask;
1196        op1 = (op1 >> 1) & 0x7f7f7f7f;
1197        op2 = (op2 << 1) & 0xfefefefe;
1198    }
1199    return result;
1200}
1201
1202uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2)
1203{
1204    uint64_t result = 0;
1205    uint64_t mask;
1206    uint64_t op2ex = op2;
1207    op2ex = (op2ex & 0xff) |
1208        ((op2ex & 0xff00) << 8) |
1209        ((op2ex & 0xff0000) << 16) |
1210        ((op2ex & 0xff000000) << 24);
1211    while (op1) {
1212        mask = 0;
1213        if (op1 & 1) {
1214            mask |= 0xffff;
1215        }
1216        if (op1 & (1 << 8)) {
1217            mask |= (0xffffU << 16);
1218        }
1219        if (op1 & (1 << 16)) {
1220            mask |= (0xffffULL << 32);
1221        }
1222        if (op1 & (1 << 24)) {
1223            mask |= (0xffffULL << 48);
1224        }
1225        result ^= op2ex & mask;
1226        op1 = (op1 >> 1) & 0x7f7f7f7f;
1227        op2ex <<= 1;
1228    }
1229    return result;
1230}
1231
1232#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
1233NEON_VOP(tst_u8, neon_u8, 4)
1234NEON_VOP(tst_u16, neon_u16, 2)
1235NEON_VOP(tst_u32, neon_u32, 1)
1236#undef NEON_FN
1237
1238#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
1239NEON_VOP(ceq_u8, neon_u8, 4)
1240NEON_VOP(ceq_u16, neon_u16, 2)
1241NEON_VOP(ceq_u32, neon_u32, 1)
1242#undef NEON_FN
1243
1244#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
1245NEON_VOP1(abs_s8, neon_s8, 4)
1246NEON_VOP1(abs_s16, neon_s16, 2)
1247#undef NEON_FN
1248
1249/* Count Leading Sign/Zero Bits.  */
1250static inline int do_clz8(uint8_t x)
1251{
1252    int n;
1253    for (n = 8; x; n--)
1254        x >>= 1;
1255    return n;
1256}
1257
1258static inline int do_clz16(uint16_t x)
1259{
1260    int n;
1261    for (n = 16; x; n--)
1262        x >>= 1;
1263    return n;
1264}
1265
1266#define NEON_FN(dest, src, dummy) dest = do_clz8(src)
1267NEON_VOP1(clz_u8, neon_u8, 4)
1268#undef NEON_FN
1269
1270#define NEON_FN(dest, src, dummy) dest = do_clz16(src)
1271NEON_VOP1(clz_u16, neon_u16, 2)
1272#undef NEON_FN
1273
1274#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
1275NEON_VOP1(cls_s8, neon_s8, 4)
1276#undef NEON_FN
1277
1278#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
1279NEON_VOP1(cls_s16, neon_s16, 2)
1280#undef NEON_FN
1281
1282uint32_t HELPER(neon_cls_s32)(uint32_t x)
1283{
1284    int count;
1285    if ((int32_t)x < 0)
1286        x = ~x;
1287    for (count = 32; x; count--)
1288        x = x >> 1;
1289    return count - 1;
1290}
1291
1292/* Bit count.  */
1293uint32_t HELPER(neon_cnt_u8)(uint32_t x)
1294{
1295    x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
1296    x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
1297    x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
1298    return x;
1299}
1300
1301/* Reverse bits in each 8 bit word */
1302uint32_t HELPER(neon_rbit_u8)(uint32_t x)
1303{
1304    x =  ((x & 0xf0f0f0f0) >> 4)
1305       | ((x & 0x0f0f0f0f) << 4);
1306    x =  ((x & 0x88888888) >> 3)
1307       | ((x & 0x44444444) >> 1)
1308       | ((x & 0x22222222) << 1)
1309       | ((x & 0x11111111) << 3);
1310    return x;
1311}
1312
1313#define NEON_QDMULH16(dest, src1, src2, round) do { \
1314    uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
1315    if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
1316        SET_QC(); \
1317        tmp = (tmp >> 31) ^ ~SIGNBIT; \
1318    } else { \
1319        tmp <<= 1; \
1320    } \
1321    if (round) { \
1322        int32_t old = tmp; \
1323        tmp += 1 << 15; \
1324        if ((int32_t)tmp < old) { \
1325            SET_QC(); \
1326            tmp = SIGNBIT - 1; \
1327        } \
1328    } \
1329    dest = tmp >> 16; \
1330    } while(0)
1331#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
1332NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
1333#undef NEON_FN
1334#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
1335NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
1336#undef NEON_FN
1337#undef NEON_QDMULH16
1338
1339#define NEON_QDMULH32(dest, src1, src2, round) do { \
1340    uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
1341    if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
1342        SET_QC(); \
1343        tmp = (tmp >> 63) ^ ~SIGNBIT64; \
1344    } else { \
1345        tmp <<= 1; \
1346    } \
1347    if (round) { \
1348        int64_t old = tmp; \
1349        tmp += (int64_t)1 << 31; \
1350        if ((int64_t)tmp < old) { \
1351            SET_QC(); \
1352            tmp = SIGNBIT64 - 1; \
1353        } \
1354    } \
1355    dest = tmp >> 32; \
1356    } while(0)
1357#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
1358NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
1359#undef NEON_FN
1360#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
1361NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
1362#undef NEON_FN
1363#undef NEON_QDMULH32
1364
1365uint32_t HELPER(neon_narrow_u8)(uint64_t x)
1366{
1367    return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
1368           | ((x >> 24) & 0xff000000u);
1369}
1370
1371uint32_t HELPER(neon_narrow_u16)(uint64_t x)
1372{
1373    return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
1374}
1375
1376uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
1377{
1378    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
1379            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
1380}
1381
1382uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
1383{
1384    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
1385}
1386
1387uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
1388{
1389    x &= 0xff80ff80ff80ff80ull;
1390    x += 0x0080008000800080ull;
1391    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
1392            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
1393}
1394
1395uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
1396{
1397    x &= 0xffff8000ffff8000ull;
1398    x += 0x0000800000008000ull;
1399    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
1400}
1401
1402uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
1403{
1404    uint16_t s;
1405    uint8_t d;
1406    uint32_t res = 0;
1407#define SAT8(n) \
1408    s = x >> n; \
1409    if (s & 0x8000) { \
1410        SET_QC(); \
1411    } else { \
1412        if (s > 0xff) { \
1413            d = 0xff; \
1414            SET_QC(); \
1415        } else  { \
1416            d = s; \
1417        } \
1418        res |= (uint32_t)d << (n / 2); \
1419    }
1420
1421    SAT8(0);
1422    SAT8(16);
1423    SAT8(32);
1424    SAT8(48);
1425#undef SAT8
1426    return res;
1427}
1428
1429uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
1430{
1431    uint16_t s;
1432    uint8_t d;
1433    uint32_t res = 0;
1434#define SAT8(n) \
1435    s = x >> n; \
1436    if (s > 0xff) { \
1437        d = 0xff; \
1438        SET_QC(); \
1439    } else  { \
1440        d = s; \
1441    } \
1442    res |= (uint32_t)d << (n / 2);
1443
1444    SAT8(0);
1445    SAT8(16);
1446    SAT8(32);
1447    SAT8(48);
1448#undef SAT8
1449    return res;
1450}
1451
1452uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
1453{
1454    int16_t s;
1455    uint8_t d;
1456    uint32_t res = 0;
1457#define SAT8(n) \
1458    s = x >> n; \
1459    if (s != (int8_t)s) { \
1460        d = (s >> 15) ^ 0x7f; \
1461        SET_QC(); \
1462    } else  { \
1463        d = s; \
1464    } \
1465    res |= (uint32_t)d << (n / 2);
1466
1467    SAT8(0);
1468    SAT8(16);
1469    SAT8(32);
1470    SAT8(48);
1471#undef SAT8
1472    return res;
1473}
1474
1475uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
1476{
1477    uint32_t high;
1478    uint32_t low;
1479    low = x;
1480    if (low & 0x80000000) {
1481        low = 0;
1482        SET_QC();
1483    } else if (low > 0xffff) {
1484        low = 0xffff;
1485        SET_QC();
1486    }
1487    high = x >> 32;
1488    if (high & 0x80000000) {
1489        high = 0;
1490        SET_QC();
1491    } else if (high > 0xffff) {
1492        high = 0xffff;
1493        SET_QC();
1494    }
1495    return low | (high << 16);
1496}
1497
1498uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
1499{
1500    uint32_t high;
1501    uint32_t low;
1502    low = x;
1503    if (low > 0xffff) {
1504        low = 0xffff;
1505        SET_QC();
1506    }
1507    high = x >> 32;
1508    if (high > 0xffff) {
1509        high = 0xffff;
1510        SET_QC();
1511    }
1512    return low | (high << 16);
1513}
1514
1515uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
1516{
1517    int32_t low;
1518    int32_t high;
1519    low = x;
1520    if (low != (int16_t)low) {
1521        low = (low >> 31) ^ 0x7fff;
1522        SET_QC();
1523    }
1524    high = x >> 32;
1525    if (high != (int16_t)high) {
1526        high = (high >> 31) ^ 0x7fff;
1527        SET_QC();
1528    }
1529    return (uint16_t)low | (high << 16);
1530}
1531
1532uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
1533{
1534    if (x & 0x8000000000000000ull) {
1535        SET_QC();
1536        return 0;
1537    }
1538    if (x > 0xffffffffu) {
1539        SET_QC();
1540        return 0xffffffffu;
1541    }
1542    return x;
1543}
1544
1545uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
1546{
1547    if (x > 0xffffffffu) {
1548        SET_QC();
1549        return 0xffffffffu;
1550    }
1551    return x;
1552}
1553
1554uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
1555{
1556    if ((int64_t)x != (int32_t)x) {
1557        SET_QC();
1558        return ((int64_t)x >> 63) ^ 0x7fffffff;
1559    }
1560    return x;
1561}
1562
1563uint64_t HELPER(neon_widen_u8)(uint32_t x)
1564{
1565    uint64_t tmp;
1566    uint64_t ret;
1567    ret = (uint8_t)x;
1568    tmp = (uint8_t)(x >> 8);
1569    ret |= tmp << 16;
1570    tmp = (uint8_t)(x >> 16);
1571    ret |= tmp << 32;
1572    tmp = (uint8_t)(x >> 24);
1573    ret |= tmp << 48;
1574    return ret;
1575}
1576
1577uint64_t HELPER(neon_widen_s8)(uint32_t x)
1578{
1579    uint64_t tmp;
1580    uint64_t ret;
1581    ret = (uint16_t)(int8_t)x;
1582    tmp = (uint16_t)(int8_t)(x >> 8);
1583    ret |= tmp << 16;
1584    tmp = (uint16_t)(int8_t)(x >> 16);
1585    ret |= tmp << 32;
1586    tmp = (uint16_t)(int8_t)(x >> 24);
1587    ret |= tmp << 48;
1588    return ret;
1589}
1590
1591uint64_t HELPER(neon_widen_u16)(uint32_t x)
1592{
1593    uint64_t high = (uint16_t)(x >> 16);
1594    return ((uint16_t)x) | (high << 32);
1595}
1596
1597uint64_t HELPER(neon_widen_s16)(uint32_t x)
1598{
1599    uint64_t high = (int16_t)(x >> 16);
1600    return ((uint32_t)(int16_t)x) | (high << 32);
1601}
1602
1603uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
1604{
1605    uint64_t mask;
1606    mask = (a ^ b) & 0x8000800080008000ull;
1607    a &= ~0x8000800080008000ull;
1608    b &= ~0x8000800080008000ull;
1609    return (a + b) ^ mask;
1610}
1611
1612uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
1613{
1614    uint64_t mask;
1615    mask = (a ^ b) & 0x8000000080000000ull;
1616    a &= ~0x8000000080000000ull;
1617    b &= ~0x8000000080000000ull;
1618    return (a + b) ^ mask;
1619}
1620
1621uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
1622{
1623    uint64_t tmp;
1624    uint64_t tmp2;
1625
1626    tmp = a & 0x0000ffff0000ffffull;
1627    tmp += (a >> 16) & 0x0000ffff0000ffffull;
1628    tmp2 = b & 0xffff0000ffff0000ull;
1629    tmp2 += (b << 16) & 0xffff0000ffff0000ull;
1630    return    ( tmp         & 0xffff)
1631            | ((tmp  >> 16) & 0xffff0000ull)
1632            | ((tmp2 << 16) & 0xffff00000000ull)
1633            | ( tmp2        & 0xffff000000000000ull);
1634}
1635
1636uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
1637{
1638    uint32_t low = a + (a >> 32);
1639    uint32_t high = b + (b >> 32);
1640    return low + ((uint64_t)high << 32);
1641}
1642
1643uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
1644{
1645    uint64_t mask;
1646    mask = (a ^ ~b) & 0x8000800080008000ull;
1647    a |= 0x8000800080008000ull;
1648    b &= ~0x8000800080008000ull;
1649    return (a - b) ^ mask;
1650}
1651
1652uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
1653{
1654    uint64_t mask;
1655    mask = (a ^ ~b) & 0x8000000080000000ull;
1656    a |= 0x8000000080000000ull;
1657    b &= ~0x8000000080000000ull;
1658    return (a - b) ^ mask;
1659}
1660
1661uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
1662{
1663    uint32_t x, y;
1664    uint32_t low, high;
1665
1666    x = a;
1667    y = b;
1668    low = x + y;
1669    if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1670        SET_QC();
1671        low = ((int32_t)x >> 31) ^ ~SIGNBIT;
1672    }
1673    x = a >> 32;
1674    y = b >> 32;
1675    high = x + y;
1676    if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1677        SET_QC();
1678        high = ((int32_t)x >> 31) ^ ~SIGNBIT;
1679    }
1680    return low | ((uint64_t)high << 32);
1681}
1682
1683uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
1684{
1685    uint64_t result;
1686
1687    result = a + b;
1688    if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
1689        SET_QC();
1690        result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
1691    }
1692    return result;
1693}
1694
1695/* We have to do the arithmetic in a larger type than
1696 * the input type, because for example with a signed 32 bit
1697 * op the absolute difference can overflow a signed 32 bit value.
1698 */
1699#define DO_ABD(dest, x, y, intype, arithtype) do {            \
1700    arithtype tmp_x = (intype)(x);                            \
1701    arithtype tmp_y = (intype)(y);                            \
1702    dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1703    } while(0)
1704
1705uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
1706{
1707    uint64_t tmp;
1708    uint64_t result;
1709    DO_ABD(result, a, b, uint8_t, uint32_t);
1710    DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
1711    result |= tmp << 16;
1712    DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
1713    result |= tmp << 32;
1714    DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
1715    result |= tmp << 48;
1716    return result;
1717}
1718
1719uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
1720{
1721    uint64_t tmp;
1722    uint64_t result;
1723    DO_ABD(result, a, b, int8_t, int32_t);
1724    DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
1725    result |= tmp << 16;
1726    DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
1727    result |= tmp << 32;
1728    DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
1729    result |= tmp << 48;
1730    return result;
1731}
1732
1733uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
1734{
1735    uint64_t tmp;
1736    uint64_t result;
1737    DO_ABD(result, a, b, uint16_t, uint32_t);
1738    DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1739    return result | (tmp << 32);
1740}
1741
1742uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
1743{
1744    uint64_t tmp;
1745    uint64_t result;
1746    DO_ABD(result, a, b, int16_t, int32_t);
1747    DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
1748    return result | (tmp << 32);
1749}
1750
1751uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
1752{
1753    uint64_t result;
1754    DO_ABD(result, a, b, uint32_t, uint64_t);
1755    return result;
1756}
1757
1758uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
1759{
1760    uint64_t result;
1761    DO_ABD(result, a, b, int32_t, int64_t);
1762    return result;
1763}
1764#undef DO_ABD
1765
1766/* Widening multiply. Named type is the source type.  */
1767#define DO_MULL(dest, x, y, type1, type2) do { \
1768    type1 tmp_x = x; \
1769    type1 tmp_y = y; \
1770    dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1771    } while(0)
1772
1773uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1774{
1775    uint64_t tmp;
1776    uint64_t result;
1777
1778    DO_MULL(result, a, b, uint8_t, uint16_t);
1779    DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1780    result |= tmp << 16;
1781    DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1782    result |= tmp << 32;
1783    DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1784    result |= tmp << 48;
1785    return result;
1786}
1787
1788uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1789{
1790    uint64_t tmp;
1791    uint64_t result;
1792
1793    DO_MULL(result, a, b, int8_t, uint16_t);
1794    DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1795    result |= tmp << 16;
1796    DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1797    result |= tmp << 32;
1798    DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1799    result |= tmp << 48;
1800    return result;
1801}
1802
1803uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1804{
1805    uint64_t tmp;
1806    uint64_t result;
1807
1808    DO_MULL(result, a, b, uint16_t, uint32_t);
1809    DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1810    return result | (tmp << 32);
1811}
1812
1813uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1814{
1815    uint64_t tmp;
1816    uint64_t result;
1817
1818    DO_MULL(result, a, b, int16_t, uint32_t);
1819    DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1820    return result | (tmp << 32);
1821}
1822
1823uint64_t HELPER(neon_negl_u16)(uint64_t x)
1824{
1825    uint16_t tmp;
1826    uint64_t result;
1827    result = (uint16_t)-x;
1828    tmp = -(x >> 16);
1829    result |= (uint64_t)tmp << 16;
1830    tmp = -(x >> 32);
1831    result |= (uint64_t)tmp << 32;
1832    tmp = -(x >> 48);
1833    result |= (uint64_t)tmp << 48;
1834    return result;
1835}
1836
1837uint64_t HELPER(neon_negl_u32)(uint64_t x)
1838{
1839    uint32_t low = -x;
1840    uint32_t high = -(x >> 32);
1841    return low | ((uint64_t)high << 32);
1842}
1843
1844/* Saturating sign manipulation.  */
1845/* ??? Make these use NEON_VOP1 */
1846#define DO_QABS8(x) do { \
1847    if (x == (int8_t)0x80) { \
1848        x = 0x7f; \
1849        SET_QC(); \
1850    } else if (x < 0) { \
1851        x = -x; \
1852    }} while (0)
1853uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1854{
1855    neon_s8 vec;
1856    NEON_UNPACK(neon_s8, vec, x);
1857    DO_QABS8(vec.v1);
1858    DO_QABS8(vec.v2);
1859    DO_QABS8(vec.v3);
1860    DO_QABS8(vec.v4);
1861    NEON_PACK(neon_s8, x, vec);
1862    return x;
1863}
1864#undef DO_QABS8
1865
1866#define DO_QNEG8(x) do { \
1867    if (x == (int8_t)0x80) { \
1868        x = 0x7f; \
1869        SET_QC(); \
1870    } else { \
1871        x = -x; \
1872    }} while (0)
1873uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1874{
1875    neon_s8 vec;
1876    NEON_UNPACK(neon_s8, vec, x);
1877    DO_QNEG8(vec.v1);
1878    DO_QNEG8(vec.v2);
1879    DO_QNEG8(vec.v3);
1880    DO_QNEG8(vec.v4);
1881    NEON_PACK(neon_s8, x, vec);
1882    return x;
1883}
1884#undef DO_QNEG8
1885
1886#define DO_QABS16(x) do { \
1887    if (x == (int16_t)0x8000) { \
1888        x = 0x7fff; \
1889        SET_QC(); \
1890    } else if (x < 0) { \
1891        x = -x; \
1892    }} while (0)
1893uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1894{
1895    neon_s16 vec;
1896    NEON_UNPACK(neon_s16, vec, x);
1897    DO_QABS16(vec.v1);
1898    DO_QABS16(vec.v2);
1899    NEON_PACK(neon_s16, x, vec);
1900    return x;
1901}
1902#undef DO_QABS16
1903
1904#define DO_QNEG16(x) do { \
1905    if (x == (int16_t)0x8000) { \
1906        x = 0x7fff; \
1907        SET_QC(); \
1908    } else { \
1909        x = -x; \
1910    }} while (0)
1911uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1912{
1913    neon_s16 vec;
1914    NEON_UNPACK(neon_s16, vec, x);
1915    DO_QNEG16(vec.v1);
1916    DO_QNEG16(vec.v2);
1917    NEON_PACK(neon_s16, x, vec);
1918    return x;
1919}
1920#undef DO_QNEG16
1921
1922uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1923{
1924    if (x == SIGNBIT) {
1925        SET_QC();
1926        x = ~SIGNBIT;
1927    } else if ((int32_t)x < 0) {
1928        x = -x;
1929    }
1930    return x;
1931}
1932
1933uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1934{
1935    if (x == SIGNBIT) {
1936        SET_QC();
1937        x = ~SIGNBIT;
1938    } else {
1939        x = -x;
1940    }
1941    return x;
1942}
1943
1944uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1945{
1946    if (x == SIGNBIT64) {
1947        SET_QC();
1948        x = ~SIGNBIT64;
1949    } else if ((int64_t)x < 0) {
1950        x = -x;
1951    }
1952    return x;
1953}
1954
1955uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1956{
1957    if (x == SIGNBIT64) {
1958        SET_QC();
1959        x = ~SIGNBIT64;
1960    } else {
1961        x = -x;
1962    }
1963    return x;
1964}
1965
1966/* NEON Float helpers.  */
1967uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b, void *fpstp)
1968{
1969    float_status *fpst = fpstp;
1970    float32 f0 = make_float32(a);
1971    float32 f1 = make_float32(b);
1972    return float32_val(float32_abs(float32_sub(f0, f1, fpst)));
1973}
1974
1975/* Floating point comparisons produce an integer result.
1976 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1977 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1978 */
1979uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
1980{
1981    float_status *fpst = fpstp;
1982    return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1983}
1984
1985uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
1986{
1987    float_status *fpst = fpstp;
1988    return -float32_le(make_float32(b), make_float32(a), fpst);
1989}
1990
1991uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1992{
1993    float_status *fpst = fpstp;
1994    return -float32_lt(make_float32(b), make_float32(a), fpst);
1995}
1996
1997uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
1998{
1999    float_status *fpst = fpstp;
2000    float32 f0 = float32_abs(make_float32(a));
2001    float32 f1 = float32_abs(make_float32(b));
2002    return -float32_le(f1, f0, fpst);
2003}
2004
2005uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
2006{
2007    float_status *fpst = fpstp;
2008    float32 f0 = float32_abs(make_float32(a));
2009    float32 f1 = float32_abs(make_float32(b));
2010    return -float32_lt(f1, f0, fpst);
2011}
2012
2013uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
2014{
2015    float_status *fpst = fpstp;
2016    float64 f0 = float64_abs(make_float64(a));
2017    float64 f1 = float64_abs(make_float64(b));
2018    return -float64_le(f1, f0, fpst);
2019}
2020
2021uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
2022{
2023    float_status *fpst = fpstp;
2024    float64 f0 = float64_abs(make_float64(a));
2025    float64 f1 = float64_abs(make_float64(b));
2026    return -float64_lt(f1, f0, fpst);
2027}
2028
2029#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
2030
2031void HELPER(neon_qunzip8)(void *vd, void *vm)
2032{
2033    uint64_t *rd = vd, *rm = vm;
2034    uint64_t zd0 = rd[0], zd1 = rd[1];
2035    uint64_t zm0 = rm[0], zm1 = rm[1];
2036
2037    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
2038        | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
2039        | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
2040        | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
2041    uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
2042        | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
2043        | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
2044        | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
2045    uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
2046        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
2047        | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
2048        | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
2049    uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
2050        | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
2051        | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
2052        | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
2053
2054    rm[0] = m0;
2055    rm[1] = m1;
2056    rd[0] = d0;
2057    rd[1] = d1;
2058}
2059
2060void HELPER(neon_qunzip16)(void *vd, void *vm)
2061{
2062    uint64_t *rd = vd, *rm = vm;
2063    uint64_t zd0 = rd[0], zd1 = rd[1];
2064    uint64_t zm0 = rm[0], zm1 = rm[1];
2065
2066    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
2067        | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
2068    uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
2069        | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
2070    uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
2071        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
2072    uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
2073        | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
2074
2075    rm[0] = m0;
2076    rm[1] = m1;
2077    rd[0] = d0;
2078    rd[1] = d1;
2079}
2080
2081void HELPER(neon_qunzip32)(void *vd, void *vm)
2082{
2083    uint64_t *rd = vd, *rm = vm;
2084    uint64_t zd0 = rd[0], zd1 = rd[1];
2085    uint64_t zm0 = rm[0], zm1 = rm[1];
2086
2087    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
2088    uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
2089    uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
2090    uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
2091
2092    rm[0] = m0;
2093    rm[1] = m1;
2094    rd[0] = d0;
2095    rd[1] = d1;
2096}
2097
2098void HELPER(neon_unzip8)(void *vd, void *vm)
2099{
2100    uint64_t *rd = vd, *rm = vm;
2101    uint64_t zd = rd[0], zm = rm[0];
2102
2103    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
2104        | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
2105        | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
2106        | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
2107    uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
2108        | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
2109        | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
2110        | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
2111
2112    rm[0] = m0;
2113    rd[0] = d0;
2114}
2115
2116void HELPER(neon_unzip16)(void *vd, void *vm)
2117{
2118    uint64_t *rd = vd, *rm = vm;
2119    uint64_t zd = rd[0], zm = rm[0];
2120
2121    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
2122        | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
2123    uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
2124        | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
2125
2126    rm[0] = m0;
2127    rd[0] = d0;
2128}
2129
2130void HELPER(neon_qzip8)(void *vd, void *vm)
2131{
2132    uint64_t *rd = vd, *rm = vm;
2133    uint64_t zd0 = rd[0], zd1 = rd[1];
2134    uint64_t zm0 = rm[0], zm1 = rm[1];
2135
2136    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
2137        | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
2138        | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
2139        | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
2140    uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
2141        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
2142        | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
2143        | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
2144    uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
2145        | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
2146        | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
2147        | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
2148    uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
2149        | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
2150        | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
2151        | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
2152
2153    rm[0] = m0;
2154    rm[1] = m1;
2155    rd[0] = d0;
2156    rd[1] = d1;
2157}
2158
2159void HELPER(neon_qzip16)(void *vd, void *vm)
2160{
2161    uint64_t *rd = vd, *rm = vm;
2162    uint64_t zd0 = rd[0], zd1 = rd[1];
2163    uint64_t zm0 = rm[0], zm1 = rm[1];
2164
2165    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
2166        | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
2167    uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
2168        | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
2169    uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
2170        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
2171    uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
2172        | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
2173
2174    rm[0] = m0;
2175    rm[1] = m1;
2176    rd[0] = d0;
2177    rd[1] = d1;
2178}
2179
2180void HELPER(neon_qzip32)(void *vd, void *vm)
2181{
2182    uint64_t *rd = vd, *rm = vm;
2183    uint64_t zd0 = rd[0], zd1 = rd[1];
2184    uint64_t zm0 = rm[0], zm1 = rm[1];
2185
2186    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
2187    uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
2188    uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
2189    uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
2190
2191    rm[0] = m0;
2192    rm[1] = m1;
2193    rd[0] = d0;
2194    rd[1] = d1;
2195}
2196
2197void HELPER(neon_zip8)(void *vd, void *vm)
2198{
2199    uint64_t *rd = vd, *rm = vm;
2200    uint64_t zd = rd[0], zm = rm[0];
2201
2202    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
2203        | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
2204        | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
2205        | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
2206    uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
2207        | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
2208        | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
2209        | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
2210
2211    rm[0] = m0;
2212    rd[0] = d0;
2213}
2214
2215void HELPER(neon_zip16)(void *vd, void *vm)
2216{
2217    uint64_t *rd = vd, *rm = vm;
2218    uint64_t zd = rd[0], zm = rm[0];
2219
2220    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
2221        | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
2222    uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
2223        | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
2224
2225    rm[0] = m0;
2226    rd[0] = d0;
2227}
2228
2229/* Helper function for 64 bit polynomial multiply case:
2230 * perform PolynomialMult(op1, op2) and return either the top or
2231 * bottom half of the 128 bit result.
2232 */
2233uint64_t HELPER(neon_pmull_64_lo)(uint64_t op1, uint64_t op2)
2234{
2235    int bitnum;
2236    uint64_t res = 0;
2237
2238    for (bitnum = 0; bitnum < 64; bitnum++) {
2239        if (op1 & (1ULL << bitnum)) {
2240            res ^= op2 << bitnum;
2241        }
2242    }
2243    return res;
2244}
2245uint64_t HELPER(neon_pmull_64_hi)(uint64_t op1, uint64_t op2)
2246{
2247    int bitnum;
2248    uint64_t res = 0;
2249
2250    /* bit 0 of op1 can't influence the high 64 bits at all */
2251    for (bitnum = 1; bitnum < 64; bitnum++) {
2252        if (op1 & (1ULL << bitnum)) {
2253            res ^= op2 >> (64 - bitnum);
2254        }
2255    }
2256    return res;
2257}
2258