qemu/target-arm/neon_helper.c
<<
>>
Prefs
   1/*
   2 * ARM NEON vector operations.
   3 *
   4 * Copyright (c) 2007, 2008 CodeSourcery.
   5 * Written by Paul Brook
   6 *
   7 * This code is licensed under the GNU GPL v2.
   8 */
   9#include "qemu/osdep.h"
  10
  11#include "cpu.h"
  12#include "exec/exec-all.h"
  13#include "exec/helper-proto.h"
  14
  15#define SIGNBIT (uint32_t)0x80000000
  16#define SIGNBIT64 ((uint64_t)1 << 63)
  17
  18#define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] |= CPSR_Q
  19
  20#define NEON_TYPE1(name, type) \
  21typedef struct \
  22{ \
  23    type v1; \
  24} neon_##name;
  25#ifdef HOST_WORDS_BIGENDIAN
  26#define NEON_TYPE2(name, type) \
  27typedef struct \
  28{ \
  29    type v2; \
  30    type v1; \
  31} neon_##name;
  32#define NEON_TYPE4(name, type) \
  33typedef struct \
  34{ \
  35    type v4; \
  36    type v3; \
  37    type v2; \
  38    type v1; \
  39} neon_##name;
  40#else
  41#define NEON_TYPE2(name, type) \
  42typedef struct \
  43{ \
  44    type v1; \
  45    type v2; \
  46} neon_##name;
  47#define NEON_TYPE4(name, type) \
  48typedef struct \
  49{ \
  50    type v1; \
  51    type v2; \
  52    type v3; \
  53    type v4; \
  54} neon_##name;
  55#endif
  56
  57NEON_TYPE4(s8, int8_t)
  58NEON_TYPE4(u8, uint8_t)
  59NEON_TYPE2(s16, int16_t)
  60NEON_TYPE2(u16, uint16_t)
  61NEON_TYPE1(s32, int32_t)
  62NEON_TYPE1(u32, uint32_t)
  63#undef NEON_TYPE4
  64#undef NEON_TYPE2
  65#undef NEON_TYPE1
  66
  67/* Copy from a uint32_t to a vector structure type.  */
  68#define NEON_UNPACK(vtype, dest, val) do { \
  69    union { \
  70        vtype v; \
  71        uint32_t i; \
  72    } conv_u; \
  73    conv_u.i = (val); \
  74    dest = conv_u.v; \
  75    } while(0)
  76
  77/* Copy from a vector structure type to a uint32_t.  */
  78#define NEON_PACK(vtype, dest, val) do { \
  79    union { \
  80        vtype v; \
  81        uint32_t i; \
  82    } conv_u; \
  83    conv_u.v = (val); \
  84    dest = conv_u.i; \
  85    } while(0)
  86
  87#define NEON_DO1 \
  88    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
  89#define NEON_DO2 \
  90    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
  91    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
  92#define NEON_DO4 \
  93    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
  94    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
  95    NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
  96    NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
  97
  98#define NEON_VOP_BODY(vtype, n) \
  99{ \
 100    uint32_t res; \
 101    vtype vsrc1; \
 102    vtype vsrc2; \
 103    vtype vdest; \
 104    NEON_UNPACK(vtype, vsrc1, arg1); \
 105    NEON_UNPACK(vtype, vsrc2, arg2); \
 106    NEON_DO##n; \
 107    NEON_PACK(vtype, res, vdest); \
 108    return res; \
 109}
 110
 111#define NEON_VOP(name, vtype, n) \
 112uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
 113NEON_VOP_BODY(vtype, n)
 114
 115#define NEON_VOP_ENV(name, vtype, n) \
 116uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
 117NEON_VOP_BODY(vtype, n)
 118
 119/* Pairwise operations.  */
 120/* For 32-bit elements each segment only contains a single element, so
 121   the elementwise and pairwise operations are the same.  */
 122#define NEON_PDO2 \
 123    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
 124    NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
 125#define NEON_PDO4 \
 126    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
 127    NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
 128    NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
 129    NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
 130
 131#define NEON_POP(name, vtype, n) \
 132uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
 133{ \
 134    uint32_t res; \
 135    vtype vsrc1; \
 136    vtype vsrc2; \
 137    vtype vdest; \
 138    NEON_UNPACK(vtype, vsrc1, arg1); \
 139    NEON_UNPACK(vtype, vsrc2, arg2); \
 140    NEON_PDO##n; \
 141    NEON_PACK(vtype, res, vdest); \
 142    return res; \
 143}
 144
 145/* Unary operators.  */
 146#define NEON_VOP1(name, vtype, n) \
 147uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
 148{ \
 149    vtype vsrc1; \
 150    vtype vdest; \
 151    NEON_UNPACK(vtype, vsrc1, arg); \
 152    NEON_DO##n; \
 153    NEON_PACK(vtype, arg, vdest); \
 154    return arg; \
 155}
 156
 157
 158#define NEON_USAT(dest, src1, src2, type) do { \
 159    uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
 160    if (tmp != (type)tmp) { \
 161        SET_QC(); \
 162        dest = ~0; \
 163    } else { \
 164        dest = tmp; \
 165    }} while(0)
 166#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
 167NEON_VOP_ENV(qadd_u8, neon_u8, 4)
 168#undef NEON_FN
 169#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
 170NEON_VOP_ENV(qadd_u16, neon_u16, 2)
 171#undef NEON_FN
 172#undef NEON_USAT
 173
 174uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
 175{
 176    uint32_t res = a + b;
 177    if (res < a) {
 178        SET_QC();
 179        res = ~0;
 180    }
 181    return res;
 182}
 183
 184uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
 185{
 186    uint64_t res;
 187
 188    res = src1 + src2;
 189    if (res < src1) {
 190        SET_QC();
 191        res = ~(uint64_t)0;
 192    }
 193    return res;
 194}
 195
 196#define NEON_SSAT(dest, src1, src2, type) do { \
 197    int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
 198    if (tmp != (type)tmp) { \
 199        SET_QC(); \
 200        if (src2 > 0) { \
 201            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
 202        } else { \
 203            tmp = 1 << (sizeof(type) * 8 - 1); \
 204        } \
 205    } \
 206    dest = tmp; \
 207    } while(0)
 208#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
 209NEON_VOP_ENV(qadd_s8, neon_s8, 4)
 210#undef NEON_FN
 211#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
 212NEON_VOP_ENV(qadd_s16, neon_s16, 2)
 213#undef NEON_FN
 214#undef NEON_SSAT
 215
 216uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
 217{
 218    uint32_t res = a + b;
 219    if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
 220        SET_QC();
 221        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
 222    }
 223    return res;
 224}
 225
 226uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
 227{
 228    uint64_t res;
 229
 230    res = src1 + src2;
 231    if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
 232        SET_QC();
 233        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
 234    }
 235    return res;
 236}
 237
 238/* Unsigned saturating accumulate of signed value
 239 *
 240 * Op1/Rn is treated as signed
 241 * Op2/Rd is treated as unsigned
 242 *
 243 * Explicit casting is used to ensure the correct sign extension of
 244 * inputs. The result is treated as a unsigned value and saturated as such.
 245 *
 246 * We use a macro for the 8/16 bit cases which expects signed integers of va,
 247 * vb, and vr for interim calculation and an unsigned 32 bit result value r.
 248 */
 249
 250#define USATACC(bits, shift) \
 251    do { \
 252        va = sextract32(a, shift, bits);                                \
 253        vb = extract32(b, shift, bits);                                 \
 254        vr = va + vb;                                                   \
 255        if (vr > UINT##bits##_MAX) {                                    \
 256            SET_QC();                                                   \
 257            vr = UINT##bits##_MAX;                                      \
 258        } else if (vr < 0) {                                            \
 259            SET_QC();                                                   \
 260            vr = 0;                                                     \
 261        }                                                               \
 262        r = deposit32(r, shift, bits, vr);                              \
 263   } while (0)
 264
 265uint32_t HELPER(neon_uqadd_s8)(CPUARMState *env, uint32_t a, uint32_t b)
 266{
 267    int16_t va, vb, vr;
 268    uint32_t r = 0;
 269
 270    USATACC(8, 0);
 271    USATACC(8, 8);
 272    USATACC(8, 16);
 273    USATACC(8, 24);
 274    return r;
 275}
 276
 277uint32_t HELPER(neon_uqadd_s16)(CPUARMState *env, uint32_t a, uint32_t b)
 278{
 279    int32_t va, vb, vr;
 280    uint64_t r = 0;
 281
 282    USATACC(16, 0);
 283    USATACC(16, 16);
 284    return r;
 285}
 286
 287#undef USATACC
 288
 289uint32_t HELPER(neon_uqadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
 290{
 291    int64_t va = (int32_t)a;
 292    int64_t vb = (uint32_t)b;
 293    int64_t vr = va + vb;
 294    if (vr > UINT32_MAX) {
 295        SET_QC();
 296        vr = UINT32_MAX;
 297    } else if (vr < 0) {
 298        SET_QC();
 299        vr = 0;
 300    }
 301    return vr;
 302}
 303
 304uint64_t HELPER(neon_uqadd_s64)(CPUARMState *env, uint64_t a, uint64_t b)
 305{
 306    uint64_t res;
 307    res = a + b;
 308    /* We only need to look at the pattern of SIGN bits to detect
 309     * +ve/-ve saturation
 310     */
 311    if (~a & b & ~res & SIGNBIT64) {
 312        SET_QC();
 313        res = UINT64_MAX;
 314    } else if (a & ~b & res & SIGNBIT64) {
 315        SET_QC();
 316        res = 0;
 317    }
 318    return res;
 319}
 320
 321/* Signed saturating accumulate of unsigned value
 322 *
 323 * Op1/Rn is treated as unsigned
 324 * Op2/Rd is treated as signed
 325 *
 326 * The result is treated as a signed value and saturated as such
 327 *
 328 * We use a macro for the 8/16 bit cases which expects signed integers of va,
 329 * vb, and vr for interim calculation and an unsigned 32 bit result value r.
 330 */
 331
 332#define SSATACC(bits, shift) \
 333    do { \
 334        va = extract32(a, shift, bits);                                 \
 335        vb = sextract32(b, shift, bits);                                \
 336        vr = va + vb;                                                   \
 337        if (vr > INT##bits##_MAX) {                                     \
 338            SET_QC();                                                   \
 339            vr = INT##bits##_MAX;                                       \
 340        } else if (vr < INT##bits##_MIN) {                              \
 341            SET_QC();                                                   \
 342            vr = INT##bits##_MIN;                                       \
 343        }                                                               \
 344        r = deposit32(r, shift, bits, vr);                              \
 345    } while (0)
 346
 347uint32_t HELPER(neon_sqadd_u8)(CPUARMState *env, uint32_t a, uint32_t b)
 348{
 349    int16_t va, vb, vr;
 350    uint32_t r = 0;
 351
 352    SSATACC(8, 0);
 353    SSATACC(8, 8);
 354    SSATACC(8, 16);
 355    SSATACC(8, 24);
 356    return r;
 357}
 358
 359uint32_t HELPER(neon_sqadd_u16)(CPUARMState *env, uint32_t a, uint32_t b)
 360{
 361    int32_t va, vb, vr;
 362    uint32_t r = 0;
 363
 364    SSATACC(16, 0);
 365    SSATACC(16, 16);
 366
 367    return r;
 368}
 369
 370#undef SSATACC
 371
 372uint32_t HELPER(neon_sqadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
 373{
 374    int64_t res;
 375    int64_t op1 = (uint32_t)a;
 376    int64_t op2 = (int32_t)b;
 377    res = op1 + op2;
 378    if (res > INT32_MAX) {
 379        SET_QC();
 380        res = INT32_MAX;
 381    } else if (res < INT32_MIN) {
 382        SET_QC();
 383        res = INT32_MIN;
 384    }
 385    return res;
 386}
 387
 388uint64_t HELPER(neon_sqadd_u64)(CPUARMState *env, uint64_t a, uint64_t b)
 389{
 390    uint64_t res;
 391    res = a + b;
 392    /* We only need to look at the pattern of SIGN bits to detect an overflow */
 393    if (((a & res)
 394         | (~b & res)
 395         | (a & ~b)) & SIGNBIT64) {
 396        SET_QC();
 397        res = INT64_MAX;
 398    }
 399    return res;
 400}
 401
 402
 403#define NEON_USAT(dest, src1, src2, type) do { \
 404    uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
 405    if (tmp != (type)tmp) { \
 406        SET_QC(); \
 407        dest = 0; \
 408    } else { \
 409        dest = tmp; \
 410    }} while(0)
 411#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
 412NEON_VOP_ENV(qsub_u8, neon_u8, 4)
 413#undef NEON_FN
 414#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
 415NEON_VOP_ENV(qsub_u16, neon_u16, 2)
 416#undef NEON_FN
 417#undef NEON_USAT
 418
 419uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b)
 420{
 421    uint32_t res = a - b;
 422    if (res > a) {
 423        SET_QC();
 424        res = 0;
 425    }
 426    return res;
 427}
 428
 429uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
 430{
 431    uint64_t res;
 432
 433    if (src1 < src2) {
 434        SET_QC();
 435        res = 0;
 436    } else {
 437        res = src1 - src2;
 438    }
 439    return res;
 440}
 441
 442#define NEON_SSAT(dest, src1, src2, type) do { \
 443    int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
 444    if (tmp != (type)tmp) { \
 445        SET_QC(); \
 446        if (src2 < 0) { \
 447            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
 448        } else { \
 449            tmp = 1 << (sizeof(type) * 8 - 1); \
 450        } \
 451    } \
 452    dest = tmp; \
 453    } while(0)
 454#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
 455NEON_VOP_ENV(qsub_s8, neon_s8, 4)
 456#undef NEON_FN
 457#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
 458NEON_VOP_ENV(qsub_s16, neon_s16, 2)
 459#undef NEON_FN
 460#undef NEON_SSAT
 461
 462uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b)
 463{
 464    uint32_t res = a - b;
 465    if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
 466        SET_QC();
 467        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
 468    }
 469    return res;
 470}
 471
 472uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
 473{
 474    uint64_t res;
 475
 476    res = src1 - src2;
 477    if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
 478        SET_QC();
 479        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
 480    }
 481    return res;
 482}
 483
 484#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
 485NEON_VOP(hadd_s8, neon_s8, 4)
 486NEON_VOP(hadd_u8, neon_u8, 4)
 487NEON_VOP(hadd_s16, neon_s16, 2)
 488NEON_VOP(hadd_u16, neon_u16, 2)
 489#undef NEON_FN
 490
 491int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
 492{
 493    int32_t dest;
 494
 495    dest = (src1 >> 1) + (src2 >> 1);
 496    if (src1 & src2 & 1)
 497        dest++;
 498    return dest;
 499}
 500
 501uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
 502{
 503    uint32_t dest;
 504
 505    dest = (src1 >> 1) + (src2 >> 1);
 506    if (src1 & src2 & 1)
 507        dest++;
 508    return dest;
 509}
 510
 511#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
 512NEON_VOP(rhadd_s8, neon_s8, 4)
 513NEON_VOP(rhadd_u8, neon_u8, 4)
 514NEON_VOP(rhadd_s16, neon_s16, 2)
 515NEON_VOP(rhadd_u16, neon_u16, 2)
 516#undef NEON_FN
 517
 518int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
 519{
 520    int32_t dest;
 521
 522    dest = (src1 >> 1) + (src2 >> 1);
 523    if ((src1 | src2) & 1)
 524        dest++;
 525    return dest;
 526}
 527
 528uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
 529{
 530    uint32_t dest;
 531
 532    dest = (src1 >> 1) + (src2 >> 1);
 533    if ((src1 | src2) & 1)
 534        dest++;
 535    return dest;
 536}
 537
 538#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
 539NEON_VOP(hsub_s8, neon_s8, 4)
 540NEON_VOP(hsub_u8, neon_u8, 4)
 541NEON_VOP(hsub_s16, neon_s16, 2)
 542NEON_VOP(hsub_u16, neon_u16, 2)
 543#undef NEON_FN
 544
 545int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
 546{
 547    int32_t dest;
 548
 549    dest = (src1 >> 1) - (src2 >> 1);
 550    if ((~src1) & src2 & 1)
 551        dest--;
 552    return dest;
 553}
 554
 555uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
 556{
 557    uint32_t dest;
 558
 559    dest = (src1 >> 1) - (src2 >> 1);
 560    if ((~src1) & src2 & 1)
 561        dest--;
 562    return dest;
 563}
 564
 565#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
 566NEON_VOP(cgt_s8, neon_s8, 4)
 567NEON_VOP(cgt_u8, neon_u8, 4)
 568NEON_VOP(cgt_s16, neon_s16, 2)
 569NEON_VOP(cgt_u16, neon_u16, 2)
 570NEON_VOP(cgt_s32, neon_s32, 1)
 571NEON_VOP(cgt_u32, neon_u32, 1)
 572#undef NEON_FN
 573
 574#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
 575NEON_VOP(cge_s8, neon_s8, 4)
 576NEON_VOP(cge_u8, neon_u8, 4)
 577NEON_VOP(cge_s16, neon_s16, 2)
 578NEON_VOP(cge_u16, neon_u16, 2)
 579NEON_VOP(cge_s32, neon_s32, 1)
 580NEON_VOP(cge_u32, neon_u32, 1)
 581#undef NEON_FN
 582
 583#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
 584NEON_VOP(min_s8, neon_s8, 4)
 585NEON_VOP(min_u8, neon_u8, 4)
 586NEON_VOP(min_s16, neon_s16, 2)
 587NEON_VOP(min_u16, neon_u16, 2)
 588NEON_VOP(min_s32, neon_s32, 1)
 589NEON_VOP(min_u32, neon_u32, 1)
 590NEON_POP(pmin_s8, neon_s8, 4)
 591NEON_POP(pmin_u8, neon_u8, 4)
 592NEON_POP(pmin_s16, neon_s16, 2)
 593NEON_POP(pmin_u16, neon_u16, 2)
 594#undef NEON_FN
 595
 596#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
 597NEON_VOP(max_s8, neon_s8, 4)
 598NEON_VOP(max_u8, neon_u8, 4)
 599NEON_VOP(max_s16, neon_s16, 2)
 600NEON_VOP(max_u16, neon_u16, 2)
 601NEON_VOP(max_s32, neon_s32, 1)
 602NEON_VOP(max_u32, neon_u32, 1)
 603NEON_POP(pmax_s8, neon_s8, 4)
 604NEON_POP(pmax_u8, neon_u8, 4)
 605NEON_POP(pmax_s16, neon_s16, 2)
 606NEON_POP(pmax_u16, neon_u16, 2)
 607#undef NEON_FN
 608
 609#define NEON_FN(dest, src1, src2) \
 610    dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
 611NEON_VOP(abd_s8, neon_s8, 4)
 612NEON_VOP(abd_u8, neon_u8, 4)
 613NEON_VOP(abd_s16, neon_s16, 2)
 614NEON_VOP(abd_u16, neon_u16, 2)
 615NEON_VOP(abd_s32, neon_s32, 1)
 616NEON_VOP(abd_u32, neon_u32, 1)
 617#undef NEON_FN
 618
 619#define NEON_FN(dest, src1, src2) do { \
 620    int8_t tmp; \
 621    tmp = (int8_t)src2; \
 622    if (tmp >= (ssize_t)sizeof(src1) * 8 || \
 623        tmp <= -(ssize_t)sizeof(src1) * 8) { \
 624        dest = 0; \
 625    } else if (tmp < 0) { \
 626        dest = src1 >> -tmp; \
 627    } else { \
 628        dest = src1 << tmp; \
 629    }} while (0)
 630NEON_VOP(shl_u8, neon_u8, 4)
 631NEON_VOP(shl_u16, neon_u16, 2)
 632NEON_VOP(shl_u32, neon_u32, 1)
 633#undef NEON_FN
 634
 635uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop)
 636{
 637    int8_t shift = (int8_t)shiftop;
 638    if (shift >= 64 || shift <= -64) {
 639        val = 0;
 640    } else if (shift < 0) {
 641        val >>= -shift;
 642    } else {
 643        val <<= shift;
 644    }
 645    return val;
 646}
 647
 648#define NEON_FN(dest, src1, src2) do { \
 649    int8_t tmp; \
 650    tmp = (int8_t)src2; \
 651    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
 652        dest = 0; \
 653    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
 654        dest = src1 >> (sizeof(src1) * 8 - 1); \
 655    } else if (tmp < 0) { \
 656        dest = src1 >> -tmp; \
 657    } else { \
 658        dest = src1 << tmp; \
 659    }} while (0)
 660NEON_VOP(shl_s8, neon_s8, 4)
 661NEON_VOP(shl_s16, neon_s16, 2)
 662NEON_VOP(shl_s32, neon_s32, 1)
 663#undef NEON_FN
 664
 665uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop)
 666{
 667    int8_t shift = (int8_t)shiftop;
 668    int64_t val = valop;
 669    if (shift >= 64) {
 670        val = 0;
 671    } else if (shift <= -64) {
 672        val >>= 63;
 673    } else if (shift < 0) {
 674        val >>= -shift;
 675    } else {
 676        val <<= shift;
 677    }
 678    return val;
 679}
 680
 681#define NEON_FN(dest, src1, src2) do { \
 682    int8_t tmp; \
 683    tmp = (int8_t)src2; \
 684    if ((tmp >= (ssize_t)sizeof(src1) * 8) \
 685        || (tmp <= -(ssize_t)sizeof(src1) * 8)) { \
 686        dest = 0; \
 687    } else if (tmp < 0) { \
 688        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
 689    } else { \
 690        dest = src1 << tmp; \
 691    }} while (0)
 692NEON_VOP(rshl_s8, neon_s8, 4)
 693NEON_VOP(rshl_s16, neon_s16, 2)
 694#undef NEON_FN
 695
 696/* The addition of the rounding constant may overflow, so we use an
 697 * intermediate 64 bit accumulator.  */
 698uint32_t HELPER(neon_rshl_s32)(uint32_t valop, uint32_t shiftop)
 699{
 700    int32_t dest;
 701    int32_t val = (int32_t)valop;
 702    int8_t shift = (int8_t)shiftop;
 703    if ((shift >= 32) || (shift <= -32)) {
 704        dest = 0;
 705    } else if (shift < 0) {
 706        int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
 707        dest = big_dest >> -shift;
 708    } else {
 709        dest = val << shift;
 710    }
 711    return dest;
 712}
 713
 714/* Handling addition overflow with 64 bit input values is more
 715 * tricky than with 32 bit values.  */
 716uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
 717{
 718    int8_t shift = (int8_t)shiftop;
 719    int64_t val = valop;
 720    if ((shift >= 64) || (shift <= -64)) {
 721        val = 0;
 722    } else if (shift < 0) {
 723        val >>= (-shift - 1);
 724        if (val == INT64_MAX) {
 725            /* In this case, it means that the rounding constant is 1,
 726             * and the addition would overflow. Return the actual
 727             * result directly.  */
 728            val = 0x4000000000000000LL;
 729        } else {
 730            val++;
 731            val >>= 1;
 732        }
 733    } else {
 734        val <<= shift;
 735    }
 736    return val;
 737}
 738
 739#define NEON_FN(dest, src1, src2) do { \
 740    int8_t tmp; \
 741    tmp = (int8_t)src2; \
 742    if (tmp >= (ssize_t)sizeof(src1) * 8 || \
 743        tmp < -(ssize_t)sizeof(src1) * 8) { \
 744        dest = 0; \
 745    } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
 746        dest = src1 >> (-tmp - 1); \
 747    } else if (tmp < 0) { \
 748        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
 749    } else { \
 750        dest = src1 << tmp; \
 751    }} while (0)
 752NEON_VOP(rshl_u8, neon_u8, 4)
 753NEON_VOP(rshl_u16, neon_u16, 2)
 754#undef NEON_FN
 755
 756/* The addition of the rounding constant may overflow, so we use an
 757 * intermediate 64 bit accumulator.  */
 758uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shiftop)
 759{
 760    uint32_t dest;
 761    int8_t shift = (int8_t)shiftop;
 762    if (shift >= 32 || shift < -32) {
 763        dest = 0;
 764    } else if (shift == -32) {
 765        dest = val >> 31;
 766    } else if (shift < 0) {
 767        uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
 768        dest = big_dest >> -shift;
 769    } else {
 770        dest = val << shift;
 771    }
 772    return dest;
 773}
 774
 775/* Handling addition overflow with 64 bit input values is more
 776 * tricky than with 32 bit values.  */
 777uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
 778{
 779    int8_t shift = (uint8_t)shiftop;
 780    if (shift >= 64 || shift < -64) {
 781        val = 0;
 782    } else if (shift == -64) {
 783        /* Rounding a 1-bit result just preserves that bit.  */
 784        val >>= 63;
 785    } else if (shift < 0) {
 786        val >>= (-shift - 1);
 787        if (val == UINT64_MAX) {
 788            /* In this case, it means that the rounding constant is 1,
 789             * and the addition would overflow. Return the actual
 790             * result directly.  */
 791            val = 0x8000000000000000ULL;
 792        } else {
 793            val++;
 794            val >>= 1;
 795        }
 796    } else {
 797        val <<= shift;
 798    }
 799    return val;
 800}
 801
 802#define NEON_FN(dest, src1, src2) do { \
 803    int8_t tmp; \
 804    tmp = (int8_t)src2; \
 805    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
 806        if (src1) { \
 807            SET_QC(); \
 808            dest = ~0; \
 809        } else { \
 810            dest = 0; \
 811        } \
 812    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
 813        dest = 0; \
 814    } else if (tmp < 0) { \
 815        dest = src1 >> -tmp; \
 816    } else { \
 817        dest = src1 << tmp; \
 818        if ((dest >> tmp) != src1) { \
 819            SET_QC(); \
 820            dest = ~0; \
 821        } \
 822    }} while (0)
 823NEON_VOP_ENV(qshl_u8, neon_u8, 4)
 824NEON_VOP_ENV(qshl_u16, neon_u16, 2)
 825NEON_VOP_ENV(qshl_u32, neon_u32, 1)
 826#undef NEON_FN
 827
 828uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shiftop)
 829{
 830    int8_t shift = (int8_t)shiftop;
 831    if (shift >= 64) {
 832        if (val) {
 833            val = ~(uint64_t)0;
 834            SET_QC();
 835        }
 836    } else if (shift <= -64) {
 837        val = 0;
 838    } else if (shift < 0) {
 839        val >>= -shift;
 840    } else {
 841        uint64_t tmp = val;
 842        val <<= shift;
 843        if ((val >> shift) != tmp) {
 844            SET_QC();
 845            val = ~(uint64_t)0;
 846        }
 847    }
 848    return val;
 849}
 850
 851#define NEON_FN(dest, src1, src2) do { \
 852    int8_t tmp; \
 853    tmp = (int8_t)src2; \
 854    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
 855        if (src1) { \
 856            SET_QC(); \
 857            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
 858            if (src1 > 0) { \
 859                dest--; \
 860            } \
 861        } else { \
 862            dest = src1; \
 863        } \
 864    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
 865        dest = src1 >> 31; \
 866    } else if (tmp < 0) { \
 867        dest = src1 >> -tmp; \
 868    } else { \
 869        dest = src1 << tmp; \
 870        if ((dest >> tmp) != src1) { \
 871            SET_QC(); \
 872            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
 873            if (src1 > 0) { \
 874                dest--; \
 875            } \
 876        } \
 877    }} while (0)
 878NEON_VOP_ENV(qshl_s8, neon_s8, 4)
 879NEON_VOP_ENV(qshl_s16, neon_s16, 2)
 880NEON_VOP_ENV(qshl_s32, neon_s32, 1)
 881#undef NEON_FN
 882
 883uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop)
 884{
 885    int8_t shift = (uint8_t)shiftop;
 886    int64_t val = valop;
 887    if (shift >= 64) {
 888        if (val) {
 889            SET_QC();
 890            val = (val >> 63) ^ ~SIGNBIT64;
 891        }
 892    } else if (shift <= -64) {
 893        val >>= 63;
 894    } else if (shift < 0) {
 895        val >>= -shift;
 896    } else {
 897        int64_t tmp = val;
 898        val <<= shift;
 899        if ((val >> shift) != tmp) {
 900            SET_QC();
 901            val = (tmp >> 63) ^ ~SIGNBIT64;
 902        }
 903    }
 904    return val;
 905}
 906
 907#define NEON_FN(dest, src1, src2) do { \
 908    if (src1 & (1 << (sizeof(src1) * 8 - 1))) { \
 909        SET_QC(); \
 910        dest = 0; \
 911    } else { \
 912        int8_t tmp; \
 913        tmp = (int8_t)src2; \
 914        if (tmp >= (ssize_t)sizeof(src1) * 8) { \
 915            if (src1) { \
 916                SET_QC(); \
 917                dest = ~0; \
 918            } else { \
 919                dest = 0; \
 920            } \
 921        } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
 922            dest = 0; \
 923        } else if (tmp < 0) { \
 924            dest = src1 >> -tmp; \
 925        } else { \
 926            dest = src1 << tmp; \
 927            if ((dest >> tmp) != src1) { \
 928                SET_QC(); \
 929                dest = ~0; \
 930            } \
 931        } \
 932    }} while (0)
 933NEON_VOP_ENV(qshlu_s8, neon_u8, 4)
 934NEON_VOP_ENV(qshlu_s16, neon_u16, 2)
 935#undef NEON_FN
 936
 937uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t valop, uint32_t shiftop)
 938{
 939    if ((int32_t)valop < 0) {
 940        SET_QC();
 941        return 0;
 942    }
 943    return helper_neon_qshl_u32(env, valop, shiftop);
 944}
 945
 946uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop)
 947{
 948    if ((int64_t)valop < 0) {
 949        SET_QC();
 950        return 0;
 951    }
 952    return helper_neon_qshl_u64(env, valop, shiftop);
 953}
 954
 955#define NEON_FN(dest, src1, src2) do { \
 956    int8_t tmp; \
 957    tmp = (int8_t)src2; \
 958    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
 959        if (src1) { \
 960            SET_QC(); \
 961            dest = ~0; \
 962        } else { \
 963            dest = 0; \
 964        } \
 965    } else if (tmp < -(ssize_t)sizeof(src1) * 8) { \
 966        dest = 0; \
 967    } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
 968        dest = src1 >> (sizeof(src1) * 8 - 1); \
 969    } else if (tmp < 0) { \
 970        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
 971    } else { \
 972        dest = src1 << tmp; \
 973        if ((dest >> tmp) != src1) { \
 974            SET_QC(); \
 975            dest = ~0; \
 976        } \
 977    }} while (0)
 978NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
 979NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
 980#undef NEON_FN
 981
 982/* The addition of the rounding constant may overflow, so we use an
 983 * intermediate 64 bit accumulator.  */
 984uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shiftop)
 985{
 986    uint32_t dest;
 987    int8_t shift = (int8_t)shiftop;
 988    if (shift >= 32) {
 989        if (val) {
 990            SET_QC();
 991            dest = ~0;
 992        } else {
 993            dest = 0;
 994        }
 995    } else if (shift < -32) {
 996        dest = 0;
 997    } else if (shift == -32) {
 998        dest = val >> 31;
 999    } else if (shift < 0) {
1000        uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
1001        dest = big_dest >> -shift;
1002    } else {
1003        dest = val << shift;
1004        if ((dest >> shift) != val) {
1005            SET_QC();
1006            dest = ~0;
1007        }
1008    }
1009    return dest;
1010}
1011
1012/* Handling addition overflow with 64 bit input values is more
1013 * tricky than with 32 bit values.  */
1014uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shiftop)
1015{
1016    int8_t shift = (int8_t)shiftop;
1017    if (shift >= 64) {
1018        if (val) {
1019            SET_QC();
1020            val = ~0;
1021        }
1022    } else if (shift < -64) {
1023        val = 0;
1024    } else if (shift == -64) {
1025        val >>= 63;
1026    } else if (shift < 0) {
1027        val >>= (-shift - 1);
1028        if (val == UINT64_MAX) {
1029            /* In this case, it means that the rounding constant is 1,
1030             * and the addition would overflow. Return the actual
1031             * result directly.  */
1032            val = 0x8000000000000000ULL;
1033        } else {
1034            val++;
1035            val >>= 1;
1036        }
1037    } else { \
1038        uint64_t tmp = val;
1039        val <<= shift;
1040        if ((val >> shift) != tmp) {
1041            SET_QC();
1042            val = ~0;
1043        }
1044    }
1045    return val;
1046}
1047
1048#define NEON_FN(dest, src1, src2) do { \
1049    int8_t tmp; \
1050    tmp = (int8_t)src2; \
1051    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
1052        if (src1) { \
1053            SET_QC(); \
1054            dest = (1 << (sizeof(src1) * 8 - 1)); \
1055            if (src1 > 0) { \
1056                dest--; \
1057            } \
1058        } else { \
1059            dest = 0; \
1060        } \
1061    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
1062        dest = 0; \
1063    } else if (tmp < 0) { \
1064        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
1065    } else { \
1066        dest = src1 << tmp; \
1067        if ((dest >> tmp) != src1) { \
1068            SET_QC(); \
1069            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
1070            if (src1 > 0) { \
1071                dest--; \
1072            } \
1073        } \
1074    }} while (0)
1075NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
1076NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
1077#undef NEON_FN
1078
1079/* The addition of the rounding constant may overflow, so we use an
1080 * intermediate 64 bit accumulator.  */
1081uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t valop, uint32_t shiftop)
1082{
1083    int32_t dest;
1084    int32_t val = (int32_t)valop;
1085    int8_t shift = (int8_t)shiftop;
1086    if (shift >= 32) {
1087        if (val) {
1088            SET_QC();
1089            dest = (val >> 31) ^ ~SIGNBIT;
1090        } else {
1091            dest = 0;
1092        }
1093    } else if (shift <= -32) {
1094        dest = 0;
1095    } else if (shift < 0) {
1096        int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
1097        dest = big_dest >> -shift;
1098    } else {
1099        dest = val << shift;
1100        if ((dest >> shift) != val) {
1101            SET_QC();
1102            dest = (val >> 31) ^ ~SIGNBIT;
1103        }
1104    }
1105    return dest;
1106}
1107
1108/* Handling addition overflow with 64 bit input values is more
1109 * tricky than with 32 bit values.  */
1110uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop)
1111{
1112    int8_t shift = (uint8_t)shiftop;
1113    int64_t val = valop;
1114
1115    if (shift >= 64) {
1116        if (val) {
1117            SET_QC();
1118            val = (val >> 63) ^ ~SIGNBIT64;
1119        }
1120    } else if (shift <= -64) {
1121        val = 0;
1122    } else if (shift < 0) {
1123        val >>= (-shift - 1);
1124        if (val == INT64_MAX) {
1125            /* In this case, it means that the rounding constant is 1,
1126             * and the addition would overflow. Return the actual
1127             * result directly.  */
1128            val = 0x4000000000000000ULL;
1129        } else {
1130            val++;
1131            val >>= 1;
1132        }
1133    } else {
1134        int64_t tmp = val;
1135        val <<= shift;
1136        if ((val >> shift) != tmp) {
1137            SET_QC();
1138            val = (tmp >> 63) ^ ~SIGNBIT64;
1139        }
1140    }
1141    return val;
1142}
1143
1144uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
1145{
1146    uint32_t mask;
1147    mask = (a ^ b) & 0x80808080u;
1148    a &= ~0x80808080u;
1149    b &= ~0x80808080u;
1150    return (a + b) ^ mask;
1151}
1152
1153uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
1154{
1155    uint32_t mask;
1156    mask = (a ^ b) & 0x80008000u;
1157    a &= ~0x80008000u;
1158    b &= ~0x80008000u;
1159    return (a + b) ^ mask;
1160}
1161
1162#define NEON_FN(dest, src1, src2) dest = src1 + src2
1163NEON_POP(padd_u8, neon_u8, 4)
1164NEON_POP(padd_u16, neon_u16, 2)
1165#undef NEON_FN
1166
1167#define NEON_FN(dest, src1, src2) dest = src1 - src2
1168NEON_VOP(sub_u8, neon_u8, 4)
1169NEON_VOP(sub_u16, neon_u16, 2)
1170#undef NEON_FN
1171
1172#define NEON_FN(dest, src1, src2) dest = src1 * src2
1173NEON_VOP(mul_u8, neon_u8, 4)
1174NEON_VOP(mul_u16, neon_u16, 2)
1175#undef NEON_FN
1176
1177/* Polynomial multiplication is like integer multiplication except the
1178   partial products are XORed, not added.  */
1179uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2)
1180{
1181    uint32_t mask;
1182    uint32_t result;
1183    result = 0;
1184    while (op1) {
1185        mask = 0;
1186        if (op1 & 1)
1187            mask |= 0xff;
1188        if (op1 & (1 << 8))
1189            mask |= (0xff << 8);
1190        if (op1 & (1 << 16))
1191            mask |= (0xff << 16);
1192        if (op1 & (1 << 24))
1193            mask |= (0xff << 24);
1194        result ^= op2 & mask;
1195        op1 = (op1 >> 1) & 0x7f7f7f7f;
1196        op2 = (op2 << 1) & 0xfefefefe;
1197    }
1198    return result;
1199}
1200
1201uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2)
1202{
1203    uint64_t result = 0;
1204    uint64_t mask;
1205    uint64_t op2ex = op2;
1206    op2ex = (op2ex & 0xff) |
1207        ((op2ex & 0xff00) << 8) |
1208        ((op2ex & 0xff0000) << 16) |
1209        ((op2ex & 0xff000000) << 24);
1210    while (op1) {
1211        mask = 0;
1212        if (op1 & 1) {
1213            mask |= 0xffff;
1214        }
1215        if (op1 & (1 << 8)) {
1216            mask |= (0xffffU << 16);
1217        }
1218        if (op1 & (1 << 16)) {
1219            mask |= (0xffffULL << 32);
1220        }
1221        if (op1 & (1 << 24)) {
1222            mask |= (0xffffULL << 48);
1223        }
1224        result ^= op2ex & mask;
1225        op1 = (op1 >> 1) & 0x7f7f7f7f;
1226        op2ex <<= 1;
1227    }
1228    return result;
1229}
1230
1231#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
1232NEON_VOP(tst_u8, neon_u8, 4)
1233NEON_VOP(tst_u16, neon_u16, 2)
1234NEON_VOP(tst_u32, neon_u32, 1)
1235#undef NEON_FN
1236
1237#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
1238NEON_VOP(ceq_u8, neon_u8, 4)
1239NEON_VOP(ceq_u16, neon_u16, 2)
1240NEON_VOP(ceq_u32, neon_u32, 1)
1241#undef NEON_FN
1242
1243#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
1244NEON_VOP1(abs_s8, neon_s8, 4)
1245NEON_VOP1(abs_s16, neon_s16, 2)
1246#undef NEON_FN
1247
1248/* Count Leading Sign/Zero Bits.  */
1249static inline int do_clz8(uint8_t x)
1250{
1251    int n;
1252    for (n = 8; x; n--)
1253        x >>= 1;
1254    return n;
1255}
1256
1257static inline int do_clz16(uint16_t x)
1258{
1259    int n;
1260    for (n = 16; x; n--)
1261        x >>= 1;
1262    return n;
1263}
1264
1265#define NEON_FN(dest, src, dummy) dest = do_clz8(src)
1266NEON_VOP1(clz_u8, neon_u8, 4)
1267#undef NEON_FN
1268
1269#define NEON_FN(dest, src, dummy) dest = do_clz16(src)
1270NEON_VOP1(clz_u16, neon_u16, 2)
1271#undef NEON_FN
1272
1273#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
1274NEON_VOP1(cls_s8, neon_s8, 4)
1275#undef NEON_FN
1276
1277#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
1278NEON_VOP1(cls_s16, neon_s16, 2)
1279#undef NEON_FN
1280
1281uint32_t HELPER(neon_cls_s32)(uint32_t x)
1282{
1283    int count;
1284    if ((int32_t)x < 0)
1285        x = ~x;
1286    for (count = 32; x; count--)
1287        x = x >> 1;
1288    return count - 1;
1289}
1290
1291/* Bit count.  */
1292uint32_t HELPER(neon_cnt_u8)(uint32_t x)
1293{
1294    x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
1295    x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
1296    x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
1297    return x;
1298}
1299
1300/* Reverse bits in each 8 bit word */
1301uint32_t HELPER(neon_rbit_u8)(uint32_t x)
1302{
1303    x =  ((x & 0xf0f0f0f0) >> 4)
1304       | ((x & 0x0f0f0f0f) << 4);
1305    x =  ((x & 0x88888888) >> 3)
1306       | ((x & 0x44444444) >> 1)
1307       | ((x & 0x22222222) << 1)
1308       | ((x & 0x11111111) << 3);
1309    return x;
1310}
1311
1312#define NEON_QDMULH16(dest, src1, src2, round) do { \
1313    uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
1314    if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
1315        SET_QC(); \
1316        tmp = (tmp >> 31) ^ ~SIGNBIT; \
1317    } else { \
1318        tmp <<= 1; \
1319    } \
1320    if (round) { \
1321        int32_t old = tmp; \
1322        tmp += 1 << 15; \
1323        if ((int32_t)tmp < old) { \
1324            SET_QC(); \
1325            tmp = SIGNBIT - 1; \
1326        } \
1327    } \
1328    dest = tmp >> 16; \
1329    } while(0)
1330#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
1331NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
1332#undef NEON_FN
1333#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
1334NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
1335#undef NEON_FN
1336#undef NEON_QDMULH16
1337
1338#define NEON_QDMULH32(dest, src1, src2, round) do { \
1339    uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
1340    if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
1341        SET_QC(); \
1342        tmp = (tmp >> 63) ^ ~SIGNBIT64; \
1343    } else { \
1344        tmp <<= 1; \
1345    } \
1346    if (round) { \
1347        int64_t old = tmp; \
1348        tmp += (int64_t)1 << 31; \
1349        if ((int64_t)tmp < old) { \
1350            SET_QC(); \
1351            tmp = SIGNBIT64 - 1; \
1352        } \
1353    } \
1354    dest = tmp >> 32; \
1355    } while(0)
1356#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
1357NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
1358#undef NEON_FN
1359#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
1360NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
1361#undef NEON_FN
1362#undef NEON_QDMULH32
1363
1364uint32_t HELPER(neon_narrow_u8)(uint64_t x)
1365{
1366    return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
1367           | ((x >> 24) & 0xff000000u);
1368}
1369
1370uint32_t HELPER(neon_narrow_u16)(uint64_t x)
1371{
1372    return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
1373}
1374
1375uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
1376{
1377    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
1378            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
1379}
1380
1381uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
1382{
1383    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
1384}
1385
1386uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
1387{
1388    x &= 0xff80ff80ff80ff80ull;
1389    x += 0x0080008000800080ull;
1390    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
1391            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
1392}
1393
1394uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
1395{
1396    x &= 0xffff8000ffff8000ull;
1397    x += 0x0000800000008000ull;
1398    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
1399}
1400
1401uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
1402{
1403    uint16_t s;
1404    uint8_t d;
1405    uint32_t res = 0;
1406#define SAT8(n) \
1407    s = x >> n; \
1408    if (s & 0x8000) { \
1409        SET_QC(); \
1410    } else { \
1411        if (s > 0xff) { \
1412            d = 0xff; \
1413            SET_QC(); \
1414        } else  { \
1415            d = s; \
1416        } \
1417        res |= (uint32_t)d << (n / 2); \
1418    }
1419
1420    SAT8(0);
1421    SAT8(16);
1422    SAT8(32);
1423    SAT8(48);
1424#undef SAT8
1425    return res;
1426}
1427
1428uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
1429{
1430    uint16_t s;
1431    uint8_t d;
1432    uint32_t res = 0;
1433#define SAT8(n) \
1434    s = x >> n; \
1435    if (s > 0xff) { \
1436        d = 0xff; \
1437        SET_QC(); \
1438    } else  { \
1439        d = s; \
1440    } \
1441    res |= (uint32_t)d << (n / 2);
1442
1443    SAT8(0);
1444    SAT8(16);
1445    SAT8(32);
1446    SAT8(48);
1447#undef SAT8
1448    return res;
1449}
1450
1451uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
1452{
1453    int16_t s;
1454    uint8_t d;
1455    uint32_t res = 0;
1456#define SAT8(n) \
1457    s = x >> n; \
1458    if (s != (int8_t)s) { \
1459        d = (s >> 15) ^ 0x7f; \
1460        SET_QC(); \
1461    } else  { \
1462        d = s; \
1463    } \
1464    res |= (uint32_t)d << (n / 2);
1465
1466    SAT8(0);
1467    SAT8(16);
1468    SAT8(32);
1469    SAT8(48);
1470#undef SAT8
1471    return res;
1472}
1473
1474uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
1475{
1476    uint32_t high;
1477    uint32_t low;
1478    low = x;
1479    if (low & 0x80000000) {
1480        low = 0;
1481        SET_QC();
1482    } else if (low > 0xffff) {
1483        low = 0xffff;
1484        SET_QC();
1485    }
1486    high = x >> 32;
1487    if (high & 0x80000000) {
1488        high = 0;
1489        SET_QC();
1490    } else if (high > 0xffff) {
1491        high = 0xffff;
1492        SET_QC();
1493    }
1494    return low | (high << 16);
1495}
1496
1497uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
1498{
1499    uint32_t high;
1500    uint32_t low;
1501    low = x;
1502    if (low > 0xffff) {
1503        low = 0xffff;
1504        SET_QC();
1505    }
1506    high = x >> 32;
1507    if (high > 0xffff) {
1508        high = 0xffff;
1509        SET_QC();
1510    }
1511    return low | (high << 16);
1512}
1513
1514uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
1515{
1516    int32_t low;
1517    int32_t high;
1518    low = x;
1519    if (low != (int16_t)low) {
1520        low = (low >> 31) ^ 0x7fff;
1521        SET_QC();
1522    }
1523    high = x >> 32;
1524    if (high != (int16_t)high) {
1525        high = (high >> 31) ^ 0x7fff;
1526        SET_QC();
1527    }
1528    return (uint16_t)low | (high << 16);
1529}
1530
1531uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
1532{
1533    if (x & 0x8000000000000000ull) {
1534        SET_QC();
1535        return 0;
1536    }
1537    if (x > 0xffffffffu) {
1538        SET_QC();
1539        return 0xffffffffu;
1540    }
1541    return x;
1542}
1543
1544uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
1545{
1546    if (x > 0xffffffffu) {
1547        SET_QC();
1548        return 0xffffffffu;
1549    }
1550    return x;
1551}
1552
1553uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
1554{
1555    if ((int64_t)x != (int32_t)x) {
1556        SET_QC();
1557        return ((int64_t)x >> 63) ^ 0x7fffffff;
1558    }
1559    return x;
1560}
1561
1562uint64_t HELPER(neon_widen_u8)(uint32_t x)
1563{
1564    uint64_t tmp;
1565    uint64_t ret;
1566    ret = (uint8_t)x;
1567    tmp = (uint8_t)(x >> 8);
1568    ret |= tmp << 16;
1569    tmp = (uint8_t)(x >> 16);
1570    ret |= tmp << 32;
1571    tmp = (uint8_t)(x >> 24);
1572    ret |= tmp << 48;
1573    return ret;
1574}
1575
1576uint64_t HELPER(neon_widen_s8)(uint32_t x)
1577{
1578    uint64_t tmp;
1579    uint64_t ret;
1580    ret = (uint16_t)(int8_t)x;
1581    tmp = (uint16_t)(int8_t)(x >> 8);
1582    ret |= tmp << 16;
1583    tmp = (uint16_t)(int8_t)(x >> 16);
1584    ret |= tmp << 32;
1585    tmp = (uint16_t)(int8_t)(x >> 24);
1586    ret |= tmp << 48;
1587    return ret;
1588}
1589
1590uint64_t HELPER(neon_widen_u16)(uint32_t x)
1591{
1592    uint64_t high = (uint16_t)(x >> 16);
1593    return ((uint16_t)x) | (high << 32);
1594}
1595
1596uint64_t HELPER(neon_widen_s16)(uint32_t x)
1597{
1598    uint64_t high = (int16_t)(x >> 16);
1599    return ((uint32_t)(int16_t)x) | (high << 32);
1600}
1601
1602uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
1603{
1604    uint64_t mask;
1605    mask = (a ^ b) & 0x8000800080008000ull;
1606    a &= ~0x8000800080008000ull;
1607    b &= ~0x8000800080008000ull;
1608    return (a + b) ^ mask;
1609}
1610
1611uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
1612{
1613    uint64_t mask;
1614    mask = (a ^ b) & 0x8000000080000000ull;
1615    a &= ~0x8000000080000000ull;
1616    b &= ~0x8000000080000000ull;
1617    return (a + b) ^ mask;
1618}
1619
1620uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
1621{
1622    uint64_t tmp;
1623    uint64_t tmp2;
1624
1625    tmp = a & 0x0000ffff0000ffffull;
1626    tmp += (a >> 16) & 0x0000ffff0000ffffull;
1627    tmp2 = b & 0xffff0000ffff0000ull;
1628    tmp2 += (b << 16) & 0xffff0000ffff0000ull;
1629    return    ( tmp         & 0xffff)
1630            | ((tmp  >> 16) & 0xffff0000ull)
1631            | ((tmp2 << 16) & 0xffff00000000ull)
1632            | ( tmp2        & 0xffff000000000000ull);
1633}
1634
1635uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
1636{
1637    uint32_t low = a + (a >> 32);
1638    uint32_t high = b + (b >> 32);
1639    return low + ((uint64_t)high << 32);
1640}
1641
1642uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
1643{
1644    uint64_t mask;
1645    mask = (a ^ ~b) & 0x8000800080008000ull;
1646    a |= 0x8000800080008000ull;
1647    b &= ~0x8000800080008000ull;
1648    return (a - b) ^ mask;
1649}
1650
1651uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
1652{
1653    uint64_t mask;
1654    mask = (a ^ ~b) & 0x8000000080000000ull;
1655    a |= 0x8000000080000000ull;
1656    b &= ~0x8000000080000000ull;
1657    return (a - b) ^ mask;
1658}
1659
1660uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
1661{
1662    uint32_t x, y;
1663    uint32_t low, high;
1664
1665    x = a;
1666    y = b;
1667    low = x + y;
1668    if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1669        SET_QC();
1670        low = ((int32_t)x >> 31) ^ ~SIGNBIT;
1671    }
1672    x = a >> 32;
1673    y = b >> 32;
1674    high = x + y;
1675    if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1676        SET_QC();
1677        high = ((int32_t)x >> 31) ^ ~SIGNBIT;
1678    }
1679    return low | ((uint64_t)high << 32);
1680}
1681
1682uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
1683{
1684    uint64_t result;
1685
1686    result = a + b;
1687    if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
1688        SET_QC();
1689        result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
1690    }
1691    return result;
1692}
1693
1694/* We have to do the arithmetic in a larger type than
1695 * the input type, because for example with a signed 32 bit
1696 * op the absolute difference can overflow a signed 32 bit value.
1697 */
1698#define DO_ABD(dest, x, y, intype, arithtype) do {            \
1699    arithtype tmp_x = (intype)(x);                            \
1700    arithtype tmp_y = (intype)(y);                            \
1701    dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1702    } while(0)
1703
1704uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
1705{
1706    uint64_t tmp;
1707    uint64_t result;
1708    DO_ABD(result, a, b, uint8_t, uint32_t);
1709    DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
1710    result |= tmp << 16;
1711    DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
1712    result |= tmp << 32;
1713    DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
1714    result |= tmp << 48;
1715    return result;
1716}
1717
1718uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
1719{
1720    uint64_t tmp;
1721    uint64_t result;
1722    DO_ABD(result, a, b, int8_t, int32_t);
1723    DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
1724    result |= tmp << 16;
1725    DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
1726    result |= tmp << 32;
1727    DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
1728    result |= tmp << 48;
1729    return result;
1730}
1731
1732uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
1733{
1734    uint64_t tmp;
1735    uint64_t result;
1736    DO_ABD(result, a, b, uint16_t, uint32_t);
1737    DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1738    return result | (tmp << 32);
1739}
1740
1741uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
1742{
1743    uint64_t tmp;
1744    uint64_t result;
1745    DO_ABD(result, a, b, int16_t, int32_t);
1746    DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
1747    return result | (tmp << 32);
1748}
1749
1750uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
1751{
1752    uint64_t result;
1753    DO_ABD(result, a, b, uint32_t, uint64_t);
1754    return result;
1755}
1756
1757uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
1758{
1759    uint64_t result;
1760    DO_ABD(result, a, b, int32_t, int64_t);
1761    return result;
1762}
1763#undef DO_ABD
1764
1765/* Widening multiply. Named type is the source type.  */
1766#define DO_MULL(dest, x, y, type1, type2) do { \
1767    type1 tmp_x = x; \
1768    type1 tmp_y = y; \
1769    dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1770    } while(0)
1771
1772uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1773{
1774    uint64_t tmp;
1775    uint64_t result;
1776
1777    DO_MULL(result, a, b, uint8_t, uint16_t);
1778    DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1779    result |= tmp << 16;
1780    DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1781    result |= tmp << 32;
1782    DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1783    result |= tmp << 48;
1784    return result;
1785}
1786
1787uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1788{
1789    uint64_t tmp;
1790    uint64_t result;
1791
1792    DO_MULL(result, a, b, int8_t, uint16_t);
1793    DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1794    result |= tmp << 16;
1795    DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1796    result |= tmp << 32;
1797    DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1798    result |= tmp << 48;
1799    return result;
1800}
1801
1802uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1803{
1804    uint64_t tmp;
1805    uint64_t result;
1806
1807    DO_MULL(result, a, b, uint16_t, uint32_t);
1808    DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1809    return result | (tmp << 32);
1810}
1811
1812uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1813{
1814    uint64_t tmp;
1815    uint64_t result;
1816
1817    DO_MULL(result, a, b, int16_t, uint32_t);
1818    DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1819    return result | (tmp << 32);
1820}
1821
1822uint64_t HELPER(neon_negl_u16)(uint64_t x)
1823{
1824    uint16_t tmp;
1825    uint64_t result;
1826    result = (uint16_t)-x;
1827    tmp = -(x >> 16);
1828    result |= (uint64_t)tmp << 16;
1829    tmp = -(x >> 32);
1830    result |= (uint64_t)tmp << 32;
1831    tmp = -(x >> 48);
1832    result |= (uint64_t)tmp << 48;
1833    return result;
1834}
1835
1836uint64_t HELPER(neon_negl_u32)(uint64_t x)
1837{
1838    uint32_t low = -x;
1839    uint32_t high = -(x >> 32);
1840    return low | ((uint64_t)high << 32);
1841}
1842
1843/* Saturating sign manipulation.  */
1844/* ??? Make these use NEON_VOP1 */
1845#define DO_QABS8(x) do { \
1846    if (x == (int8_t)0x80) { \
1847        x = 0x7f; \
1848        SET_QC(); \
1849    } else if (x < 0) { \
1850        x = -x; \
1851    }} while (0)
1852uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1853{
1854    neon_s8 vec;
1855    NEON_UNPACK(neon_s8, vec, x);
1856    DO_QABS8(vec.v1);
1857    DO_QABS8(vec.v2);
1858    DO_QABS8(vec.v3);
1859    DO_QABS8(vec.v4);
1860    NEON_PACK(neon_s8, x, vec);
1861    return x;
1862}
1863#undef DO_QABS8
1864
1865#define DO_QNEG8(x) do { \
1866    if (x == (int8_t)0x80) { \
1867        x = 0x7f; \
1868        SET_QC(); \
1869    } else { \
1870        x = -x; \
1871    }} while (0)
1872uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1873{
1874    neon_s8 vec;
1875    NEON_UNPACK(neon_s8, vec, x);
1876    DO_QNEG8(vec.v1);
1877    DO_QNEG8(vec.v2);
1878    DO_QNEG8(vec.v3);
1879    DO_QNEG8(vec.v4);
1880    NEON_PACK(neon_s8, x, vec);
1881    return x;
1882}
1883#undef DO_QNEG8
1884
1885#define DO_QABS16(x) do { \
1886    if (x == (int16_t)0x8000) { \
1887        x = 0x7fff; \
1888        SET_QC(); \
1889    } else if (x < 0) { \
1890        x = -x; \
1891    }} while (0)
1892uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1893{
1894    neon_s16 vec;
1895    NEON_UNPACK(neon_s16, vec, x);
1896    DO_QABS16(vec.v1);
1897    DO_QABS16(vec.v2);
1898    NEON_PACK(neon_s16, x, vec);
1899    return x;
1900}
1901#undef DO_QABS16
1902
1903#define DO_QNEG16(x) do { \
1904    if (x == (int16_t)0x8000) { \
1905        x = 0x7fff; \
1906        SET_QC(); \
1907    } else { \
1908        x = -x; \
1909    }} while (0)
1910uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1911{
1912    neon_s16 vec;
1913    NEON_UNPACK(neon_s16, vec, x);
1914    DO_QNEG16(vec.v1);
1915    DO_QNEG16(vec.v2);
1916    NEON_PACK(neon_s16, x, vec);
1917    return x;
1918}
1919#undef DO_QNEG16
1920
1921uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1922{
1923    if (x == SIGNBIT) {
1924        SET_QC();
1925        x = ~SIGNBIT;
1926    } else if ((int32_t)x < 0) {
1927        x = -x;
1928    }
1929    return x;
1930}
1931
1932uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1933{
1934    if (x == SIGNBIT) {
1935        SET_QC();
1936        x = ~SIGNBIT;
1937    } else {
1938        x = -x;
1939    }
1940    return x;
1941}
1942
1943uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1944{
1945    if (x == SIGNBIT64) {
1946        SET_QC();
1947        x = ~SIGNBIT64;
1948    } else if ((int64_t)x < 0) {
1949        x = -x;
1950    }
1951    return x;
1952}
1953
1954uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1955{
1956    if (x == SIGNBIT64) {
1957        SET_QC();
1958        x = ~SIGNBIT64;
1959    } else {
1960        x = -x;
1961    }
1962    return x;
1963}
1964
1965/* NEON Float helpers.  */
1966uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b, void *fpstp)
1967{
1968    float_status *fpst = fpstp;
1969    float32 f0 = make_float32(a);
1970    float32 f1 = make_float32(b);
1971    return float32_val(float32_abs(float32_sub(f0, f1, fpst)));
1972}
1973
1974/* Floating point comparisons produce an integer result.
1975 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1976 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1977 */
1978uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
1979{
1980    float_status *fpst = fpstp;
1981    return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1982}
1983
1984uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
1985{
1986    float_status *fpst = fpstp;
1987    return -float32_le(make_float32(b), make_float32(a), fpst);
1988}
1989
1990uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1991{
1992    float_status *fpst = fpstp;
1993    return -float32_lt(make_float32(b), make_float32(a), fpst);
1994}
1995
1996uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
1997{
1998    float_status *fpst = fpstp;
1999    float32 f0 = float32_abs(make_float32(a));
2000    float32 f1 = float32_abs(make_float32(b));
2001    return -float32_le(f1, f0, fpst);
2002}
2003
2004uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
2005{
2006    float_status *fpst = fpstp;
2007    float32 f0 = float32_abs(make_float32(a));
2008    float32 f1 = float32_abs(make_float32(b));
2009    return -float32_lt(f1, f0, fpst);
2010}
2011
2012uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
2013{
2014    float_status *fpst = fpstp;
2015    float64 f0 = float64_abs(make_float64(a));
2016    float64 f1 = float64_abs(make_float64(b));
2017    return -float64_le(f1, f0, fpst);
2018}
2019
2020uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
2021{
2022    float_status *fpst = fpstp;
2023    float64 f0 = float64_abs(make_float64(a));
2024    float64 f1 = float64_abs(make_float64(b));
2025    return -float64_lt(f1, f0, fpst);
2026}
2027
2028#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
2029
2030void HELPER(neon_qunzip8)(CPUARMState *env, uint32_t rd, uint32_t rm)
2031{
2032    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
2033    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
2034    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
2035    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
2036    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
2037        | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
2038        | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
2039        | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
2040    uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
2041        | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
2042        | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
2043        | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
2044    uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
2045        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
2046        | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
2047        | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
2048    uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
2049        | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
2050        | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
2051        | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
2052    env->vfp.regs[rm] = make_float64(m0);
2053    env->vfp.regs[rm + 1] = make_float64(m1);
2054    env->vfp.regs[rd] = make_float64(d0);
2055    env->vfp.regs[rd + 1] = make_float64(d1);
2056}
2057
2058void HELPER(neon_qunzip16)(CPUARMState *env, uint32_t rd, uint32_t rm)
2059{
2060    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
2061    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
2062    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
2063    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
2064    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
2065        | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
2066    uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
2067        | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
2068    uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
2069        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
2070    uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
2071        | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
2072    env->vfp.regs[rm] = make_float64(m0);
2073    env->vfp.regs[rm + 1] = make_float64(m1);
2074    env->vfp.regs[rd] = make_float64(d0);
2075    env->vfp.regs[rd + 1] = make_float64(d1);
2076}
2077
2078void HELPER(neon_qunzip32)(CPUARMState *env, uint32_t rd, uint32_t rm)
2079{
2080    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
2081    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
2082    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
2083    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
2084    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
2085    uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
2086    uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
2087    uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
2088    env->vfp.regs[rm] = make_float64(m0);
2089    env->vfp.regs[rm + 1] = make_float64(m1);
2090    env->vfp.regs[rd] = make_float64(d0);
2091    env->vfp.regs[rd + 1] = make_float64(d1);
2092}
2093
2094void HELPER(neon_unzip8)(CPUARMState *env, uint32_t rd, uint32_t rm)
2095{
2096    uint64_t zm = float64_val(env->vfp.regs[rm]);
2097    uint64_t zd = float64_val(env->vfp.regs[rd]);
2098    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
2099        | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
2100        | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
2101        | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
2102    uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
2103        | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
2104        | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
2105        | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
2106    env->vfp.regs[rm] = make_float64(m0);
2107    env->vfp.regs[rd] = make_float64(d0);
2108}
2109
2110void HELPER(neon_unzip16)(CPUARMState *env, uint32_t rd, uint32_t rm)
2111{
2112    uint64_t zm = float64_val(env->vfp.regs[rm]);
2113    uint64_t zd = float64_val(env->vfp.regs[rd]);
2114    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
2115        | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
2116    uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
2117        | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
2118    env->vfp.regs[rm] = make_float64(m0);
2119    env->vfp.regs[rd] = make_float64(d0);
2120}
2121
2122void HELPER(neon_qzip8)(CPUARMState *env, uint32_t rd, uint32_t rm)
2123{
2124    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
2125    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
2126    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
2127    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
2128    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
2129        | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
2130        | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
2131        | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
2132    uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
2133        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
2134        | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
2135        | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
2136    uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
2137        | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
2138        | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
2139        | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
2140    uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
2141        | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
2142        | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
2143        | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
2144    env->vfp.regs[rm] = make_float64(m0);
2145    env->vfp.regs[rm + 1] = make_float64(m1);
2146    env->vfp.regs[rd] = make_float64(d0);
2147    env->vfp.regs[rd + 1] = make_float64(d1);
2148}
2149
2150void HELPER(neon_qzip16)(CPUARMState *env, uint32_t rd, uint32_t rm)
2151{
2152    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
2153    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
2154    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
2155    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
2156    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
2157        | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
2158    uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
2159        | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
2160    uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
2161        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
2162    uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
2163        | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
2164    env->vfp.regs[rm] = make_float64(m0);
2165    env->vfp.regs[rm + 1] = make_float64(m1);
2166    env->vfp.regs[rd] = make_float64(d0);
2167    env->vfp.regs[rd + 1] = make_float64(d1);
2168}
2169
2170void HELPER(neon_qzip32)(CPUARMState *env, uint32_t rd, uint32_t rm)
2171{
2172    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
2173    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
2174    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
2175    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
2176    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
2177    uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
2178    uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
2179    uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
2180    env->vfp.regs[rm] = make_float64(m0);
2181    env->vfp.regs[rm + 1] = make_float64(m1);
2182    env->vfp.regs[rd] = make_float64(d0);
2183    env->vfp.regs[rd + 1] = make_float64(d1);
2184}
2185
2186void HELPER(neon_zip8)(CPUARMState *env, uint32_t rd, uint32_t rm)
2187{
2188    uint64_t zm = float64_val(env->vfp.regs[rm]);
2189    uint64_t zd = float64_val(env->vfp.regs[rd]);
2190    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
2191        | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
2192        | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
2193        | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
2194    uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
2195        | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
2196        | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
2197        | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
2198    env->vfp.regs[rm] = make_float64(m0);
2199    env->vfp.regs[rd] = make_float64(d0);
2200}
2201
2202void HELPER(neon_zip16)(CPUARMState *env, uint32_t rd, uint32_t rm)
2203{
2204    uint64_t zm = float64_val(env->vfp.regs[rm]);
2205    uint64_t zd = float64_val(env->vfp.regs[rd]);
2206    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
2207        | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
2208    uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
2209        | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
2210    env->vfp.regs[rm] = make_float64(m0);
2211    env->vfp.regs[rd] = make_float64(d0);
2212}
2213
2214/* Helper function for 64 bit polynomial multiply case:
2215 * perform PolynomialMult(op1, op2) and return either the top or
2216 * bottom half of the 128 bit result.
2217 */
2218uint64_t HELPER(neon_pmull_64_lo)(uint64_t op1, uint64_t op2)
2219{
2220    int bitnum;
2221    uint64_t res = 0;
2222
2223    for (bitnum = 0; bitnum < 64; bitnum++) {
2224        if (op1 & (1ULL << bitnum)) {
2225            res ^= op2 << bitnum;
2226        }
2227    }
2228    return res;
2229}
2230uint64_t HELPER(neon_pmull_64_hi)(uint64_t op1, uint64_t op2)
2231{
2232    int bitnum;
2233    uint64_t res = 0;
2234
2235    /* bit 0 of op1 can't influence the high 64 bits at all */
2236    for (bitnum = 1; bitnum < 64; bitnum++) {
2237        if (op1 & (1ULL << bitnum)) {
2238            res ^= op2 >> (64 - bitnum);
2239        }
2240    }
2241    return res;
2242}
2243