qemu/target/i386/ops_sse.h
<<
>>
Prefs
   1/*
   2 *  MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
   3 *
   4 *  Copyright (c) 2005 Fabrice Bellard
   5 *  Copyright (c) 2008 Intel Corporation  <andrew.zaborowski@intel.com>
   6 *
   7 * This library is free software; you can redistribute it and/or
   8 * modify it under the terms of the GNU Lesser General Public
   9 * License as published by the Free Software Foundation; either
  10 * version 2.1 of the License, or (at your option) any later version.
  11 *
  12 * This library is distributed in the hope that it will be useful,
  13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 * Lesser General Public License for more details.
  16 *
  17 * You should have received a copy of the GNU Lesser General Public
  18 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  19 */
  20
  21#include "crypto/aes.h"
  22
  23#if SHIFT == 0
  24#define Reg MMXReg
  25#define SIZE 8
  26#define XMM_ONLY(...)
  27#define B(n) MMX_B(n)
  28#define W(n) MMX_W(n)
  29#define L(n) MMX_L(n)
  30#define Q(n) MMX_Q(n)
  31#define SUFFIX _mmx
  32#else
  33#define Reg ZMMReg
  34#define SIZE 16
  35#define XMM_ONLY(...) __VA_ARGS__
  36#define B(n) ZMM_B(n)
  37#define W(n) ZMM_W(n)
  38#define L(n) ZMM_L(n)
  39#define Q(n) ZMM_Q(n)
  40#define SUFFIX _xmm
  41#endif
  42
  43/*
  44 * Copy the relevant parts of a Reg value around. In the case where
  45 * sizeof(Reg) > SIZE, these helpers operate only on the lower bytes of
  46 * a 64 byte ZMMReg, so we must copy only those and keep the top bytes
  47 * untouched in the guest-visible destination destination register.
  48 * Note that the "lower bytes" are placed last in memory on big-endian
  49 * hosts, which store the vector backwards in memory.  In that case the
  50 * copy *starts* at B(SIZE - 1) and ends at B(0), the opposite of
  51 * the little-endian case.
  52 */
  53#if HOST_BIG_ENDIAN
  54#define MOVE(d, r) memcpy(&((d).B(SIZE - 1)), &(r).B(SIZE - 1), SIZE)
  55#else
  56#define MOVE(d, r) memcpy(&(d).B(0), &(r).B(0), SIZE)
  57#endif
  58
  59void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
  60{
  61    int shift;
  62
  63    if (s->Q(0) > 15) {
  64        d->Q(0) = 0;
  65#if SHIFT == 1
  66        d->Q(1) = 0;
  67#endif
  68    } else {
  69        shift = s->B(0);
  70        d->W(0) >>= shift;
  71        d->W(1) >>= shift;
  72        d->W(2) >>= shift;
  73        d->W(3) >>= shift;
  74#if SHIFT == 1
  75        d->W(4) >>= shift;
  76        d->W(5) >>= shift;
  77        d->W(6) >>= shift;
  78        d->W(7) >>= shift;
  79#endif
  80    }
  81}
  82
  83void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
  84{
  85    int shift;
  86
  87    if (s->Q(0) > 15) {
  88        shift = 15;
  89    } else {
  90        shift = s->B(0);
  91    }
  92    d->W(0) = (int16_t)d->W(0) >> shift;
  93    d->W(1) = (int16_t)d->W(1) >> shift;
  94    d->W(2) = (int16_t)d->W(2) >> shift;
  95    d->W(3) = (int16_t)d->W(3) >> shift;
  96#if SHIFT == 1
  97    d->W(4) = (int16_t)d->W(4) >> shift;
  98    d->W(5) = (int16_t)d->W(5) >> shift;
  99    d->W(6) = (int16_t)d->W(6) >> shift;
 100    d->W(7) = (int16_t)d->W(7) >> shift;
 101#endif
 102}
 103
 104void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 105{
 106    int shift;
 107
 108    if (s->Q(0) > 15) {
 109        d->Q(0) = 0;
 110#if SHIFT == 1
 111        d->Q(1) = 0;
 112#endif
 113    } else {
 114        shift = s->B(0);
 115        d->W(0) <<= shift;
 116        d->W(1) <<= shift;
 117        d->W(2) <<= shift;
 118        d->W(3) <<= shift;
 119#if SHIFT == 1
 120        d->W(4) <<= shift;
 121        d->W(5) <<= shift;
 122        d->W(6) <<= shift;
 123        d->W(7) <<= shift;
 124#endif
 125    }
 126}
 127
 128void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 129{
 130    int shift;
 131
 132    if (s->Q(0) > 31) {
 133        d->Q(0) = 0;
 134#if SHIFT == 1
 135        d->Q(1) = 0;
 136#endif
 137    } else {
 138        shift = s->B(0);
 139        d->L(0) >>= shift;
 140        d->L(1) >>= shift;
 141#if SHIFT == 1
 142        d->L(2) >>= shift;
 143        d->L(3) >>= shift;
 144#endif
 145    }
 146}
 147
 148void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 149{
 150    int shift;
 151
 152    if (s->Q(0) > 31) {
 153        shift = 31;
 154    } else {
 155        shift = s->B(0);
 156    }
 157    d->L(0) = (int32_t)d->L(0) >> shift;
 158    d->L(1) = (int32_t)d->L(1) >> shift;
 159#if SHIFT == 1
 160    d->L(2) = (int32_t)d->L(2) >> shift;
 161    d->L(3) = (int32_t)d->L(3) >> shift;
 162#endif
 163}
 164
 165void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 166{
 167    int shift;
 168
 169    if (s->Q(0) > 31) {
 170        d->Q(0) = 0;
 171#if SHIFT == 1
 172        d->Q(1) = 0;
 173#endif
 174    } else {
 175        shift = s->B(0);
 176        d->L(0) <<= shift;
 177        d->L(1) <<= shift;
 178#if SHIFT == 1
 179        d->L(2) <<= shift;
 180        d->L(3) <<= shift;
 181#endif
 182    }
 183}
 184
 185void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 186{
 187    int shift;
 188
 189    if (s->Q(0) > 63) {
 190        d->Q(0) = 0;
 191#if SHIFT == 1
 192        d->Q(1) = 0;
 193#endif
 194    } else {
 195        shift = s->B(0);
 196        d->Q(0) >>= shift;
 197#if SHIFT == 1
 198        d->Q(1) >>= shift;
 199#endif
 200    }
 201}
 202
 203void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 204{
 205    int shift;
 206
 207    if (s->Q(0) > 63) {
 208        d->Q(0) = 0;
 209#if SHIFT == 1
 210        d->Q(1) = 0;
 211#endif
 212    } else {
 213        shift = s->B(0);
 214        d->Q(0) <<= shift;
 215#if SHIFT == 1
 216        d->Q(1) <<= shift;
 217#endif
 218    }
 219}
 220
 221#if SHIFT == 1
 222void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 223{
 224    int shift, i;
 225
 226    shift = s->L(0);
 227    if (shift > 16) {
 228        shift = 16;
 229    }
 230    for (i = 0; i < 16 - shift; i++) {
 231        d->B(i) = d->B(i + shift);
 232    }
 233    for (i = 16 - shift; i < 16; i++) {
 234        d->B(i) = 0;
 235    }
 236}
 237
 238void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 239{
 240    int shift, i;
 241
 242    shift = s->L(0);
 243    if (shift > 16) {
 244        shift = 16;
 245    }
 246    for (i = 15; i >= shift; i--) {
 247        d->B(i) = d->B(i - shift);
 248    }
 249    for (i = 0; i < shift; i++) {
 250        d->B(i) = 0;
 251    }
 252}
 253#endif
 254
 255#define SSE_HELPER_B(name, F)                                   \
 256    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
 257    {                                                           \
 258        d->B(0) = F(d->B(0), s->B(0));                          \
 259        d->B(1) = F(d->B(1), s->B(1));                          \
 260        d->B(2) = F(d->B(2), s->B(2));                          \
 261        d->B(3) = F(d->B(3), s->B(3));                          \
 262        d->B(4) = F(d->B(4), s->B(4));                          \
 263        d->B(5) = F(d->B(5), s->B(5));                          \
 264        d->B(6) = F(d->B(6), s->B(6));                          \
 265        d->B(7) = F(d->B(7), s->B(7));                          \
 266        XMM_ONLY(                                               \
 267                 d->B(8) = F(d->B(8), s->B(8));                 \
 268                 d->B(9) = F(d->B(9), s->B(9));                 \
 269                 d->B(10) = F(d->B(10), s->B(10));              \
 270                 d->B(11) = F(d->B(11), s->B(11));              \
 271                 d->B(12) = F(d->B(12), s->B(12));              \
 272                 d->B(13) = F(d->B(13), s->B(13));              \
 273                 d->B(14) = F(d->B(14), s->B(14));              \
 274                 d->B(15) = F(d->B(15), s->B(15));              \
 275                                                        )       \
 276            }
 277
 278#define SSE_HELPER_W(name, F)                                   \
 279    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
 280    {                                                           \
 281        d->W(0) = F(d->W(0), s->W(0));                          \
 282        d->W(1) = F(d->W(1), s->W(1));                          \
 283        d->W(2) = F(d->W(2), s->W(2));                          \
 284        d->W(3) = F(d->W(3), s->W(3));                          \
 285        XMM_ONLY(                                               \
 286                 d->W(4) = F(d->W(4), s->W(4));                 \
 287                 d->W(5) = F(d->W(5), s->W(5));                 \
 288                 d->W(6) = F(d->W(6), s->W(6));                 \
 289                 d->W(7) = F(d->W(7), s->W(7));                 \
 290                                                        )       \
 291            }
 292
 293#define SSE_HELPER_L(name, F)                                   \
 294    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
 295    {                                                           \
 296        d->L(0) = F(d->L(0), s->L(0));                          \
 297        d->L(1) = F(d->L(1), s->L(1));                          \
 298        XMM_ONLY(                                               \
 299                 d->L(2) = F(d->L(2), s->L(2));                 \
 300                 d->L(3) = F(d->L(3), s->L(3));                 \
 301                                                        )       \
 302            }
 303
 304#define SSE_HELPER_Q(name, F)                                   \
 305    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)   \
 306    {                                                           \
 307        d->Q(0) = F(d->Q(0), s->Q(0));                          \
 308        XMM_ONLY(                                               \
 309                 d->Q(1) = F(d->Q(1), s->Q(1));                 \
 310                                                        )       \
 311            }
 312
 313#if SHIFT == 0
 314static inline int satub(int x)
 315{
 316    if (x < 0) {
 317        return 0;
 318    } else if (x > 255) {
 319        return 255;
 320    } else {
 321        return x;
 322    }
 323}
 324
 325static inline int satuw(int x)
 326{
 327    if (x < 0) {
 328        return 0;
 329    } else if (x > 65535) {
 330        return 65535;
 331    } else {
 332        return x;
 333    }
 334}
 335
 336static inline int satsb(int x)
 337{
 338    if (x < -128) {
 339        return -128;
 340    } else if (x > 127) {
 341        return 127;
 342    } else {
 343        return x;
 344    }
 345}
 346
 347static inline int satsw(int x)
 348{
 349    if (x < -32768) {
 350        return -32768;
 351    } else if (x > 32767) {
 352        return 32767;
 353    } else {
 354        return x;
 355    }
 356}
 357
 358#define FADD(a, b) ((a) + (b))
 359#define FADDUB(a, b) satub((a) + (b))
 360#define FADDUW(a, b) satuw((a) + (b))
 361#define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b))
 362#define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b))
 363
 364#define FSUB(a, b) ((a) - (b))
 365#define FSUBUB(a, b) satub((a) - (b))
 366#define FSUBUW(a, b) satuw((a) - (b))
 367#define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b))
 368#define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b))
 369#define FMINUB(a, b) ((a) < (b)) ? (a) : (b)
 370#define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b)
 371#define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
 372#define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
 373
 374#define FAND(a, b) ((a) & (b))
 375#define FANDN(a, b) ((~(a)) & (b))
 376#define FOR(a, b) ((a) | (b))
 377#define FXOR(a, b) ((a) ^ (b))
 378
 379#define FCMPGTB(a, b) ((int8_t)(a) > (int8_t)(b) ? -1 : 0)
 380#define FCMPGTW(a, b) ((int16_t)(a) > (int16_t)(b) ? -1 : 0)
 381#define FCMPGTL(a, b) ((int32_t)(a) > (int32_t)(b) ? -1 : 0)
 382#define FCMPEQ(a, b) ((a) == (b) ? -1 : 0)
 383
 384#define FMULLW(a, b) ((a) * (b))
 385#define FMULHRW(a, b) (((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16)
 386#define FMULHUW(a, b) ((a) * (b) >> 16)
 387#define FMULHW(a, b) ((int16_t)(a) * (int16_t)(b) >> 16)
 388
 389#define FAVG(a, b) (((a) + (b) + 1) >> 1)
 390#endif
 391
 392SSE_HELPER_B(helper_paddb, FADD)
 393SSE_HELPER_W(helper_paddw, FADD)
 394SSE_HELPER_L(helper_paddl, FADD)
 395SSE_HELPER_Q(helper_paddq, FADD)
 396
 397SSE_HELPER_B(helper_psubb, FSUB)
 398SSE_HELPER_W(helper_psubw, FSUB)
 399SSE_HELPER_L(helper_psubl, FSUB)
 400SSE_HELPER_Q(helper_psubq, FSUB)
 401
 402SSE_HELPER_B(helper_paddusb, FADDUB)
 403SSE_HELPER_B(helper_paddsb, FADDSB)
 404SSE_HELPER_B(helper_psubusb, FSUBUB)
 405SSE_HELPER_B(helper_psubsb, FSUBSB)
 406
 407SSE_HELPER_W(helper_paddusw, FADDUW)
 408SSE_HELPER_W(helper_paddsw, FADDSW)
 409SSE_HELPER_W(helper_psubusw, FSUBUW)
 410SSE_HELPER_W(helper_psubsw, FSUBSW)
 411
 412SSE_HELPER_B(helper_pminub, FMINUB)
 413SSE_HELPER_B(helper_pmaxub, FMAXUB)
 414
 415SSE_HELPER_W(helper_pminsw, FMINSW)
 416SSE_HELPER_W(helper_pmaxsw, FMAXSW)
 417
 418SSE_HELPER_Q(helper_pand, FAND)
 419SSE_HELPER_Q(helper_pandn, FANDN)
 420SSE_HELPER_Q(helper_por, FOR)
 421SSE_HELPER_Q(helper_pxor, FXOR)
 422
 423SSE_HELPER_B(helper_pcmpgtb, FCMPGTB)
 424SSE_HELPER_W(helper_pcmpgtw, FCMPGTW)
 425SSE_HELPER_L(helper_pcmpgtl, FCMPGTL)
 426
 427SSE_HELPER_B(helper_pcmpeqb, FCMPEQ)
 428SSE_HELPER_W(helper_pcmpeqw, FCMPEQ)
 429SSE_HELPER_L(helper_pcmpeql, FCMPEQ)
 430
 431SSE_HELPER_W(helper_pmullw, FMULLW)
 432#if SHIFT == 0
 433SSE_HELPER_W(helper_pmulhrw, FMULHRW)
 434#endif
 435SSE_HELPER_W(helper_pmulhuw, FMULHUW)
 436SSE_HELPER_W(helper_pmulhw, FMULHW)
 437
 438SSE_HELPER_B(helper_pavgb, FAVG)
 439SSE_HELPER_W(helper_pavgw, FAVG)
 440
 441void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 442{
 443    d->Q(0) = (uint64_t)s->L(0) * (uint64_t)d->L(0);
 444#if SHIFT == 1
 445    d->Q(1) = (uint64_t)s->L(2) * (uint64_t)d->L(2);
 446#endif
 447}
 448
 449void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 450{
 451    int i;
 452
 453    for (i = 0; i < (2 << SHIFT); i++) {
 454        d->L(i) = (int16_t)s->W(2 * i) * (int16_t)d->W(2 * i) +
 455            (int16_t)s->W(2 * i + 1) * (int16_t)d->W(2 * i + 1);
 456    }
 457}
 458
 459#if SHIFT == 0
 460static inline int abs1(int a)
 461{
 462    if (a < 0) {
 463        return -a;
 464    } else {
 465        return a;
 466    }
 467}
 468#endif
 469void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 470{
 471    unsigned int val;
 472
 473    val = 0;
 474    val += abs1(d->B(0) - s->B(0));
 475    val += abs1(d->B(1) - s->B(1));
 476    val += abs1(d->B(2) - s->B(2));
 477    val += abs1(d->B(3) - s->B(3));
 478    val += abs1(d->B(4) - s->B(4));
 479    val += abs1(d->B(5) - s->B(5));
 480    val += abs1(d->B(6) - s->B(6));
 481    val += abs1(d->B(7) - s->B(7));
 482    d->Q(0) = val;
 483#if SHIFT == 1
 484    val = 0;
 485    val += abs1(d->B(8) - s->B(8));
 486    val += abs1(d->B(9) - s->B(9));
 487    val += abs1(d->B(10) - s->B(10));
 488    val += abs1(d->B(11) - s->B(11));
 489    val += abs1(d->B(12) - s->B(12));
 490    val += abs1(d->B(13) - s->B(13));
 491    val += abs1(d->B(14) - s->B(14));
 492    val += abs1(d->B(15) - s->B(15));
 493    d->Q(1) = val;
 494#endif
 495}
 496
 497void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
 498                                  target_ulong a0)
 499{
 500    int i;
 501
 502    for (i = 0; i < (8 << SHIFT); i++) {
 503        if (s->B(i) & 0x80) {
 504            cpu_stb_data_ra(env, a0 + i, d->B(i), GETPC());
 505        }
 506    }
 507}
 508
 509void glue(helper_movl_mm_T0, SUFFIX)(Reg *d, uint32_t val)
 510{
 511    d->L(0) = val;
 512    d->L(1) = 0;
 513#if SHIFT == 1
 514    d->Q(1) = 0;
 515#endif
 516}
 517
 518#ifdef TARGET_X86_64
 519void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val)
 520{
 521    d->Q(0) = val;
 522#if SHIFT == 1
 523    d->Q(1) = 0;
 524#endif
 525}
 526#endif
 527
 528#if SHIFT == 0
 529void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
 530{
 531    Reg r;
 532
 533    r.W(0) = s->W(order & 3);
 534    r.W(1) = s->W((order >> 2) & 3);
 535    r.W(2) = s->W((order >> 4) & 3);
 536    r.W(3) = s->W((order >> 6) & 3);
 537    MOVE(*d, r);
 538}
 539#else
 540void helper_shufps(Reg *d, Reg *s, int order)
 541{
 542    Reg r;
 543
 544    r.L(0) = d->L(order & 3);
 545    r.L(1) = d->L((order >> 2) & 3);
 546    r.L(2) = s->L((order >> 4) & 3);
 547    r.L(3) = s->L((order >> 6) & 3);
 548    MOVE(*d, r);
 549}
 550
 551void helper_shufpd(Reg *d, Reg *s, int order)
 552{
 553    Reg r;
 554
 555    r.Q(0) = d->Q(order & 1);
 556    r.Q(1) = s->Q((order >> 1) & 1);
 557    MOVE(*d, r);
 558}
 559
 560void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
 561{
 562    Reg r;
 563
 564    r.L(0) = s->L(order & 3);
 565    r.L(1) = s->L((order >> 2) & 3);
 566    r.L(2) = s->L((order >> 4) & 3);
 567    r.L(3) = s->L((order >> 6) & 3);
 568    MOVE(*d, r);
 569}
 570
 571void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
 572{
 573    Reg r;
 574
 575    r.W(0) = s->W(order & 3);
 576    r.W(1) = s->W((order >> 2) & 3);
 577    r.W(2) = s->W((order >> 4) & 3);
 578    r.W(3) = s->W((order >> 6) & 3);
 579    r.Q(1) = s->Q(1);
 580    MOVE(*d, r);
 581}
 582
 583void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
 584{
 585    Reg r;
 586
 587    r.Q(0) = s->Q(0);
 588    r.W(4) = s->W(4 + (order & 3));
 589    r.W(5) = s->W(4 + ((order >> 2) & 3));
 590    r.W(6) = s->W(4 + ((order >> 4) & 3));
 591    r.W(7) = s->W(4 + ((order >> 6) & 3));
 592    MOVE(*d, r);
 593}
 594#endif
 595
 596#if SHIFT == 1
 597/* FPU ops */
 598/* XXX: not accurate */
 599
 600#define SSE_HELPER_S(name, F)                                           \
 601    void helper_ ## name ## ps(CPUX86State *env, Reg *d, Reg *s)        \
 602    {                                                                   \
 603        d->ZMM_S(0) = F(32, d->ZMM_S(0), s->ZMM_S(0));                  \
 604        d->ZMM_S(1) = F(32, d->ZMM_S(1), s->ZMM_S(1));                  \
 605        d->ZMM_S(2) = F(32, d->ZMM_S(2), s->ZMM_S(2));                  \
 606        d->ZMM_S(3) = F(32, d->ZMM_S(3), s->ZMM_S(3));                  \
 607    }                                                                   \
 608                                                                        \
 609    void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)        \
 610    {                                                                   \
 611        d->ZMM_S(0) = F(32, d->ZMM_S(0), s->ZMM_S(0));                  \
 612    }                                                                   \
 613                                                                        \
 614    void helper_ ## name ## pd(CPUX86State *env, Reg *d, Reg *s)        \
 615    {                                                                   \
 616        d->ZMM_D(0) = F(64, d->ZMM_D(0), s->ZMM_D(0));                  \
 617        d->ZMM_D(1) = F(64, d->ZMM_D(1), s->ZMM_D(1));                  \
 618    }                                                                   \
 619                                                                        \
 620    void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)        \
 621    {                                                                   \
 622        d->ZMM_D(0) = F(64, d->ZMM_D(0), s->ZMM_D(0));                  \
 623    }
 624
 625#define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status)
 626#define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status)
 627#define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status)
 628#define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status)
 629#define FPU_SQRT(size, a, b) float ## size ## _sqrt(b, &env->sse_status)
 630
 631/* Note that the choice of comparison op here is important to get the
 632 * special cases right: for min and max Intel specifies that (-0,0),
 633 * (NaN, anything) and (anything, NaN) return the second argument.
 634 */
 635#define FPU_MIN(size, a, b)                                     \
 636    (float ## size ## _lt(a, b, &env->sse_status) ? (a) : (b))
 637#define FPU_MAX(size, a, b)                                     \
 638    (float ## size ## _lt(b, a, &env->sse_status) ? (a) : (b))
 639
 640SSE_HELPER_S(add, FPU_ADD)
 641SSE_HELPER_S(sub, FPU_SUB)
 642SSE_HELPER_S(mul, FPU_MUL)
 643SSE_HELPER_S(div, FPU_DIV)
 644SSE_HELPER_S(min, FPU_MIN)
 645SSE_HELPER_S(max, FPU_MAX)
 646SSE_HELPER_S(sqrt, FPU_SQRT)
 647
 648
 649/* float to float conversions */
 650void helper_cvtps2pd(CPUX86State *env, Reg *d, Reg *s)
 651{
 652    float32 s0, s1;
 653
 654    s0 = s->ZMM_S(0);
 655    s1 = s->ZMM_S(1);
 656    d->ZMM_D(0) = float32_to_float64(s0, &env->sse_status);
 657    d->ZMM_D(1) = float32_to_float64(s1, &env->sse_status);
 658}
 659
 660void helper_cvtpd2ps(CPUX86State *env, Reg *d, Reg *s)
 661{
 662    d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status);
 663    d->ZMM_S(1) = float64_to_float32(s->ZMM_D(1), &env->sse_status);
 664    d->Q(1) = 0;
 665}
 666
 667void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *s)
 668{
 669    d->ZMM_D(0) = float32_to_float64(s->ZMM_S(0), &env->sse_status);
 670}
 671
 672void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *s)
 673{
 674    d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status);
 675}
 676
 677/* integer to float */
 678void helper_cvtdq2ps(CPUX86State *env, Reg *d, Reg *s)
 679{
 680    d->ZMM_S(0) = int32_to_float32(s->ZMM_L(0), &env->sse_status);
 681    d->ZMM_S(1) = int32_to_float32(s->ZMM_L(1), &env->sse_status);
 682    d->ZMM_S(2) = int32_to_float32(s->ZMM_L(2), &env->sse_status);
 683    d->ZMM_S(3) = int32_to_float32(s->ZMM_L(3), &env->sse_status);
 684}
 685
 686void helper_cvtdq2pd(CPUX86State *env, Reg *d, Reg *s)
 687{
 688    int32_t l0, l1;
 689
 690    l0 = (int32_t)s->ZMM_L(0);
 691    l1 = (int32_t)s->ZMM_L(1);
 692    d->ZMM_D(0) = int32_to_float64(l0, &env->sse_status);
 693    d->ZMM_D(1) = int32_to_float64(l1, &env->sse_status);
 694}
 695
 696void helper_cvtpi2ps(CPUX86State *env, ZMMReg *d, MMXReg *s)
 697{
 698    d->ZMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status);
 699    d->ZMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status);
 700}
 701
 702void helper_cvtpi2pd(CPUX86State *env, ZMMReg *d, MMXReg *s)
 703{
 704    d->ZMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status);
 705    d->ZMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status);
 706}
 707
 708void helper_cvtsi2ss(CPUX86State *env, ZMMReg *d, uint32_t val)
 709{
 710    d->ZMM_S(0) = int32_to_float32(val, &env->sse_status);
 711}
 712
 713void helper_cvtsi2sd(CPUX86State *env, ZMMReg *d, uint32_t val)
 714{
 715    d->ZMM_D(0) = int32_to_float64(val, &env->sse_status);
 716}
 717
 718#ifdef TARGET_X86_64
 719void helper_cvtsq2ss(CPUX86State *env, ZMMReg *d, uint64_t val)
 720{
 721    d->ZMM_S(0) = int64_to_float32(val, &env->sse_status);
 722}
 723
 724void helper_cvtsq2sd(CPUX86State *env, ZMMReg *d, uint64_t val)
 725{
 726    d->ZMM_D(0) = int64_to_float64(val, &env->sse_status);
 727}
 728#endif
 729
 730/* float to integer */
 731
 732/*
 733 * x86 mandates that we return the indefinite integer value for the result
 734 * of any float-to-integer conversion that raises the 'invalid' exception.
 735 * Wrap the softfloat functions to get this behaviour.
 736 */
 737#define WRAP_FLOATCONV(RETTYPE, FN, FLOATTYPE, INDEFVALUE)              \
 738    static inline RETTYPE x86_##FN(FLOATTYPE a, float_status *s)        \
 739    {                                                                   \
 740        int oldflags, newflags;                                         \
 741        RETTYPE r;                                                      \
 742                                                                        \
 743        oldflags = get_float_exception_flags(s);                        \
 744        set_float_exception_flags(0, s);                                \
 745        r = FN(a, s);                                                   \
 746        newflags = get_float_exception_flags(s);                        \
 747        if (newflags & float_flag_invalid) {                            \
 748            r = INDEFVALUE;                                             \
 749        }                                                               \
 750        set_float_exception_flags(newflags | oldflags, s);              \
 751        return r;                                                       \
 752    }
 753
 754WRAP_FLOATCONV(int32_t, float32_to_int32, float32, INT32_MIN)
 755WRAP_FLOATCONV(int32_t, float32_to_int32_round_to_zero, float32, INT32_MIN)
 756WRAP_FLOATCONV(int32_t, float64_to_int32, float64, INT32_MIN)
 757WRAP_FLOATCONV(int32_t, float64_to_int32_round_to_zero, float64, INT32_MIN)
 758WRAP_FLOATCONV(int64_t, float32_to_int64, float32, INT64_MIN)
 759WRAP_FLOATCONV(int64_t, float32_to_int64_round_to_zero, float32, INT64_MIN)
 760WRAP_FLOATCONV(int64_t, float64_to_int64, float64, INT64_MIN)
 761WRAP_FLOATCONV(int64_t, float64_to_int64_round_to_zero, float64, INT64_MIN)
 762
 763void helper_cvtps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 764{
 765    d->ZMM_L(0) = x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
 766    d->ZMM_L(1) = x86_float32_to_int32(s->ZMM_S(1), &env->sse_status);
 767    d->ZMM_L(2) = x86_float32_to_int32(s->ZMM_S(2), &env->sse_status);
 768    d->ZMM_L(3) = x86_float32_to_int32(s->ZMM_S(3), &env->sse_status);
 769}
 770
 771void helper_cvtpd2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 772{
 773    d->ZMM_L(0) = x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
 774    d->ZMM_L(1) = x86_float64_to_int32(s->ZMM_D(1), &env->sse_status);
 775    d->ZMM_Q(1) = 0;
 776}
 777
 778void helper_cvtps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
 779{
 780    d->MMX_L(0) = x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
 781    d->MMX_L(1) = x86_float32_to_int32(s->ZMM_S(1), &env->sse_status);
 782}
 783
 784void helper_cvtpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
 785{
 786    d->MMX_L(0) = x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
 787    d->MMX_L(1) = x86_float64_to_int32(s->ZMM_D(1), &env->sse_status);
 788}
 789
 790int32_t helper_cvtss2si(CPUX86State *env, ZMMReg *s)
 791{
 792    return x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
 793}
 794
 795int32_t helper_cvtsd2si(CPUX86State *env, ZMMReg *s)
 796{
 797    return x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
 798}
 799
 800#ifdef TARGET_X86_64
 801int64_t helper_cvtss2sq(CPUX86State *env, ZMMReg *s)
 802{
 803    return x86_float32_to_int64(s->ZMM_S(0), &env->sse_status);
 804}
 805
 806int64_t helper_cvtsd2sq(CPUX86State *env, ZMMReg *s)
 807{
 808    return x86_float64_to_int64(s->ZMM_D(0), &env->sse_status);
 809}
 810#endif
 811
 812/* float to integer truncated */
 813void helper_cvttps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 814{
 815    d->ZMM_L(0) = x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
 816    d->ZMM_L(1) = x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status);
 817    d->ZMM_L(2) = x86_float32_to_int32_round_to_zero(s->ZMM_S(2), &env->sse_status);
 818    d->ZMM_L(3) = x86_float32_to_int32_round_to_zero(s->ZMM_S(3), &env->sse_status);
 819}
 820
 821void helper_cvttpd2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 822{
 823    d->ZMM_L(0) = x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
 824    d->ZMM_L(1) = x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status);
 825    d->ZMM_Q(1) = 0;
 826}
 827
 828void helper_cvttps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
 829{
 830    d->MMX_L(0) = x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
 831    d->MMX_L(1) = x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status);
 832}
 833
 834void helper_cvttpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
 835{
 836    d->MMX_L(0) = x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
 837    d->MMX_L(1) = x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status);
 838}
 839
 840int32_t helper_cvttss2si(CPUX86State *env, ZMMReg *s)
 841{
 842    return x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
 843}
 844
 845int32_t helper_cvttsd2si(CPUX86State *env, ZMMReg *s)
 846{
 847    return x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
 848}
 849
 850#ifdef TARGET_X86_64
 851int64_t helper_cvttss2sq(CPUX86State *env, ZMMReg *s)
 852{
 853    return x86_float32_to_int64_round_to_zero(s->ZMM_S(0), &env->sse_status);
 854}
 855
 856int64_t helper_cvttsd2sq(CPUX86State *env, ZMMReg *s)
 857{
 858    return x86_float64_to_int64_round_to_zero(s->ZMM_D(0), &env->sse_status);
 859}
 860#endif
 861
 862void helper_rsqrtps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 863{
 864    uint8_t old_flags = get_float_exception_flags(&env->sse_status);
 865    d->ZMM_S(0) = float32_div(float32_one,
 866                              float32_sqrt(s->ZMM_S(0), &env->sse_status),
 867                              &env->sse_status);
 868    d->ZMM_S(1) = float32_div(float32_one,
 869                              float32_sqrt(s->ZMM_S(1), &env->sse_status),
 870                              &env->sse_status);
 871    d->ZMM_S(2) = float32_div(float32_one,
 872                              float32_sqrt(s->ZMM_S(2), &env->sse_status),
 873                              &env->sse_status);
 874    d->ZMM_S(3) = float32_div(float32_one,
 875                              float32_sqrt(s->ZMM_S(3), &env->sse_status),
 876                              &env->sse_status);
 877    set_float_exception_flags(old_flags, &env->sse_status);
 878}
 879
 880void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 881{
 882    uint8_t old_flags = get_float_exception_flags(&env->sse_status);
 883    d->ZMM_S(0) = float32_div(float32_one,
 884                              float32_sqrt(s->ZMM_S(0), &env->sse_status),
 885                              &env->sse_status);
 886    set_float_exception_flags(old_flags, &env->sse_status);
 887}
 888
 889void helper_rcpps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 890{
 891    uint8_t old_flags = get_float_exception_flags(&env->sse_status);
 892    d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status);
 893    d->ZMM_S(1) = float32_div(float32_one, s->ZMM_S(1), &env->sse_status);
 894    d->ZMM_S(2) = float32_div(float32_one, s->ZMM_S(2), &env->sse_status);
 895    d->ZMM_S(3) = float32_div(float32_one, s->ZMM_S(3), &env->sse_status);
 896    set_float_exception_flags(old_flags, &env->sse_status);
 897}
 898
 899void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 900{
 901    uint8_t old_flags = get_float_exception_flags(&env->sse_status);
 902    d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status);
 903    set_float_exception_flags(old_flags, &env->sse_status);
 904}
 905
 906static inline uint64_t helper_extrq(uint64_t src, int shift, int len)
 907{
 908    uint64_t mask;
 909
 910    if (len == 0) {
 911        mask = ~0LL;
 912    } else {
 913        mask = (1ULL << len) - 1;
 914    }
 915    return (src >> shift) & mask;
 916}
 917
 918void helper_extrq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 919{
 920    d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), s->ZMM_B(1), s->ZMM_B(0));
 921}
 922
 923void helper_extrq_i(CPUX86State *env, ZMMReg *d, int index, int length)
 924{
 925    d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), index, length);
 926}
 927
 928static inline uint64_t helper_insertq(uint64_t src, int shift, int len)
 929{
 930    uint64_t mask;
 931
 932    if (len == 0) {
 933        mask = ~0ULL;
 934    } else {
 935        mask = (1ULL << len) - 1;
 936    }
 937    return (src & ~(mask << shift)) | ((src & mask) << shift);
 938}
 939
 940void helper_insertq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 941{
 942    d->ZMM_Q(0) = helper_insertq(s->ZMM_Q(0), s->ZMM_B(9), s->ZMM_B(8));
 943}
 944
 945void helper_insertq_i(CPUX86State *env, ZMMReg *d, int index, int length)
 946{
 947    d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), index, length);
 948}
 949
 950void helper_haddps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 951{
 952    ZMMReg r;
 953
 954    r.ZMM_S(0) = float32_add(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status);
 955    r.ZMM_S(1) = float32_add(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status);
 956    r.ZMM_S(2) = float32_add(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status);
 957    r.ZMM_S(3) = float32_add(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status);
 958    MOVE(*d, r);
 959}
 960
 961void helper_haddpd(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 962{
 963    ZMMReg r;
 964
 965    r.ZMM_D(0) = float64_add(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status);
 966    r.ZMM_D(1) = float64_add(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status);
 967    MOVE(*d, r);
 968}
 969
 970void helper_hsubps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 971{
 972    ZMMReg r;
 973
 974    r.ZMM_S(0) = float32_sub(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status);
 975    r.ZMM_S(1) = float32_sub(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status);
 976    r.ZMM_S(2) = float32_sub(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status);
 977    r.ZMM_S(3) = float32_sub(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status);
 978    MOVE(*d, r);
 979}
 980
 981void helper_hsubpd(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 982{
 983    ZMMReg r;
 984
 985    r.ZMM_D(0) = float64_sub(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status);
 986    r.ZMM_D(1) = float64_sub(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status);
 987    MOVE(*d, r);
 988}
 989
 990void helper_addsubps(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 991{
 992    d->ZMM_S(0) = float32_sub(d->ZMM_S(0), s->ZMM_S(0), &env->sse_status);
 993    d->ZMM_S(1) = float32_add(d->ZMM_S(1), s->ZMM_S(1), &env->sse_status);
 994    d->ZMM_S(2) = float32_sub(d->ZMM_S(2), s->ZMM_S(2), &env->sse_status);
 995    d->ZMM_S(3) = float32_add(d->ZMM_S(3), s->ZMM_S(3), &env->sse_status);
 996}
 997
 998void helper_addsubpd(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 999{
1000    d->ZMM_D(0) = float64_sub(d->ZMM_D(0), s->ZMM_D(0), &env->sse_status);
1001    d->ZMM_D(1) = float64_add(d->ZMM_D(1), s->ZMM_D(1), &env->sse_status);
1002}
1003
1004/* XXX: unordered */
1005#define SSE_HELPER_CMP(name, F)                                         \
1006    void helper_ ## name ## ps(CPUX86State *env, Reg *d, Reg *s)        \
1007    {                                                                   \
1008        d->ZMM_L(0) = F(32, d->ZMM_S(0), s->ZMM_S(0));                  \
1009        d->ZMM_L(1) = F(32, d->ZMM_S(1), s->ZMM_S(1));                  \
1010        d->ZMM_L(2) = F(32, d->ZMM_S(2), s->ZMM_S(2));                  \
1011        d->ZMM_L(3) = F(32, d->ZMM_S(3), s->ZMM_S(3));                  \
1012    }                                                                   \
1013                                                                        \
1014    void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)        \
1015    {                                                                   \
1016        d->ZMM_L(0) = F(32, d->ZMM_S(0), s->ZMM_S(0));                  \
1017    }                                                                   \
1018                                                                        \
1019    void helper_ ## name ## pd(CPUX86State *env, Reg *d, Reg *s)        \
1020    {                                                                   \
1021        d->ZMM_Q(0) = F(64, d->ZMM_D(0), s->ZMM_D(0));                  \
1022        d->ZMM_Q(1) = F(64, d->ZMM_D(1), s->ZMM_D(1));                  \
1023    }                                                                   \
1024                                                                        \
1025    void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)        \
1026    {                                                                   \
1027        d->ZMM_Q(0) = F(64, d->ZMM_D(0), s->ZMM_D(0));                  \
1028    }
1029
1030#define FPU_CMPEQ(size, a, b)                                           \
1031    (float ## size ## _eq_quiet(a, b, &env->sse_status) ? -1 : 0)
1032#define FPU_CMPLT(size, a, b)                                           \
1033    (float ## size ## _lt(a, b, &env->sse_status) ? -1 : 0)
1034#define FPU_CMPLE(size, a, b)                                           \
1035    (float ## size ## _le(a, b, &env->sse_status) ? -1 : 0)
1036#define FPU_CMPUNORD(size, a, b)                                        \
1037    (float ## size ## _unordered_quiet(a, b, &env->sse_status) ? -1 : 0)
1038#define FPU_CMPNEQ(size, a, b)                                          \
1039    (float ## size ## _eq_quiet(a, b, &env->sse_status) ? 0 : -1)
1040#define FPU_CMPNLT(size, a, b)                                          \
1041    (float ## size ## _lt(a, b, &env->sse_status) ? 0 : -1)
1042#define FPU_CMPNLE(size, a, b)                                          \
1043    (float ## size ## _le(a, b, &env->sse_status) ? 0 : -1)
1044#define FPU_CMPORD(size, a, b)                                          \
1045    (float ## size ## _unordered_quiet(a, b, &env->sse_status) ? 0 : -1)
1046
1047SSE_HELPER_CMP(cmpeq, FPU_CMPEQ)
1048SSE_HELPER_CMP(cmplt, FPU_CMPLT)
1049SSE_HELPER_CMP(cmple, FPU_CMPLE)
1050SSE_HELPER_CMP(cmpunord, FPU_CMPUNORD)
1051SSE_HELPER_CMP(cmpneq, FPU_CMPNEQ)
1052SSE_HELPER_CMP(cmpnlt, FPU_CMPNLT)
1053SSE_HELPER_CMP(cmpnle, FPU_CMPNLE)
1054SSE_HELPER_CMP(cmpord, FPU_CMPORD)
1055
1056static const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
1057
1058void helper_ucomiss(CPUX86State *env, Reg *d, Reg *s)
1059{
1060    FloatRelation ret;
1061    float32 s0, s1;
1062
1063    s0 = d->ZMM_S(0);
1064    s1 = s->ZMM_S(0);
1065    ret = float32_compare_quiet(s0, s1, &env->sse_status);
1066    CC_SRC = comis_eflags[ret + 1];
1067}
1068
1069void helper_comiss(CPUX86State *env, Reg *d, Reg *s)
1070{
1071    FloatRelation ret;
1072    float32 s0, s1;
1073
1074    s0 = d->ZMM_S(0);
1075    s1 = s->ZMM_S(0);
1076    ret = float32_compare(s0, s1, &env->sse_status);
1077    CC_SRC = comis_eflags[ret + 1];
1078}
1079
1080void helper_ucomisd(CPUX86State *env, Reg *d, Reg *s)
1081{
1082    FloatRelation ret;
1083    float64 d0, d1;
1084
1085    d0 = d->ZMM_D(0);
1086    d1 = s->ZMM_D(0);
1087    ret = float64_compare_quiet(d0, d1, &env->sse_status);
1088    CC_SRC = comis_eflags[ret + 1];
1089}
1090
1091void helper_comisd(CPUX86State *env, Reg *d, Reg *s)
1092{
1093    FloatRelation ret;
1094    float64 d0, d1;
1095
1096    d0 = d->ZMM_D(0);
1097    d1 = s->ZMM_D(0);
1098    ret = float64_compare(d0, d1, &env->sse_status);
1099    CC_SRC = comis_eflags[ret + 1];
1100}
1101
1102uint32_t helper_movmskps(CPUX86State *env, Reg *s)
1103{
1104    int b0, b1, b2, b3;
1105
1106    b0 = s->ZMM_L(0) >> 31;
1107    b1 = s->ZMM_L(1) >> 31;
1108    b2 = s->ZMM_L(2) >> 31;
1109    b3 = s->ZMM_L(3) >> 31;
1110    return b0 | (b1 << 1) | (b2 << 2) | (b3 << 3);
1111}
1112
1113uint32_t helper_movmskpd(CPUX86State *env, Reg *s)
1114{
1115    int b0, b1;
1116
1117    b0 = s->ZMM_L(1) >> 31;
1118    b1 = s->ZMM_L(3) >> 31;
1119    return b0 | (b1 << 1);
1120}
1121
1122#endif
1123
1124uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, Reg *s)
1125{
1126    uint32_t val;
1127
1128    val = 0;
1129    val |= (s->B(0) >> 7);
1130    val |= (s->B(1) >> 6) & 0x02;
1131    val |= (s->B(2) >> 5) & 0x04;
1132    val |= (s->B(3) >> 4) & 0x08;
1133    val |= (s->B(4) >> 3) & 0x10;
1134    val |= (s->B(5) >> 2) & 0x20;
1135    val |= (s->B(6) >> 1) & 0x40;
1136    val |= (s->B(7)) & 0x80;
1137#if SHIFT == 1
1138    val |= (s->B(8) << 1) & 0x0100;
1139    val |= (s->B(9) << 2) & 0x0200;
1140    val |= (s->B(10) << 3) & 0x0400;
1141    val |= (s->B(11) << 4) & 0x0800;
1142    val |= (s->B(12) << 5) & 0x1000;
1143    val |= (s->B(13) << 6) & 0x2000;
1144    val |= (s->B(14) << 7) & 0x4000;
1145    val |= (s->B(15) << 8) & 0x8000;
1146#endif
1147    return val;
1148}
1149
1150void glue(helper_packsswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1151{
1152    Reg r;
1153
1154    r.B(0) = satsb((int16_t)d->W(0));
1155    r.B(1) = satsb((int16_t)d->W(1));
1156    r.B(2) = satsb((int16_t)d->W(2));
1157    r.B(3) = satsb((int16_t)d->W(3));
1158#if SHIFT == 1
1159    r.B(4) = satsb((int16_t)d->W(4));
1160    r.B(5) = satsb((int16_t)d->W(5));
1161    r.B(6) = satsb((int16_t)d->W(6));
1162    r.B(7) = satsb((int16_t)d->W(7));
1163#endif
1164    r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0));
1165    r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1));
1166    r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2));
1167    r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3));
1168#if SHIFT == 1
1169    r.B(12) = satsb((int16_t)s->W(4));
1170    r.B(13) = satsb((int16_t)s->W(5));
1171    r.B(14) = satsb((int16_t)s->W(6));
1172    r.B(15) = satsb((int16_t)s->W(7));
1173#endif
1174    MOVE(*d, r);
1175}
1176
1177void glue(helper_packuswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1178{
1179    Reg r;
1180
1181    r.B(0) = satub((int16_t)d->W(0));
1182    r.B(1) = satub((int16_t)d->W(1));
1183    r.B(2) = satub((int16_t)d->W(2));
1184    r.B(3) = satub((int16_t)d->W(3));
1185#if SHIFT == 1
1186    r.B(4) = satub((int16_t)d->W(4));
1187    r.B(5) = satub((int16_t)d->W(5));
1188    r.B(6) = satub((int16_t)d->W(6));
1189    r.B(7) = satub((int16_t)d->W(7));
1190#endif
1191    r.B((4 << SHIFT) + 0) = satub((int16_t)s->W(0));
1192    r.B((4 << SHIFT) + 1) = satub((int16_t)s->W(1));
1193    r.B((4 << SHIFT) + 2) = satub((int16_t)s->W(2));
1194    r.B((4 << SHIFT) + 3) = satub((int16_t)s->W(3));
1195#if SHIFT == 1
1196    r.B(12) = satub((int16_t)s->W(4));
1197    r.B(13) = satub((int16_t)s->W(5));
1198    r.B(14) = satub((int16_t)s->W(6));
1199    r.B(15) = satub((int16_t)s->W(7));
1200#endif
1201    MOVE(*d, r);
1202}
1203
1204void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1205{
1206    Reg r;
1207
1208    r.W(0) = satsw(d->L(0));
1209    r.W(1) = satsw(d->L(1));
1210#if SHIFT == 1
1211    r.W(2) = satsw(d->L(2));
1212    r.W(3) = satsw(d->L(3));
1213#endif
1214    r.W((2 << SHIFT) + 0) = satsw(s->L(0));
1215    r.W((2 << SHIFT) + 1) = satsw(s->L(1));
1216#if SHIFT == 1
1217    r.W(6) = satsw(s->L(2));
1218    r.W(7) = satsw(s->L(3));
1219#endif
1220    MOVE(*d, r);
1221}
1222
1223#define UNPCK_OP(base_name, base)                                       \
1224                                                                        \
1225    void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\
1226                                                        Reg *d, Reg *s) \
1227    {                                                                   \
1228        Reg r;                                                          \
1229                                                                        \
1230        r.B(0) = d->B((base << (SHIFT + 2)) + 0);                       \
1231        r.B(1) = s->B((base << (SHIFT + 2)) + 0);                       \
1232        r.B(2) = d->B((base << (SHIFT + 2)) + 1);                       \
1233        r.B(3) = s->B((base << (SHIFT + 2)) + 1);                       \
1234        r.B(4) = d->B((base << (SHIFT + 2)) + 2);                       \
1235        r.B(5) = s->B((base << (SHIFT + 2)) + 2);                       \
1236        r.B(6) = d->B((base << (SHIFT + 2)) + 3);                       \
1237        r.B(7) = s->B((base << (SHIFT + 2)) + 3);                       \
1238        XMM_ONLY(                                                       \
1239                 r.B(8) = d->B((base << (SHIFT + 2)) + 4);              \
1240                 r.B(9) = s->B((base << (SHIFT + 2)) + 4);              \
1241                 r.B(10) = d->B((base << (SHIFT + 2)) + 5);             \
1242                 r.B(11) = s->B((base << (SHIFT + 2)) + 5);             \
1243                 r.B(12) = d->B((base << (SHIFT + 2)) + 6);             \
1244                 r.B(13) = s->B((base << (SHIFT + 2)) + 6);             \
1245                 r.B(14) = d->B((base << (SHIFT + 2)) + 7);             \
1246                 r.B(15) = s->B((base << (SHIFT + 2)) + 7);             \
1247                                                                      ) \
1248        MOVE(*d, r);                                                    \
1249    }                                                                   \
1250                                                                        \
1251    void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\
1252                                                        Reg *d, Reg *s) \
1253    {                                                                   \
1254        Reg r;                                                          \
1255                                                                        \
1256        r.W(0) = d->W((base << (SHIFT + 1)) + 0);                       \
1257        r.W(1) = s->W((base << (SHIFT + 1)) + 0);                       \
1258        r.W(2) = d->W((base << (SHIFT + 1)) + 1);                       \
1259        r.W(3) = s->W((base << (SHIFT + 1)) + 1);                       \
1260        XMM_ONLY(                                                       \
1261                 r.W(4) = d->W((base << (SHIFT + 1)) + 2);              \
1262                 r.W(5) = s->W((base << (SHIFT + 1)) + 2);              \
1263                 r.W(6) = d->W((base << (SHIFT + 1)) + 3);              \
1264                 r.W(7) = s->W((base << (SHIFT + 1)) + 3);              \
1265                                                                      ) \
1266            MOVE(*d, r);                                                \
1267    }                                                                   \
1268                                                                        \
1269    void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\
1270                                                        Reg *d, Reg *s) \
1271    {                                                                   \
1272        Reg r;                                                          \
1273                                                                        \
1274        r.L(0) = d->L((base << SHIFT) + 0);                             \
1275        r.L(1) = s->L((base << SHIFT) + 0);                             \
1276        XMM_ONLY(                                                       \
1277                 r.L(2) = d->L((base << SHIFT) + 1);                    \
1278                 r.L(3) = s->L((base << SHIFT) + 1);                    \
1279                                                                      ) \
1280            MOVE(*d, r);                                                \
1281    }                                                                   \
1282                                                                        \
1283    XMM_ONLY(                                                           \
1284             void glue(helper_punpck ## base_name ## qdq, SUFFIX)(CPUX86State \
1285                                                                  *env, \
1286                                                                  Reg *d, \
1287                                                                  Reg *s) \
1288             {                                                          \
1289                 Reg r;                                                 \
1290                                                                        \
1291                 r.Q(0) = d->Q(base);                                   \
1292                 r.Q(1) = s->Q(base);                                   \
1293                 MOVE(*d, r);                                           \
1294             }                                                          \
1295                                                                        )
1296
1297UNPCK_OP(l, 0)
1298UNPCK_OP(h, 1)
1299
1300/* 3DNow! float ops */
1301#if SHIFT == 0
1302void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s)
1303{
1304    d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status);
1305    d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status);
1306}
1307
1308void helper_pi2fw(CPUX86State *env, MMXReg *d, MMXReg *s)
1309{
1310    d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status);
1311    d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status);
1312}
1313
1314void helper_pf2id(CPUX86State *env, MMXReg *d, MMXReg *s)
1315{
1316    d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status);
1317    d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status);
1318}
1319
1320void helper_pf2iw(CPUX86State *env, MMXReg *d, MMXReg *s)
1321{
1322    d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0),
1323                                                       &env->mmx_status));
1324    d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1),
1325                                                       &env->mmx_status));
1326}
1327
1328void helper_pfacc(CPUX86State *env, MMXReg *d, MMXReg *s)
1329{
1330    MMXReg r;
1331
1332    r.MMX_S(0) = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1333    r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1334    MOVE(*d, r);
1335}
1336
1337void helper_pfadd(CPUX86State *env, MMXReg *d, MMXReg *s)
1338{
1339    d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1340    d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1341}
1342
1343void helper_pfcmpeq(CPUX86State *env, MMXReg *d, MMXReg *s)
1344{
1345    d->MMX_L(0) = float32_eq_quiet(d->MMX_S(0), s->MMX_S(0),
1346                                   &env->mmx_status) ? -1 : 0;
1347    d->MMX_L(1) = float32_eq_quiet(d->MMX_S(1), s->MMX_S(1),
1348                                   &env->mmx_status) ? -1 : 0;
1349}
1350
1351void helper_pfcmpge(CPUX86State *env, MMXReg *d, MMXReg *s)
1352{
1353    d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0),
1354                             &env->mmx_status) ? -1 : 0;
1355    d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1),
1356                             &env->mmx_status) ? -1 : 0;
1357}
1358
1359void helper_pfcmpgt(CPUX86State *env, MMXReg *d, MMXReg *s)
1360{
1361    d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0),
1362                             &env->mmx_status) ? -1 : 0;
1363    d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1),
1364                             &env->mmx_status) ? -1 : 0;
1365}
1366
1367void helper_pfmax(CPUX86State *env, MMXReg *d, MMXReg *s)
1368{
1369    if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status)) {
1370        d->MMX_S(0) = s->MMX_S(0);
1371    }
1372    if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status)) {
1373        d->MMX_S(1) = s->MMX_S(1);
1374    }
1375}
1376
1377void helper_pfmin(CPUX86State *env, MMXReg *d, MMXReg *s)
1378{
1379    if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status)) {
1380        d->MMX_S(0) = s->MMX_S(0);
1381    }
1382    if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status)) {
1383        d->MMX_S(1) = s->MMX_S(1);
1384    }
1385}
1386
1387void helper_pfmul(CPUX86State *env, MMXReg *d, MMXReg *s)
1388{
1389    d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1390    d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1391}
1392
1393void helper_pfnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
1394{
1395    MMXReg r;
1396
1397    r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1398    r.MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1399    MOVE(*d, r);
1400}
1401
1402void helper_pfpnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
1403{
1404    MMXReg r;
1405
1406    r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1407    r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1408    MOVE(*d, r);
1409}
1410
1411void helper_pfrcp(CPUX86State *env, MMXReg *d, MMXReg *s)
1412{
1413    d->MMX_S(0) = float32_div(float32_one, s->MMX_S(0), &env->mmx_status);
1414    d->MMX_S(1) = d->MMX_S(0);
1415}
1416
1417void helper_pfrsqrt(CPUX86State *env, MMXReg *d, MMXReg *s)
1418{
1419    d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff;
1420    d->MMX_S(1) = float32_div(float32_one,
1421                              float32_sqrt(d->MMX_S(1), &env->mmx_status),
1422                              &env->mmx_status);
1423    d->MMX_L(1) |= s->MMX_L(0) & 0x80000000;
1424    d->MMX_L(0) = d->MMX_L(1);
1425}
1426
1427void helper_pfsub(CPUX86State *env, MMXReg *d, MMXReg *s)
1428{
1429    d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1430    d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1431}
1432
1433void helper_pfsubr(CPUX86State *env, MMXReg *d, MMXReg *s)
1434{
1435    d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status);
1436    d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status);
1437}
1438
1439void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s)
1440{
1441    MMXReg r;
1442
1443    r.MMX_L(0) = s->MMX_L(1);
1444    r.MMX_L(1) = s->MMX_L(0);
1445    MOVE(*d, r);
1446}
1447#endif
1448
1449/* SSSE3 op helpers */
1450void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1451{
1452    int i;
1453    Reg r;
1454
1455    for (i = 0; i < (8 << SHIFT); i++) {
1456        r.B(i) = (s->B(i) & 0x80) ? 0 : (d->B(s->B(i) & ((8 << SHIFT) - 1)));
1457    }
1458
1459    MOVE(*d, r);
1460}
1461
1462void glue(helper_phaddw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1463{
1464
1465    Reg r;
1466
1467    r.W(0) = (int16_t)d->W(0) + (int16_t)d->W(1);
1468    r.W(1) = (int16_t)d->W(2) + (int16_t)d->W(3);
1469    XMM_ONLY(r.W(2) = (int16_t)d->W(4) + (int16_t)d->W(5));
1470    XMM_ONLY(r.W(3) = (int16_t)d->W(6) + (int16_t)d->W(7));
1471    r.W((2 << SHIFT) + 0) = (int16_t)s->W(0) + (int16_t)s->W(1);
1472    r.W((2 << SHIFT) + 1) = (int16_t)s->W(2) + (int16_t)s->W(3);
1473    XMM_ONLY(r.W(6) = (int16_t)s->W(4) + (int16_t)s->W(5));
1474    XMM_ONLY(r.W(7) = (int16_t)s->W(6) + (int16_t)s->W(7));
1475
1476    MOVE(*d, r);
1477}
1478
1479void glue(helper_phaddd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1480{
1481    Reg r;
1482
1483    r.L(0) = (int32_t)d->L(0) + (int32_t)d->L(1);
1484    XMM_ONLY(r.L(1) = (int32_t)d->L(2) + (int32_t)d->L(3));
1485    r.L((1 << SHIFT) + 0) = (int32_t)s->L(0) + (int32_t)s->L(1);
1486    XMM_ONLY(r.L(3) = (int32_t)s->L(2) + (int32_t)s->L(3));
1487
1488    MOVE(*d, r);
1489}
1490
1491void glue(helper_phaddsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1492{
1493    Reg r;
1494
1495    r.W(0) = satsw((int16_t)d->W(0) + (int16_t)d->W(1));
1496    r.W(1) = satsw((int16_t)d->W(2) + (int16_t)d->W(3));
1497    XMM_ONLY(r.W(2) = satsw((int16_t)d->W(4) + (int16_t)d->W(5)));
1498    XMM_ONLY(r.W(3) = satsw((int16_t)d->W(6) + (int16_t)d->W(7)));
1499    r.W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) + (int16_t)s->W(1));
1500    r.W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) + (int16_t)s->W(3));
1501    XMM_ONLY(r.W(6) = satsw((int16_t)s->W(4) + (int16_t)s->W(5)));
1502    XMM_ONLY(r.W(7) = satsw((int16_t)s->W(6) + (int16_t)s->W(7)));
1503
1504    MOVE(*d, r);
1505}
1506
1507void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1508{
1509    d->W(0) = satsw((int8_t)s->B(0) * (uint8_t)d->B(0) +
1510                    (int8_t)s->B(1) * (uint8_t)d->B(1));
1511    d->W(1) = satsw((int8_t)s->B(2) * (uint8_t)d->B(2) +
1512                    (int8_t)s->B(3) * (uint8_t)d->B(3));
1513    d->W(2) = satsw((int8_t)s->B(4) * (uint8_t)d->B(4) +
1514                    (int8_t)s->B(5) * (uint8_t)d->B(5));
1515    d->W(3) = satsw((int8_t)s->B(6) * (uint8_t)d->B(6) +
1516                    (int8_t)s->B(7) * (uint8_t)d->B(7));
1517#if SHIFT == 1
1518    d->W(4) = satsw((int8_t)s->B(8) * (uint8_t)d->B(8) +
1519                    (int8_t)s->B(9) * (uint8_t)d->B(9));
1520    d->W(5) = satsw((int8_t)s->B(10) * (uint8_t)d->B(10) +
1521                    (int8_t)s->B(11) * (uint8_t)d->B(11));
1522    d->W(6) = satsw((int8_t)s->B(12) * (uint8_t)d->B(12) +
1523                    (int8_t)s->B(13) * (uint8_t)d->B(13));
1524    d->W(7) = satsw((int8_t)s->B(14) * (uint8_t)d->B(14) +
1525                    (int8_t)s->B(15) * (uint8_t)d->B(15));
1526#endif
1527}
1528
1529void glue(helper_phsubw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1530{
1531    d->W(0) = (int16_t)d->W(0) - (int16_t)d->W(1);
1532    d->W(1) = (int16_t)d->W(2) - (int16_t)d->W(3);
1533    XMM_ONLY(d->W(2) = (int16_t)d->W(4) - (int16_t)d->W(5));
1534    XMM_ONLY(d->W(3) = (int16_t)d->W(6) - (int16_t)d->W(7));
1535    d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) - (int16_t)s->W(1);
1536    d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) - (int16_t)s->W(3);
1537    XMM_ONLY(d->W(6) = (int16_t)s->W(4) - (int16_t)s->W(5));
1538    XMM_ONLY(d->W(7) = (int16_t)s->W(6) - (int16_t)s->W(7));
1539}
1540
1541void glue(helper_phsubd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1542{
1543    d->L(0) = (int32_t)d->L(0) - (int32_t)d->L(1);
1544    XMM_ONLY(d->L(1) = (int32_t)d->L(2) - (int32_t)d->L(3));
1545    d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) - (int32_t)s->L(1);
1546    XMM_ONLY(d->L(3) = (int32_t)s->L(2) - (int32_t)s->L(3));
1547}
1548
1549void glue(helper_phsubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1550{
1551    d->W(0) = satsw((int16_t)d->W(0) - (int16_t)d->W(1));
1552    d->W(1) = satsw((int16_t)d->W(2) - (int16_t)d->W(3));
1553    XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) - (int16_t)d->W(5)));
1554    XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) - (int16_t)d->W(7)));
1555    d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) - (int16_t)s->W(1));
1556    d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) - (int16_t)s->W(3));
1557    XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) - (int16_t)s->W(5)));
1558    XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) - (int16_t)s->W(7)));
1559}
1560
1561#define FABSB(_, x) (x > INT8_MAX  ? -(int8_t)x : x)
1562#define FABSW(_, x) (x > INT16_MAX ? -(int16_t)x : x)
1563#define FABSL(_, x) (x > INT32_MAX ? -(int32_t)x : x)
1564SSE_HELPER_B(helper_pabsb, FABSB)
1565SSE_HELPER_W(helper_pabsw, FABSW)
1566SSE_HELPER_L(helper_pabsd, FABSL)
1567
1568#define FMULHRSW(d, s) (((int16_t) d * (int16_t)s + 0x4000) >> 15)
1569SSE_HELPER_W(helper_pmulhrsw, FMULHRSW)
1570
1571#define FSIGNB(d, s) (s <= INT8_MAX  ? s ? d : 0 : -(int8_t)d)
1572#define FSIGNW(d, s) (s <= INT16_MAX ? s ? d : 0 : -(int16_t)d)
1573#define FSIGNL(d, s) (s <= INT32_MAX ? s ? d : 0 : -(int32_t)d)
1574SSE_HELPER_B(helper_psignb, FSIGNB)
1575SSE_HELPER_W(helper_psignw, FSIGNW)
1576SSE_HELPER_L(helper_psignd, FSIGNL)
1577
1578void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1579                                  int32_t shift)
1580{
1581    Reg r;
1582
1583    /* XXX could be checked during translation */
1584    if (shift >= (16 << SHIFT)) {
1585        r.Q(0) = 0;
1586        XMM_ONLY(r.Q(1) = 0);
1587    } else {
1588        shift <<= 3;
1589#define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
1590#if SHIFT == 0
1591        r.Q(0) = SHR(s->Q(0), shift - 0) |
1592            SHR(d->Q(0), shift -  64);
1593#else
1594        r.Q(0) = SHR(s->Q(0), shift - 0) |
1595            SHR(s->Q(1), shift -  64) |
1596            SHR(d->Q(0), shift - 128) |
1597            SHR(d->Q(1), shift - 192);
1598        r.Q(1) = SHR(s->Q(0), shift + 64) |
1599            SHR(s->Q(1), shift -   0) |
1600            SHR(d->Q(0), shift -  64) |
1601            SHR(d->Q(1), shift - 128);
1602#endif
1603#undef SHR
1604    }
1605
1606    MOVE(*d, r);
1607}
1608
1609#define XMM0 (env->xmm_regs[0])
1610
1611#if SHIFT == 1
1612#define SSE_HELPER_V(name, elem, num, F)                                \
1613    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)           \
1614    {                                                                   \
1615        d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0));           \
1616        d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1));           \
1617        if (num > 2) {                                                  \
1618            d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2));       \
1619            d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3));       \
1620            if (num > 4) {                                              \
1621                d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4));   \
1622                d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5));   \
1623                d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6));   \
1624                d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7));   \
1625                if (num > 8) {                                          \
1626                    d->elem(8) = F(d->elem(8), s->elem(8), XMM0.elem(8)); \
1627                    d->elem(9) = F(d->elem(9), s->elem(9), XMM0.elem(9)); \
1628                    d->elem(10) = F(d->elem(10), s->elem(10), XMM0.elem(10)); \
1629                    d->elem(11) = F(d->elem(11), s->elem(11), XMM0.elem(11)); \
1630                    d->elem(12) = F(d->elem(12), s->elem(12), XMM0.elem(12)); \
1631                    d->elem(13) = F(d->elem(13), s->elem(13), XMM0.elem(13)); \
1632                    d->elem(14) = F(d->elem(14), s->elem(14), XMM0.elem(14)); \
1633                    d->elem(15) = F(d->elem(15), s->elem(15), XMM0.elem(15)); \
1634                }                                                       \
1635            }                                                           \
1636        }                                                               \
1637    }
1638
1639#define SSE_HELPER_I(name, elem, num, F)                                \
1640    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t imm) \
1641    {                                                                   \
1642        d->elem(0) = F(d->elem(0), s->elem(0), ((imm >> 0) & 1));       \
1643        d->elem(1) = F(d->elem(1), s->elem(1), ((imm >> 1) & 1));       \
1644        if (num > 2) {                                                  \
1645            d->elem(2) = F(d->elem(2), s->elem(2), ((imm >> 2) & 1));   \
1646            d->elem(3) = F(d->elem(3), s->elem(3), ((imm >> 3) & 1));   \
1647            if (num > 4) {                                              \
1648                d->elem(4) = F(d->elem(4), s->elem(4), ((imm >> 4) & 1)); \
1649                d->elem(5) = F(d->elem(5), s->elem(5), ((imm >> 5) & 1)); \
1650                d->elem(6) = F(d->elem(6), s->elem(6), ((imm >> 6) & 1)); \
1651                d->elem(7) = F(d->elem(7), s->elem(7), ((imm >> 7) & 1)); \
1652                if (num > 8) {                                          \
1653                    d->elem(8) = F(d->elem(8), s->elem(8), ((imm >> 8) & 1)); \
1654                    d->elem(9) = F(d->elem(9), s->elem(9), ((imm >> 9) & 1)); \
1655                    d->elem(10) = F(d->elem(10), s->elem(10),           \
1656                                    ((imm >> 10) & 1));                 \
1657                    d->elem(11) = F(d->elem(11), s->elem(11),           \
1658                                    ((imm >> 11) & 1));                 \
1659                    d->elem(12) = F(d->elem(12), s->elem(12),           \
1660                                    ((imm >> 12) & 1));                 \
1661                    d->elem(13) = F(d->elem(13), s->elem(13),           \
1662                                    ((imm >> 13) & 1));                 \
1663                    d->elem(14) = F(d->elem(14), s->elem(14),           \
1664                                    ((imm >> 14) & 1));                 \
1665                    d->elem(15) = F(d->elem(15), s->elem(15),           \
1666                                    ((imm >> 15) & 1));                 \
1667                }                                                       \
1668            }                                                           \
1669        }                                                               \
1670    }
1671
1672/* SSE4.1 op helpers */
1673#define FBLENDVB(d, s, m) ((m & 0x80) ? s : d)
1674#define FBLENDVPS(d, s, m) ((m & 0x80000000) ? s : d)
1675#define FBLENDVPD(d, s, m) ((m & 0x8000000000000000LL) ? s : d)
1676SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB)
1677SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS)
1678SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD)
1679
1680void glue(helper_ptest, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1681{
1682    uint64_t zf = (s->Q(0) &  d->Q(0)) | (s->Q(1) &  d->Q(1));
1683    uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1));
1684
1685    CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C);
1686}
1687
1688#define SSE_HELPER_F(name, elem, num, F)        \
1689    void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)     \
1690    {                                           \
1691        if (num > 2) {                          \
1692            if (num > 4) {                      \
1693                d->elem(7) = F(7);              \
1694                d->elem(6) = F(6);              \
1695                d->elem(5) = F(5);              \
1696                d->elem(4) = F(4);              \
1697            }                                   \
1698            d->elem(3) = F(3);                  \
1699            d->elem(2) = F(2);                  \
1700        }                                       \
1701        d->elem(1) = F(1);                      \
1702        d->elem(0) = F(0);                      \
1703    }
1704
1705SSE_HELPER_F(helper_pmovsxbw, W, 8, (int8_t) s->B)
1706SSE_HELPER_F(helper_pmovsxbd, L, 4, (int8_t) s->B)
1707SSE_HELPER_F(helper_pmovsxbq, Q, 2, (int8_t) s->B)
1708SSE_HELPER_F(helper_pmovsxwd, L, 4, (int16_t) s->W)
1709SSE_HELPER_F(helper_pmovsxwq, Q, 2, (int16_t) s->W)
1710SSE_HELPER_F(helper_pmovsxdq, Q, 2, (int32_t) s->L)
1711SSE_HELPER_F(helper_pmovzxbw, W, 8, s->B)
1712SSE_HELPER_F(helper_pmovzxbd, L, 4, s->B)
1713SSE_HELPER_F(helper_pmovzxbq, Q, 2, s->B)
1714SSE_HELPER_F(helper_pmovzxwd, L, 4, s->W)
1715SSE_HELPER_F(helper_pmovzxwq, Q, 2, s->W)
1716SSE_HELPER_F(helper_pmovzxdq, Q, 2, s->L)
1717
1718void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1719{
1720    d->Q(0) = (int64_t)(int32_t) d->L(0) * (int32_t) s->L(0);
1721    d->Q(1) = (int64_t)(int32_t) d->L(2) * (int32_t) s->L(2);
1722}
1723
1724#define FCMPEQQ(d, s) (d == s ? -1 : 0)
1725SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ)
1726
1727void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1728{
1729    Reg r;
1730
1731    r.W(0) = satuw((int32_t) d->L(0));
1732    r.W(1) = satuw((int32_t) d->L(1));
1733    r.W(2) = satuw((int32_t) d->L(2));
1734    r.W(3) = satuw((int32_t) d->L(3));
1735    r.W(4) = satuw((int32_t) s->L(0));
1736    r.W(5) = satuw((int32_t) s->L(1));
1737    r.W(6) = satuw((int32_t) s->L(2));
1738    r.W(7) = satuw((int32_t) s->L(3));
1739    MOVE(*d, r);
1740}
1741
1742#define FMINSB(d, s) MIN((int8_t)d, (int8_t)s)
1743#define FMINSD(d, s) MIN((int32_t)d, (int32_t)s)
1744#define FMAXSB(d, s) MAX((int8_t)d, (int8_t)s)
1745#define FMAXSD(d, s) MAX((int32_t)d, (int32_t)s)
1746SSE_HELPER_B(helper_pminsb, FMINSB)
1747SSE_HELPER_L(helper_pminsd, FMINSD)
1748SSE_HELPER_W(helper_pminuw, MIN)
1749SSE_HELPER_L(helper_pminud, MIN)
1750SSE_HELPER_B(helper_pmaxsb, FMAXSB)
1751SSE_HELPER_L(helper_pmaxsd, FMAXSD)
1752SSE_HELPER_W(helper_pmaxuw, MAX)
1753SSE_HELPER_L(helper_pmaxud, MAX)
1754
1755#define FMULLD(d, s) ((int32_t)d * (int32_t)s)
1756SSE_HELPER_L(helper_pmulld, FMULLD)
1757
1758void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
1759{
1760    int idx = 0;
1761
1762    if (s->W(1) < s->W(idx)) {
1763        idx = 1;
1764    }
1765    if (s->W(2) < s->W(idx)) {
1766        idx = 2;
1767    }
1768    if (s->W(3) < s->W(idx)) {
1769        idx = 3;
1770    }
1771    if (s->W(4) < s->W(idx)) {
1772        idx = 4;
1773    }
1774    if (s->W(5) < s->W(idx)) {
1775        idx = 5;
1776    }
1777    if (s->W(6) < s->W(idx)) {
1778        idx = 6;
1779    }
1780    if (s->W(7) < s->W(idx)) {
1781        idx = 7;
1782    }
1783
1784    d->W(0) = s->W(idx);
1785    d->W(1) = idx;
1786    d->L(1) = 0;
1787    d->Q(1) = 0;
1788}
1789
1790void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1791                                  uint32_t mode)
1792{
1793    uint8_t old_flags = get_float_exception_flags(&env->sse_status);
1794    signed char prev_rounding_mode;
1795
1796    prev_rounding_mode = env->sse_status.float_rounding_mode;
1797    if (!(mode & (1 << 2))) {
1798        switch (mode & 3) {
1799        case 0:
1800            set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1801            break;
1802        case 1:
1803            set_float_rounding_mode(float_round_down, &env->sse_status);
1804            break;
1805        case 2:
1806            set_float_rounding_mode(float_round_up, &env->sse_status);
1807            break;
1808        case 3:
1809            set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1810            break;
1811        }
1812    }
1813
1814    d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status);
1815    d->ZMM_S(1) = float32_round_to_int(s->ZMM_S(1), &env->sse_status);
1816    d->ZMM_S(2) = float32_round_to_int(s->ZMM_S(2), &env->sse_status);
1817    d->ZMM_S(3) = float32_round_to_int(s->ZMM_S(3), &env->sse_status);
1818
1819    if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
1820        set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1821                                  ~float_flag_inexact,
1822                                  &env->sse_status);
1823    }
1824    env->sse_status.float_rounding_mode = prev_rounding_mode;
1825}
1826
1827void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1828                                  uint32_t mode)
1829{
1830    uint8_t old_flags = get_float_exception_flags(&env->sse_status);
1831    signed char prev_rounding_mode;
1832
1833    prev_rounding_mode = env->sse_status.float_rounding_mode;
1834    if (!(mode & (1 << 2))) {
1835        switch (mode & 3) {
1836        case 0:
1837            set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1838            break;
1839        case 1:
1840            set_float_rounding_mode(float_round_down, &env->sse_status);
1841            break;
1842        case 2:
1843            set_float_rounding_mode(float_round_up, &env->sse_status);
1844            break;
1845        case 3:
1846            set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1847            break;
1848        }
1849    }
1850
1851    d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status);
1852    d->ZMM_D(1) = float64_round_to_int(s->ZMM_D(1), &env->sse_status);
1853
1854    if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
1855        set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1856                                  ~float_flag_inexact,
1857                                  &env->sse_status);
1858    }
1859    env->sse_status.float_rounding_mode = prev_rounding_mode;
1860}
1861
1862void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1863                                  uint32_t mode)
1864{
1865    uint8_t old_flags = get_float_exception_flags(&env->sse_status);
1866    signed char prev_rounding_mode;
1867
1868    prev_rounding_mode = env->sse_status.float_rounding_mode;
1869    if (!(mode & (1 << 2))) {
1870        switch (mode & 3) {
1871        case 0:
1872            set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1873            break;
1874        case 1:
1875            set_float_rounding_mode(float_round_down, &env->sse_status);
1876            break;
1877        case 2:
1878            set_float_rounding_mode(float_round_up, &env->sse_status);
1879            break;
1880        case 3:
1881            set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1882            break;
1883        }
1884    }
1885
1886    d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status);
1887
1888    if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
1889        set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1890                                  ~float_flag_inexact,
1891                                  &env->sse_status);
1892    }
1893    env->sse_status.float_rounding_mode = prev_rounding_mode;
1894}
1895
1896void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1897                                  uint32_t mode)
1898{
1899    uint8_t old_flags = get_float_exception_flags(&env->sse_status);
1900    signed char prev_rounding_mode;
1901
1902    prev_rounding_mode = env->sse_status.float_rounding_mode;
1903    if (!(mode & (1 << 2))) {
1904        switch (mode & 3) {
1905        case 0:
1906            set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1907            break;
1908        case 1:
1909            set_float_rounding_mode(float_round_down, &env->sse_status);
1910            break;
1911        case 2:
1912            set_float_rounding_mode(float_round_up, &env->sse_status);
1913            break;
1914        case 3:
1915            set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1916            break;
1917        }
1918    }
1919
1920    d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status);
1921
1922    if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
1923        set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1924                                  ~float_flag_inexact,
1925                                  &env->sse_status);
1926    }
1927    env->sse_status.float_rounding_mode = prev_rounding_mode;
1928}
1929
1930#define FBLENDP(d, s, m) (m ? s : d)
1931SSE_HELPER_I(helper_blendps, L, 4, FBLENDP)
1932SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP)
1933SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP)
1934
1935void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask)
1936{
1937    float32 iresult = float32_zero;
1938
1939    if (mask & (1 << 4)) {
1940        iresult = float32_add(iresult,
1941                              float32_mul(d->ZMM_S(0), s->ZMM_S(0),
1942                                          &env->sse_status),
1943                              &env->sse_status);
1944    }
1945    if (mask & (1 << 5)) {
1946        iresult = float32_add(iresult,
1947                              float32_mul(d->ZMM_S(1), s->ZMM_S(1),
1948                                          &env->sse_status),
1949                              &env->sse_status);
1950    }
1951    if (mask & (1 << 6)) {
1952        iresult = float32_add(iresult,
1953                              float32_mul(d->ZMM_S(2), s->ZMM_S(2),
1954                                          &env->sse_status),
1955                              &env->sse_status);
1956    }
1957    if (mask & (1 << 7)) {
1958        iresult = float32_add(iresult,
1959                              float32_mul(d->ZMM_S(3), s->ZMM_S(3),
1960                                          &env->sse_status),
1961                              &env->sse_status);
1962    }
1963    d->ZMM_S(0) = (mask & (1 << 0)) ? iresult : float32_zero;
1964    d->ZMM_S(1) = (mask & (1 << 1)) ? iresult : float32_zero;
1965    d->ZMM_S(2) = (mask & (1 << 2)) ? iresult : float32_zero;
1966    d->ZMM_S(3) = (mask & (1 << 3)) ? iresult : float32_zero;
1967}
1968
1969void glue(helper_dppd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask)
1970{
1971    float64 iresult = float64_zero;
1972
1973    if (mask & (1 << 4)) {
1974        iresult = float64_add(iresult,
1975                              float64_mul(d->ZMM_D(0), s->ZMM_D(0),
1976                                          &env->sse_status),
1977                              &env->sse_status);
1978    }
1979    if (mask & (1 << 5)) {
1980        iresult = float64_add(iresult,
1981                              float64_mul(d->ZMM_D(1), s->ZMM_D(1),
1982                                          &env->sse_status),
1983                              &env->sse_status);
1984    }
1985    d->ZMM_D(0) = (mask & (1 << 0)) ? iresult : float64_zero;
1986    d->ZMM_D(1) = (mask & (1 << 1)) ? iresult : float64_zero;
1987}
1988
1989void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1990                                  uint32_t offset)
1991{
1992    int s0 = (offset & 3) << 2;
1993    int d0 = (offset & 4) << 0;
1994    int i;
1995    Reg r;
1996
1997    for (i = 0; i < 8; i++, d0++) {
1998        r.W(i) = 0;
1999        r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0));
2000        r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1));
2001        r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2));
2002        r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3));
2003    }
2004
2005    MOVE(*d, r);
2006}
2007
2008/* SSE4.2 op helpers */
2009#define FCMPGTQ(d, s) ((int64_t)d > (int64_t)s ? -1 : 0)
2010SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ)
2011
2012static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl)
2013{
2014    target_long val, limit;
2015
2016    /* Presence of REX.W is indicated by a bit higher than 7 set */
2017    if (ctrl >> 8) {
2018        val = (target_long)env->regs[reg];
2019    } else {
2020        val = (int32_t)env->regs[reg];
2021    }
2022    if (ctrl & 1) {
2023        limit = 8;
2024    } else {
2025        limit = 16;
2026    }
2027    if ((val > limit) || (val < -limit)) {
2028        return limit;
2029    }
2030    return abs1(val);
2031}
2032
2033static inline int pcmp_ilen(Reg *r, uint8_t ctrl)
2034{
2035    int val = 0;
2036
2037    if (ctrl & 1) {
2038        while (val < 8 && r->W(val)) {
2039            val++;
2040        }
2041    } else {
2042        while (val < 16 && r->B(val)) {
2043            val++;
2044        }
2045    }
2046
2047    return val;
2048}
2049
2050static inline int pcmp_val(Reg *r, uint8_t ctrl, int i)
2051{
2052    switch ((ctrl >> 0) & 3) {
2053    case 0:
2054        return r->B(i);
2055    case 1:
2056        return r->W(i);
2057    case 2:
2058        return (int8_t)r->B(i);
2059    case 3:
2060    default:
2061        return (int16_t)r->W(i);
2062    }
2063}
2064
2065static inline unsigned pcmpxstrx(CPUX86State *env, Reg *d, Reg *s,
2066                                 int8_t ctrl, int valids, int validd)
2067{
2068    unsigned int res = 0;
2069    int v;
2070    int j, i;
2071    int upper = (ctrl & 1) ? 7 : 15;
2072
2073    valids--;
2074    validd--;
2075
2076    CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0);
2077
2078    switch ((ctrl >> 2) & 3) {
2079    case 0:
2080        for (j = valids; j >= 0; j--) {
2081            res <<= 1;
2082            v = pcmp_val(s, ctrl, j);
2083            for (i = validd; i >= 0; i--) {
2084                res |= (v == pcmp_val(d, ctrl, i));
2085            }
2086        }
2087        break;
2088    case 1:
2089        for (j = valids; j >= 0; j--) {
2090            res <<= 1;
2091            v = pcmp_val(s, ctrl, j);
2092            for (i = ((validd - 1) | 1); i >= 0; i -= 2) {
2093                res |= (pcmp_val(d, ctrl, i - 0) >= v &&
2094                        pcmp_val(d, ctrl, i - 1) <= v);
2095            }
2096        }
2097        break;
2098    case 2:
2099        res = (1 << (upper - MAX(valids, validd))) - 1;
2100        res <<= MAX(valids, validd) - MIN(valids, validd);
2101        for (i = MIN(valids, validd); i >= 0; i--) {
2102            res <<= 1;
2103            v = pcmp_val(s, ctrl, i);
2104            res |= (v == pcmp_val(d, ctrl, i));
2105        }
2106        break;
2107    case 3:
2108        if (validd == -1) {
2109            res = (2 << upper) - 1;
2110            break;
2111        }
2112        for (j = valids == upper ? valids : valids - validd; j >= 0; j--) {
2113            res <<= 1;
2114            v = 1;
2115            for (i = MIN(valids - j, validd); i >= 0; i--) {
2116                v &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i));
2117            }
2118            res |= v;
2119        }
2120        break;
2121    }
2122
2123    switch ((ctrl >> 4) & 3) {
2124    case 1:
2125        res ^= (2 << upper) - 1;
2126        break;
2127    case 3:
2128        res ^= (1 << (valids + 1)) - 1;
2129        break;
2130    }
2131
2132    if (res) {
2133        CC_SRC |= CC_C;
2134    }
2135    if (res & 1) {
2136        CC_SRC |= CC_O;
2137    }
2138
2139    return res;
2140}
2141
2142void glue(helper_pcmpestri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2143                                    uint32_t ctrl)
2144{
2145    unsigned int res = pcmpxstrx(env, d, s, ctrl,
2146                                 pcmp_elen(env, R_EDX, ctrl),
2147                                 pcmp_elen(env, R_EAX, ctrl));
2148
2149    if (res) {
2150        env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res);
2151    } else {
2152        env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
2153    }
2154}
2155
2156void glue(helper_pcmpestrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2157                                    uint32_t ctrl)
2158{
2159    int i;
2160    unsigned int res = pcmpxstrx(env, d, s, ctrl,
2161                                 pcmp_elen(env, R_EDX, ctrl),
2162                                 pcmp_elen(env, R_EAX, ctrl));
2163
2164    if ((ctrl >> 6) & 1) {
2165        if (ctrl & 1) {
2166            for (i = 0; i < 8; i++, res >>= 1) {
2167                env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0;
2168            }
2169        } else {
2170            for (i = 0; i < 16; i++, res >>= 1) {
2171                env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0;
2172            }
2173        }
2174    } else {
2175        env->xmm_regs[0].Q(1) = 0;
2176        env->xmm_regs[0].Q(0) = res;
2177    }
2178}
2179
2180void glue(helper_pcmpistri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2181                                    uint32_t ctrl)
2182{
2183    unsigned int res = pcmpxstrx(env, d, s, ctrl,
2184                                 pcmp_ilen(s, ctrl),
2185                                 pcmp_ilen(d, ctrl));
2186
2187    if (res) {
2188        env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res);
2189    } else {
2190        env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
2191    }
2192}
2193
2194void glue(helper_pcmpistrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2195                                    uint32_t ctrl)
2196{
2197    int i;
2198    unsigned int res = pcmpxstrx(env, d, s, ctrl,
2199                                 pcmp_ilen(s, ctrl),
2200                                 pcmp_ilen(d, ctrl));
2201
2202    if ((ctrl >> 6) & 1) {
2203        if (ctrl & 1) {
2204            for (i = 0; i < 8; i++, res >>= 1) {
2205                env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0;
2206            }
2207        } else {
2208            for (i = 0; i < 16; i++, res >>= 1) {
2209                env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0;
2210            }
2211        }
2212    } else {
2213        env->xmm_regs[0].Q(1) = 0;
2214        env->xmm_regs[0].Q(0) = res;
2215    }
2216}
2217
2218#define CRCPOLY        0x1edc6f41
2219#define CRCPOLY_BITREV 0x82f63b78
2220target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len)
2221{
2222    target_ulong crc = (msg & ((target_ulong) -1 >>
2223                               (TARGET_LONG_BITS - len))) ^ crc1;
2224
2225    while (len--) {
2226        crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0);
2227    }
2228
2229    return crc;
2230}
2231
2232void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2233                                    uint32_t ctrl)
2234{
2235    uint64_t ah, al, b, resh, resl;
2236
2237    ah = 0;
2238    al = d->Q((ctrl & 1) != 0);
2239    b = s->Q((ctrl & 16) != 0);
2240    resh = resl = 0;
2241
2242    while (b) {
2243        if (b & 1) {
2244            resl ^= al;
2245            resh ^= ah;
2246        }
2247        ah = (ah << 1) | (al >> 63);
2248        al <<= 1;
2249        b >>= 1;
2250    }
2251
2252    d->Q(0) = resl;
2253    d->Q(1) = resh;
2254}
2255
2256void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2257{
2258    int i;
2259    Reg st = *d;
2260    Reg rk = *s;
2261
2262    for (i = 0 ; i < 4 ; i++) {
2263        d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4*i+0])] ^
2264                                    AES_Td1[st.B(AES_ishifts[4*i+1])] ^
2265                                    AES_Td2[st.B(AES_ishifts[4*i+2])] ^
2266                                    AES_Td3[st.B(AES_ishifts[4*i+3])]);
2267    }
2268}
2269
2270void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2271{
2272    int i;
2273    Reg st = *d;
2274    Reg rk = *s;
2275
2276    for (i = 0; i < 16; i++) {
2277        d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i])]);
2278    }
2279}
2280
2281void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2282{
2283    int i;
2284    Reg st = *d;
2285    Reg rk = *s;
2286
2287    for (i = 0 ; i < 4 ; i++) {
2288        d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4*i+0])] ^
2289                                    AES_Te1[st.B(AES_shifts[4*i+1])] ^
2290                                    AES_Te2[st.B(AES_shifts[4*i+2])] ^
2291                                    AES_Te3[st.B(AES_shifts[4*i+3])]);
2292    }
2293}
2294
2295void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2296{
2297    int i;
2298    Reg st = *d;
2299    Reg rk = *s;
2300
2301    for (i = 0; i < 16; i++) {
2302        d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i])]);
2303    }
2304
2305}
2306
2307void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2308{
2309    int i;
2310    Reg tmp = *s;
2311
2312    for (i = 0 ; i < 4 ; i++) {
2313        d->L(i) = bswap32(AES_imc[tmp.B(4*i+0)][0] ^
2314                          AES_imc[tmp.B(4*i+1)][1] ^
2315                          AES_imc[tmp.B(4*i+2)][2] ^
2316                          AES_imc[tmp.B(4*i+3)][3]);
2317    }
2318}
2319
2320void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2321                                          uint32_t ctrl)
2322{
2323    int i;
2324    Reg tmp = *s;
2325
2326    for (i = 0 ; i < 4 ; i++) {
2327        d->B(i) = AES_sbox[tmp.B(i + 4)];
2328        d->B(i + 8) = AES_sbox[tmp.B(i + 12)];
2329    }
2330    d->L(1) = (d->L(0) << 24 | d->L(0) >> 8) ^ ctrl;
2331    d->L(3) = (d->L(2) << 24 | d->L(2) >> 8) ^ ctrl;
2332}
2333#endif
2334
2335#undef SHIFT
2336#undef XMM_ONLY
2337#undef Reg
2338#undef B
2339#undef W
2340#undef L
2341#undef Q
2342#undef SUFFIX
2343#undef SIZE
2344