qemu/target/i386/fpu_helper.c
<<
>>
Prefs
   1/*
   2 *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
   3 *
   4 *  Copyright (c) 2003 Fabrice Bellard
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2.1 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include <math.h>
  22#include "cpu.h"
  23#include "exec/helper-proto.h"
  24#include "qemu/host-utils.h"
  25#include "exec/exec-all.h"
  26#include "exec/cpu_ldst.h"
  27#include "fpu/softfloat.h"
  28#include "fpu/softfloat-macros.h"
  29
  30#ifdef CONFIG_SOFTMMU
  31#include "hw/irq.h"
  32#endif
  33
  34#define FPU_RC_MASK         0xc00
  35#define FPU_RC_NEAR         0x000
  36#define FPU_RC_DOWN         0x400
  37#define FPU_RC_UP           0x800
  38#define FPU_RC_CHOP         0xc00
  39
  40#define MAXTAN 9223372036854775808.0
  41
  42/* the following deal with x86 long double-precision numbers */
  43#define MAXEXPD 0x7fff
  44#define EXPBIAS 16383
  45#define EXPD(fp)        (fp.l.upper & 0x7fff)
  46#define SIGND(fp)       ((fp.l.upper) & 0x8000)
  47#define MANTD(fp)       (fp.l.lower)
  48#define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
  49
  50#define FPUS_IE (1 << 0)
  51#define FPUS_DE (1 << 1)
  52#define FPUS_ZE (1 << 2)
  53#define FPUS_OE (1 << 3)
  54#define FPUS_UE (1 << 4)
  55#define FPUS_PE (1 << 5)
  56#define FPUS_SF (1 << 6)
  57#define FPUS_SE (1 << 7)
  58#define FPUS_B  (1 << 15)
  59
  60#define FPUC_EM 0x3f
  61
  62#define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
  63#define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
  64#define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
  65#define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
  66#define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
  67#define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
  68#define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
  69#define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
  70
  71#if !defined(CONFIG_USER_ONLY)
  72static qemu_irq ferr_irq;
  73
  74void x86_register_ferr_irq(qemu_irq irq)
  75{
  76    ferr_irq = irq;
  77}
  78
  79static void cpu_clear_ignne(void)
  80{
  81    CPUX86State *env = &X86_CPU(first_cpu)->env;
  82    env->hflags2 &= ~HF2_IGNNE_MASK;
  83}
  84
  85void cpu_set_ignne(void)
  86{
  87    CPUX86State *env = &X86_CPU(first_cpu)->env;
  88    env->hflags2 |= HF2_IGNNE_MASK;
  89    /*
  90     * We get here in response to a write to port F0h.  The chipset should
  91     * deassert FP_IRQ and FERR# instead should stay signaled until FPSW_SE is
  92     * cleared, because FERR# and FP_IRQ are two separate pins on real
  93     * hardware.  However, we don't model FERR# as a qemu_irq, so we just
  94     * do directly what the chipset would do, i.e. deassert FP_IRQ.
  95     */
  96    qemu_irq_lower(ferr_irq);
  97}
  98#endif
  99
 100
 101static inline void fpush(CPUX86State *env)
 102{
 103    env->fpstt = (env->fpstt - 1) & 7;
 104    env->fptags[env->fpstt] = 0; /* validate stack entry */
 105}
 106
 107static inline void fpop(CPUX86State *env)
 108{
 109    env->fptags[env->fpstt] = 1; /* invalidate stack entry */
 110    env->fpstt = (env->fpstt + 1) & 7;
 111}
 112
 113static inline floatx80 helper_fldt(CPUX86State *env, target_ulong ptr,
 114                                   uintptr_t retaddr)
 115{
 116    CPU_LDoubleU temp;
 117
 118    temp.l.lower = cpu_ldq_data_ra(env, ptr, retaddr);
 119    temp.l.upper = cpu_lduw_data_ra(env, ptr + 8, retaddr);
 120    return temp.d;
 121}
 122
 123static inline void helper_fstt(CPUX86State *env, floatx80 f, target_ulong ptr,
 124                               uintptr_t retaddr)
 125{
 126    CPU_LDoubleU temp;
 127
 128    temp.d = f;
 129    cpu_stq_data_ra(env, ptr, temp.l.lower, retaddr);
 130    cpu_stw_data_ra(env, ptr + 8, temp.l.upper, retaddr);
 131}
 132
 133/* x87 FPU helpers */
 134
 135static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
 136{
 137    union {
 138        float64 f64;
 139        double d;
 140    } u;
 141
 142    u.f64 = floatx80_to_float64(a, &env->fp_status);
 143    return u.d;
 144}
 145
 146static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
 147{
 148    union {
 149        float64 f64;
 150        double d;
 151    } u;
 152
 153    u.d = a;
 154    return float64_to_floatx80(u.f64, &env->fp_status);
 155}
 156
 157static void fpu_set_exception(CPUX86State *env, int mask)
 158{
 159    env->fpus |= mask;
 160    if (env->fpus & (~env->fpuc & FPUC_EM)) {
 161        env->fpus |= FPUS_SE | FPUS_B;
 162    }
 163}
 164
 165static inline uint8_t save_exception_flags(CPUX86State *env)
 166{
 167    uint8_t old_flags = get_float_exception_flags(&env->fp_status);
 168    set_float_exception_flags(0, &env->fp_status);
 169    return old_flags;
 170}
 171
 172static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
 173{
 174    uint8_t new_flags = get_float_exception_flags(&env->fp_status);
 175    float_raise(old_flags, &env->fp_status);
 176    fpu_set_exception(env,
 177                      ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
 178                       (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
 179                       (new_flags & float_flag_overflow ? FPUS_OE : 0) |
 180                       (new_flags & float_flag_underflow ? FPUS_UE : 0) |
 181                       (new_flags & float_flag_inexact ? FPUS_PE : 0) |
 182                       (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
 183}
 184
 185static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
 186{
 187    uint8_t old_flags = save_exception_flags(env);
 188    floatx80 ret = floatx80_div(a, b, &env->fp_status);
 189    merge_exception_flags(env, old_flags);
 190    return ret;
 191}
 192
 193static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
 194{
 195    if (env->cr[0] & CR0_NE_MASK) {
 196        raise_exception_ra(env, EXCP10_COPR, retaddr);
 197    }
 198#if !defined(CONFIG_USER_ONLY)
 199    else if (ferr_irq && !(env->hflags2 & HF2_IGNNE_MASK)) {
 200        qemu_irq_raise(ferr_irq);
 201    }
 202#endif
 203}
 204
 205void helper_flds_FT0(CPUX86State *env, uint32_t val)
 206{
 207    uint8_t old_flags = save_exception_flags(env);
 208    union {
 209        float32 f;
 210        uint32_t i;
 211    } u;
 212
 213    u.i = val;
 214    FT0 = float32_to_floatx80(u.f, &env->fp_status);
 215    merge_exception_flags(env, old_flags);
 216}
 217
 218void helper_fldl_FT0(CPUX86State *env, uint64_t val)
 219{
 220    uint8_t old_flags = save_exception_flags(env);
 221    union {
 222        float64 f;
 223        uint64_t i;
 224    } u;
 225
 226    u.i = val;
 227    FT0 = float64_to_floatx80(u.f, &env->fp_status);
 228    merge_exception_flags(env, old_flags);
 229}
 230
 231void helper_fildl_FT0(CPUX86State *env, int32_t val)
 232{
 233    FT0 = int32_to_floatx80(val, &env->fp_status);
 234}
 235
 236void helper_flds_ST0(CPUX86State *env, uint32_t val)
 237{
 238    uint8_t old_flags = save_exception_flags(env);
 239    int new_fpstt;
 240    union {
 241        float32 f;
 242        uint32_t i;
 243    } u;
 244
 245    new_fpstt = (env->fpstt - 1) & 7;
 246    u.i = val;
 247    env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
 248    env->fpstt = new_fpstt;
 249    env->fptags[new_fpstt] = 0; /* validate stack entry */
 250    merge_exception_flags(env, old_flags);
 251}
 252
 253void helper_fldl_ST0(CPUX86State *env, uint64_t val)
 254{
 255    uint8_t old_flags = save_exception_flags(env);
 256    int new_fpstt;
 257    union {
 258        float64 f;
 259        uint64_t i;
 260    } u;
 261
 262    new_fpstt = (env->fpstt - 1) & 7;
 263    u.i = val;
 264    env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
 265    env->fpstt = new_fpstt;
 266    env->fptags[new_fpstt] = 0; /* validate stack entry */
 267    merge_exception_flags(env, old_flags);
 268}
 269
 270void helper_fildl_ST0(CPUX86State *env, int32_t val)
 271{
 272    int new_fpstt;
 273
 274    new_fpstt = (env->fpstt - 1) & 7;
 275    env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
 276    env->fpstt = new_fpstt;
 277    env->fptags[new_fpstt] = 0; /* validate stack entry */
 278}
 279
 280void helper_fildll_ST0(CPUX86State *env, int64_t val)
 281{
 282    int new_fpstt;
 283
 284    new_fpstt = (env->fpstt - 1) & 7;
 285    env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
 286    env->fpstt = new_fpstt;
 287    env->fptags[new_fpstt] = 0; /* validate stack entry */
 288}
 289
 290uint32_t helper_fsts_ST0(CPUX86State *env)
 291{
 292    uint8_t old_flags = save_exception_flags(env);
 293    union {
 294        float32 f;
 295        uint32_t i;
 296    } u;
 297
 298    u.f = floatx80_to_float32(ST0, &env->fp_status);
 299    merge_exception_flags(env, old_flags);
 300    return u.i;
 301}
 302
 303uint64_t helper_fstl_ST0(CPUX86State *env)
 304{
 305    uint8_t old_flags = save_exception_flags(env);
 306    union {
 307        float64 f;
 308        uint64_t i;
 309    } u;
 310
 311    u.f = floatx80_to_float64(ST0, &env->fp_status);
 312    merge_exception_flags(env, old_flags);
 313    return u.i;
 314}
 315
 316int32_t helper_fist_ST0(CPUX86State *env)
 317{
 318    uint8_t old_flags = save_exception_flags(env);
 319    int32_t val;
 320
 321    val = floatx80_to_int32(ST0, &env->fp_status);
 322    if (val != (int16_t)val) {
 323        set_float_exception_flags(float_flag_invalid, &env->fp_status);
 324        val = -32768;
 325    }
 326    merge_exception_flags(env, old_flags);
 327    return val;
 328}
 329
 330int32_t helper_fistl_ST0(CPUX86State *env)
 331{
 332    uint8_t old_flags = save_exception_flags(env);
 333    int32_t val;
 334
 335    val = floatx80_to_int32(ST0, &env->fp_status);
 336    if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 337        val = 0x80000000;
 338    }
 339    merge_exception_flags(env, old_flags);
 340    return val;
 341}
 342
 343int64_t helper_fistll_ST0(CPUX86State *env)
 344{
 345    uint8_t old_flags = save_exception_flags(env);
 346    int64_t val;
 347
 348    val = floatx80_to_int64(ST0, &env->fp_status);
 349    if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 350        val = 0x8000000000000000ULL;
 351    }
 352    merge_exception_flags(env, old_flags);
 353    return val;
 354}
 355
 356int32_t helper_fistt_ST0(CPUX86State *env)
 357{
 358    uint8_t old_flags = save_exception_flags(env);
 359    int32_t val;
 360
 361    val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
 362    if (val != (int16_t)val) {
 363        set_float_exception_flags(float_flag_invalid, &env->fp_status);
 364        val = -32768;
 365    }
 366    merge_exception_flags(env, old_flags);
 367    return val;
 368}
 369
 370int32_t helper_fisttl_ST0(CPUX86State *env)
 371{
 372    uint8_t old_flags = save_exception_flags(env);
 373    int32_t val;
 374
 375    val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
 376    if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 377        val = 0x80000000;
 378    }
 379    merge_exception_flags(env, old_flags);
 380    return val;
 381}
 382
 383int64_t helper_fisttll_ST0(CPUX86State *env)
 384{
 385    uint8_t old_flags = save_exception_flags(env);
 386    int64_t val;
 387
 388    val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
 389    if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 390        val = 0x8000000000000000ULL;
 391    }
 392    merge_exception_flags(env, old_flags);
 393    return val;
 394}
 395
 396void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
 397{
 398    int new_fpstt;
 399
 400    new_fpstt = (env->fpstt - 1) & 7;
 401    env->fpregs[new_fpstt].d = helper_fldt(env, ptr, GETPC());
 402    env->fpstt = new_fpstt;
 403    env->fptags[new_fpstt] = 0; /* validate stack entry */
 404}
 405
 406void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
 407{
 408    helper_fstt(env, ST0, ptr, GETPC());
 409}
 410
 411void helper_fpush(CPUX86State *env)
 412{
 413    fpush(env);
 414}
 415
 416void helper_fpop(CPUX86State *env)
 417{
 418    fpop(env);
 419}
 420
 421void helper_fdecstp(CPUX86State *env)
 422{
 423    env->fpstt = (env->fpstt - 1) & 7;
 424    env->fpus &= ~0x4700;
 425}
 426
 427void helper_fincstp(CPUX86State *env)
 428{
 429    env->fpstt = (env->fpstt + 1) & 7;
 430    env->fpus &= ~0x4700;
 431}
 432
 433/* FPU move */
 434
 435void helper_ffree_STN(CPUX86State *env, int st_index)
 436{
 437    env->fptags[(env->fpstt + st_index) & 7] = 1;
 438}
 439
 440void helper_fmov_ST0_FT0(CPUX86State *env)
 441{
 442    ST0 = FT0;
 443}
 444
 445void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
 446{
 447    FT0 = ST(st_index);
 448}
 449
 450void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
 451{
 452    ST0 = ST(st_index);
 453}
 454
 455void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
 456{
 457    ST(st_index) = ST0;
 458}
 459
 460void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
 461{
 462    floatx80 tmp;
 463
 464    tmp = ST(st_index);
 465    ST(st_index) = ST0;
 466    ST0 = tmp;
 467}
 468
 469/* FPU operations */
 470
 471static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
 472
 473void helper_fcom_ST0_FT0(CPUX86State *env)
 474{
 475    uint8_t old_flags = save_exception_flags(env);
 476    FloatRelation ret;
 477
 478    ret = floatx80_compare(ST0, FT0, &env->fp_status);
 479    env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
 480    merge_exception_flags(env, old_flags);
 481}
 482
 483void helper_fucom_ST0_FT0(CPUX86State *env)
 484{
 485    uint8_t old_flags = save_exception_flags(env);
 486    FloatRelation ret;
 487
 488    ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
 489    env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
 490    merge_exception_flags(env, old_flags);
 491}
 492
 493static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
 494
 495void helper_fcomi_ST0_FT0(CPUX86State *env)
 496{
 497    uint8_t old_flags = save_exception_flags(env);
 498    int eflags;
 499    FloatRelation ret;
 500
 501    ret = floatx80_compare(ST0, FT0, &env->fp_status);
 502    eflags = cpu_cc_compute_all(env, CC_OP);
 503    eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
 504    CC_SRC = eflags;
 505    merge_exception_flags(env, old_flags);
 506}
 507
 508void helper_fucomi_ST0_FT0(CPUX86State *env)
 509{
 510    uint8_t old_flags = save_exception_flags(env);
 511    int eflags;
 512    FloatRelation ret;
 513
 514    ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
 515    eflags = cpu_cc_compute_all(env, CC_OP);
 516    eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
 517    CC_SRC = eflags;
 518    merge_exception_flags(env, old_flags);
 519}
 520
 521void helper_fadd_ST0_FT0(CPUX86State *env)
 522{
 523    uint8_t old_flags = save_exception_flags(env);
 524    ST0 = floatx80_add(ST0, FT0, &env->fp_status);
 525    merge_exception_flags(env, old_flags);
 526}
 527
 528void helper_fmul_ST0_FT0(CPUX86State *env)
 529{
 530    uint8_t old_flags = save_exception_flags(env);
 531    ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
 532    merge_exception_flags(env, old_flags);
 533}
 534
 535void helper_fsub_ST0_FT0(CPUX86State *env)
 536{
 537    uint8_t old_flags = save_exception_flags(env);
 538    ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
 539    merge_exception_flags(env, old_flags);
 540}
 541
 542void helper_fsubr_ST0_FT0(CPUX86State *env)
 543{
 544    uint8_t old_flags = save_exception_flags(env);
 545    ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
 546    merge_exception_flags(env, old_flags);
 547}
 548
 549void helper_fdiv_ST0_FT0(CPUX86State *env)
 550{
 551    ST0 = helper_fdiv(env, ST0, FT0);
 552}
 553
 554void helper_fdivr_ST0_FT0(CPUX86State *env)
 555{
 556    ST0 = helper_fdiv(env, FT0, ST0);
 557}
 558
 559/* fp operations between STN and ST0 */
 560
 561void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
 562{
 563    uint8_t old_flags = save_exception_flags(env);
 564    ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
 565    merge_exception_flags(env, old_flags);
 566}
 567
 568void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
 569{
 570    uint8_t old_flags = save_exception_flags(env);
 571    ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
 572    merge_exception_flags(env, old_flags);
 573}
 574
 575void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
 576{
 577    uint8_t old_flags = save_exception_flags(env);
 578    ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
 579    merge_exception_flags(env, old_flags);
 580}
 581
 582void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
 583{
 584    uint8_t old_flags = save_exception_flags(env);
 585    ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
 586    merge_exception_flags(env, old_flags);
 587}
 588
 589void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
 590{
 591    floatx80 *p;
 592
 593    p = &ST(st_index);
 594    *p = helper_fdiv(env, *p, ST0);
 595}
 596
 597void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
 598{
 599    floatx80 *p;
 600
 601    p = &ST(st_index);
 602    *p = helper_fdiv(env, ST0, *p);
 603}
 604
 605/* misc FPU operations */
 606void helper_fchs_ST0(CPUX86State *env)
 607{
 608    ST0 = floatx80_chs(ST0);
 609}
 610
 611void helper_fabs_ST0(CPUX86State *env)
 612{
 613    ST0 = floatx80_abs(ST0);
 614}
 615
 616void helper_fld1_ST0(CPUX86State *env)
 617{
 618    ST0 = floatx80_one;
 619}
 620
 621void helper_fldl2t_ST0(CPUX86State *env)
 622{
 623    switch (env->fpuc & FPU_RC_MASK) {
 624    case FPU_RC_UP:
 625        ST0 = floatx80_l2t_u;
 626        break;
 627    default:
 628        ST0 = floatx80_l2t;
 629        break;
 630    }
 631}
 632
 633void helper_fldl2e_ST0(CPUX86State *env)
 634{
 635    switch (env->fpuc & FPU_RC_MASK) {
 636    case FPU_RC_DOWN:
 637    case FPU_RC_CHOP:
 638        ST0 = floatx80_l2e_d;
 639        break;
 640    default:
 641        ST0 = floatx80_l2e;
 642        break;
 643    }
 644}
 645
 646void helper_fldpi_ST0(CPUX86State *env)
 647{
 648    switch (env->fpuc & FPU_RC_MASK) {
 649    case FPU_RC_DOWN:
 650    case FPU_RC_CHOP:
 651        ST0 = floatx80_pi_d;
 652        break;
 653    default:
 654        ST0 = floatx80_pi;
 655        break;
 656    }
 657}
 658
 659void helper_fldlg2_ST0(CPUX86State *env)
 660{
 661    switch (env->fpuc & FPU_RC_MASK) {
 662    case FPU_RC_DOWN:
 663    case FPU_RC_CHOP:
 664        ST0 = floatx80_lg2_d;
 665        break;
 666    default:
 667        ST0 = floatx80_lg2;
 668        break;
 669    }
 670}
 671
 672void helper_fldln2_ST0(CPUX86State *env)
 673{
 674    switch (env->fpuc & FPU_RC_MASK) {
 675    case FPU_RC_DOWN:
 676    case FPU_RC_CHOP:
 677        ST0 = floatx80_ln2_d;
 678        break;
 679    default:
 680        ST0 = floatx80_ln2;
 681        break;
 682    }
 683}
 684
 685void helper_fldz_ST0(CPUX86State *env)
 686{
 687    ST0 = floatx80_zero;
 688}
 689
 690void helper_fldz_FT0(CPUX86State *env)
 691{
 692    FT0 = floatx80_zero;
 693}
 694
 695uint32_t helper_fnstsw(CPUX86State *env)
 696{
 697    return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
 698}
 699
 700uint32_t helper_fnstcw(CPUX86State *env)
 701{
 702    return env->fpuc;
 703}
 704
 705void update_fp_status(CPUX86State *env)
 706{
 707    int rnd_type;
 708
 709    /* set rounding mode */
 710    switch (env->fpuc & FPU_RC_MASK) {
 711    default:
 712    case FPU_RC_NEAR:
 713        rnd_type = float_round_nearest_even;
 714        break;
 715    case FPU_RC_DOWN:
 716        rnd_type = float_round_down;
 717        break;
 718    case FPU_RC_UP:
 719        rnd_type = float_round_up;
 720        break;
 721    case FPU_RC_CHOP:
 722        rnd_type = float_round_to_zero;
 723        break;
 724    }
 725    set_float_rounding_mode(rnd_type, &env->fp_status);
 726    switch ((env->fpuc >> 8) & 3) {
 727    case 0:
 728        rnd_type = 32;
 729        break;
 730    case 2:
 731        rnd_type = 64;
 732        break;
 733    case 3:
 734    default:
 735        rnd_type = 80;
 736        break;
 737    }
 738    set_floatx80_rounding_precision(rnd_type, &env->fp_status);
 739}
 740
 741void helper_fldcw(CPUX86State *env, uint32_t val)
 742{
 743    cpu_set_fpuc(env, val);
 744}
 745
 746void helper_fclex(CPUX86State *env)
 747{
 748    env->fpus &= 0x7f00;
 749}
 750
 751void helper_fwait(CPUX86State *env)
 752{
 753    if (env->fpus & FPUS_SE) {
 754        fpu_raise_exception(env, GETPC());
 755    }
 756}
 757
 758void helper_fninit(CPUX86State *env)
 759{
 760    env->fpus = 0;
 761    env->fpstt = 0;
 762    cpu_set_fpuc(env, 0x37f);
 763    env->fptags[0] = 1;
 764    env->fptags[1] = 1;
 765    env->fptags[2] = 1;
 766    env->fptags[3] = 1;
 767    env->fptags[4] = 1;
 768    env->fptags[5] = 1;
 769    env->fptags[6] = 1;
 770    env->fptags[7] = 1;
 771}
 772
 773/* BCD ops */
 774
 775void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
 776{
 777    floatx80 tmp;
 778    uint64_t val;
 779    unsigned int v;
 780    int i;
 781
 782    val = 0;
 783    for (i = 8; i >= 0; i--) {
 784        v = cpu_ldub_data_ra(env, ptr + i, GETPC());
 785        val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
 786    }
 787    tmp = int64_to_floatx80(val, &env->fp_status);
 788    if (cpu_ldub_data_ra(env, ptr + 9, GETPC()) & 0x80) {
 789        tmp = floatx80_chs(tmp);
 790    }
 791    fpush(env);
 792    ST0 = tmp;
 793}
 794
 795void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
 796{
 797    uint8_t old_flags = save_exception_flags(env);
 798    int v;
 799    target_ulong mem_ref, mem_end;
 800    int64_t val;
 801    CPU_LDoubleU temp;
 802
 803    temp.d = ST0;
 804
 805    val = floatx80_to_int64(ST0, &env->fp_status);
 806    mem_ref = ptr;
 807    if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
 808        set_float_exception_flags(float_flag_invalid, &env->fp_status);
 809        while (mem_ref < ptr + 7) {
 810            cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
 811        }
 812        cpu_stb_data_ra(env, mem_ref++, 0xc0, GETPC());
 813        cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
 814        cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
 815        merge_exception_flags(env, old_flags);
 816        return;
 817    }
 818    mem_end = mem_ref + 9;
 819    if (SIGND(temp)) {
 820        cpu_stb_data_ra(env, mem_end, 0x80, GETPC());
 821        val = -val;
 822    } else {
 823        cpu_stb_data_ra(env, mem_end, 0x00, GETPC());
 824    }
 825    while (mem_ref < mem_end) {
 826        if (val == 0) {
 827            break;
 828        }
 829        v = val % 100;
 830        val = val / 100;
 831        v = ((v / 10) << 4) | (v % 10);
 832        cpu_stb_data_ra(env, mem_ref++, v, GETPC());
 833    }
 834    while (mem_ref < mem_end) {
 835        cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
 836    }
 837    merge_exception_flags(env, old_flags);
 838}
 839
 840/* 128-bit significand of log(2).  */
 841#define ln2_sig_high 0xb17217f7d1cf79abULL
 842#define ln2_sig_low 0xc9e3b39803f2f6afULL
 843
 844/*
 845 * Polynomial coefficients for an approximation to (2^x - 1) / x, on
 846 * the interval [-1/64, 1/64].
 847 */
 848#define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
 849#define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
 850#define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
 851#define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
 852#define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
 853#define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
 854#define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
 855#define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
 856#define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
 857
 858struct f2xm1_data {
 859    /*
 860     * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
 861     * are very close to exact floatx80 values.
 862     */
 863    floatx80 t;
 864    /* The value of 2^t.  */
 865    floatx80 exp2;
 866    /* The value of 2^t - 1.  */
 867    floatx80 exp2m1;
 868};
 869
 870static const struct f2xm1_data f2xm1_table[65] = {
 871    { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
 872      make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
 873      make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
 874    { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
 875      make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
 876      make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
 877    { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
 878      make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
 879      make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
 880    { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
 881      make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
 882      make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
 883    { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
 884      make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
 885      make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
 886    { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
 887      make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
 888      make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
 889    { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
 890      make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
 891      make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
 892    { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
 893      make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
 894      make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
 895    { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
 896      make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
 897      make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
 898    { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
 899      make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
 900      make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
 901    { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
 902      make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
 903      make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
 904    { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
 905      make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
 906      make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
 907    { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
 908      make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
 909      make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
 910    { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
 911      make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
 912      make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
 913    { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
 914      make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
 915      make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
 916    { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
 917      make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
 918      make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
 919    { make_floatx80_init(0xbffe, 0x800000000000227dULL),
 920      make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
 921      make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
 922    { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
 923      make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
 924      make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
 925    { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
 926      make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
 927      make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
 928    { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
 929      make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
 930      make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
 931    { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
 932      make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
 933      make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
 934    { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
 935      make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
 936      make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
 937    { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
 938      make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
 939      make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
 940    { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
 941      make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
 942      make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
 943    { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
 944      make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
 945      make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
 946    { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
 947      make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
 948      make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
 949    { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
 950      make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
 951      make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
 952    { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
 953      make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
 954      make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
 955    { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
 956      make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
 957      make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
 958    { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
 959      make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
 960      make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
 961    { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
 962      make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
 963      make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
 964    { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
 965      make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
 966      make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
 967    { floatx80_zero_init,
 968      make_floatx80_init(0x3fff, 0x8000000000000000ULL),
 969      floatx80_zero_init },
 970    { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
 971      make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
 972      make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
 973    { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
 974      make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
 975      make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
 976    { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
 977      make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
 978      make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
 979    { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
 980      make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
 981      make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
 982    { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
 983      make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
 984      make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
 985    { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
 986      make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
 987      make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
 988    { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
 989      make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
 990      make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
 991    { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
 992      make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
 993      make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
 994    { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
 995      make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
 996      make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
 997    { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
 998      make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
 999      make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
1000    { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
1001      make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
1002      make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
1003    { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
1004      make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
1005      make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
1006    { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
1007      make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
1008      make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
1009    { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
1010      make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
1011      make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
1012    { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
1013      make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
1014      make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
1015    { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
1016      make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
1017      make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
1018    { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
1019      make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
1020      make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
1021    { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
1022      make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
1023      make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
1024    { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
1025      make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
1026      make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
1027    { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
1028      make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1029      make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1030    { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1031      make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1032      make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1033    { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1034      make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1035      make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1036    { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1037      make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1038      make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1039    { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1040      make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1041      make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1042    { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1043      make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1044      make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1045    { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1046      make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1047      make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1048    { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1049      make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1050      make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1051    { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1052      make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1053      make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1054    { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1055      make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1056      make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1057    { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1058      make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1059      make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1060    { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1061      make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1062      make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1063    { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1064      make_floatx80_init(0x4000, 0x8000000000000000ULL),
1065      make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1066};
1067
1068void helper_f2xm1(CPUX86State *env)
1069{
1070    uint8_t old_flags = save_exception_flags(env);
1071    uint64_t sig = extractFloatx80Frac(ST0);
1072    int32_t exp = extractFloatx80Exp(ST0);
1073    bool sign = extractFloatx80Sign(ST0);
1074
1075    if (floatx80_invalid_encoding(ST0)) {
1076        float_raise(float_flag_invalid, &env->fp_status);
1077        ST0 = floatx80_default_nan(&env->fp_status);
1078    } else if (floatx80_is_any_nan(ST0)) {
1079        if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1080            float_raise(float_flag_invalid, &env->fp_status);
1081            ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1082        }
1083    } else if (exp > 0x3fff ||
1084               (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1085        /* Out of range for the instruction, treat as invalid.  */
1086        float_raise(float_flag_invalid, &env->fp_status);
1087        ST0 = floatx80_default_nan(&env->fp_status);
1088    } else if (exp == 0x3fff) {
1089        /* Argument 1 or -1, exact result 1 or -0.5.  */
1090        if (sign) {
1091            ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1092        }
1093    } else if (exp < 0x3fb0) {
1094        if (!floatx80_is_zero(ST0)) {
1095            /*
1096             * Multiplying the argument by an extra-precision version
1097             * of log(2) is sufficiently precise.  Zero arguments are
1098             * returned unchanged.
1099             */
1100            uint64_t sig0, sig1, sig2;
1101            if (exp == 0) {
1102                normalizeFloatx80Subnormal(sig, &exp, &sig);
1103            }
1104            mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1105                            &sig2);
1106            /* This result is inexact.  */
1107            sig1 |= 1;
1108            ST0 = normalizeRoundAndPackFloatx80(80, sign, exp, sig0, sig1,
1109                                                &env->fp_status);
1110        }
1111    } else {
1112        floatx80 tmp, y, accum;
1113        bool asign, bsign;
1114        int32_t n, aexp, bexp;
1115        uint64_t asig0, asig1, asig2, bsig0, bsig1;
1116        FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1117        signed char save_prec = env->fp_status.floatx80_rounding_precision;
1118        env->fp_status.float_rounding_mode = float_round_nearest_even;
1119        env->fp_status.floatx80_rounding_precision = 80;
1120
1121        /* Find the nearest multiple of 1/32 to the argument.  */
1122        tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1123        n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1124        y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1125
1126        if (floatx80_is_zero(y)) {
1127            /*
1128             * Use the value of 2^t - 1 from the table, to avoid
1129             * needing to special-case zero as a result of
1130             * multiplication below.
1131             */
1132            ST0 = f2xm1_table[n].t;
1133            set_float_exception_flags(float_flag_inexact, &env->fp_status);
1134            env->fp_status.float_rounding_mode = save_mode;
1135        } else {
1136            /*
1137             * Compute the lower parts of a polynomial expansion for
1138             * (2^y - 1) / y.
1139             */
1140            accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1141            accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1142            accum = floatx80_mul(accum, y, &env->fp_status);
1143            accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1144            accum = floatx80_mul(accum, y, &env->fp_status);
1145            accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1146            accum = floatx80_mul(accum, y, &env->fp_status);
1147            accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1148            accum = floatx80_mul(accum, y, &env->fp_status);
1149            accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1150            accum = floatx80_mul(accum, y, &env->fp_status);
1151            accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1152            accum = floatx80_mul(accum, y, &env->fp_status);
1153            accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1154
1155            /*
1156             * The full polynomial expansion is f2xm1_coeff_0 + accum
1157             * (where accum has much lower magnitude, and so, in
1158             * particular, carry out of the addition is not possible).
1159             * (This expansion is only accurate to about 70 bits, not
1160             * 128 bits.)
1161             */
1162            aexp = extractFloatx80Exp(f2xm1_coeff_0);
1163            asign = extractFloatx80Sign(f2xm1_coeff_0);
1164            shift128RightJamming(extractFloatx80Frac(accum), 0,
1165                                 aexp - extractFloatx80Exp(accum),
1166                                 &asig0, &asig1);
1167            bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1168            bsig1 = 0;
1169            if (asign == extractFloatx80Sign(accum)) {
1170                add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1171            } else {
1172                sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1173            }
1174            /* And thus compute an approximation to 2^y - 1.  */
1175            mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1176                            &asig0, &asig1, &asig2);
1177            aexp += extractFloatx80Exp(y) - 0x3ffe;
1178            asign ^= extractFloatx80Sign(y);
1179            if (n != 32) {
1180                /*
1181                 * Multiply this by the precomputed value of 2^t and
1182                 * add that of 2^t - 1.
1183                 */
1184                mul128By64To192(asig0, asig1,
1185                                extractFloatx80Frac(f2xm1_table[n].exp2),
1186                                &asig0, &asig1, &asig2);
1187                aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1188                bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1189                bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1190                bsig1 = 0;
1191                if (bexp < aexp) {
1192                    shift128RightJamming(bsig0, bsig1, aexp - bexp,
1193                                         &bsig0, &bsig1);
1194                } else if (aexp < bexp) {
1195                    shift128RightJamming(asig0, asig1, bexp - aexp,
1196                                         &asig0, &asig1);
1197                    aexp = bexp;
1198                }
1199                /* The sign of 2^t - 1 is always that of the result.  */
1200                bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1201                if (asign == bsign) {
1202                    /* Avoid possible carry out of the addition.  */
1203                    shift128RightJamming(asig0, asig1, 1,
1204                                         &asig0, &asig1);
1205                    shift128RightJamming(bsig0, bsig1, 1,
1206                                         &bsig0, &bsig1);
1207                    ++aexp;
1208                    add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1209                } else {
1210                    sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1211                    asign = bsign;
1212                }
1213            }
1214            env->fp_status.float_rounding_mode = save_mode;
1215            /* This result is inexact.  */
1216            asig1 |= 1;
1217            ST0 = normalizeRoundAndPackFloatx80(80, asign, aexp, asig0, asig1,
1218                                                &env->fp_status);
1219        }
1220
1221        env->fp_status.floatx80_rounding_precision = save_prec;
1222    }
1223    merge_exception_flags(env, old_flags);
1224}
1225
1226void helper_fptan(CPUX86State *env)
1227{
1228    double fptemp = floatx80_to_double(env, ST0);
1229
1230    if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1231        env->fpus |= 0x400;
1232    } else {
1233        fptemp = tan(fptemp);
1234        ST0 = double_to_floatx80(env, fptemp);
1235        fpush(env);
1236        ST0 = floatx80_one;
1237        env->fpus &= ~0x400; /* C2 <-- 0 */
1238        /* the above code is for |arg| < 2**52 only */
1239    }
1240}
1241
1242/* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1243#define pi_4_exp 0x3ffe
1244#define pi_4_sig_high 0xc90fdaa22168c234ULL
1245#define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1246#define pi_2_exp 0x3fff
1247#define pi_2_sig_high 0xc90fdaa22168c234ULL
1248#define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1249#define pi_34_exp 0x4000
1250#define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1251#define pi_34_sig_low 0x9394c9e8a0a5159dULL
1252#define pi_exp 0x4000
1253#define pi_sig_high 0xc90fdaa22168c234ULL
1254#define pi_sig_low 0xc4c6628b80dc1cd1ULL
1255
1256/*
1257 * Polynomial coefficients for an approximation to atan(x), with only
1258 * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1259 * for some other approximations, no low part is needed for the first
1260 * coefficient here to achieve a sufficiently accurate result, because
1261 * the coefficient in this minimax approximation is very close to
1262 * exactly 1.)
1263 */
1264#define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1265#define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1266#define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1267#define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1268#define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1269#define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1270#define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1271
1272struct fpatan_data {
1273    /* High and low parts of atan(x).  */
1274    floatx80 atan_high, atan_low;
1275};
1276
1277static const struct fpatan_data fpatan_table[9] = {
1278    { floatx80_zero_init,
1279      floatx80_zero_init },
1280    { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1281      make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1282    { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1283      make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1284    { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1285      make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1286    { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1287      make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1288    { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1289      make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1290    { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1291      make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1292    { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1293      make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1294    { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1295      make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1296};
1297
1298void helper_fpatan(CPUX86State *env)
1299{
1300    uint8_t old_flags = save_exception_flags(env);
1301    uint64_t arg0_sig = extractFloatx80Frac(ST0);
1302    int32_t arg0_exp = extractFloatx80Exp(ST0);
1303    bool arg0_sign = extractFloatx80Sign(ST0);
1304    uint64_t arg1_sig = extractFloatx80Frac(ST1);
1305    int32_t arg1_exp = extractFloatx80Exp(ST1);
1306    bool arg1_sign = extractFloatx80Sign(ST1);
1307
1308    if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1309        float_raise(float_flag_invalid, &env->fp_status);
1310        ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1311    } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1312        float_raise(float_flag_invalid, &env->fp_status);
1313        ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1314    } else if (floatx80_invalid_encoding(ST0) ||
1315               floatx80_invalid_encoding(ST1)) {
1316        float_raise(float_flag_invalid, &env->fp_status);
1317        ST1 = floatx80_default_nan(&env->fp_status);
1318    } else if (floatx80_is_any_nan(ST0)) {
1319        ST1 = ST0;
1320    } else if (floatx80_is_any_nan(ST1)) {
1321        /* Pass this NaN through.  */
1322    } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1323        /* Pass this zero through.  */
1324    } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1325                 arg0_exp - arg1_exp >= 80) &&
1326               !arg0_sign) {
1327        /*
1328         * Dividing ST1 by ST0 gives the correct result up to
1329         * rounding, and avoids spurious underflow exceptions that
1330         * might result from passing some small values through the
1331         * polynomial approximation, but if a finite nonzero result of
1332         * division is exact, the result of fpatan is still inexact
1333         * (and underflowing where appropriate).
1334         */
1335        signed char save_prec = env->fp_status.floatx80_rounding_precision;
1336        env->fp_status.floatx80_rounding_precision = 80;
1337        ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1338        env->fp_status.floatx80_rounding_precision = save_prec;
1339        if (!floatx80_is_zero(ST1) &&
1340            !(get_float_exception_flags(&env->fp_status) &
1341              float_flag_inexact)) {
1342            /*
1343             * The mathematical result is very slightly closer to zero
1344             * than this exact result.  Round a value with the
1345             * significand adjusted accordingly to get the correct
1346             * exceptions, and possibly an adjusted result depending
1347             * on the rounding mode.
1348             */
1349            uint64_t sig = extractFloatx80Frac(ST1);
1350            int32_t exp = extractFloatx80Exp(ST1);
1351            bool sign = extractFloatx80Sign(ST1);
1352            if (exp == 0) {
1353                normalizeFloatx80Subnormal(sig, &exp, &sig);
1354            }
1355            ST1 = normalizeRoundAndPackFloatx80(80, sign, exp, sig - 1,
1356                                                -1, &env->fp_status);
1357        }
1358    } else {
1359        /* The result is inexact.  */
1360        bool rsign = arg1_sign;
1361        int32_t rexp;
1362        uint64_t rsig0, rsig1;
1363        if (floatx80_is_zero(ST1)) {
1364            /*
1365             * ST0 is negative.  The result is pi with the sign of
1366             * ST1.
1367             */
1368            rexp = pi_exp;
1369            rsig0 = pi_sig_high;
1370            rsig1 = pi_sig_low;
1371        } else if (floatx80_is_infinity(ST1)) {
1372            if (floatx80_is_infinity(ST0)) {
1373                if (arg0_sign) {
1374                    rexp = pi_34_exp;
1375                    rsig0 = pi_34_sig_high;
1376                    rsig1 = pi_34_sig_low;
1377                } else {
1378                    rexp = pi_4_exp;
1379                    rsig0 = pi_4_sig_high;
1380                    rsig1 = pi_4_sig_low;
1381                }
1382            } else {
1383                rexp = pi_2_exp;
1384                rsig0 = pi_2_sig_high;
1385                rsig1 = pi_2_sig_low;
1386            }
1387        } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1388            rexp = pi_2_exp;
1389            rsig0 = pi_2_sig_high;
1390            rsig1 = pi_2_sig_low;
1391        } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1392            /* ST0 is negative.  */
1393            rexp = pi_exp;
1394            rsig0 = pi_sig_high;
1395            rsig1 = pi_sig_low;
1396        } else {
1397            /*
1398             * ST0 and ST1 are finite, nonzero and with exponents not
1399             * too far apart.
1400             */
1401            int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1402            int32_t azexp, axexp;
1403            bool adj_sub, ysign, zsign;
1404            uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1405            uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1406            uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1407            uint64_t azsig0, azsig1;
1408            uint64_t azsig2, azsig3, axsig0, axsig1;
1409            floatx80 x8;
1410            FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1411            signed char save_prec = env->fp_status.floatx80_rounding_precision;
1412            env->fp_status.float_rounding_mode = float_round_nearest_even;
1413            env->fp_status.floatx80_rounding_precision = 80;
1414
1415            if (arg0_exp == 0) {
1416                normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1417            }
1418            if (arg1_exp == 0) {
1419                normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1420            }
1421            if (arg0_exp > arg1_exp ||
1422                (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1423                /* Work with abs(ST1) / abs(ST0).  */
1424                num_exp = arg1_exp;
1425                num_sig = arg1_sig;
1426                den_exp = arg0_exp;
1427                den_sig = arg0_sig;
1428                if (arg0_sign) {
1429                    /* The result is subtracted from pi.  */
1430                    adj_exp = pi_exp;
1431                    adj_sig0 = pi_sig_high;
1432                    adj_sig1 = pi_sig_low;
1433                    adj_sub = true;
1434                } else {
1435                    /* The result is used as-is.  */
1436                    adj_exp = 0;
1437                    adj_sig0 = 0;
1438                    adj_sig1 = 0;
1439                    adj_sub = false;
1440                }
1441            } else {
1442                /* Work with abs(ST0) / abs(ST1).  */
1443                num_exp = arg0_exp;
1444                num_sig = arg0_sig;
1445                den_exp = arg1_exp;
1446                den_sig = arg1_sig;
1447                /* The result is added to or subtracted from pi/2.  */
1448                adj_exp = pi_2_exp;
1449                adj_sig0 = pi_2_sig_high;
1450                adj_sig1 = pi_2_sig_low;
1451                adj_sub = !arg0_sign;
1452            }
1453
1454            /*
1455             * Compute x = num/den, where 0 < x <= 1 and x is not too
1456             * small.
1457             */
1458            xexp = num_exp - den_exp + 0x3ffe;
1459            remsig0 = num_sig;
1460            remsig1 = 0;
1461            if (den_sig <= remsig0) {
1462                shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1463                ++xexp;
1464            }
1465            xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1466            mul64To128(den_sig, xsig0, &msig0, &msig1);
1467            sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1468            while ((int64_t) remsig0 < 0) {
1469                --xsig0;
1470                add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1471            }
1472            xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1473            /*
1474             * No need to correct any estimation error in xsig1; even
1475             * with such error, it is accurate enough.
1476             */
1477
1478            /*
1479             * Split x as x = t + y, where t = n/8 is the nearest
1480             * multiple of 1/8 to x.
1481             */
1482            x8 = normalizeRoundAndPackFloatx80(80, false, xexp + 3, xsig0,
1483                                               xsig1, &env->fp_status);
1484            n = floatx80_to_int32(x8, &env->fp_status);
1485            if (n == 0) {
1486                ysign = false;
1487                yexp = xexp;
1488                ysig0 = xsig0;
1489                ysig1 = xsig1;
1490                texp = 0;
1491                tsig = 0;
1492            } else {
1493                int shift = clz32(n) + 32;
1494                texp = 0x403b - shift;
1495                tsig = n;
1496                tsig <<= shift;
1497                if (texp == xexp) {
1498                    sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1499                    if ((int64_t) ysig0 >= 0) {
1500                        ysign = false;
1501                        if (ysig0 == 0) {
1502                            if (ysig1 == 0) {
1503                                yexp = 0;
1504                            } else {
1505                                shift = clz64(ysig1) + 64;
1506                                yexp = xexp - shift;
1507                                shift128Left(ysig0, ysig1, shift,
1508                                             &ysig0, &ysig1);
1509                            }
1510                        } else {
1511                            shift = clz64(ysig0);
1512                            yexp = xexp - shift;
1513                            shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1514                        }
1515                    } else {
1516                        ysign = true;
1517                        sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1518                        if (ysig0 == 0) {
1519                            shift = clz64(ysig1) + 64;
1520                        } else {
1521                            shift = clz64(ysig0);
1522                        }
1523                        yexp = xexp - shift;
1524                        shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1525                    }
1526                } else {
1527                    /*
1528                     * t's exponent must be greater than x's because t
1529                     * is positive and the nearest multiple of 1/8 to
1530                     * x, and if x has a greater exponent, the power
1531                     * of 2 with that exponent is also a multiple of
1532                     * 1/8.
1533                     */
1534                    uint64_t usig0, usig1;
1535                    shift128RightJamming(xsig0, xsig1, texp - xexp,
1536                                         &usig0, &usig1);
1537                    ysign = true;
1538                    sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1539                    if (ysig0 == 0) {
1540                        shift = clz64(ysig1) + 64;
1541                    } else {
1542                        shift = clz64(ysig0);
1543                    }
1544                    yexp = texp - shift;
1545                    shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1546                }
1547            }
1548
1549            /*
1550             * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1551             * arctan(z).
1552             */
1553            zsign = ysign;
1554            if (texp == 0 || yexp == 0) {
1555                zexp = yexp;
1556                zsig0 = ysig0;
1557                zsig1 = ysig1;
1558            } else {
1559                /*
1560                 * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1561                 */
1562                int32_t dexp = texp + xexp - 0x3ffe;
1563                uint64_t dsig0, dsig1, dsig2;
1564                mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1565                /*
1566                 * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1567                 * bit).  Add 1 to produce the denominator 1+tx.
1568                 */
1569                shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1570                                     &dsig0, &dsig1);
1571                dsig0 |= 0x8000000000000000ULL;
1572                zexp = yexp - 1;
1573                remsig0 = ysig0;
1574                remsig1 = ysig1;
1575                remsig2 = 0;
1576                if (dsig0 <= remsig0) {
1577                    shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1578                    ++zexp;
1579                }
1580                zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1581                mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1582                sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1583                       &remsig0, &remsig1, &remsig2);
1584                while ((int64_t) remsig0 < 0) {
1585                    --zsig0;
1586                    add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1587                           &remsig0, &remsig1, &remsig2);
1588                }
1589                zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1590                /* No need to correct any estimation error in zsig1.  */
1591            }
1592
1593            if (zexp == 0) {
1594                azexp = 0;
1595                azsig0 = 0;
1596                azsig1 = 0;
1597            } else {
1598                floatx80 z2, accum;
1599                uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1600                /* Compute z^2.  */
1601                mul128To256(zsig0, zsig1, zsig0, zsig1,
1602                            &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1603                z2 = normalizeRoundAndPackFloatx80(80, false,
1604                                                   zexp + zexp - 0x3ffe,
1605                                                   z2sig0, z2sig1,
1606                                                   &env->fp_status);
1607
1608                /* Compute the lower parts of the polynomial expansion.  */
1609                accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1610                accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1611                accum = floatx80_mul(accum, z2, &env->fp_status);
1612                accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1613                accum = floatx80_mul(accum, z2, &env->fp_status);
1614                accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1615                accum = floatx80_mul(accum, z2, &env->fp_status);
1616                accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1617                accum = floatx80_mul(accum, z2, &env->fp_status);
1618                accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1619                accum = floatx80_mul(accum, z2, &env->fp_status);
1620
1621                /*
1622                 * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1623                 * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1624                 */
1625                aexp = extractFloatx80Exp(fpatan_coeff_0);
1626                shift128RightJamming(extractFloatx80Frac(accum), 0,
1627                                     aexp - extractFloatx80Exp(accum),
1628                                     &asig0, &asig1);
1629                sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1630                       &asig0, &asig1);
1631                /* Multiply by z to compute arctan(z).  */
1632                azexp = aexp + zexp - 0x3ffe;
1633                mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1634                            &azsig2, &azsig3);
1635            }
1636
1637            /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1638            if (texp == 0) {
1639                /* z is positive.  */
1640                axexp = azexp;
1641                axsig0 = azsig0;
1642                axsig1 = azsig1;
1643            } else {
1644                bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1645                int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1646                uint64_t low_sig0 =
1647                    extractFloatx80Frac(fpatan_table[n].atan_low);
1648                uint64_t low_sig1 = 0;
1649                axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1650                axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1651                axsig1 = 0;
1652                shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1653                                     &low_sig0, &low_sig1);
1654                if (low_sign) {
1655                    sub128(axsig0, axsig1, low_sig0, low_sig1,
1656                           &axsig0, &axsig1);
1657                } else {
1658                    add128(axsig0, axsig1, low_sig0, low_sig1,
1659                           &axsig0, &axsig1);
1660                }
1661                if (azexp >= axexp) {
1662                    shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1663                                         &axsig0, &axsig1);
1664                    axexp = azexp + 1;
1665                    shift128RightJamming(azsig0, azsig1, 1,
1666                                         &azsig0, &azsig1);
1667                } else {
1668                    shift128RightJamming(axsig0, axsig1, 1,
1669                                         &axsig0, &axsig1);
1670                    shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1671                                         &azsig0, &azsig1);
1672                    ++axexp;
1673                }
1674                if (zsign) {
1675                    sub128(axsig0, axsig1, azsig0, azsig1,
1676                           &axsig0, &axsig1);
1677                } else {
1678                    add128(axsig0, axsig1, azsig0, azsig1,
1679                           &axsig0, &axsig1);
1680                }
1681            }
1682
1683            if (adj_exp == 0) {
1684                rexp = axexp;
1685                rsig0 = axsig0;
1686                rsig1 = axsig1;
1687            } else {
1688                /*
1689                 * Add or subtract arctan(x) (exponent axexp,
1690                 * significand axsig0 and axsig1, positive, not
1691                 * necessarily normalized) to the number given by
1692                 * adj_exp, adj_sig0 and adj_sig1, according to
1693                 * adj_sub.
1694                 */
1695                if (adj_exp >= axexp) {
1696                    shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1697                                         &axsig0, &axsig1);
1698                    rexp = adj_exp + 1;
1699                    shift128RightJamming(adj_sig0, adj_sig1, 1,
1700                                         &adj_sig0, &adj_sig1);
1701                } else {
1702                    shift128RightJamming(axsig0, axsig1, 1,
1703                                         &axsig0, &axsig1);
1704                    shift128RightJamming(adj_sig0, adj_sig1,
1705                                         axexp - adj_exp + 1,
1706                                         &adj_sig0, &adj_sig1);
1707                    rexp = axexp + 1;
1708                }
1709                if (adj_sub) {
1710                    sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1711                           &rsig0, &rsig1);
1712                } else {
1713                    add128(adj_sig0, adj_sig1, axsig0, axsig1,
1714                           &rsig0, &rsig1);
1715                }
1716            }
1717
1718            env->fp_status.float_rounding_mode = save_mode;
1719            env->fp_status.floatx80_rounding_precision = save_prec;
1720        }
1721        /* This result is inexact.  */
1722        rsig1 |= 1;
1723        ST1 = normalizeRoundAndPackFloatx80(80, rsign, rexp,
1724                                            rsig0, rsig1, &env->fp_status);
1725    }
1726
1727    fpop(env);
1728    merge_exception_flags(env, old_flags);
1729}
1730
1731void helper_fxtract(CPUX86State *env)
1732{
1733    uint8_t old_flags = save_exception_flags(env);
1734    CPU_LDoubleU temp;
1735
1736    temp.d = ST0;
1737
1738    if (floatx80_is_zero(ST0)) {
1739        /* Easy way to generate -inf and raising division by 0 exception */
1740        ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1741                           &env->fp_status);
1742        fpush(env);
1743        ST0 = temp.d;
1744    } else if (floatx80_invalid_encoding(ST0)) {
1745        float_raise(float_flag_invalid, &env->fp_status);
1746        ST0 = floatx80_default_nan(&env->fp_status);
1747        fpush(env);
1748        ST0 = ST1;
1749    } else if (floatx80_is_any_nan(ST0)) {
1750        if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1751            float_raise(float_flag_invalid, &env->fp_status);
1752            ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1753        }
1754        fpush(env);
1755        ST0 = ST1;
1756    } else if (floatx80_is_infinity(ST0)) {
1757        fpush(env);
1758        ST0 = ST1;
1759        ST1 = floatx80_infinity;
1760    } else {
1761        int expdif;
1762
1763        if (EXPD(temp) == 0) {
1764            int shift = clz64(temp.l.lower);
1765            temp.l.lower <<= shift;
1766            expdif = 1 - EXPBIAS - shift;
1767            float_raise(float_flag_input_denormal, &env->fp_status);
1768        } else {
1769            expdif = EXPD(temp) - EXPBIAS;
1770        }
1771        /* DP exponent bias */
1772        ST0 = int32_to_floatx80(expdif, &env->fp_status);
1773        fpush(env);
1774        BIASEXPONENT(temp);
1775        ST0 = temp.d;
1776    }
1777    merge_exception_flags(env, old_flags);
1778}
1779
1780static void helper_fprem_common(CPUX86State *env, bool mod)
1781{
1782    uint8_t old_flags = save_exception_flags(env);
1783    uint64_t quotient;
1784    CPU_LDoubleU temp0, temp1;
1785    int exp0, exp1, expdiff;
1786
1787    temp0.d = ST0;
1788    temp1.d = ST1;
1789    exp0 = EXPD(temp0);
1790    exp1 = EXPD(temp1);
1791
1792    env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1793    if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1794        exp0 == 0x7fff || exp1 == 0x7fff ||
1795        floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1796        ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1797    } else {
1798        if (exp0 == 0) {
1799            exp0 = 1 - clz64(temp0.l.lower);
1800        }
1801        if (exp1 == 0) {
1802            exp1 = 1 - clz64(temp1.l.lower);
1803        }
1804        expdiff = exp0 - exp1;
1805        if (expdiff < 64) {
1806            ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1807            env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1808            env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1809            env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1810        } else {
1811            /*
1812             * Partial remainder.  This choice of how many bits to
1813             * process at once is specified in AMD instruction set
1814             * manuals, and empirically is followed by Intel
1815             * processors as well; it ensures that the final remainder
1816             * operation in a loop does produce the correct low three
1817             * bits of the quotient.  AMD manuals specify that the
1818             * flags other than C2 are cleared, and empirically Intel
1819             * processors clear them as well.
1820             */
1821            int n = 32 + (expdiff % 32);
1822            temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1823            ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1824            env->fpus |= 0x400;  /* C2 <-- 1 */
1825        }
1826    }
1827    merge_exception_flags(env, old_flags);
1828}
1829
1830void helper_fprem1(CPUX86State *env)
1831{
1832    helper_fprem_common(env, false);
1833}
1834
1835void helper_fprem(CPUX86State *env)
1836{
1837    helper_fprem_common(env, true);
1838}
1839
1840/* 128-bit significand of log2(e).  */
1841#define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1842#define log2_e_sig_low 0xbe87fed0691d3e89ULL
1843
1844/*
1845 * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1846 * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1847 * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1848 * interval [sqrt(2)/2, sqrt(2)].
1849 */
1850#define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1851#define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1852#define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1853#define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1854#define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1855#define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1856#define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1857#define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1858#define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1859#define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1860#define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1861
1862/*
1863 * Compute an approximation of log2(1+arg), where 1+arg is in the
1864 * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1865 * function is called, rounding precision is set to 80 and the
1866 * round-to-nearest mode is in effect.  arg must not be exactly zero,
1867 * and must not be so close to zero that underflow might occur.
1868 */
1869static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1870                                uint64_t *sig0, uint64_t *sig1)
1871{
1872    uint64_t arg0_sig = extractFloatx80Frac(arg);
1873    int32_t arg0_exp = extractFloatx80Exp(arg);
1874    bool arg0_sign = extractFloatx80Sign(arg);
1875    bool asign;
1876    int32_t dexp, texp, aexp;
1877    uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1878    uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1879    uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1880    floatx80 t2, accum;
1881
1882    /*
1883     * Compute an approximation of arg/(2+arg), with extra precision,
1884     * as the argument to a polynomial approximation.  The extra
1885     * precision is only needed for the first term of the
1886     * approximation, with subsequent terms being significantly
1887     * smaller; the approximation only uses odd exponents, and the
1888     * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1889     */
1890    if (arg0_sign) {
1891        dexp = 0x3fff;
1892        shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1893        sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1894    } else {
1895        dexp = 0x4000;
1896        shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1897        dsig0 |= 0x8000000000000000ULL;
1898    }
1899    texp = arg0_exp - dexp + 0x3ffe;
1900    rsig0 = arg0_sig;
1901    rsig1 = 0;
1902    rsig2 = 0;
1903    if (dsig0 <= rsig0) {
1904        shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1905        ++texp;
1906    }
1907    tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1908    mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1909    sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1910           &rsig0, &rsig1, &rsig2);
1911    while ((int64_t) rsig0 < 0) {
1912        --tsig0;
1913        add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1914               &rsig0, &rsig1, &rsig2);
1915    }
1916    tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1917    /*
1918     * No need to correct any estimation error in tsig1; even with
1919     * such error, it is accurate enough.  Now compute the square of
1920     * that approximation.
1921     */
1922    mul128To256(tsig0, tsig1, tsig0, tsig1,
1923                &t2sig0, &t2sig1, &t2sig2, &t2sig3);
1924    t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe,
1925                                       t2sig0, t2sig1, &env->fp_status);
1926
1927    /* Compute the lower parts of the polynomial expansion.  */
1928    accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
1929    accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
1930    accum = floatx80_mul(accum, t2, &env->fp_status);
1931    accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
1932    accum = floatx80_mul(accum, t2, &env->fp_status);
1933    accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
1934    accum = floatx80_mul(accum, t2, &env->fp_status);
1935    accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
1936    accum = floatx80_mul(accum, t2, &env->fp_status);
1937    accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
1938    accum = floatx80_mul(accum, t2, &env->fp_status);
1939    accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
1940    accum = floatx80_mul(accum, t2, &env->fp_status);
1941    accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
1942    accum = floatx80_mul(accum, t2, &env->fp_status);
1943    accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
1944    accum = floatx80_mul(accum, t2, &env->fp_status);
1945    accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
1946
1947    /*
1948     * The full polynomial expansion is fyl2x_coeff_0 + accum (where
1949     * accum has much lower magnitude, and so, in particular, carry
1950     * out of the addition is not possible), multiplied by t.  (This
1951     * expansion is only accurate to about 70 bits, not 128 bits.)
1952     */
1953    aexp = extractFloatx80Exp(fyl2x_coeff_0);
1954    asign = extractFloatx80Sign(fyl2x_coeff_0);
1955    shift128RightJamming(extractFloatx80Frac(accum), 0,
1956                         aexp - extractFloatx80Exp(accum),
1957                         &asig0, &asig1);
1958    bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
1959    bsig1 = 0;
1960    if (asign == extractFloatx80Sign(accum)) {
1961        add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1962    } else {
1963        sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1964    }
1965    /* Multiply by t to compute the required result.  */
1966    mul128To256(asig0, asig1, tsig0, tsig1,
1967                &asig0, &asig1, &asig2, &asig3);
1968    aexp += texp - 0x3ffe;
1969    *exp = aexp;
1970    *sig0 = asig0;
1971    *sig1 = asig1;
1972}
1973
1974void helper_fyl2xp1(CPUX86State *env)
1975{
1976    uint8_t old_flags = save_exception_flags(env);
1977    uint64_t arg0_sig = extractFloatx80Frac(ST0);
1978    int32_t arg0_exp = extractFloatx80Exp(ST0);
1979    bool arg0_sign = extractFloatx80Sign(ST0);
1980    uint64_t arg1_sig = extractFloatx80Frac(ST1);
1981    int32_t arg1_exp = extractFloatx80Exp(ST1);
1982    bool arg1_sign = extractFloatx80Sign(ST1);
1983
1984    if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1985        float_raise(float_flag_invalid, &env->fp_status);
1986        ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1987    } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1988        float_raise(float_flag_invalid, &env->fp_status);
1989        ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1990    } else if (floatx80_invalid_encoding(ST0) ||
1991               floatx80_invalid_encoding(ST1)) {
1992        float_raise(float_flag_invalid, &env->fp_status);
1993        ST1 = floatx80_default_nan(&env->fp_status);
1994    } else if (floatx80_is_any_nan(ST0)) {
1995        ST1 = ST0;
1996    } else if (floatx80_is_any_nan(ST1)) {
1997        /* Pass this NaN through.  */
1998    } else if (arg0_exp > 0x3ffd ||
1999               (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
2000                                                  0x95f619980c4336f7ULL :
2001                                                  0xd413cccfe7799211ULL))) {
2002        /*
2003         * Out of range for the instruction (ST0 must have absolute
2004         * value less than 1 - sqrt(2)/2 = 0.292..., according to
2005         * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
2006         * to sqrt(2) - 1, which we allow here), treat as invalid.
2007         */
2008        float_raise(float_flag_invalid, &env->fp_status);
2009        ST1 = floatx80_default_nan(&env->fp_status);
2010    } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
2011               arg1_exp == 0x7fff) {
2012        /*
2013         * One argument is zero, or multiplying by infinity; correct
2014         * result is exact and can be obtained by multiplying the
2015         * arguments.
2016         */
2017        ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
2018    } else if (arg0_exp < 0x3fb0) {
2019        /*
2020         * Multiplying both arguments and an extra-precision version
2021         * of log2(e) is sufficiently precise.
2022         */
2023        uint64_t sig0, sig1, sig2;
2024        int32_t exp;
2025        if (arg0_exp == 0) {
2026            normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2027        }
2028        if (arg1_exp == 0) {
2029            normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2030        }
2031        mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2032                        &sig0, &sig1, &sig2);
2033        exp = arg0_exp + 1;
2034        mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2035        exp += arg1_exp - 0x3ffe;
2036        /* This result is inexact.  */
2037        sig1 |= 1;
2038        ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, exp,
2039                                            sig0, sig1, &env->fp_status);
2040    } else {
2041        int32_t aexp;
2042        uint64_t asig0, asig1, asig2;
2043        FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2044        signed char save_prec = env->fp_status.floatx80_rounding_precision;
2045        env->fp_status.float_rounding_mode = float_round_nearest_even;
2046        env->fp_status.floatx80_rounding_precision = 80;
2047
2048        helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2049        /*
2050         * Multiply by the second argument to compute the required
2051         * result.
2052         */
2053        if (arg1_exp == 0) {
2054            normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2055        }
2056        mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2057        aexp += arg1_exp - 0x3ffe;
2058        /* This result is inexact.  */
2059        asig1 |= 1;
2060        env->fp_status.float_rounding_mode = save_mode;
2061        ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, aexp,
2062                                            asig0, asig1, &env->fp_status);
2063        env->fp_status.floatx80_rounding_precision = save_prec;
2064    }
2065    fpop(env);
2066    merge_exception_flags(env, old_flags);
2067}
2068
2069void helper_fyl2x(CPUX86State *env)
2070{
2071    uint8_t old_flags = save_exception_flags(env);
2072    uint64_t arg0_sig = extractFloatx80Frac(ST0);
2073    int32_t arg0_exp = extractFloatx80Exp(ST0);
2074    bool arg0_sign = extractFloatx80Sign(ST0);
2075    uint64_t arg1_sig = extractFloatx80Frac(ST1);
2076    int32_t arg1_exp = extractFloatx80Exp(ST1);
2077    bool arg1_sign = extractFloatx80Sign(ST1);
2078
2079    if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2080        float_raise(float_flag_invalid, &env->fp_status);
2081        ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2082    } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2083        float_raise(float_flag_invalid, &env->fp_status);
2084        ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2085    } else if (floatx80_invalid_encoding(ST0) ||
2086               floatx80_invalid_encoding(ST1)) {
2087        float_raise(float_flag_invalid, &env->fp_status);
2088        ST1 = floatx80_default_nan(&env->fp_status);
2089    } else if (floatx80_is_any_nan(ST0)) {
2090        ST1 = ST0;
2091    } else if (floatx80_is_any_nan(ST1)) {
2092        /* Pass this NaN through.  */
2093    } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2094        float_raise(float_flag_invalid, &env->fp_status);
2095        ST1 = floatx80_default_nan(&env->fp_status);
2096    } else if (floatx80_is_infinity(ST1)) {
2097        FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2098                                             &env->fp_status);
2099        switch (cmp) {
2100        case float_relation_less:
2101            ST1 = floatx80_chs(ST1);
2102            break;
2103        case float_relation_greater:
2104            /* Result is infinity of the same sign as ST1.  */
2105            break;
2106        default:
2107            float_raise(float_flag_invalid, &env->fp_status);
2108            ST1 = floatx80_default_nan(&env->fp_status);
2109            break;
2110        }
2111    } else if (floatx80_is_infinity(ST0)) {
2112        if (floatx80_is_zero(ST1)) {
2113            float_raise(float_flag_invalid, &env->fp_status);
2114            ST1 = floatx80_default_nan(&env->fp_status);
2115        } else if (arg1_sign) {
2116            ST1 = floatx80_chs(ST0);
2117        } else {
2118            ST1 = ST0;
2119        }
2120    } else if (floatx80_is_zero(ST0)) {
2121        if (floatx80_is_zero(ST1)) {
2122            float_raise(float_flag_invalid, &env->fp_status);
2123            ST1 = floatx80_default_nan(&env->fp_status);
2124        } else {
2125            /* Result is infinity with opposite sign to ST1.  */
2126            float_raise(float_flag_divbyzero, &env->fp_status);
2127            ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2128                                0x8000000000000000ULL);
2129        }
2130    } else if (floatx80_is_zero(ST1)) {
2131        if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2132            ST1 = floatx80_chs(ST1);
2133        }
2134        /* Otherwise, ST1 is already the correct result.  */
2135    } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2136        if (arg1_sign) {
2137            ST1 = floatx80_chs(floatx80_zero);
2138        } else {
2139            ST1 = floatx80_zero;
2140        }
2141    } else {
2142        int32_t int_exp;
2143        floatx80 arg0_m1;
2144        FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2145        signed char save_prec = env->fp_status.floatx80_rounding_precision;
2146        env->fp_status.float_rounding_mode = float_round_nearest_even;
2147        env->fp_status.floatx80_rounding_precision = 80;
2148
2149        if (arg0_exp == 0) {
2150            normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2151        }
2152        if (arg1_exp == 0) {
2153            normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2154        }
2155        int_exp = arg0_exp - 0x3fff;
2156        if (arg0_sig > 0xb504f333f9de6484ULL) {
2157            ++int_exp;
2158        }
2159        arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2160                                               &env->fp_status),
2161                               floatx80_one, &env->fp_status);
2162        if (floatx80_is_zero(arg0_m1)) {
2163            /* Exact power of 2; multiply by ST1.  */
2164            env->fp_status.float_rounding_mode = save_mode;
2165            ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2166                               ST1, &env->fp_status);
2167        } else {
2168            bool asign = extractFloatx80Sign(arg0_m1);
2169            int32_t aexp;
2170            uint64_t asig0, asig1, asig2;
2171            helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2172            if (int_exp != 0) {
2173                bool isign = (int_exp < 0);
2174                int32_t iexp;
2175                uint64_t isig;
2176                int shift;
2177                int_exp = isign ? -int_exp : int_exp;
2178                shift = clz32(int_exp) + 32;
2179                isig = int_exp;
2180                isig <<= shift;
2181                iexp = 0x403e - shift;
2182                shift128RightJamming(asig0, asig1, iexp - aexp,
2183                                     &asig0, &asig1);
2184                if (asign == isign) {
2185                    add128(isig, 0, asig0, asig1, &asig0, &asig1);
2186                } else {
2187                    sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2188                }
2189                aexp = iexp;
2190                asign = isign;
2191            }
2192            /*
2193             * Multiply by the second argument to compute the required
2194             * result.
2195             */
2196            if (arg1_exp == 0) {
2197                normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2198            }
2199            mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2200            aexp += arg1_exp - 0x3ffe;
2201            /* This result is inexact.  */
2202            asig1 |= 1;
2203            env->fp_status.float_rounding_mode = save_mode;
2204            ST1 = normalizeRoundAndPackFloatx80(80, asign ^ arg1_sign, aexp,
2205                                                asig0, asig1, &env->fp_status);
2206        }
2207
2208        env->fp_status.floatx80_rounding_precision = save_prec;
2209    }
2210    fpop(env);
2211    merge_exception_flags(env, old_flags);
2212}
2213
2214void helper_fsqrt(CPUX86State *env)
2215{
2216    uint8_t old_flags = save_exception_flags(env);
2217    if (floatx80_is_neg(ST0)) {
2218        env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2219        env->fpus |= 0x400;
2220    }
2221    ST0 = floatx80_sqrt(ST0, &env->fp_status);
2222    merge_exception_flags(env, old_flags);
2223}
2224
2225void helper_fsincos(CPUX86State *env)
2226{
2227    double fptemp = floatx80_to_double(env, ST0);
2228
2229    if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2230        env->fpus |= 0x400;
2231    } else {
2232        ST0 = double_to_floatx80(env, sin(fptemp));
2233        fpush(env);
2234        ST0 = double_to_floatx80(env, cos(fptemp));
2235        env->fpus &= ~0x400;  /* C2 <-- 0 */
2236        /* the above code is for |arg| < 2**63 only */
2237    }
2238}
2239
2240void helper_frndint(CPUX86State *env)
2241{
2242    uint8_t old_flags = save_exception_flags(env);
2243    ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2244    merge_exception_flags(env, old_flags);
2245}
2246
2247void helper_fscale(CPUX86State *env)
2248{
2249    uint8_t old_flags = save_exception_flags(env);
2250    if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2251        float_raise(float_flag_invalid, &env->fp_status);
2252        ST0 = floatx80_default_nan(&env->fp_status);
2253    } else if (floatx80_is_any_nan(ST1)) {
2254        if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2255            float_raise(float_flag_invalid, &env->fp_status);
2256        }
2257        ST0 = ST1;
2258        if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2259            float_raise(float_flag_invalid, &env->fp_status);
2260            ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2261        }
2262    } else if (floatx80_is_infinity(ST1) &&
2263               !floatx80_invalid_encoding(ST0) &&
2264               !floatx80_is_any_nan(ST0)) {
2265        if (floatx80_is_neg(ST1)) {
2266            if (floatx80_is_infinity(ST0)) {
2267                float_raise(float_flag_invalid, &env->fp_status);
2268                ST0 = floatx80_default_nan(&env->fp_status);
2269            } else {
2270                ST0 = (floatx80_is_neg(ST0) ?
2271                       floatx80_chs(floatx80_zero) :
2272                       floatx80_zero);
2273            }
2274        } else {
2275            if (floatx80_is_zero(ST0)) {
2276                float_raise(float_flag_invalid, &env->fp_status);
2277                ST0 = floatx80_default_nan(&env->fp_status);
2278            } else {
2279                ST0 = (floatx80_is_neg(ST0) ?
2280                       floatx80_chs(floatx80_infinity) :
2281                       floatx80_infinity);
2282            }
2283        }
2284    } else {
2285        int n;
2286        signed char save = env->fp_status.floatx80_rounding_precision;
2287        uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2288        set_float_exception_flags(0, &env->fp_status);
2289        n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2290        set_float_exception_flags(save_flags, &env->fp_status);
2291        env->fp_status.floatx80_rounding_precision = 80;
2292        ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2293        env->fp_status.floatx80_rounding_precision = save;
2294    }
2295    merge_exception_flags(env, old_flags);
2296}
2297
2298void helper_fsin(CPUX86State *env)
2299{
2300    double fptemp = floatx80_to_double(env, ST0);
2301
2302    if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2303        env->fpus |= 0x400;
2304    } else {
2305        ST0 = double_to_floatx80(env, sin(fptemp));
2306        env->fpus &= ~0x400;  /* C2 <-- 0 */
2307        /* the above code is for |arg| < 2**53 only */
2308    }
2309}
2310
2311void helper_fcos(CPUX86State *env)
2312{
2313    double fptemp = floatx80_to_double(env, ST0);
2314
2315    if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2316        env->fpus |= 0x400;
2317    } else {
2318        ST0 = double_to_floatx80(env, cos(fptemp));
2319        env->fpus &= ~0x400;  /* C2 <-- 0 */
2320        /* the above code is for |arg| < 2**63 only */
2321    }
2322}
2323
2324void helper_fxam_ST0(CPUX86State *env)
2325{
2326    CPU_LDoubleU temp;
2327    int expdif;
2328
2329    temp.d = ST0;
2330
2331    env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2332    if (SIGND(temp)) {
2333        env->fpus |= 0x200; /* C1 <-- 1 */
2334    }
2335
2336    if (env->fptags[env->fpstt]) {
2337        env->fpus |= 0x4100; /* Empty */
2338        return;
2339    }
2340
2341    expdif = EXPD(temp);
2342    if (expdif == MAXEXPD) {
2343        if (MANTD(temp) == 0x8000000000000000ULL) {
2344            env->fpus |= 0x500; /* Infinity */
2345        } else if (MANTD(temp) & 0x8000000000000000ULL) {
2346            env->fpus |= 0x100; /* NaN */
2347        }
2348    } else if (expdif == 0) {
2349        if (MANTD(temp) == 0) {
2350            env->fpus |=  0x4000; /* Zero */
2351        } else {
2352            env->fpus |= 0x4400; /* Denormal */
2353        }
2354    } else if (MANTD(temp) & 0x8000000000000000ULL) {
2355        env->fpus |= 0x400;
2356    }
2357}
2358
2359static void do_fstenv(CPUX86State *env, target_ulong ptr, int data32,
2360                      uintptr_t retaddr)
2361{
2362    int fpus, fptag, exp, i;
2363    uint64_t mant;
2364    CPU_LDoubleU tmp;
2365
2366    fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2367    fptag = 0;
2368    for (i = 7; i >= 0; i--) {
2369        fptag <<= 2;
2370        if (env->fptags[i]) {
2371            fptag |= 3;
2372        } else {
2373            tmp.d = env->fpregs[i].d;
2374            exp = EXPD(tmp);
2375            mant = MANTD(tmp);
2376            if (exp == 0 && mant == 0) {
2377                /* zero */
2378                fptag |= 1;
2379            } else if (exp == 0 || exp == MAXEXPD
2380                       || (mant & (1LL << 63)) == 0) {
2381                /* NaNs, infinity, denormal */
2382                fptag |= 2;
2383            }
2384        }
2385    }
2386    if (data32) {
2387        /* 32 bit */
2388        cpu_stl_data_ra(env, ptr, env->fpuc, retaddr);
2389        cpu_stl_data_ra(env, ptr + 4, fpus, retaddr);
2390        cpu_stl_data_ra(env, ptr + 8, fptag, retaddr);
2391        cpu_stl_data_ra(env, ptr + 12, 0, retaddr); /* fpip */
2392        cpu_stl_data_ra(env, ptr + 16, 0, retaddr); /* fpcs */
2393        cpu_stl_data_ra(env, ptr + 20, 0, retaddr); /* fpoo */
2394        cpu_stl_data_ra(env, ptr + 24, 0, retaddr); /* fpos */
2395    } else {
2396        /* 16 bit */
2397        cpu_stw_data_ra(env, ptr, env->fpuc, retaddr);
2398        cpu_stw_data_ra(env, ptr + 2, fpus, retaddr);
2399        cpu_stw_data_ra(env, ptr + 4, fptag, retaddr);
2400        cpu_stw_data_ra(env, ptr + 6, 0, retaddr);
2401        cpu_stw_data_ra(env, ptr + 8, 0, retaddr);
2402        cpu_stw_data_ra(env, ptr + 10, 0, retaddr);
2403        cpu_stw_data_ra(env, ptr + 12, 0, retaddr);
2404    }
2405}
2406
2407void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2408{
2409    do_fstenv(env, ptr, data32, GETPC());
2410}
2411
2412static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2413{
2414    env->fpstt = (fpus >> 11) & 7;
2415    env->fpus = fpus & ~0x3800 & ~FPUS_B;
2416    env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2417#if !defined(CONFIG_USER_ONLY)
2418    if (!(env->fpus & FPUS_SE)) {
2419        /*
2420         * Here the processor deasserts FERR#; in response, the chipset deasserts
2421         * IGNNE#.
2422         */
2423        cpu_clear_ignne();
2424    }
2425#endif
2426}
2427
2428static void do_fldenv(CPUX86State *env, target_ulong ptr, int data32,
2429                      uintptr_t retaddr)
2430{
2431    int i, fpus, fptag;
2432
2433    if (data32) {
2434        cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2435        fpus = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2436        fptag = cpu_lduw_data_ra(env, ptr + 8, retaddr);
2437    } else {
2438        cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2439        fpus = cpu_lduw_data_ra(env, ptr + 2, retaddr);
2440        fptag = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2441    }
2442    cpu_set_fpus(env, fpus);
2443    for (i = 0; i < 8; i++) {
2444        env->fptags[i] = ((fptag & 3) == 3);
2445        fptag >>= 2;
2446    }
2447}
2448
2449void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2450{
2451    do_fldenv(env, ptr, data32, GETPC());
2452}
2453
2454void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2455{
2456    floatx80 tmp;
2457    int i;
2458
2459    do_fstenv(env, ptr, data32, GETPC());
2460
2461    ptr += (14 << data32);
2462    for (i = 0; i < 8; i++) {
2463        tmp = ST(i);
2464        helper_fstt(env, tmp, ptr, GETPC());
2465        ptr += 10;
2466    }
2467
2468    /* fninit */
2469    env->fpus = 0;
2470    env->fpstt = 0;
2471    cpu_set_fpuc(env, 0x37f);
2472    env->fptags[0] = 1;
2473    env->fptags[1] = 1;
2474    env->fptags[2] = 1;
2475    env->fptags[3] = 1;
2476    env->fptags[4] = 1;
2477    env->fptags[5] = 1;
2478    env->fptags[6] = 1;
2479    env->fptags[7] = 1;
2480}
2481
2482void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2483{
2484    floatx80 tmp;
2485    int i;
2486
2487    do_fldenv(env, ptr, data32, GETPC());
2488    ptr += (14 << data32);
2489
2490    for (i = 0; i < 8; i++) {
2491        tmp = helper_fldt(env, ptr, GETPC());
2492        ST(i) = tmp;
2493        ptr += 10;
2494    }
2495}
2496
2497#if defined(CONFIG_USER_ONLY)
2498void cpu_x86_fsave(CPUX86State *env, target_ulong ptr, int data32)
2499{
2500    helper_fsave(env, ptr, data32);
2501}
2502
2503void cpu_x86_frstor(CPUX86State *env, target_ulong ptr, int data32)
2504{
2505    helper_frstor(env, ptr, data32);
2506}
2507#endif
2508
2509#define XO(X)  offsetof(X86XSaveArea, X)
2510
2511static void do_xsave_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2512{
2513    int fpus, fptag, i;
2514    target_ulong addr;
2515
2516    fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2517    fptag = 0;
2518    for (i = 0; i < 8; i++) {
2519        fptag |= (env->fptags[i] << i);
2520    }
2521
2522    cpu_stw_data_ra(env, ptr + XO(legacy.fcw), env->fpuc, ra);
2523    cpu_stw_data_ra(env, ptr + XO(legacy.fsw), fpus, ra);
2524    cpu_stw_data_ra(env, ptr + XO(legacy.ftw), fptag ^ 0xff, ra);
2525
2526    /* In 32-bit mode this is eip, sel, dp, sel.
2527       In 64-bit mode this is rip, rdp.
2528       But in either case we don't write actual data, just zeros.  */
2529    cpu_stq_data_ra(env, ptr + XO(legacy.fpip), 0, ra); /* eip+sel; rip */
2530    cpu_stq_data_ra(env, ptr + XO(legacy.fpdp), 0, ra); /* edp+sel; rdp */
2531
2532    addr = ptr + XO(legacy.fpregs);
2533    for (i = 0; i < 8; i++) {
2534        floatx80 tmp = ST(i);
2535        helper_fstt(env, tmp, addr, ra);
2536        addr += 16;
2537    }
2538}
2539
2540static void do_xsave_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2541{
2542    update_mxcsr_from_sse_status(env);
2543    cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr), env->mxcsr, ra);
2544    cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr_mask), 0x0000ffff, ra);
2545}
2546
2547static void do_xsave_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2548{
2549    int i, nb_xmm_regs;
2550    target_ulong addr;
2551
2552    if (env->hflags & HF_CS64_MASK) {
2553        nb_xmm_regs = 16;
2554    } else {
2555        nb_xmm_regs = 8;
2556    }
2557
2558    addr = ptr + XO(legacy.xmm_regs);
2559    for (i = 0; i < nb_xmm_regs; i++) {
2560        cpu_stq_data_ra(env, addr, env->xmm_regs[i].ZMM_Q(0), ra);
2561        cpu_stq_data_ra(env, addr + 8, env->xmm_regs[i].ZMM_Q(1), ra);
2562        addr += 16;
2563    }
2564}
2565
2566static void do_xsave_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2567{
2568    target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2569    int i;
2570
2571    for (i = 0; i < 4; i++, addr += 16) {
2572        cpu_stq_data_ra(env, addr, env->bnd_regs[i].lb, ra);
2573        cpu_stq_data_ra(env, addr + 8, env->bnd_regs[i].ub, ra);
2574    }
2575}
2576
2577static void do_xsave_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2578{
2579    cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2580                    env->bndcs_regs.cfgu, ra);
2581    cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2582                    env->bndcs_regs.sts, ra);
2583}
2584
2585static void do_xsave_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2586{
2587    cpu_stq_data_ra(env, ptr, env->pkru, ra);
2588}
2589
2590void helper_fxsave(CPUX86State *env, target_ulong ptr)
2591{
2592    uintptr_t ra = GETPC();
2593
2594    /* The operand must be 16 byte aligned */
2595    if (ptr & 0xf) {
2596        raise_exception_ra(env, EXCP0D_GPF, ra);
2597    }
2598
2599    do_xsave_fpu(env, ptr, ra);
2600
2601    if (env->cr[4] & CR4_OSFXSR_MASK) {
2602        do_xsave_mxcsr(env, ptr, ra);
2603        /* Fast FXSAVE leaves out the XMM registers */
2604        if (!(env->efer & MSR_EFER_FFXSR)
2605            || (env->hflags & HF_CPL_MASK)
2606            || !(env->hflags & HF_LMA_MASK)) {
2607            do_xsave_sse(env, ptr, ra);
2608        }
2609    }
2610}
2611
2612static uint64_t get_xinuse(CPUX86State *env)
2613{
2614    uint64_t inuse = -1;
2615
2616    /* For the most part, we don't track XINUSE.  We could calculate it
2617       here for all components, but it's probably less work to simply
2618       indicate in use.  That said, the state of BNDREGS is important
2619       enough to track in HFLAGS, so we might as well use that here.  */
2620    if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2621       inuse &= ~XSTATE_BNDREGS_MASK;
2622    }
2623    return inuse;
2624}
2625
2626static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2627                     uint64_t inuse, uint64_t opt, uintptr_t ra)
2628{
2629    uint64_t old_bv, new_bv;
2630
2631    /* The OS must have enabled XSAVE.  */
2632    if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2633        raise_exception_ra(env, EXCP06_ILLOP, ra);
2634    }
2635
2636    /* The operand must be 64 byte aligned.  */
2637    if (ptr & 63) {
2638        raise_exception_ra(env, EXCP0D_GPF, ra);
2639    }
2640
2641    /* Never save anything not enabled by XCR0.  */
2642    rfbm &= env->xcr0;
2643    opt &= rfbm;
2644
2645    if (opt & XSTATE_FP_MASK) {
2646        do_xsave_fpu(env, ptr, ra);
2647    }
2648    if (rfbm & XSTATE_SSE_MASK) {
2649        /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2650        do_xsave_mxcsr(env, ptr, ra);
2651    }
2652    if (opt & XSTATE_SSE_MASK) {
2653        do_xsave_sse(env, ptr, ra);
2654    }
2655    if (opt & XSTATE_BNDREGS_MASK) {
2656        do_xsave_bndregs(env, ptr + XO(bndreg_state), ra);
2657    }
2658    if (opt & XSTATE_BNDCSR_MASK) {
2659        do_xsave_bndcsr(env, ptr + XO(bndcsr_state), ra);
2660    }
2661    if (opt & XSTATE_PKRU_MASK) {
2662        do_xsave_pkru(env, ptr + XO(pkru_state), ra);
2663    }
2664
2665    /* Update the XSTATE_BV field.  */
2666    old_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2667    new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2668    cpu_stq_data_ra(env, ptr + XO(header.xstate_bv), new_bv, ra);
2669}
2670
2671void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2672{
2673    do_xsave(env, ptr, rfbm, get_xinuse(env), -1, GETPC());
2674}
2675
2676void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2677{
2678    uint64_t inuse = get_xinuse(env);
2679    do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2680}
2681
2682static void do_xrstor_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2683{
2684    int i, fpuc, fpus, fptag;
2685    target_ulong addr;
2686
2687    fpuc = cpu_lduw_data_ra(env, ptr + XO(legacy.fcw), ra);
2688    fpus = cpu_lduw_data_ra(env, ptr + XO(legacy.fsw), ra);
2689    fptag = cpu_lduw_data_ra(env, ptr + XO(legacy.ftw), ra);
2690    cpu_set_fpuc(env, fpuc);
2691    cpu_set_fpus(env, fpus);
2692    fptag ^= 0xff;
2693    for (i = 0; i < 8; i++) {
2694        env->fptags[i] = ((fptag >> i) & 1);
2695    }
2696
2697    addr = ptr + XO(legacy.fpregs);
2698    for (i = 0; i < 8; i++) {
2699        floatx80 tmp = helper_fldt(env, addr, ra);
2700        ST(i) = tmp;
2701        addr += 16;
2702    }
2703}
2704
2705static void do_xrstor_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2706{
2707    cpu_set_mxcsr(env, cpu_ldl_data_ra(env, ptr + XO(legacy.mxcsr), ra));
2708}
2709
2710static void do_xrstor_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2711{
2712    int i, nb_xmm_regs;
2713    target_ulong addr;
2714
2715    if (env->hflags & HF_CS64_MASK) {
2716        nb_xmm_regs = 16;
2717    } else {
2718        nb_xmm_regs = 8;
2719    }
2720
2721    addr = ptr + XO(legacy.xmm_regs);
2722    for (i = 0; i < nb_xmm_regs; i++) {
2723        env->xmm_regs[i].ZMM_Q(0) = cpu_ldq_data_ra(env, addr, ra);
2724        env->xmm_regs[i].ZMM_Q(1) = cpu_ldq_data_ra(env, addr + 8, ra);
2725        addr += 16;
2726    }
2727}
2728
2729static void do_xrstor_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2730{
2731    target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2732    int i;
2733
2734    for (i = 0; i < 4; i++, addr += 16) {
2735        env->bnd_regs[i].lb = cpu_ldq_data_ra(env, addr, ra);
2736        env->bnd_regs[i].ub = cpu_ldq_data_ra(env, addr + 8, ra);
2737    }
2738}
2739
2740static void do_xrstor_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2741{
2742    /* FIXME: Extend highest implemented bit of linear address.  */
2743    env->bndcs_regs.cfgu
2744        = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu), ra);
2745    env->bndcs_regs.sts
2746        = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts), ra);
2747}
2748
2749static void do_xrstor_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2750{
2751    env->pkru = cpu_ldq_data_ra(env, ptr, ra);
2752}
2753
2754void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2755{
2756    uintptr_t ra = GETPC();
2757
2758    /* The operand must be 16 byte aligned */
2759    if (ptr & 0xf) {
2760        raise_exception_ra(env, EXCP0D_GPF, ra);
2761    }
2762
2763    do_xrstor_fpu(env, ptr, ra);
2764
2765    if (env->cr[4] & CR4_OSFXSR_MASK) {
2766        do_xrstor_mxcsr(env, ptr, ra);
2767        /* Fast FXRSTOR leaves out the XMM registers */
2768        if (!(env->efer & MSR_EFER_FFXSR)
2769            || (env->hflags & HF_CPL_MASK)
2770            || !(env->hflags & HF_LMA_MASK)) {
2771            do_xrstor_sse(env, ptr, ra);
2772        }
2773    }
2774}
2775
2776#if defined(CONFIG_USER_ONLY)
2777void cpu_x86_fxsave(CPUX86State *env, target_ulong ptr)
2778{
2779    helper_fxsave(env, ptr);
2780}
2781
2782void cpu_x86_fxrstor(CPUX86State *env, target_ulong ptr)
2783{
2784    helper_fxrstor(env, ptr);
2785}
2786#endif
2787
2788void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2789{
2790    uintptr_t ra = GETPC();
2791    uint64_t xstate_bv, xcomp_bv, reserve0;
2792
2793    rfbm &= env->xcr0;
2794
2795    /* The OS must have enabled XSAVE.  */
2796    if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2797        raise_exception_ra(env, EXCP06_ILLOP, ra);
2798    }
2799
2800    /* The operand must be 64 byte aligned.  */
2801    if (ptr & 63) {
2802        raise_exception_ra(env, EXCP0D_GPF, ra);
2803    }
2804
2805    xstate_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2806
2807    if ((int64_t)xstate_bv < 0) {
2808        /* FIXME: Compact form.  */
2809        raise_exception_ra(env, EXCP0D_GPF, ra);
2810    }
2811
2812    /* Standard form.  */
2813
2814    /* The XSTATE_BV field must not set bits not present in XCR0.  */
2815    if (xstate_bv & ~env->xcr0) {
2816        raise_exception_ra(env, EXCP0D_GPF, ra);
2817    }
2818
2819    /* The XCOMP_BV field must be zero.  Note that, as of the April 2016
2820       revision, the description of the XSAVE Header (Vol 1, Sec 13.4.2)
2821       describes only XCOMP_BV, but the description of the standard form
2822       of XRSTOR (Vol 1, Sec 13.8.1) checks bytes 23:8 for zero, which
2823       includes the next 64-bit field.  */
2824    xcomp_bv = cpu_ldq_data_ra(env, ptr + XO(header.xcomp_bv), ra);
2825    reserve0 = cpu_ldq_data_ra(env, ptr + XO(header.reserve0), ra);
2826    if (xcomp_bv || reserve0) {
2827        raise_exception_ra(env, EXCP0D_GPF, ra);
2828    }
2829
2830    if (rfbm & XSTATE_FP_MASK) {
2831        if (xstate_bv & XSTATE_FP_MASK) {
2832            do_xrstor_fpu(env, ptr, ra);
2833        } else {
2834            helper_fninit(env);
2835            memset(env->fpregs, 0, sizeof(env->fpregs));
2836        }
2837    }
2838    if (rfbm & XSTATE_SSE_MASK) {
2839        /* Note that the standard form of XRSTOR loads MXCSR from memory
2840           whether or not the XSTATE_BV bit is set.  */
2841        do_xrstor_mxcsr(env, ptr, ra);
2842        if (xstate_bv & XSTATE_SSE_MASK) {
2843            do_xrstor_sse(env, ptr, ra);
2844        } else {
2845            /* ??? When AVX is implemented, we may have to be more
2846               selective in the clearing.  */
2847            memset(env->xmm_regs, 0, sizeof(env->xmm_regs));
2848        }
2849    }
2850    if (rfbm & XSTATE_BNDREGS_MASK) {
2851        if (xstate_bv & XSTATE_BNDREGS_MASK) {
2852            do_xrstor_bndregs(env, ptr + XO(bndreg_state), ra);
2853            env->hflags |= HF_MPX_IU_MASK;
2854        } else {
2855            memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
2856            env->hflags &= ~HF_MPX_IU_MASK;
2857        }
2858    }
2859    if (rfbm & XSTATE_BNDCSR_MASK) {
2860        if (xstate_bv & XSTATE_BNDCSR_MASK) {
2861            do_xrstor_bndcsr(env, ptr + XO(bndcsr_state), ra);
2862        } else {
2863            memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
2864        }
2865        cpu_sync_bndcs_hflags(env);
2866    }
2867    if (rfbm & XSTATE_PKRU_MASK) {
2868        uint64_t old_pkru = env->pkru;
2869        if (xstate_bv & XSTATE_PKRU_MASK) {
2870            do_xrstor_pkru(env, ptr + XO(pkru_state), ra);
2871        } else {
2872            env->pkru = 0;
2873        }
2874        if (env->pkru != old_pkru) {
2875            CPUState *cs = env_cpu(env);
2876            tlb_flush(cs);
2877        }
2878    }
2879}
2880
2881#undef XO
2882
2883uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
2884{
2885    /* The OS must have enabled XSAVE.  */
2886    if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2887        raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2888    }
2889
2890    switch (ecx) {
2891    case 0:
2892        return env->xcr0;
2893    case 1:
2894        if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
2895            return env->xcr0 & get_xinuse(env);
2896        }
2897        break;
2898    }
2899    raise_exception_ra(env, EXCP0D_GPF, GETPC());
2900}
2901
2902void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
2903{
2904    uint32_t dummy, ena_lo, ena_hi;
2905    uint64_t ena;
2906
2907    /* The OS must have enabled XSAVE.  */
2908    if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2909        raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2910    }
2911
2912    /* Only XCR0 is defined at present; the FPU may not be disabled.  */
2913    if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
2914        goto do_gpf;
2915    }
2916
2917    /* Disallow enabling unimplemented features.  */
2918    cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
2919    ena = ((uint64_t)ena_hi << 32) | ena_lo;
2920    if (mask & ~ena) {
2921        goto do_gpf;
2922    }
2923
2924    /* Disallow enabling only half of MPX.  */
2925    if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
2926        & XSTATE_BNDCSR_MASK) {
2927        goto do_gpf;
2928    }
2929
2930    env->xcr0 = mask;
2931    cpu_sync_bndcs_hflags(env);
2932    return;
2933
2934 do_gpf:
2935    raise_exception_ra(env, EXCP0D_GPF, GETPC());
2936}
2937
2938/* MMX/SSE */
2939/* XXX: optimize by storing fptt and fptags in the static cpu state */
2940
2941#define SSE_DAZ             0x0040
2942#define SSE_RC_MASK         0x6000
2943#define SSE_RC_NEAR         0x0000
2944#define SSE_RC_DOWN         0x2000
2945#define SSE_RC_UP           0x4000
2946#define SSE_RC_CHOP         0x6000
2947#define SSE_FZ              0x8000
2948
2949void update_mxcsr_status(CPUX86State *env)
2950{
2951    uint32_t mxcsr = env->mxcsr;
2952    int rnd_type;
2953
2954    /* set rounding mode */
2955    switch (mxcsr & SSE_RC_MASK) {
2956    default:
2957    case SSE_RC_NEAR:
2958        rnd_type = float_round_nearest_even;
2959        break;
2960    case SSE_RC_DOWN:
2961        rnd_type = float_round_down;
2962        break;
2963    case SSE_RC_UP:
2964        rnd_type = float_round_up;
2965        break;
2966    case SSE_RC_CHOP:
2967        rnd_type = float_round_to_zero;
2968        break;
2969    }
2970    set_float_rounding_mode(rnd_type, &env->sse_status);
2971
2972    /* Set exception flags.  */
2973    set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
2974                              (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
2975                              (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
2976                              (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
2977                              (mxcsr & FPUS_PE ? float_flag_inexact : 0),
2978                              &env->sse_status);
2979
2980    /* set denormals are zero */
2981    set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
2982
2983    /* set flush to zero */
2984    set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
2985}
2986
2987void update_mxcsr_from_sse_status(CPUX86State *env)
2988{
2989    if (tcg_enabled()) {
2990        uint8_t flags = get_float_exception_flags(&env->sse_status);
2991        /*
2992         * The MXCSR denormal flag has opposite semantics to
2993         * float_flag_input_denormal (the softfloat code sets that flag
2994         * only when flushing input denormals to zero, but SSE sets it
2995         * only when not flushing them to zero), so is not converted
2996         * here.
2997         */
2998        env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
2999                       (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3000                       (flags & float_flag_overflow ? FPUS_OE : 0) |
3001                       (flags & float_flag_underflow ? FPUS_UE : 0) |
3002                       (flags & float_flag_inexact ? FPUS_PE : 0) |
3003                       (flags & float_flag_output_denormal ? FPUS_UE | FPUS_PE :
3004                        0));
3005    }
3006}
3007
3008void helper_update_mxcsr(CPUX86State *env)
3009{
3010    update_mxcsr_from_sse_status(env);
3011}
3012
3013void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3014{
3015    cpu_set_mxcsr(env, val);
3016}
3017
3018void helper_enter_mmx(CPUX86State *env)
3019{
3020    env->fpstt = 0;
3021    *(uint32_t *)(env->fptags) = 0;
3022    *(uint32_t *)(env->fptags + 4) = 0;
3023}
3024
3025void helper_emms(CPUX86State *env)
3026{
3027    /* set to empty state */
3028    *(uint32_t *)(env->fptags) = 0x01010101;
3029    *(uint32_t *)(env->fptags + 4) = 0x01010101;
3030}
3031
3032/* XXX: suppress */
3033void helper_movq(CPUX86State *env, void *d, void *s)
3034{
3035    *(uint64_t *)d = *(uint64_t *)s;
3036}
3037
3038#define SHIFT 0
3039#include "ops_sse.h"
3040
3041#define SHIFT 1
3042#include "ops_sse.h"
3043