qemu/target/i386/tcg/fpu_helper.c
<<
>>
Prefs
   1/*
   2 *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
   3 *
   4 *  Copyright (c) 2003 Fabrice Bellard
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2.1 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include <math.h>
  22#include "cpu.h"
  23#include "tcg-cpu.h"
  24#include "exec/helper-proto.h"
  25#include "fpu/softfloat.h"
  26#include "fpu/softfloat-macros.h"
  27#include "helper-tcg.h"
  28
  29/* float macros */
  30#define FT0    (env->ft0)
  31#define ST0    (env->fpregs[env->fpstt].d)
  32#define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
  33#define ST1    ST(1)
  34
  35#define FPU_RC_MASK         0xc00
  36#define FPU_RC_NEAR         0x000
  37#define FPU_RC_DOWN         0x400
  38#define FPU_RC_UP           0x800
  39#define FPU_RC_CHOP         0xc00
  40
  41#define MAXTAN 9223372036854775808.0
  42
  43/* the following deal with x86 long double-precision numbers */
  44#define MAXEXPD 0x7fff
  45#define EXPBIAS 16383
  46#define EXPD(fp)        (fp.l.upper & 0x7fff)
  47#define SIGND(fp)       ((fp.l.upper) & 0x8000)
  48#define MANTD(fp)       (fp.l.lower)
  49#define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
  50
  51#define FPUS_IE (1 << 0)
  52#define FPUS_DE (1 << 1)
  53#define FPUS_ZE (1 << 2)
  54#define FPUS_OE (1 << 3)
  55#define FPUS_UE (1 << 4)
  56#define FPUS_PE (1 << 5)
  57#define FPUS_SF (1 << 6)
  58#define FPUS_SE (1 << 7)
  59#define FPUS_B  (1 << 15)
  60
  61#define FPUC_EM 0x3f
  62
  63#define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
  64#define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
  65#define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
  66#define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
  67#define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
  68#define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
  69#define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
  70#define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
  71
  72static inline void fpush(CPUX86State *env)
  73{
  74    env->fpstt = (env->fpstt - 1) & 7;
  75    env->fptags[env->fpstt] = 0; /* validate stack entry */
  76}
  77
  78static inline void fpop(CPUX86State *env)
  79{
  80    env->fptags[env->fpstt] = 1; /* invalidate stack entry */
  81    env->fpstt = (env->fpstt + 1) & 7;
  82}
  83
  84static floatx80 do_fldt(CPUX86State *env, target_ulong ptr, uintptr_t retaddr)
  85{
  86    CPU_LDoubleU temp;
  87
  88    temp.l.lower = cpu_ldq_data_ra(env, ptr, retaddr);
  89    temp.l.upper = cpu_lduw_data_ra(env, ptr + 8, retaddr);
  90    return temp.d;
  91}
  92
  93static void do_fstt(CPUX86State *env, floatx80 f, target_ulong ptr,
  94                    uintptr_t retaddr)
  95{
  96    CPU_LDoubleU temp;
  97
  98    temp.d = f;
  99    cpu_stq_data_ra(env, ptr, temp.l.lower, retaddr);
 100    cpu_stw_data_ra(env, ptr + 8, temp.l.upper, retaddr);
 101}
 102
 103/* x87 FPU helpers */
 104
 105static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
 106{
 107    union {
 108        float64 f64;
 109        double d;
 110    } u;
 111
 112    u.f64 = floatx80_to_float64(a, &env->fp_status);
 113    return u.d;
 114}
 115
 116static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
 117{
 118    union {
 119        float64 f64;
 120        double d;
 121    } u;
 122
 123    u.d = a;
 124    return float64_to_floatx80(u.f64, &env->fp_status);
 125}
 126
 127static void fpu_set_exception(CPUX86State *env, int mask)
 128{
 129    env->fpus |= mask;
 130    if (env->fpus & (~env->fpuc & FPUC_EM)) {
 131        env->fpus |= FPUS_SE | FPUS_B;
 132    }
 133}
 134
 135static inline uint8_t save_exception_flags(CPUX86State *env)
 136{
 137    uint8_t old_flags = get_float_exception_flags(&env->fp_status);
 138    set_float_exception_flags(0, &env->fp_status);
 139    return old_flags;
 140}
 141
 142static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
 143{
 144    uint8_t new_flags = get_float_exception_flags(&env->fp_status);
 145    float_raise(old_flags, &env->fp_status);
 146    fpu_set_exception(env,
 147                      ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
 148                       (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
 149                       (new_flags & float_flag_overflow ? FPUS_OE : 0) |
 150                       (new_flags & float_flag_underflow ? FPUS_UE : 0) |
 151                       (new_flags & float_flag_inexact ? FPUS_PE : 0) |
 152                       (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
 153}
 154
 155static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
 156{
 157    uint8_t old_flags = save_exception_flags(env);
 158    floatx80 ret = floatx80_div(a, b, &env->fp_status);
 159    merge_exception_flags(env, old_flags);
 160    return ret;
 161}
 162
 163static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
 164{
 165    if (env->cr[0] & CR0_NE_MASK) {
 166        raise_exception_ra(env, EXCP10_COPR, retaddr);
 167    }
 168#if !defined(CONFIG_USER_ONLY)
 169    else {
 170        fpu_check_raise_ferr_irq(env);
 171    }
 172#endif
 173}
 174
 175void helper_flds_FT0(CPUX86State *env, uint32_t val)
 176{
 177    uint8_t old_flags = save_exception_flags(env);
 178    union {
 179        float32 f;
 180        uint32_t i;
 181    } u;
 182
 183    u.i = val;
 184    FT0 = float32_to_floatx80(u.f, &env->fp_status);
 185    merge_exception_flags(env, old_flags);
 186}
 187
 188void helper_fldl_FT0(CPUX86State *env, uint64_t val)
 189{
 190    uint8_t old_flags = save_exception_flags(env);
 191    union {
 192        float64 f;
 193        uint64_t i;
 194    } u;
 195
 196    u.i = val;
 197    FT0 = float64_to_floatx80(u.f, &env->fp_status);
 198    merge_exception_flags(env, old_flags);
 199}
 200
 201void helper_fildl_FT0(CPUX86State *env, int32_t val)
 202{
 203    FT0 = int32_to_floatx80(val, &env->fp_status);
 204}
 205
 206void helper_flds_ST0(CPUX86State *env, uint32_t val)
 207{
 208    uint8_t old_flags = save_exception_flags(env);
 209    int new_fpstt;
 210    union {
 211        float32 f;
 212        uint32_t i;
 213    } u;
 214
 215    new_fpstt = (env->fpstt - 1) & 7;
 216    u.i = val;
 217    env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
 218    env->fpstt = new_fpstt;
 219    env->fptags[new_fpstt] = 0; /* validate stack entry */
 220    merge_exception_flags(env, old_flags);
 221}
 222
 223void helper_fldl_ST0(CPUX86State *env, uint64_t val)
 224{
 225    uint8_t old_flags = save_exception_flags(env);
 226    int new_fpstt;
 227    union {
 228        float64 f;
 229        uint64_t i;
 230    } u;
 231
 232    new_fpstt = (env->fpstt - 1) & 7;
 233    u.i = val;
 234    env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
 235    env->fpstt = new_fpstt;
 236    env->fptags[new_fpstt] = 0; /* validate stack entry */
 237    merge_exception_flags(env, old_flags);
 238}
 239
 240void helper_fildl_ST0(CPUX86State *env, int32_t val)
 241{
 242    int new_fpstt;
 243
 244    new_fpstt = (env->fpstt - 1) & 7;
 245    env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
 246    env->fpstt = new_fpstt;
 247    env->fptags[new_fpstt] = 0; /* validate stack entry */
 248}
 249
 250void helper_fildll_ST0(CPUX86State *env, int64_t val)
 251{
 252    int new_fpstt;
 253
 254    new_fpstt = (env->fpstt - 1) & 7;
 255    env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
 256    env->fpstt = new_fpstt;
 257    env->fptags[new_fpstt] = 0; /* validate stack entry */
 258}
 259
 260uint32_t helper_fsts_ST0(CPUX86State *env)
 261{
 262    uint8_t old_flags = save_exception_flags(env);
 263    union {
 264        float32 f;
 265        uint32_t i;
 266    } u;
 267
 268    u.f = floatx80_to_float32(ST0, &env->fp_status);
 269    merge_exception_flags(env, old_flags);
 270    return u.i;
 271}
 272
 273uint64_t helper_fstl_ST0(CPUX86State *env)
 274{
 275    uint8_t old_flags = save_exception_flags(env);
 276    union {
 277        float64 f;
 278        uint64_t i;
 279    } u;
 280
 281    u.f = floatx80_to_float64(ST0, &env->fp_status);
 282    merge_exception_flags(env, old_flags);
 283    return u.i;
 284}
 285
 286int32_t helper_fist_ST0(CPUX86State *env)
 287{
 288    uint8_t old_flags = save_exception_flags(env);
 289    int32_t val;
 290
 291    val = floatx80_to_int32(ST0, &env->fp_status);
 292    if (val != (int16_t)val) {
 293        set_float_exception_flags(float_flag_invalid, &env->fp_status);
 294        val = -32768;
 295    }
 296    merge_exception_flags(env, old_flags);
 297    return val;
 298}
 299
 300int32_t helper_fistl_ST0(CPUX86State *env)
 301{
 302    uint8_t old_flags = save_exception_flags(env);
 303    int32_t val;
 304
 305    val = floatx80_to_int32(ST0, &env->fp_status);
 306    if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 307        val = 0x80000000;
 308    }
 309    merge_exception_flags(env, old_flags);
 310    return val;
 311}
 312
 313int64_t helper_fistll_ST0(CPUX86State *env)
 314{
 315    uint8_t old_flags = save_exception_flags(env);
 316    int64_t val;
 317
 318    val = floatx80_to_int64(ST0, &env->fp_status);
 319    if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 320        val = 0x8000000000000000ULL;
 321    }
 322    merge_exception_flags(env, old_flags);
 323    return val;
 324}
 325
 326int32_t helper_fistt_ST0(CPUX86State *env)
 327{
 328    uint8_t old_flags = save_exception_flags(env);
 329    int32_t val;
 330
 331    val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
 332    if (val != (int16_t)val) {
 333        set_float_exception_flags(float_flag_invalid, &env->fp_status);
 334        val = -32768;
 335    }
 336    merge_exception_flags(env, old_flags);
 337    return val;
 338}
 339
 340int32_t helper_fisttl_ST0(CPUX86State *env)
 341{
 342    uint8_t old_flags = save_exception_flags(env);
 343    int32_t val;
 344
 345    val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
 346    if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 347        val = 0x80000000;
 348    }
 349    merge_exception_flags(env, old_flags);
 350    return val;
 351}
 352
 353int64_t helper_fisttll_ST0(CPUX86State *env)
 354{
 355    uint8_t old_flags = save_exception_flags(env);
 356    int64_t val;
 357
 358    val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
 359    if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 360        val = 0x8000000000000000ULL;
 361    }
 362    merge_exception_flags(env, old_flags);
 363    return val;
 364}
 365
 366void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
 367{
 368    int new_fpstt;
 369
 370    new_fpstt = (env->fpstt - 1) & 7;
 371    env->fpregs[new_fpstt].d = do_fldt(env, ptr, GETPC());
 372    env->fpstt = new_fpstt;
 373    env->fptags[new_fpstt] = 0; /* validate stack entry */
 374}
 375
 376void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
 377{
 378    do_fstt(env, ST0, ptr, GETPC());
 379}
 380
 381void helper_fpush(CPUX86State *env)
 382{
 383    fpush(env);
 384}
 385
 386void helper_fpop(CPUX86State *env)
 387{
 388    fpop(env);
 389}
 390
 391void helper_fdecstp(CPUX86State *env)
 392{
 393    env->fpstt = (env->fpstt - 1) & 7;
 394    env->fpus &= ~0x4700;
 395}
 396
 397void helper_fincstp(CPUX86State *env)
 398{
 399    env->fpstt = (env->fpstt + 1) & 7;
 400    env->fpus &= ~0x4700;
 401}
 402
 403/* FPU move */
 404
 405void helper_ffree_STN(CPUX86State *env, int st_index)
 406{
 407    env->fptags[(env->fpstt + st_index) & 7] = 1;
 408}
 409
 410void helper_fmov_ST0_FT0(CPUX86State *env)
 411{
 412    ST0 = FT0;
 413}
 414
 415void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
 416{
 417    FT0 = ST(st_index);
 418}
 419
 420void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
 421{
 422    ST0 = ST(st_index);
 423}
 424
 425void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
 426{
 427    ST(st_index) = ST0;
 428}
 429
 430void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
 431{
 432    floatx80 tmp;
 433
 434    tmp = ST(st_index);
 435    ST(st_index) = ST0;
 436    ST0 = tmp;
 437}
 438
 439/* FPU operations */
 440
 441static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
 442
 443void helper_fcom_ST0_FT0(CPUX86State *env)
 444{
 445    uint8_t old_flags = save_exception_flags(env);
 446    FloatRelation ret;
 447
 448    ret = floatx80_compare(ST0, FT0, &env->fp_status);
 449    env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
 450    merge_exception_flags(env, old_flags);
 451}
 452
 453void helper_fucom_ST0_FT0(CPUX86State *env)
 454{
 455    uint8_t old_flags = save_exception_flags(env);
 456    FloatRelation ret;
 457
 458    ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
 459    env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
 460    merge_exception_flags(env, old_flags);
 461}
 462
 463static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
 464
 465void helper_fcomi_ST0_FT0(CPUX86State *env)
 466{
 467    uint8_t old_flags = save_exception_flags(env);
 468    int eflags;
 469    FloatRelation ret;
 470
 471    ret = floatx80_compare(ST0, FT0, &env->fp_status);
 472    eflags = cpu_cc_compute_all(env, CC_OP);
 473    eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
 474    CC_SRC = eflags;
 475    merge_exception_flags(env, old_flags);
 476}
 477
 478void helper_fucomi_ST0_FT0(CPUX86State *env)
 479{
 480    uint8_t old_flags = save_exception_flags(env);
 481    int eflags;
 482    FloatRelation ret;
 483
 484    ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
 485    eflags = cpu_cc_compute_all(env, CC_OP);
 486    eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
 487    CC_SRC = eflags;
 488    merge_exception_flags(env, old_flags);
 489}
 490
 491void helper_fadd_ST0_FT0(CPUX86State *env)
 492{
 493    uint8_t old_flags = save_exception_flags(env);
 494    ST0 = floatx80_add(ST0, FT0, &env->fp_status);
 495    merge_exception_flags(env, old_flags);
 496}
 497
 498void helper_fmul_ST0_FT0(CPUX86State *env)
 499{
 500    uint8_t old_flags = save_exception_flags(env);
 501    ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
 502    merge_exception_flags(env, old_flags);
 503}
 504
 505void helper_fsub_ST0_FT0(CPUX86State *env)
 506{
 507    uint8_t old_flags = save_exception_flags(env);
 508    ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
 509    merge_exception_flags(env, old_flags);
 510}
 511
 512void helper_fsubr_ST0_FT0(CPUX86State *env)
 513{
 514    uint8_t old_flags = save_exception_flags(env);
 515    ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
 516    merge_exception_flags(env, old_flags);
 517}
 518
 519void helper_fdiv_ST0_FT0(CPUX86State *env)
 520{
 521    ST0 = helper_fdiv(env, ST0, FT0);
 522}
 523
 524void helper_fdivr_ST0_FT0(CPUX86State *env)
 525{
 526    ST0 = helper_fdiv(env, FT0, ST0);
 527}
 528
 529/* fp operations between STN and ST0 */
 530
 531void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
 532{
 533    uint8_t old_flags = save_exception_flags(env);
 534    ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
 535    merge_exception_flags(env, old_flags);
 536}
 537
 538void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
 539{
 540    uint8_t old_flags = save_exception_flags(env);
 541    ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
 542    merge_exception_flags(env, old_flags);
 543}
 544
 545void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
 546{
 547    uint8_t old_flags = save_exception_flags(env);
 548    ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
 549    merge_exception_flags(env, old_flags);
 550}
 551
 552void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
 553{
 554    uint8_t old_flags = save_exception_flags(env);
 555    ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
 556    merge_exception_flags(env, old_flags);
 557}
 558
 559void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
 560{
 561    floatx80 *p;
 562
 563    p = &ST(st_index);
 564    *p = helper_fdiv(env, *p, ST0);
 565}
 566
 567void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
 568{
 569    floatx80 *p;
 570
 571    p = &ST(st_index);
 572    *p = helper_fdiv(env, ST0, *p);
 573}
 574
 575/* misc FPU operations */
 576void helper_fchs_ST0(CPUX86State *env)
 577{
 578    ST0 = floatx80_chs(ST0);
 579}
 580
 581void helper_fabs_ST0(CPUX86State *env)
 582{
 583    ST0 = floatx80_abs(ST0);
 584}
 585
 586void helper_fld1_ST0(CPUX86State *env)
 587{
 588    ST0 = floatx80_one;
 589}
 590
 591void helper_fldl2t_ST0(CPUX86State *env)
 592{
 593    switch (env->fpuc & FPU_RC_MASK) {
 594    case FPU_RC_UP:
 595        ST0 = floatx80_l2t_u;
 596        break;
 597    default:
 598        ST0 = floatx80_l2t;
 599        break;
 600    }
 601}
 602
 603void helper_fldl2e_ST0(CPUX86State *env)
 604{
 605    switch (env->fpuc & FPU_RC_MASK) {
 606    case FPU_RC_DOWN:
 607    case FPU_RC_CHOP:
 608        ST0 = floatx80_l2e_d;
 609        break;
 610    default:
 611        ST0 = floatx80_l2e;
 612        break;
 613    }
 614}
 615
 616void helper_fldpi_ST0(CPUX86State *env)
 617{
 618    switch (env->fpuc & FPU_RC_MASK) {
 619    case FPU_RC_DOWN:
 620    case FPU_RC_CHOP:
 621        ST0 = floatx80_pi_d;
 622        break;
 623    default:
 624        ST0 = floatx80_pi;
 625        break;
 626    }
 627}
 628
 629void helper_fldlg2_ST0(CPUX86State *env)
 630{
 631    switch (env->fpuc & FPU_RC_MASK) {
 632    case FPU_RC_DOWN:
 633    case FPU_RC_CHOP:
 634        ST0 = floatx80_lg2_d;
 635        break;
 636    default:
 637        ST0 = floatx80_lg2;
 638        break;
 639    }
 640}
 641
 642void helper_fldln2_ST0(CPUX86State *env)
 643{
 644    switch (env->fpuc & FPU_RC_MASK) {
 645    case FPU_RC_DOWN:
 646    case FPU_RC_CHOP:
 647        ST0 = floatx80_ln2_d;
 648        break;
 649    default:
 650        ST0 = floatx80_ln2;
 651        break;
 652    }
 653}
 654
 655void helper_fldz_ST0(CPUX86State *env)
 656{
 657    ST0 = floatx80_zero;
 658}
 659
 660void helper_fldz_FT0(CPUX86State *env)
 661{
 662    FT0 = floatx80_zero;
 663}
 664
 665uint32_t helper_fnstsw(CPUX86State *env)
 666{
 667    return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
 668}
 669
 670uint32_t helper_fnstcw(CPUX86State *env)
 671{
 672    return env->fpuc;
 673}
 674
 675void update_fp_status(CPUX86State *env)
 676{
 677    FloatRoundMode rnd_mode;
 678    FloatX80RoundPrec rnd_prec;
 679
 680    /* set rounding mode */
 681    switch (env->fpuc & FPU_RC_MASK) {
 682    default:
 683    case FPU_RC_NEAR:
 684        rnd_mode = float_round_nearest_even;
 685        break;
 686    case FPU_RC_DOWN:
 687        rnd_mode = float_round_down;
 688        break;
 689    case FPU_RC_UP:
 690        rnd_mode = float_round_up;
 691        break;
 692    case FPU_RC_CHOP:
 693        rnd_mode = float_round_to_zero;
 694        break;
 695    }
 696    set_float_rounding_mode(rnd_mode, &env->fp_status);
 697
 698    switch ((env->fpuc >> 8) & 3) {
 699    case 0:
 700        rnd_prec = floatx80_precision_s;
 701        break;
 702    case 2:
 703        rnd_prec = floatx80_precision_d;
 704        break;
 705    case 3:
 706    default:
 707        rnd_prec = floatx80_precision_x;
 708        break;
 709    }
 710    set_floatx80_rounding_precision(rnd_prec, &env->fp_status);
 711}
 712
 713void helper_fldcw(CPUX86State *env, uint32_t val)
 714{
 715    cpu_set_fpuc(env, val);
 716}
 717
 718void helper_fclex(CPUX86State *env)
 719{
 720    env->fpus &= 0x7f00;
 721}
 722
 723void helper_fwait(CPUX86State *env)
 724{
 725    if (env->fpus & FPUS_SE) {
 726        fpu_raise_exception(env, GETPC());
 727    }
 728}
 729
 730static void do_fninit(CPUX86State *env)
 731{
 732    env->fpus = 0;
 733    env->fpstt = 0;
 734    env->fpcs = 0;
 735    env->fpds = 0;
 736    env->fpip = 0;
 737    env->fpdp = 0;
 738    cpu_set_fpuc(env, 0x37f);
 739    env->fptags[0] = 1;
 740    env->fptags[1] = 1;
 741    env->fptags[2] = 1;
 742    env->fptags[3] = 1;
 743    env->fptags[4] = 1;
 744    env->fptags[5] = 1;
 745    env->fptags[6] = 1;
 746    env->fptags[7] = 1;
 747}
 748
 749void helper_fninit(CPUX86State *env)
 750{
 751    do_fninit(env);
 752}
 753
 754/* BCD ops */
 755
 756void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
 757{
 758    floatx80 tmp;
 759    uint64_t val;
 760    unsigned int v;
 761    int i;
 762
 763    val = 0;
 764    for (i = 8; i >= 0; i--) {
 765        v = cpu_ldub_data_ra(env, ptr + i, GETPC());
 766        val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
 767    }
 768    tmp = int64_to_floatx80(val, &env->fp_status);
 769    if (cpu_ldub_data_ra(env, ptr + 9, GETPC()) & 0x80) {
 770        tmp = floatx80_chs(tmp);
 771    }
 772    fpush(env);
 773    ST0 = tmp;
 774}
 775
 776void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
 777{
 778    uint8_t old_flags = save_exception_flags(env);
 779    int v;
 780    target_ulong mem_ref, mem_end;
 781    int64_t val;
 782    CPU_LDoubleU temp;
 783
 784    temp.d = ST0;
 785
 786    val = floatx80_to_int64(ST0, &env->fp_status);
 787    mem_ref = ptr;
 788    if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
 789        set_float_exception_flags(float_flag_invalid, &env->fp_status);
 790        while (mem_ref < ptr + 7) {
 791            cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
 792        }
 793        cpu_stb_data_ra(env, mem_ref++, 0xc0, GETPC());
 794        cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
 795        cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
 796        merge_exception_flags(env, old_flags);
 797        return;
 798    }
 799    mem_end = mem_ref + 9;
 800    if (SIGND(temp)) {
 801        cpu_stb_data_ra(env, mem_end, 0x80, GETPC());
 802        val = -val;
 803    } else {
 804        cpu_stb_data_ra(env, mem_end, 0x00, GETPC());
 805    }
 806    while (mem_ref < mem_end) {
 807        if (val == 0) {
 808            break;
 809        }
 810        v = val % 100;
 811        val = val / 100;
 812        v = ((v / 10) << 4) | (v % 10);
 813        cpu_stb_data_ra(env, mem_ref++, v, GETPC());
 814    }
 815    while (mem_ref < mem_end) {
 816        cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
 817    }
 818    merge_exception_flags(env, old_flags);
 819}
 820
 821/* 128-bit significand of log(2).  */
 822#define ln2_sig_high 0xb17217f7d1cf79abULL
 823#define ln2_sig_low 0xc9e3b39803f2f6afULL
 824
 825/*
 826 * Polynomial coefficients for an approximation to (2^x - 1) / x, on
 827 * the interval [-1/64, 1/64].
 828 */
 829#define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
 830#define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
 831#define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
 832#define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
 833#define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
 834#define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
 835#define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
 836#define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
 837#define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
 838
 839struct f2xm1_data {
 840    /*
 841     * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
 842     * are very close to exact floatx80 values.
 843     */
 844    floatx80 t;
 845    /* The value of 2^t.  */
 846    floatx80 exp2;
 847    /* The value of 2^t - 1.  */
 848    floatx80 exp2m1;
 849};
 850
 851static const struct f2xm1_data f2xm1_table[65] = {
 852    { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
 853      make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
 854      make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
 855    { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
 856      make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
 857      make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
 858    { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
 859      make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
 860      make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
 861    { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
 862      make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
 863      make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
 864    { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
 865      make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
 866      make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
 867    { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
 868      make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
 869      make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
 870    { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
 871      make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
 872      make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
 873    { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
 874      make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
 875      make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
 876    { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
 877      make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
 878      make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
 879    { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
 880      make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
 881      make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
 882    { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
 883      make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
 884      make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
 885    { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
 886      make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
 887      make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
 888    { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
 889      make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
 890      make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
 891    { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
 892      make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
 893      make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
 894    { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
 895      make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
 896      make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
 897    { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
 898      make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
 899      make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
 900    { make_floatx80_init(0xbffe, 0x800000000000227dULL),
 901      make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
 902      make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
 903    { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
 904      make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
 905      make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
 906    { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
 907      make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
 908      make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
 909    { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
 910      make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
 911      make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
 912    { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
 913      make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
 914      make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
 915    { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
 916      make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
 917      make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
 918    { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
 919      make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
 920      make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
 921    { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
 922      make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
 923      make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
 924    { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
 925      make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
 926      make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
 927    { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
 928      make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
 929      make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
 930    { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
 931      make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
 932      make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
 933    { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
 934      make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
 935      make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
 936    { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
 937      make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
 938      make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
 939    { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
 940      make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
 941      make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
 942    { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
 943      make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
 944      make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
 945    { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
 946      make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
 947      make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
 948    { floatx80_zero_init,
 949      make_floatx80_init(0x3fff, 0x8000000000000000ULL),
 950      floatx80_zero_init },
 951    { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
 952      make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
 953      make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
 954    { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
 955      make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
 956      make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
 957    { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
 958      make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
 959      make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
 960    { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
 961      make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
 962      make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
 963    { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
 964      make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
 965      make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
 966    { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
 967      make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
 968      make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
 969    { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
 970      make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
 971      make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
 972    { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
 973      make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
 974      make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
 975    { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
 976      make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
 977      make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
 978    { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
 979      make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
 980      make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
 981    { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
 982      make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
 983      make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
 984    { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
 985      make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
 986      make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
 987    { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
 988      make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
 989      make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
 990    { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
 991      make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
 992      make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
 993    { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
 994      make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
 995      make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
 996    { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
 997      make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
 998      make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
 999    { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
1000      make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
1001      make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
1002    { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
1003      make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
1004      make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
1005    { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
1006      make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
1007      make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
1008    { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
1009      make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1010      make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1011    { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1012      make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1013      make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1014    { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1015      make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1016      make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1017    { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1018      make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1019      make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1020    { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1021      make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1022      make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1023    { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1024      make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1025      make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1026    { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1027      make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1028      make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1029    { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1030      make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1031      make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1032    { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1033      make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1034      make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1035    { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1036      make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1037      make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1038    { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1039      make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1040      make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1041    { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1042      make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1043      make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1044    { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1045      make_floatx80_init(0x4000, 0x8000000000000000ULL),
1046      make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1047};
1048
1049void helper_f2xm1(CPUX86State *env)
1050{
1051    uint8_t old_flags = save_exception_flags(env);
1052    uint64_t sig = extractFloatx80Frac(ST0);
1053    int32_t exp = extractFloatx80Exp(ST0);
1054    bool sign = extractFloatx80Sign(ST0);
1055
1056    if (floatx80_invalid_encoding(ST0)) {
1057        float_raise(float_flag_invalid, &env->fp_status);
1058        ST0 = floatx80_default_nan(&env->fp_status);
1059    } else if (floatx80_is_any_nan(ST0)) {
1060        if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1061            float_raise(float_flag_invalid, &env->fp_status);
1062            ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1063        }
1064    } else if (exp > 0x3fff ||
1065               (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1066        /* Out of range for the instruction, treat as invalid.  */
1067        float_raise(float_flag_invalid, &env->fp_status);
1068        ST0 = floatx80_default_nan(&env->fp_status);
1069    } else if (exp == 0x3fff) {
1070        /* Argument 1 or -1, exact result 1 or -0.5.  */
1071        if (sign) {
1072            ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1073        }
1074    } else if (exp < 0x3fb0) {
1075        if (!floatx80_is_zero(ST0)) {
1076            /*
1077             * Multiplying the argument by an extra-precision version
1078             * of log(2) is sufficiently precise.  Zero arguments are
1079             * returned unchanged.
1080             */
1081            uint64_t sig0, sig1, sig2;
1082            if (exp == 0) {
1083                normalizeFloatx80Subnormal(sig, &exp, &sig);
1084            }
1085            mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1086                            &sig2);
1087            /* This result is inexact.  */
1088            sig1 |= 1;
1089            ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1090                                                sign, exp, sig0, sig1,
1091                                                &env->fp_status);
1092        }
1093    } else {
1094        floatx80 tmp, y, accum;
1095        bool asign, bsign;
1096        int32_t n, aexp, bexp;
1097        uint64_t asig0, asig1, asig2, bsig0, bsig1;
1098        FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1099        FloatX80RoundPrec save_prec =
1100            env->fp_status.floatx80_rounding_precision;
1101        env->fp_status.float_rounding_mode = float_round_nearest_even;
1102        env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1103
1104        /* Find the nearest multiple of 1/32 to the argument.  */
1105        tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1106        n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1107        y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1108
1109        if (floatx80_is_zero(y)) {
1110            /*
1111             * Use the value of 2^t - 1 from the table, to avoid
1112             * needing to special-case zero as a result of
1113             * multiplication below.
1114             */
1115            ST0 = f2xm1_table[n].t;
1116            set_float_exception_flags(float_flag_inexact, &env->fp_status);
1117            env->fp_status.float_rounding_mode = save_mode;
1118        } else {
1119            /*
1120             * Compute the lower parts of a polynomial expansion for
1121             * (2^y - 1) / y.
1122             */
1123            accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1124            accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1125            accum = floatx80_mul(accum, y, &env->fp_status);
1126            accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1127            accum = floatx80_mul(accum, y, &env->fp_status);
1128            accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1129            accum = floatx80_mul(accum, y, &env->fp_status);
1130            accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1131            accum = floatx80_mul(accum, y, &env->fp_status);
1132            accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1133            accum = floatx80_mul(accum, y, &env->fp_status);
1134            accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1135            accum = floatx80_mul(accum, y, &env->fp_status);
1136            accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1137
1138            /*
1139             * The full polynomial expansion is f2xm1_coeff_0 + accum
1140             * (where accum has much lower magnitude, and so, in
1141             * particular, carry out of the addition is not possible).
1142             * (This expansion is only accurate to about 70 bits, not
1143             * 128 bits.)
1144             */
1145            aexp = extractFloatx80Exp(f2xm1_coeff_0);
1146            asign = extractFloatx80Sign(f2xm1_coeff_0);
1147            shift128RightJamming(extractFloatx80Frac(accum), 0,
1148                                 aexp - extractFloatx80Exp(accum),
1149                                 &asig0, &asig1);
1150            bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1151            bsig1 = 0;
1152            if (asign == extractFloatx80Sign(accum)) {
1153                add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1154            } else {
1155                sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1156            }
1157            /* And thus compute an approximation to 2^y - 1.  */
1158            mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1159                            &asig0, &asig1, &asig2);
1160            aexp += extractFloatx80Exp(y) - 0x3ffe;
1161            asign ^= extractFloatx80Sign(y);
1162            if (n != 32) {
1163                /*
1164                 * Multiply this by the precomputed value of 2^t and
1165                 * add that of 2^t - 1.
1166                 */
1167                mul128By64To192(asig0, asig1,
1168                                extractFloatx80Frac(f2xm1_table[n].exp2),
1169                                &asig0, &asig1, &asig2);
1170                aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1171                bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1172                bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1173                bsig1 = 0;
1174                if (bexp < aexp) {
1175                    shift128RightJamming(bsig0, bsig1, aexp - bexp,
1176                                         &bsig0, &bsig1);
1177                } else if (aexp < bexp) {
1178                    shift128RightJamming(asig0, asig1, bexp - aexp,
1179                                         &asig0, &asig1);
1180                    aexp = bexp;
1181                }
1182                /* The sign of 2^t - 1 is always that of the result.  */
1183                bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1184                if (asign == bsign) {
1185                    /* Avoid possible carry out of the addition.  */
1186                    shift128RightJamming(asig0, asig1, 1,
1187                                         &asig0, &asig1);
1188                    shift128RightJamming(bsig0, bsig1, 1,
1189                                         &bsig0, &bsig1);
1190                    ++aexp;
1191                    add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1192                } else {
1193                    sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1194                    asign = bsign;
1195                }
1196            }
1197            env->fp_status.float_rounding_mode = save_mode;
1198            /* This result is inexact.  */
1199            asig1 |= 1;
1200            ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1201                                                asign, aexp, asig0, asig1,
1202                                                &env->fp_status);
1203        }
1204
1205        env->fp_status.floatx80_rounding_precision = save_prec;
1206    }
1207    merge_exception_flags(env, old_flags);
1208}
1209
1210void helper_fptan(CPUX86State *env)
1211{
1212    double fptemp = floatx80_to_double(env, ST0);
1213
1214    if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1215        env->fpus |= 0x400;
1216    } else {
1217        fptemp = tan(fptemp);
1218        ST0 = double_to_floatx80(env, fptemp);
1219        fpush(env);
1220        ST0 = floatx80_one;
1221        env->fpus &= ~0x400; /* C2 <-- 0 */
1222        /* the above code is for |arg| < 2**52 only */
1223    }
1224}
1225
1226/* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1227#define pi_4_exp 0x3ffe
1228#define pi_4_sig_high 0xc90fdaa22168c234ULL
1229#define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1230#define pi_2_exp 0x3fff
1231#define pi_2_sig_high 0xc90fdaa22168c234ULL
1232#define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1233#define pi_34_exp 0x4000
1234#define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1235#define pi_34_sig_low 0x9394c9e8a0a5159dULL
1236#define pi_exp 0x4000
1237#define pi_sig_high 0xc90fdaa22168c234ULL
1238#define pi_sig_low 0xc4c6628b80dc1cd1ULL
1239
1240/*
1241 * Polynomial coefficients for an approximation to atan(x), with only
1242 * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1243 * for some other approximations, no low part is needed for the first
1244 * coefficient here to achieve a sufficiently accurate result, because
1245 * the coefficient in this minimax approximation is very close to
1246 * exactly 1.)
1247 */
1248#define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1249#define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1250#define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1251#define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1252#define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1253#define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1254#define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1255
1256struct fpatan_data {
1257    /* High and low parts of atan(x).  */
1258    floatx80 atan_high, atan_low;
1259};
1260
1261static const struct fpatan_data fpatan_table[9] = {
1262    { floatx80_zero_init,
1263      floatx80_zero_init },
1264    { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1265      make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1266    { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1267      make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1268    { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1269      make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1270    { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1271      make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1272    { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1273      make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1274    { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1275      make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1276    { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1277      make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1278    { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1279      make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1280};
1281
1282void helper_fpatan(CPUX86State *env)
1283{
1284    uint8_t old_flags = save_exception_flags(env);
1285    uint64_t arg0_sig = extractFloatx80Frac(ST0);
1286    int32_t arg0_exp = extractFloatx80Exp(ST0);
1287    bool arg0_sign = extractFloatx80Sign(ST0);
1288    uint64_t arg1_sig = extractFloatx80Frac(ST1);
1289    int32_t arg1_exp = extractFloatx80Exp(ST1);
1290    bool arg1_sign = extractFloatx80Sign(ST1);
1291
1292    if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1293        float_raise(float_flag_invalid, &env->fp_status);
1294        ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1295    } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1296        float_raise(float_flag_invalid, &env->fp_status);
1297        ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1298    } else if (floatx80_invalid_encoding(ST0) ||
1299               floatx80_invalid_encoding(ST1)) {
1300        float_raise(float_flag_invalid, &env->fp_status);
1301        ST1 = floatx80_default_nan(&env->fp_status);
1302    } else if (floatx80_is_any_nan(ST0)) {
1303        ST1 = ST0;
1304    } else if (floatx80_is_any_nan(ST1)) {
1305        /* Pass this NaN through.  */
1306    } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1307        /* Pass this zero through.  */
1308    } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1309                 arg0_exp - arg1_exp >= 80) &&
1310               !arg0_sign) {
1311        /*
1312         * Dividing ST1 by ST0 gives the correct result up to
1313         * rounding, and avoids spurious underflow exceptions that
1314         * might result from passing some small values through the
1315         * polynomial approximation, but if a finite nonzero result of
1316         * division is exact, the result of fpatan is still inexact
1317         * (and underflowing where appropriate).
1318         */
1319        FloatX80RoundPrec save_prec =
1320            env->fp_status.floatx80_rounding_precision;
1321        env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1322        ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1323        env->fp_status.floatx80_rounding_precision = save_prec;
1324        if (!floatx80_is_zero(ST1) &&
1325            !(get_float_exception_flags(&env->fp_status) &
1326              float_flag_inexact)) {
1327            /*
1328             * The mathematical result is very slightly closer to zero
1329             * than this exact result.  Round a value with the
1330             * significand adjusted accordingly to get the correct
1331             * exceptions, and possibly an adjusted result depending
1332             * on the rounding mode.
1333             */
1334            uint64_t sig = extractFloatx80Frac(ST1);
1335            int32_t exp = extractFloatx80Exp(ST1);
1336            bool sign = extractFloatx80Sign(ST1);
1337            if (exp == 0) {
1338                normalizeFloatx80Subnormal(sig, &exp, &sig);
1339            }
1340            ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1341                                                sign, exp, sig - 1,
1342                                                -1, &env->fp_status);
1343        }
1344    } else {
1345        /* The result is inexact.  */
1346        bool rsign = arg1_sign;
1347        int32_t rexp;
1348        uint64_t rsig0, rsig1;
1349        if (floatx80_is_zero(ST1)) {
1350            /*
1351             * ST0 is negative.  The result is pi with the sign of
1352             * ST1.
1353             */
1354            rexp = pi_exp;
1355            rsig0 = pi_sig_high;
1356            rsig1 = pi_sig_low;
1357        } else if (floatx80_is_infinity(ST1)) {
1358            if (floatx80_is_infinity(ST0)) {
1359                if (arg0_sign) {
1360                    rexp = pi_34_exp;
1361                    rsig0 = pi_34_sig_high;
1362                    rsig1 = pi_34_sig_low;
1363                } else {
1364                    rexp = pi_4_exp;
1365                    rsig0 = pi_4_sig_high;
1366                    rsig1 = pi_4_sig_low;
1367                }
1368            } else {
1369                rexp = pi_2_exp;
1370                rsig0 = pi_2_sig_high;
1371                rsig1 = pi_2_sig_low;
1372            }
1373        } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1374            rexp = pi_2_exp;
1375            rsig0 = pi_2_sig_high;
1376            rsig1 = pi_2_sig_low;
1377        } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1378            /* ST0 is negative.  */
1379            rexp = pi_exp;
1380            rsig0 = pi_sig_high;
1381            rsig1 = pi_sig_low;
1382        } else {
1383            /*
1384             * ST0 and ST1 are finite, nonzero and with exponents not
1385             * too far apart.
1386             */
1387            int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1388            int32_t azexp, axexp;
1389            bool adj_sub, ysign, zsign;
1390            uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1391            uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1392            uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1393            uint64_t azsig0, azsig1;
1394            uint64_t azsig2, azsig3, axsig0, axsig1;
1395            floatx80 x8;
1396            FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1397            FloatX80RoundPrec save_prec =
1398                env->fp_status.floatx80_rounding_precision;
1399            env->fp_status.float_rounding_mode = float_round_nearest_even;
1400            env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1401
1402            if (arg0_exp == 0) {
1403                normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1404            }
1405            if (arg1_exp == 0) {
1406                normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1407            }
1408            if (arg0_exp > arg1_exp ||
1409                (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1410                /* Work with abs(ST1) / abs(ST0).  */
1411                num_exp = arg1_exp;
1412                num_sig = arg1_sig;
1413                den_exp = arg0_exp;
1414                den_sig = arg0_sig;
1415                if (arg0_sign) {
1416                    /* The result is subtracted from pi.  */
1417                    adj_exp = pi_exp;
1418                    adj_sig0 = pi_sig_high;
1419                    adj_sig1 = pi_sig_low;
1420                    adj_sub = true;
1421                } else {
1422                    /* The result is used as-is.  */
1423                    adj_exp = 0;
1424                    adj_sig0 = 0;
1425                    adj_sig1 = 0;
1426                    adj_sub = false;
1427                }
1428            } else {
1429                /* Work with abs(ST0) / abs(ST1).  */
1430                num_exp = arg0_exp;
1431                num_sig = arg0_sig;
1432                den_exp = arg1_exp;
1433                den_sig = arg1_sig;
1434                /* The result is added to or subtracted from pi/2.  */
1435                adj_exp = pi_2_exp;
1436                adj_sig0 = pi_2_sig_high;
1437                adj_sig1 = pi_2_sig_low;
1438                adj_sub = !arg0_sign;
1439            }
1440
1441            /*
1442             * Compute x = num/den, where 0 < x <= 1 and x is not too
1443             * small.
1444             */
1445            xexp = num_exp - den_exp + 0x3ffe;
1446            remsig0 = num_sig;
1447            remsig1 = 0;
1448            if (den_sig <= remsig0) {
1449                shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1450                ++xexp;
1451            }
1452            xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1453            mul64To128(den_sig, xsig0, &msig0, &msig1);
1454            sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1455            while ((int64_t) remsig0 < 0) {
1456                --xsig0;
1457                add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1458            }
1459            xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1460            /*
1461             * No need to correct any estimation error in xsig1; even
1462             * with such error, it is accurate enough.
1463             */
1464
1465            /*
1466             * Split x as x = t + y, where t = n/8 is the nearest
1467             * multiple of 1/8 to x.
1468             */
1469            x8 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1470                                               false, xexp + 3, xsig0,
1471                                               xsig1, &env->fp_status);
1472            n = floatx80_to_int32(x8, &env->fp_status);
1473            if (n == 0) {
1474                ysign = false;
1475                yexp = xexp;
1476                ysig0 = xsig0;
1477                ysig1 = xsig1;
1478                texp = 0;
1479                tsig = 0;
1480            } else {
1481                int shift = clz32(n) + 32;
1482                texp = 0x403b - shift;
1483                tsig = n;
1484                tsig <<= shift;
1485                if (texp == xexp) {
1486                    sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1487                    if ((int64_t) ysig0 >= 0) {
1488                        ysign = false;
1489                        if (ysig0 == 0) {
1490                            if (ysig1 == 0) {
1491                                yexp = 0;
1492                            } else {
1493                                shift = clz64(ysig1) + 64;
1494                                yexp = xexp - shift;
1495                                shift128Left(ysig0, ysig1, shift,
1496                                             &ysig0, &ysig1);
1497                            }
1498                        } else {
1499                            shift = clz64(ysig0);
1500                            yexp = xexp - shift;
1501                            shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1502                        }
1503                    } else {
1504                        ysign = true;
1505                        sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1506                        if (ysig0 == 0) {
1507                            shift = clz64(ysig1) + 64;
1508                        } else {
1509                            shift = clz64(ysig0);
1510                        }
1511                        yexp = xexp - shift;
1512                        shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1513                    }
1514                } else {
1515                    /*
1516                     * t's exponent must be greater than x's because t
1517                     * is positive and the nearest multiple of 1/8 to
1518                     * x, and if x has a greater exponent, the power
1519                     * of 2 with that exponent is also a multiple of
1520                     * 1/8.
1521                     */
1522                    uint64_t usig0, usig1;
1523                    shift128RightJamming(xsig0, xsig1, texp - xexp,
1524                                         &usig0, &usig1);
1525                    ysign = true;
1526                    sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1527                    if (ysig0 == 0) {
1528                        shift = clz64(ysig1) + 64;
1529                    } else {
1530                        shift = clz64(ysig0);
1531                    }
1532                    yexp = texp - shift;
1533                    shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1534                }
1535            }
1536
1537            /*
1538             * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1539             * arctan(z).
1540             */
1541            zsign = ysign;
1542            if (texp == 0 || yexp == 0) {
1543                zexp = yexp;
1544                zsig0 = ysig0;
1545                zsig1 = ysig1;
1546            } else {
1547                /*
1548                 * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1549                 */
1550                int32_t dexp = texp + xexp - 0x3ffe;
1551                uint64_t dsig0, dsig1, dsig2;
1552                mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1553                /*
1554                 * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1555                 * bit).  Add 1 to produce the denominator 1+tx.
1556                 */
1557                shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1558                                     &dsig0, &dsig1);
1559                dsig0 |= 0x8000000000000000ULL;
1560                zexp = yexp - 1;
1561                remsig0 = ysig0;
1562                remsig1 = ysig1;
1563                remsig2 = 0;
1564                if (dsig0 <= remsig0) {
1565                    shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1566                    ++zexp;
1567                }
1568                zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1569                mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1570                sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1571                       &remsig0, &remsig1, &remsig2);
1572                while ((int64_t) remsig0 < 0) {
1573                    --zsig0;
1574                    add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1575                           &remsig0, &remsig1, &remsig2);
1576                }
1577                zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1578                /* No need to correct any estimation error in zsig1.  */
1579            }
1580
1581            if (zexp == 0) {
1582                azexp = 0;
1583                azsig0 = 0;
1584                azsig1 = 0;
1585            } else {
1586                floatx80 z2, accum;
1587                uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1588                /* Compute z^2.  */
1589                mul128To256(zsig0, zsig1, zsig0, zsig1,
1590                            &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1591                z2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1592                                                   zexp + zexp - 0x3ffe,
1593                                                   z2sig0, z2sig1,
1594                                                   &env->fp_status);
1595
1596                /* Compute the lower parts of the polynomial expansion.  */
1597                accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1598                accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1599                accum = floatx80_mul(accum, z2, &env->fp_status);
1600                accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1601                accum = floatx80_mul(accum, z2, &env->fp_status);
1602                accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1603                accum = floatx80_mul(accum, z2, &env->fp_status);
1604                accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1605                accum = floatx80_mul(accum, z2, &env->fp_status);
1606                accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1607                accum = floatx80_mul(accum, z2, &env->fp_status);
1608
1609                /*
1610                 * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1611                 * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1612                 */
1613                aexp = extractFloatx80Exp(fpatan_coeff_0);
1614                shift128RightJamming(extractFloatx80Frac(accum), 0,
1615                                     aexp - extractFloatx80Exp(accum),
1616                                     &asig0, &asig1);
1617                sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1618                       &asig0, &asig1);
1619                /* Multiply by z to compute arctan(z).  */
1620                azexp = aexp + zexp - 0x3ffe;
1621                mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1622                            &azsig2, &azsig3);
1623            }
1624
1625            /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1626            if (texp == 0) {
1627                /* z is positive.  */
1628                axexp = azexp;
1629                axsig0 = azsig0;
1630                axsig1 = azsig1;
1631            } else {
1632                bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1633                int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1634                uint64_t low_sig0 =
1635                    extractFloatx80Frac(fpatan_table[n].atan_low);
1636                uint64_t low_sig1 = 0;
1637                axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1638                axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1639                axsig1 = 0;
1640                shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1641                                     &low_sig0, &low_sig1);
1642                if (low_sign) {
1643                    sub128(axsig0, axsig1, low_sig0, low_sig1,
1644                           &axsig0, &axsig1);
1645                } else {
1646                    add128(axsig0, axsig1, low_sig0, low_sig1,
1647                           &axsig0, &axsig1);
1648                }
1649                if (azexp >= axexp) {
1650                    shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1651                                         &axsig0, &axsig1);
1652                    axexp = azexp + 1;
1653                    shift128RightJamming(azsig0, azsig1, 1,
1654                                         &azsig0, &azsig1);
1655                } else {
1656                    shift128RightJamming(axsig0, axsig1, 1,
1657                                         &axsig0, &axsig1);
1658                    shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1659                                         &azsig0, &azsig1);
1660                    ++axexp;
1661                }
1662                if (zsign) {
1663                    sub128(axsig0, axsig1, azsig0, azsig1,
1664                           &axsig0, &axsig1);
1665                } else {
1666                    add128(axsig0, axsig1, azsig0, azsig1,
1667                           &axsig0, &axsig1);
1668                }
1669            }
1670
1671            if (adj_exp == 0) {
1672                rexp = axexp;
1673                rsig0 = axsig0;
1674                rsig1 = axsig1;
1675            } else {
1676                /*
1677                 * Add or subtract arctan(x) (exponent axexp,
1678                 * significand axsig0 and axsig1, positive, not
1679                 * necessarily normalized) to the number given by
1680                 * adj_exp, adj_sig0 and adj_sig1, according to
1681                 * adj_sub.
1682                 */
1683                if (adj_exp >= axexp) {
1684                    shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1685                                         &axsig0, &axsig1);
1686                    rexp = adj_exp + 1;
1687                    shift128RightJamming(adj_sig0, adj_sig1, 1,
1688                                         &adj_sig0, &adj_sig1);
1689                } else {
1690                    shift128RightJamming(axsig0, axsig1, 1,
1691                                         &axsig0, &axsig1);
1692                    shift128RightJamming(adj_sig0, adj_sig1,
1693                                         axexp - adj_exp + 1,
1694                                         &adj_sig0, &adj_sig1);
1695                    rexp = axexp + 1;
1696                }
1697                if (adj_sub) {
1698                    sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1699                           &rsig0, &rsig1);
1700                } else {
1701                    add128(adj_sig0, adj_sig1, axsig0, axsig1,
1702                           &rsig0, &rsig1);
1703                }
1704            }
1705
1706            env->fp_status.float_rounding_mode = save_mode;
1707            env->fp_status.floatx80_rounding_precision = save_prec;
1708        }
1709        /* This result is inexact.  */
1710        rsig1 |= 1;
1711        ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x, rsign, rexp,
1712                                            rsig0, rsig1, &env->fp_status);
1713    }
1714
1715    fpop(env);
1716    merge_exception_flags(env, old_flags);
1717}
1718
1719void helper_fxtract(CPUX86State *env)
1720{
1721    uint8_t old_flags = save_exception_flags(env);
1722    CPU_LDoubleU temp;
1723
1724    temp.d = ST0;
1725
1726    if (floatx80_is_zero(ST0)) {
1727        /* Easy way to generate -inf and raising division by 0 exception */
1728        ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1729                           &env->fp_status);
1730        fpush(env);
1731        ST0 = temp.d;
1732    } else if (floatx80_invalid_encoding(ST0)) {
1733        float_raise(float_flag_invalid, &env->fp_status);
1734        ST0 = floatx80_default_nan(&env->fp_status);
1735        fpush(env);
1736        ST0 = ST1;
1737    } else if (floatx80_is_any_nan(ST0)) {
1738        if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1739            float_raise(float_flag_invalid, &env->fp_status);
1740            ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1741        }
1742        fpush(env);
1743        ST0 = ST1;
1744    } else if (floatx80_is_infinity(ST0)) {
1745        fpush(env);
1746        ST0 = ST1;
1747        ST1 = floatx80_infinity;
1748    } else {
1749        int expdif;
1750
1751        if (EXPD(temp) == 0) {
1752            int shift = clz64(temp.l.lower);
1753            temp.l.lower <<= shift;
1754            expdif = 1 - EXPBIAS - shift;
1755            float_raise(float_flag_input_denormal, &env->fp_status);
1756        } else {
1757            expdif = EXPD(temp) - EXPBIAS;
1758        }
1759        /* DP exponent bias */
1760        ST0 = int32_to_floatx80(expdif, &env->fp_status);
1761        fpush(env);
1762        BIASEXPONENT(temp);
1763        ST0 = temp.d;
1764    }
1765    merge_exception_flags(env, old_flags);
1766}
1767
1768static void helper_fprem_common(CPUX86State *env, bool mod)
1769{
1770    uint8_t old_flags = save_exception_flags(env);
1771    uint64_t quotient;
1772    CPU_LDoubleU temp0, temp1;
1773    int exp0, exp1, expdiff;
1774
1775    temp0.d = ST0;
1776    temp1.d = ST1;
1777    exp0 = EXPD(temp0);
1778    exp1 = EXPD(temp1);
1779
1780    env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1781    if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1782        exp0 == 0x7fff || exp1 == 0x7fff ||
1783        floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1784        ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1785    } else {
1786        if (exp0 == 0) {
1787            exp0 = 1 - clz64(temp0.l.lower);
1788        }
1789        if (exp1 == 0) {
1790            exp1 = 1 - clz64(temp1.l.lower);
1791        }
1792        expdiff = exp0 - exp1;
1793        if (expdiff < 64) {
1794            ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1795            env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1796            env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1797            env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1798        } else {
1799            /*
1800             * Partial remainder.  This choice of how many bits to
1801             * process at once is specified in AMD instruction set
1802             * manuals, and empirically is followed by Intel
1803             * processors as well; it ensures that the final remainder
1804             * operation in a loop does produce the correct low three
1805             * bits of the quotient.  AMD manuals specify that the
1806             * flags other than C2 are cleared, and empirically Intel
1807             * processors clear them as well.
1808             */
1809            int n = 32 + (expdiff % 32);
1810            temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1811            ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1812            env->fpus |= 0x400;  /* C2 <-- 1 */
1813        }
1814    }
1815    merge_exception_flags(env, old_flags);
1816}
1817
1818void helper_fprem1(CPUX86State *env)
1819{
1820    helper_fprem_common(env, false);
1821}
1822
1823void helper_fprem(CPUX86State *env)
1824{
1825    helper_fprem_common(env, true);
1826}
1827
1828/* 128-bit significand of log2(e).  */
1829#define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1830#define log2_e_sig_low 0xbe87fed0691d3e89ULL
1831
1832/*
1833 * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1834 * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1835 * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1836 * interval [sqrt(2)/2, sqrt(2)].
1837 */
1838#define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1839#define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1840#define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1841#define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1842#define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1843#define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1844#define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1845#define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1846#define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1847#define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1848#define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1849
1850/*
1851 * Compute an approximation of log2(1+arg), where 1+arg is in the
1852 * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1853 * function is called, rounding precision is set to 80 and the
1854 * round-to-nearest mode is in effect.  arg must not be exactly zero,
1855 * and must not be so close to zero that underflow might occur.
1856 */
1857static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1858                                uint64_t *sig0, uint64_t *sig1)
1859{
1860    uint64_t arg0_sig = extractFloatx80Frac(arg);
1861    int32_t arg0_exp = extractFloatx80Exp(arg);
1862    bool arg0_sign = extractFloatx80Sign(arg);
1863    bool asign;
1864    int32_t dexp, texp, aexp;
1865    uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1866    uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1867    uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1868    floatx80 t2, accum;
1869
1870    /*
1871     * Compute an approximation of arg/(2+arg), with extra precision,
1872     * as the argument to a polynomial approximation.  The extra
1873     * precision is only needed for the first term of the
1874     * approximation, with subsequent terms being significantly
1875     * smaller; the approximation only uses odd exponents, and the
1876     * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1877     */
1878    if (arg0_sign) {
1879        dexp = 0x3fff;
1880        shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1881        sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1882    } else {
1883        dexp = 0x4000;
1884        shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1885        dsig0 |= 0x8000000000000000ULL;
1886    }
1887    texp = arg0_exp - dexp + 0x3ffe;
1888    rsig0 = arg0_sig;
1889    rsig1 = 0;
1890    rsig2 = 0;
1891    if (dsig0 <= rsig0) {
1892        shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1893        ++texp;
1894    }
1895    tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1896    mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1897    sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1898           &rsig0, &rsig1, &rsig2);
1899    while ((int64_t) rsig0 < 0) {
1900        --tsig0;
1901        add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1902               &rsig0, &rsig1, &rsig2);
1903    }
1904    tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1905    /*
1906     * No need to correct any estimation error in tsig1; even with
1907     * such error, it is accurate enough.  Now compute the square of
1908     * that approximation.
1909     */
1910    mul128To256(tsig0, tsig1, tsig0, tsig1,
1911                &t2sig0, &t2sig1, &t2sig2, &t2sig3);
1912    t2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1913                                       texp + texp - 0x3ffe,
1914                                       t2sig0, t2sig1, &env->fp_status);
1915
1916    /* Compute the lower parts of the polynomial expansion.  */
1917    accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
1918    accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
1919    accum = floatx80_mul(accum, t2, &env->fp_status);
1920    accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
1921    accum = floatx80_mul(accum, t2, &env->fp_status);
1922    accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
1923    accum = floatx80_mul(accum, t2, &env->fp_status);
1924    accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
1925    accum = floatx80_mul(accum, t2, &env->fp_status);
1926    accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
1927    accum = floatx80_mul(accum, t2, &env->fp_status);
1928    accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
1929    accum = floatx80_mul(accum, t2, &env->fp_status);
1930    accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
1931    accum = floatx80_mul(accum, t2, &env->fp_status);
1932    accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
1933    accum = floatx80_mul(accum, t2, &env->fp_status);
1934    accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
1935
1936    /*
1937     * The full polynomial expansion is fyl2x_coeff_0 + accum (where
1938     * accum has much lower magnitude, and so, in particular, carry
1939     * out of the addition is not possible), multiplied by t.  (This
1940     * expansion is only accurate to about 70 bits, not 128 bits.)
1941     */
1942    aexp = extractFloatx80Exp(fyl2x_coeff_0);
1943    asign = extractFloatx80Sign(fyl2x_coeff_0);
1944    shift128RightJamming(extractFloatx80Frac(accum), 0,
1945                         aexp - extractFloatx80Exp(accum),
1946                         &asig0, &asig1);
1947    bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
1948    bsig1 = 0;
1949    if (asign == extractFloatx80Sign(accum)) {
1950        add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1951    } else {
1952        sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1953    }
1954    /* Multiply by t to compute the required result.  */
1955    mul128To256(asig0, asig1, tsig0, tsig1,
1956                &asig0, &asig1, &asig2, &asig3);
1957    aexp += texp - 0x3ffe;
1958    *exp = aexp;
1959    *sig0 = asig0;
1960    *sig1 = asig1;
1961}
1962
1963void helper_fyl2xp1(CPUX86State *env)
1964{
1965    uint8_t old_flags = save_exception_flags(env);
1966    uint64_t arg0_sig = extractFloatx80Frac(ST0);
1967    int32_t arg0_exp = extractFloatx80Exp(ST0);
1968    bool arg0_sign = extractFloatx80Sign(ST0);
1969    uint64_t arg1_sig = extractFloatx80Frac(ST1);
1970    int32_t arg1_exp = extractFloatx80Exp(ST1);
1971    bool arg1_sign = extractFloatx80Sign(ST1);
1972
1973    if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1974        float_raise(float_flag_invalid, &env->fp_status);
1975        ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1976    } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1977        float_raise(float_flag_invalid, &env->fp_status);
1978        ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1979    } else if (floatx80_invalid_encoding(ST0) ||
1980               floatx80_invalid_encoding(ST1)) {
1981        float_raise(float_flag_invalid, &env->fp_status);
1982        ST1 = floatx80_default_nan(&env->fp_status);
1983    } else if (floatx80_is_any_nan(ST0)) {
1984        ST1 = ST0;
1985    } else if (floatx80_is_any_nan(ST1)) {
1986        /* Pass this NaN through.  */
1987    } else if (arg0_exp > 0x3ffd ||
1988               (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
1989                                                  0x95f619980c4336f7ULL :
1990                                                  0xd413cccfe7799211ULL))) {
1991        /*
1992         * Out of range for the instruction (ST0 must have absolute
1993         * value less than 1 - sqrt(2)/2 = 0.292..., according to
1994         * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
1995         * to sqrt(2) - 1, which we allow here), treat as invalid.
1996         */
1997        float_raise(float_flag_invalid, &env->fp_status);
1998        ST1 = floatx80_default_nan(&env->fp_status);
1999    } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
2000               arg1_exp == 0x7fff) {
2001        /*
2002         * One argument is zero, or multiplying by infinity; correct
2003         * result is exact and can be obtained by multiplying the
2004         * arguments.
2005         */
2006        ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
2007    } else if (arg0_exp < 0x3fb0) {
2008        /*
2009         * Multiplying both arguments and an extra-precision version
2010         * of log2(e) is sufficiently precise.
2011         */
2012        uint64_t sig0, sig1, sig2;
2013        int32_t exp;
2014        if (arg0_exp == 0) {
2015            normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2016        }
2017        if (arg1_exp == 0) {
2018            normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2019        }
2020        mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2021                        &sig0, &sig1, &sig2);
2022        exp = arg0_exp + 1;
2023        mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2024        exp += arg1_exp - 0x3ffe;
2025        /* This result is inexact.  */
2026        sig1 |= 1;
2027        ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2028                                            arg0_sign ^ arg1_sign, exp,
2029                                            sig0, sig1, &env->fp_status);
2030    } else {
2031        int32_t aexp;
2032        uint64_t asig0, asig1, asig2;
2033        FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2034        FloatX80RoundPrec save_prec =
2035            env->fp_status.floatx80_rounding_precision;
2036        env->fp_status.float_rounding_mode = float_round_nearest_even;
2037        env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2038
2039        helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2040        /*
2041         * Multiply by the second argument to compute the required
2042         * result.
2043         */
2044        if (arg1_exp == 0) {
2045            normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2046        }
2047        mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2048        aexp += arg1_exp - 0x3ffe;
2049        /* This result is inexact.  */
2050        asig1 |= 1;
2051        env->fp_status.float_rounding_mode = save_mode;
2052        ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2053                                            arg0_sign ^ arg1_sign, aexp,
2054                                            asig0, asig1, &env->fp_status);
2055        env->fp_status.floatx80_rounding_precision = save_prec;
2056    }
2057    fpop(env);
2058    merge_exception_flags(env, old_flags);
2059}
2060
2061void helper_fyl2x(CPUX86State *env)
2062{
2063    uint8_t old_flags = save_exception_flags(env);
2064    uint64_t arg0_sig = extractFloatx80Frac(ST0);
2065    int32_t arg0_exp = extractFloatx80Exp(ST0);
2066    bool arg0_sign = extractFloatx80Sign(ST0);
2067    uint64_t arg1_sig = extractFloatx80Frac(ST1);
2068    int32_t arg1_exp = extractFloatx80Exp(ST1);
2069    bool arg1_sign = extractFloatx80Sign(ST1);
2070
2071    if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2072        float_raise(float_flag_invalid, &env->fp_status);
2073        ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2074    } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2075        float_raise(float_flag_invalid, &env->fp_status);
2076        ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2077    } else if (floatx80_invalid_encoding(ST0) ||
2078               floatx80_invalid_encoding(ST1)) {
2079        float_raise(float_flag_invalid, &env->fp_status);
2080        ST1 = floatx80_default_nan(&env->fp_status);
2081    } else if (floatx80_is_any_nan(ST0)) {
2082        ST1 = ST0;
2083    } else if (floatx80_is_any_nan(ST1)) {
2084        /* Pass this NaN through.  */
2085    } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2086        float_raise(float_flag_invalid, &env->fp_status);
2087        ST1 = floatx80_default_nan(&env->fp_status);
2088    } else if (floatx80_is_infinity(ST1)) {
2089        FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2090                                             &env->fp_status);
2091        switch (cmp) {
2092        case float_relation_less:
2093            ST1 = floatx80_chs(ST1);
2094            break;
2095        case float_relation_greater:
2096            /* Result is infinity of the same sign as ST1.  */
2097            break;
2098        default:
2099            float_raise(float_flag_invalid, &env->fp_status);
2100            ST1 = floatx80_default_nan(&env->fp_status);
2101            break;
2102        }
2103    } else if (floatx80_is_infinity(ST0)) {
2104        if (floatx80_is_zero(ST1)) {
2105            float_raise(float_flag_invalid, &env->fp_status);
2106            ST1 = floatx80_default_nan(&env->fp_status);
2107        } else if (arg1_sign) {
2108            ST1 = floatx80_chs(ST0);
2109        } else {
2110            ST1 = ST0;
2111        }
2112    } else if (floatx80_is_zero(ST0)) {
2113        if (floatx80_is_zero(ST1)) {
2114            float_raise(float_flag_invalid, &env->fp_status);
2115            ST1 = floatx80_default_nan(&env->fp_status);
2116        } else {
2117            /* Result is infinity with opposite sign to ST1.  */
2118            float_raise(float_flag_divbyzero, &env->fp_status);
2119            ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2120                                0x8000000000000000ULL);
2121        }
2122    } else if (floatx80_is_zero(ST1)) {
2123        if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2124            ST1 = floatx80_chs(ST1);
2125        }
2126        /* Otherwise, ST1 is already the correct result.  */
2127    } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2128        if (arg1_sign) {
2129            ST1 = floatx80_chs(floatx80_zero);
2130        } else {
2131            ST1 = floatx80_zero;
2132        }
2133    } else {
2134        int32_t int_exp;
2135        floatx80 arg0_m1;
2136        FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2137        FloatX80RoundPrec save_prec =
2138            env->fp_status.floatx80_rounding_precision;
2139        env->fp_status.float_rounding_mode = float_round_nearest_even;
2140        env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2141
2142        if (arg0_exp == 0) {
2143            normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2144        }
2145        if (arg1_exp == 0) {
2146            normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2147        }
2148        int_exp = arg0_exp - 0x3fff;
2149        if (arg0_sig > 0xb504f333f9de6484ULL) {
2150            ++int_exp;
2151        }
2152        arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2153                                               &env->fp_status),
2154                               floatx80_one, &env->fp_status);
2155        if (floatx80_is_zero(arg0_m1)) {
2156            /* Exact power of 2; multiply by ST1.  */
2157            env->fp_status.float_rounding_mode = save_mode;
2158            ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2159                               ST1, &env->fp_status);
2160        } else {
2161            bool asign = extractFloatx80Sign(arg0_m1);
2162            int32_t aexp;
2163            uint64_t asig0, asig1, asig2;
2164            helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2165            if (int_exp != 0) {
2166                bool isign = (int_exp < 0);
2167                int32_t iexp;
2168                uint64_t isig;
2169                int shift;
2170                int_exp = isign ? -int_exp : int_exp;
2171                shift = clz32(int_exp) + 32;
2172                isig = int_exp;
2173                isig <<= shift;
2174                iexp = 0x403e - shift;
2175                shift128RightJamming(asig0, asig1, iexp - aexp,
2176                                     &asig0, &asig1);
2177                if (asign == isign) {
2178                    add128(isig, 0, asig0, asig1, &asig0, &asig1);
2179                } else {
2180                    sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2181                }
2182                aexp = iexp;
2183                asign = isign;
2184            }
2185            /*
2186             * Multiply by the second argument to compute the required
2187             * result.
2188             */
2189            if (arg1_exp == 0) {
2190                normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2191            }
2192            mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2193            aexp += arg1_exp - 0x3ffe;
2194            /* This result is inexact.  */
2195            asig1 |= 1;
2196            env->fp_status.float_rounding_mode = save_mode;
2197            ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2198                                                asign ^ arg1_sign, aexp,
2199                                                asig0, asig1, &env->fp_status);
2200        }
2201
2202        env->fp_status.floatx80_rounding_precision = save_prec;
2203    }
2204    fpop(env);
2205    merge_exception_flags(env, old_flags);
2206}
2207
2208void helper_fsqrt(CPUX86State *env)
2209{
2210    uint8_t old_flags = save_exception_flags(env);
2211    if (floatx80_is_neg(ST0)) {
2212        env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2213        env->fpus |= 0x400;
2214    }
2215    ST0 = floatx80_sqrt(ST0, &env->fp_status);
2216    merge_exception_flags(env, old_flags);
2217}
2218
2219void helper_fsincos(CPUX86State *env)
2220{
2221    double fptemp = floatx80_to_double(env, ST0);
2222
2223    if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2224        env->fpus |= 0x400;
2225    } else {
2226        ST0 = double_to_floatx80(env, sin(fptemp));
2227        fpush(env);
2228        ST0 = double_to_floatx80(env, cos(fptemp));
2229        env->fpus &= ~0x400;  /* C2 <-- 0 */
2230        /* the above code is for |arg| < 2**63 only */
2231    }
2232}
2233
2234void helper_frndint(CPUX86State *env)
2235{
2236    uint8_t old_flags = save_exception_flags(env);
2237    ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2238    merge_exception_flags(env, old_flags);
2239}
2240
2241void helper_fscale(CPUX86State *env)
2242{
2243    uint8_t old_flags = save_exception_flags(env);
2244    if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2245        float_raise(float_flag_invalid, &env->fp_status);
2246        ST0 = floatx80_default_nan(&env->fp_status);
2247    } else if (floatx80_is_any_nan(ST1)) {
2248        if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2249            float_raise(float_flag_invalid, &env->fp_status);
2250        }
2251        ST0 = ST1;
2252        if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2253            float_raise(float_flag_invalid, &env->fp_status);
2254            ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2255        }
2256    } else if (floatx80_is_infinity(ST1) &&
2257               !floatx80_invalid_encoding(ST0) &&
2258               !floatx80_is_any_nan(ST0)) {
2259        if (floatx80_is_neg(ST1)) {
2260            if (floatx80_is_infinity(ST0)) {
2261                float_raise(float_flag_invalid, &env->fp_status);
2262                ST0 = floatx80_default_nan(&env->fp_status);
2263            } else {
2264                ST0 = (floatx80_is_neg(ST0) ?
2265                       floatx80_chs(floatx80_zero) :
2266                       floatx80_zero);
2267            }
2268        } else {
2269            if (floatx80_is_zero(ST0)) {
2270                float_raise(float_flag_invalid, &env->fp_status);
2271                ST0 = floatx80_default_nan(&env->fp_status);
2272            } else {
2273                ST0 = (floatx80_is_neg(ST0) ?
2274                       floatx80_chs(floatx80_infinity) :
2275                       floatx80_infinity);
2276            }
2277        }
2278    } else {
2279        int n;
2280        FloatX80RoundPrec save = env->fp_status.floatx80_rounding_precision;
2281        uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2282        set_float_exception_flags(0, &env->fp_status);
2283        n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2284        set_float_exception_flags(save_flags, &env->fp_status);
2285        env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2286        ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2287        env->fp_status.floatx80_rounding_precision = save;
2288    }
2289    merge_exception_flags(env, old_flags);
2290}
2291
2292void helper_fsin(CPUX86State *env)
2293{
2294    double fptemp = floatx80_to_double(env, ST0);
2295
2296    if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2297        env->fpus |= 0x400;
2298    } else {
2299        ST0 = double_to_floatx80(env, sin(fptemp));
2300        env->fpus &= ~0x400;  /* C2 <-- 0 */
2301        /* the above code is for |arg| < 2**53 only */
2302    }
2303}
2304
2305void helper_fcos(CPUX86State *env)
2306{
2307    double fptemp = floatx80_to_double(env, ST0);
2308
2309    if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2310        env->fpus |= 0x400;
2311    } else {
2312        ST0 = double_to_floatx80(env, cos(fptemp));
2313        env->fpus &= ~0x400;  /* C2 <-- 0 */
2314        /* the above code is for |arg| < 2**63 only */
2315    }
2316}
2317
2318void helper_fxam_ST0(CPUX86State *env)
2319{
2320    CPU_LDoubleU temp;
2321    int expdif;
2322
2323    temp.d = ST0;
2324
2325    env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2326    if (SIGND(temp)) {
2327        env->fpus |= 0x200; /* C1 <-- 1 */
2328    }
2329
2330    if (env->fptags[env->fpstt]) {
2331        env->fpus |= 0x4100; /* Empty */
2332        return;
2333    }
2334
2335    expdif = EXPD(temp);
2336    if (expdif == MAXEXPD) {
2337        if (MANTD(temp) == 0x8000000000000000ULL) {
2338            env->fpus |= 0x500; /* Infinity */
2339        } else if (MANTD(temp) & 0x8000000000000000ULL) {
2340            env->fpus |= 0x100; /* NaN */
2341        }
2342    } else if (expdif == 0) {
2343        if (MANTD(temp) == 0) {
2344            env->fpus |=  0x4000; /* Zero */
2345        } else {
2346            env->fpus |= 0x4400; /* Denormal */
2347        }
2348    } else if (MANTD(temp) & 0x8000000000000000ULL) {
2349        env->fpus |= 0x400;
2350    }
2351}
2352
2353static void do_fstenv(CPUX86State *env, target_ulong ptr, int data32,
2354                      uintptr_t retaddr)
2355{
2356    int fpus, fptag, exp, i;
2357    uint64_t mant;
2358    CPU_LDoubleU tmp;
2359
2360    fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2361    fptag = 0;
2362    for (i = 7; i >= 0; i--) {
2363        fptag <<= 2;
2364        if (env->fptags[i]) {
2365            fptag |= 3;
2366        } else {
2367            tmp.d = env->fpregs[i].d;
2368            exp = EXPD(tmp);
2369            mant = MANTD(tmp);
2370            if (exp == 0 && mant == 0) {
2371                /* zero */
2372                fptag |= 1;
2373            } else if (exp == 0 || exp == MAXEXPD
2374                       || (mant & (1LL << 63)) == 0) {
2375                /* NaNs, infinity, denormal */
2376                fptag |= 2;
2377            }
2378        }
2379    }
2380    if (data32) {
2381        /* 32 bit */
2382        cpu_stl_data_ra(env, ptr, env->fpuc, retaddr);
2383        cpu_stl_data_ra(env, ptr + 4, fpus, retaddr);
2384        cpu_stl_data_ra(env, ptr + 8, fptag, retaddr);
2385        cpu_stl_data_ra(env, ptr + 12, env->fpip, retaddr); /* fpip */
2386        cpu_stl_data_ra(env, ptr + 16, env->fpcs, retaddr); /* fpcs */
2387        cpu_stl_data_ra(env, ptr + 20, env->fpdp, retaddr); /* fpoo */
2388        cpu_stl_data_ra(env, ptr + 24, env->fpds, retaddr); /* fpos */
2389    } else {
2390        /* 16 bit */
2391        cpu_stw_data_ra(env, ptr, env->fpuc, retaddr);
2392        cpu_stw_data_ra(env, ptr + 2, fpus, retaddr);
2393        cpu_stw_data_ra(env, ptr + 4, fptag, retaddr);
2394        cpu_stw_data_ra(env, ptr + 6, env->fpip, retaddr);
2395        cpu_stw_data_ra(env, ptr + 8, env->fpcs, retaddr);
2396        cpu_stw_data_ra(env, ptr + 10, env->fpdp, retaddr);
2397        cpu_stw_data_ra(env, ptr + 12, env->fpds, retaddr);
2398    }
2399}
2400
2401void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2402{
2403    do_fstenv(env, ptr, data32, GETPC());
2404}
2405
2406static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2407{
2408    env->fpstt = (fpus >> 11) & 7;
2409    env->fpus = fpus & ~0x3800 & ~FPUS_B;
2410    env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2411#if !defined(CONFIG_USER_ONLY)
2412    if (!(env->fpus & FPUS_SE)) {
2413        /*
2414         * Here the processor deasserts FERR#; in response, the chipset deasserts
2415         * IGNNE#.
2416         */
2417        cpu_clear_ignne();
2418    }
2419#endif
2420}
2421
2422static void do_fldenv(CPUX86State *env, target_ulong ptr, int data32,
2423                      uintptr_t retaddr)
2424{
2425    int i, fpus, fptag;
2426
2427    if (data32) {
2428        cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2429        fpus = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2430        fptag = cpu_lduw_data_ra(env, ptr + 8, retaddr);
2431    } else {
2432        cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2433        fpus = cpu_lduw_data_ra(env, ptr + 2, retaddr);
2434        fptag = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2435    }
2436    cpu_set_fpus(env, fpus);
2437    for (i = 0; i < 8; i++) {
2438        env->fptags[i] = ((fptag & 3) == 3);
2439        fptag >>= 2;
2440    }
2441}
2442
2443void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2444{
2445    do_fldenv(env, ptr, data32, GETPC());
2446}
2447
2448static void do_fsave(CPUX86State *env, target_ulong ptr, int data32,
2449                     uintptr_t retaddr)
2450{
2451    floatx80 tmp;
2452    int i;
2453
2454    do_fstenv(env, ptr, data32, retaddr);
2455
2456    ptr += (14 << data32);
2457    for (i = 0; i < 8; i++) {
2458        tmp = ST(i);
2459        do_fstt(env, tmp, ptr, retaddr);
2460        ptr += 10;
2461    }
2462
2463    do_fninit(env);
2464}
2465
2466void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2467{
2468    do_fsave(env, ptr, data32, GETPC());
2469}
2470
2471static void do_frstor(CPUX86State *env, target_ulong ptr, int data32,
2472                      uintptr_t retaddr)
2473{
2474    floatx80 tmp;
2475    int i;
2476
2477    do_fldenv(env, ptr, data32, retaddr);
2478    ptr += (14 << data32);
2479
2480    for (i = 0; i < 8; i++) {
2481        tmp = do_fldt(env, ptr, retaddr);
2482        ST(i) = tmp;
2483        ptr += 10;
2484    }
2485}
2486
2487void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2488{
2489    do_frstor(env, ptr, data32, GETPC());
2490}
2491
2492#if defined(CONFIG_USER_ONLY)
2493void cpu_x86_fsave(CPUX86State *env, target_ulong ptr, int data32)
2494{
2495    do_fsave(env, ptr, data32, 0);
2496}
2497
2498void cpu_x86_frstor(CPUX86State *env, target_ulong ptr, int data32)
2499{
2500    do_frstor(env, ptr, data32, 0);
2501}
2502#endif
2503
2504#define XO(X)  offsetof(X86XSaveArea, X)
2505
2506static void do_xsave_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2507{
2508    int fpus, fptag, i;
2509    target_ulong addr;
2510
2511    fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2512    fptag = 0;
2513    for (i = 0; i < 8; i++) {
2514        fptag |= (env->fptags[i] << i);
2515    }
2516
2517    cpu_stw_data_ra(env, ptr + XO(legacy.fcw), env->fpuc, ra);
2518    cpu_stw_data_ra(env, ptr + XO(legacy.fsw), fpus, ra);
2519    cpu_stw_data_ra(env, ptr + XO(legacy.ftw), fptag ^ 0xff, ra);
2520
2521    /* In 32-bit mode this is eip, sel, dp, sel.
2522       In 64-bit mode this is rip, rdp.
2523       But in either case we don't write actual data, just zeros.  */
2524    cpu_stq_data_ra(env, ptr + XO(legacy.fpip), 0, ra); /* eip+sel; rip */
2525    cpu_stq_data_ra(env, ptr + XO(legacy.fpdp), 0, ra); /* edp+sel; rdp */
2526
2527    addr = ptr + XO(legacy.fpregs);
2528    for (i = 0; i < 8; i++) {
2529        floatx80 tmp = ST(i);
2530        do_fstt(env, tmp, addr, ra);
2531        addr += 16;
2532    }
2533}
2534
2535static void do_xsave_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2536{
2537    update_mxcsr_from_sse_status(env);
2538    cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr), env->mxcsr, ra);
2539    cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr_mask), 0x0000ffff, ra);
2540}
2541
2542static void do_xsave_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2543{
2544    int i, nb_xmm_regs;
2545    target_ulong addr;
2546
2547    if (env->hflags & HF_CS64_MASK) {
2548        nb_xmm_regs = 16;
2549    } else {
2550        nb_xmm_regs = 8;
2551    }
2552
2553    addr = ptr + XO(legacy.xmm_regs);
2554    for (i = 0; i < nb_xmm_regs; i++) {
2555        cpu_stq_data_ra(env, addr, env->xmm_regs[i].ZMM_Q(0), ra);
2556        cpu_stq_data_ra(env, addr + 8, env->xmm_regs[i].ZMM_Q(1), ra);
2557        addr += 16;
2558    }
2559}
2560
2561static void do_xsave_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2562{
2563    target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2564    int i;
2565
2566    for (i = 0; i < 4; i++, addr += 16) {
2567        cpu_stq_data_ra(env, addr, env->bnd_regs[i].lb, ra);
2568        cpu_stq_data_ra(env, addr + 8, env->bnd_regs[i].ub, ra);
2569    }
2570}
2571
2572static void do_xsave_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2573{
2574    cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2575                    env->bndcs_regs.cfgu, ra);
2576    cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2577                    env->bndcs_regs.sts, ra);
2578}
2579
2580static void do_xsave_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2581{
2582    cpu_stq_data_ra(env, ptr, env->pkru, ra);
2583}
2584
2585static void do_fxsave(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2586{
2587    /* The operand must be 16 byte aligned */
2588    if (ptr & 0xf) {
2589        raise_exception_ra(env, EXCP0D_GPF, ra);
2590    }
2591
2592    do_xsave_fpu(env, ptr, ra);
2593
2594    if (env->cr[4] & CR4_OSFXSR_MASK) {
2595        do_xsave_mxcsr(env, ptr, ra);
2596        /* Fast FXSAVE leaves out the XMM registers */
2597        if (!(env->efer & MSR_EFER_FFXSR)
2598            || (env->hflags & HF_CPL_MASK)
2599            || !(env->hflags & HF_LMA_MASK)) {
2600            do_xsave_sse(env, ptr, ra);
2601        }
2602    }
2603}
2604
2605void helper_fxsave(CPUX86State *env, target_ulong ptr)
2606{
2607    do_fxsave(env, ptr, GETPC());
2608}
2609
2610static uint64_t get_xinuse(CPUX86State *env)
2611{
2612    uint64_t inuse = -1;
2613
2614    /* For the most part, we don't track XINUSE.  We could calculate it
2615       here for all components, but it's probably less work to simply
2616       indicate in use.  That said, the state of BNDREGS is important
2617       enough to track in HFLAGS, so we might as well use that here.  */
2618    if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2619       inuse &= ~XSTATE_BNDREGS_MASK;
2620    }
2621    return inuse;
2622}
2623
2624static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2625                     uint64_t inuse, uint64_t opt, uintptr_t ra)
2626{
2627    uint64_t old_bv, new_bv;
2628
2629    /* The OS must have enabled XSAVE.  */
2630    if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2631        raise_exception_ra(env, EXCP06_ILLOP, ra);
2632    }
2633
2634    /* The operand must be 64 byte aligned.  */
2635    if (ptr & 63) {
2636        raise_exception_ra(env, EXCP0D_GPF, ra);
2637    }
2638
2639    /* Never save anything not enabled by XCR0.  */
2640    rfbm &= env->xcr0;
2641    opt &= rfbm;
2642
2643    if (opt & XSTATE_FP_MASK) {
2644        do_xsave_fpu(env, ptr, ra);
2645    }
2646    if (rfbm & XSTATE_SSE_MASK) {
2647        /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2648        do_xsave_mxcsr(env, ptr, ra);
2649    }
2650    if (opt & XSTATE_SSE_MASK) {
2651        do_xsave_sse(env, ptr, ra);
2652    }
2653    if (opt & XSTATE_BNDREGS_MASK) {
2654        do_xsave_bndregs(env, ptr + XO(bndreg_state), ra);
2655    }
2656    if (opt & XSTATE_BNDCSR_MASK) {
2657        do_xsave_bndcsr(env, ptr + XO(bndcsr_state), ra);
2658    }
2659    if (opt & XSTATE_PKRU_MASK) {
2660        do_xsave_pkru(env, ptr + XO(pkru_state), ra);
2661    }
2662
2663    /* Update the XSTATE_BV field.  */
2664    old_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2665    new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2666    cpu_stq_data_ra(env, ptr + XO(header.xstate_bv), new_bv, ra);
2667}
2668
2669void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2670{
2671    do_xsave(env, ptr, rfbm, get_xinuse(env), -1, GETPC());
2672}
2673
2674void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2675{
2676    uint64_t inuse = get_xinuse(env);
2677    do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2678}
2679
2680static void do_xrstor_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2681{
2682    int i, fpuc, fpus, fptag;
2683    target_ulong addr;
2684
2685    fpuc = cpu_lduw_data_ra(env, ptr + XO(legacy.fcw), ra);
2686    fpus = cpu_lduw_data_ra(env, ptr + XO(legacy.fsw), ra);
2687    fptag = cpu_lduw_data_ra(env, ptr + XO(legacy.ftw), ra);
2688    cpu_set_fpuc(env, fpuc);
2689    cpu_set_fpus(env, fpus);
2690    fptag ^= 0xff;
2691    for (i = 0; i < 8; i++) {
2692        env->fptags[i] = ((fptag >> i) & 1);
2693    }
2694
2695    addr = ptr + XO(legacy.fpregs);
2696    for (i = 0; i < 8; i++) {
2697        floatx80 tmp = do_fldt(env, addr, ra);
2698        ST(i) = tmp;
2699        addr += 16;
2700    }
2701}
2702
2703static void do_xrstor_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2704{
2705    cpu_set_mxcsr(env, cpu_ldl_data_ra(env, ptr + XO(legacy.mxcsr), ra));
2706}
2707
2708static void do_xrstor_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2709{
2710    int i, nb_xmm_regs;
2711    target_ulong addr;
2712
2713    if (env->hflags & HF_CS64_MASK) {
2714        nb_xmm_regs = 16;
2715    } else {
2716        nb_xmm_regs = 8;
2717    }
2718
2719    addr = ptr + XO(legacy.xmm_regs);
2720    for (i = 0; i < nb_xmm_regs; i++) {
2721        env->xmm_regs[i].ZMM_Q(0) = cpu_ldq_data_ra(env, addr, ra);
2722        env->xmm_regs[i].ZMM_Q(1) = cpu_ldq_data_ra(env, addr + 8, ra);
2723        addr += 16;
2724    }
2725}
2726
2727static void do_xrstor_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2728{
2729    target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2730    int i;
2731
2732    for (i = 0; i < 4; i++, addr += 16) {
2733        env->bnd_regs[i].lb = cpu_ldq_data_ra(env, addr, ra);
2734        env->bnd_regs[i].ub = cpu_ldq_data_ra(env, addr + 8, ra);
2735    }
2736}
2737
2738static void do_xrstor_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2739{
2740    /* FIXME: Extend highest implemented bit of linear address.  */
2741    env->bndcs_regs.cfgu
2742        = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu), ra);
2743    env->bndcs_regs.sts
2744        = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts), ra);
2745}
2746
2747static void do_xrstor_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2748{
2749    env->pkru = cpu_ldq_data_ra(env, ptr, ra);
2750}
2751
2752static void do_fxrstor(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2753{
2754    /* The operand must be 16 byte aligned */
2755    if (ptr & 0xf) {
2756        raise_exception_ra(env, EXCP0D_GPF, ra);
2757    }
2758
2759    do_xrstor_fpu(env, ptr, ra);
2760
2761    if (env->cr[4] & CR4_OSFXSR_MASK) {
2762        do_xrstor_mxcsr(env, ptr, ra);
2763        /* Fast FXRSTOR leaves out the XMM registers */
2764        if (!(env->efer & MSR_EFER_FFXSR)
2765            || (env->hflags & HF_CPL_MASK)
2766            || !(env->hflags & HF_LMA_MASK)) {
2767            do_xrstor_sse(env, ptr, ra);
2768        }
2769    }
2770}
2771
2772void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2773{
2774    do_fxrstor(env, ptr, GETPC());
2775}
2776
2777#if defined(CONFIG_USER_ONLY)
2778void cpu_x86_fxsave(CPUX86State *env, target_ulong ptr)
2779{
2780    do_fxsave(env, ptr, 0);
2781}
2782
2783void cpu_x86_fxrstor(CPUX86State *env, target_ulong ptr)
2784{
2785    do_fxrstor(env, ptr, 0);
2786}
2787#endif
2788
2789void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2790{
2791    uintptr_t ra = GETPC();
2792    uint64_t xstate_bv, xcomp_bv, reserve0;
2793
2794    rfbm &= env->xcr0;
2795
2796    /* The OS must have enabled XSAVE.  */
2797    if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2798        raise_exception_ra(env, EXCP06_ILLOP, ra);
2799    }
2800
2801    /* The operand must be 64 byte aligned.  */
2802    if (ptr & 63) {
2803        raise_exception_ra(env, EXCP0D_GPF, ra);
2804    }
2805
2806    xstate_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2807
2808    if ((int64_t)xstate_bv < 0) {
2809        /* FIXME: Compact form.  */
2810        raise_exception_ra(env, EXCP0D_GPF, ra);
2811    }
2812
2813    /* Standard form.  */
2814
2815    /* The XSTATE_BV field must not set bits not present in XCR0.  */
2816    if (xstate_bv & ~env->xcr0) {
2817        raise_exception_ra(env, EXCP0D_GPF, ra);
2818    }
2819
2820    /* The XCOMP_BV field must be zero.  Note that, as of the April 2016
2821       revision, the description of the XSAVE Header (Vol 1, Sec 13.4.2)
2822       describes only XCOMP_BV, but the description of the standard form
2823       of XRSTOR (Vol 1, Sec 13.8.1) checks bytes 23:8 for zero, which
2824       includes the next 64-bit field.  */
2825    xcomp_bv = cpu_ldq_data_ra(env, ptr + XO(header.xcomp_bv), ra);
2826    reserve0 = cpu_ldq_data_ra(env, ptr + XO(header.reserve0), ra);
2827    if (xcomp_bv || reserve0) {
2828        raise_exception_ra(env, EXCP0D_GPF, ra);
2829    }
2830
2831    if (rfbm & XSTATE_FP_MASK) {
2832        if (xstate_bv & XSTATE_FP_MASK) {
2833            do_xrstor_fpu(env, ptr, ra);
2834        } else {
2835            do_fninit(env);
2836            memset(env->fpregs, 0, sizeof(env->fpregs));
2837        }
2838    }
2839    if (rfbm & XSTATE_SSE_MASK) {
2840        /* Note that the standard form of XRSTOR loads MXCSR from memory
2841           whether or not the XSTATE_BV bit is set.  */
2842        do_xrstor_mxcsr(env, ptr, ra);
2843        if (xstate_bv & XSTATE_SSE_MASK) {
2844            do_xrstor_sse(env, ptr, ra);
2845        } else {
2846            /* ??? When AVX is implemented, we may have to be more
2847               selective in the clearing.  */
2848            memset(env->xmm_regs, 0, sizeof(env->xmm_regs));
2849        }
2850    }
2851    if (rfbm & XSTATE_BNDREGS_MASK) {
2852        if (xstate_bv & XSTATE_BNDREGS_MASK) {
2853            do_xrstor_bndregs(env, ptr + XO(bndreg_state), ra);
2854            env->hflags |= HF_MPX_IU_MASK;
2855        } else {
2856            memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
2857            env->hflags &= ~HF_MPX_IU_MASK;
2858        }
2859    }
2860    if (rfbm & XSTATE_BNDCSR_MASK) {
2861        if (xstate_bv & XSTATE_BNDCSR_MASK) {
2862            do_xrstor_bndcsr(env, ptr + XO(bndcsr_state), ra);
2863        } else {
2864            memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
2865        }
2866        cpu_sync_bndcs_hflags(env);
2867    }
2868    if (rfbm & XSTATE_PKRU_MASK) {
2869        uint64_t old_pkru = env->pkru;
2870        if (xstate_bv & XSTATE_PKRU_MASK) {
2871            do_xrstor_pkru(env, ptr + XO(pkru_state), ra);
2872        } else {
2873            env->pkru = 0;
2874        }
2875        if (env->pkru != old_pkru) {
2876            CPUState *cs = env_cpu(env);
2877            tlb_flush(cs);
2878        }
2879    }
2880}
2881
2882#undef XO
2883
2884uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
2885{
2886    /* The OS must have enabled XSAVE.  */
2887    if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2888        raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2889    }
2890
2891    switch (ecx) {
2892    case 0:
2893        return env->xcr0;
2894    case 1:
2895        if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
2896            return env->xcr0 & get_xinuse(env);
2897        }
2898        break;
2899    }
2900    raise_exception_ra(env, EXCP0D_GPF, GETPC());
2901}
2902
2903void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
2904{
2905    uint32_t dummy, ena_lo, ena_hi;
2906    uint64_t ena;
2907
2908    /* The OS must have enabled XSAVE.  */
2909    if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2910        raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2911    }
2912
2913    /* Only XCR0 is defined at present; the FPU may not be disabled.  */
2914    if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
2915        goto do_gpf;
2916    }
2917
2918    /* Disallow enabling unimplemented features.  */
2919    cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
2920    ena = ((uint64_t)ena_hi << 32) | ena_lo;
2921    if (mask & ~ena) {
2922        goto do_gpf;
2923    }
2924
2925    /* Disallow enabling only half of MPX.  */
2926    if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
2927        & XSTATE_BNDCSR_MASK) {
2928        goto do_gpf;
2929    }
2930
2931    env->xcr0 = mask;
2932    cpu_sync_bndcs_hflags(env);
2933    return;
2934
2935 do_gpf:
2936    raise_exception_ra(env, EXCP0D_GPF, GETPC());
2937}
2938
2939/* MMX/SSE */
2940/* XXX: optimize by storing fptt and fptags in the static cpu state */
2941
2942#define SSE_DAZ             0x0040
2943#define SSE_RC_MASK         0x6000
2944#define SSE_RC_NEAR         0x0000
2945#define SSE_RC_DOWN         0x2000
2946#define SSE_RC_UP           0x4000
2947#define SSE_RC_CHOP         0x6000
2948#define SSE_FZ              0x8000
2949
2950void update_mxcsr_status(CPUX86State *env)
2951{
2952    uint32_t mxcsr = env->mxcsr;
2953    int rnd_type;
2954
2955    /* set rounding mode */
2956    switch (mxcsr & SSE_RC_MASK) {
2957    default:
2958    case SSE_RC_NEAR:
2959        rnd_type = float_round_nearest_even;
2960        break;
2961    case SSE_RC_DOWN:
2962        rnd_type = float_round_down;
2963        break;
2964    case SSE_RC_UP:
2965        rnd_type = float_round_up;
2966        break;
2967    case SSE_RC_CHOP:
2968        rnd_type = float_round_to_zero;
2969        break;
2970    }
2971    set_float_rounding_mode(rnd_type, &env->sse_status);
2972
2973    /* Set exception flags.  */
2974    set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
2975                              (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
2976                              (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
2977                              (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
2978                              (mxcsr & FPUS_PE ? float_flag_inexact : 0),
2979                              &env->sse_status);
2980
2981    /* set denormals are zero */
2982    set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
2983
2984    /* set flush to zero */
2985    set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
2986}
2987
2988void update_mxcsr_from_sse_status(CPUX86State *env)
2989{
2990    uint8_t flags = get_float_exception_flags(&env->sse_status);
2991    /*
2992     * The MXCSR denormal flag has opposite semantics to
2993     * float_flag_input_denormal (the softfloat code sets that flag
2994     * only when flushing input denormals to zero, but SSE sets it
2995     * only when not flushing them to zero), so is not converted
2996     * here.
2997     */
2998    env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
2999                   (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3000                   (flags & float_flag_overflow ? FPUS_OE : 0) |
3001                   (flags & float_flag_underflow ? FPUS_UE : 0) |
3002                   (flags & float_flag_inexact ? FPUS_PE : 0) |
3003                   (flags & float_flag_output_denormal ? FPUS_UE | FPUS_PE :
3004                    0));
3005}
3006
3007void helper_update_mxcsr(CPUX86State *env)
3008{
3009    update_mxcsr_from_sse_status(env);
3010}
3011
3012void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3013{
3014    cpu_set_mxcsr(env, val);
3015}
3016
3017void helper_enter_mmx(CPUX86State *env)
3018{
3019    env->fpstt = 0;
3020    *(uint32_t *)(env->fptags) = 0;
3021    *(uint32_t *)(env->fptags + 4) = 0;
3022}
3023
3024void helper_emms(CPUX86State *env)
3025{
3026    /* set to empty state */
3027    *(uint32_t *)(env->fptags) = 0x01010101;
3028    *(uint32_t *)(env->fptags + 4) = 0x01010101;
3029}
3030
3031/* XXX: suppress */
3032void helper_movq(CPUX86State *env, void *d, void *s)
3033{
3034    *(uint64_t *)d = *(uint64_t *)s;
3035}
3036
3037#define SHIFT 0
3038#include "ops_sse.h"
3039
3040#define SHIFT 1
3041#include "ops_sse.h"
3042