qemu/fpu/softfloat.c
<<
>>
Prefs
   1/*
   2 * QEMU float support
   3 *
   4 * The code in this source file is derived from release 2a of the SoftFloat
   5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6 * some later contributions) are provided under that license, as detailed below.
   7 * It has subsequently been modified by contributors to the QEMU Project,
   8 * so some portions are provided under:
   9 *  the SoftFloat-2a license
  10 *  the BSD license
  11 *  GPL-v2-or-later
  12 *
  13 * Any future contributions to this file after December 1st 2014 will be
  14 * taken to be licensed under the Softfloat-2a license unless specifically
  15 * indicated otherwise.
  16 */
  17
  18/*
  19===============================================================================
  20This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21Arithmetic Package, Release 2a.
  22
  23Written by John R. Hauser.  This work was made possible in part by the
  24International Computer Science Institute, located at Suite 600, 1947 Center
  25Street, Berkeley, California 94704.  Funding was partially provided by the
  26National Science Foundation under grant MIP-9311980.  The original version
  27of this code was written as part of a project to build a fixed-point vector
  28processor in collaboration with the University of California at Berkeley,
  29overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31arithmetic/SoftFloat.html'.
  32
  33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39Derivative works are acceptable, even for commercial purposes, so long as
  40(1) they include prominent notice that the work is derivative, and (2) they
  41include prominent notice akin to these four paragraphs for those parts of
  42this code that are retained.
  43
  44===============================================================================
  45*/
  46
  47/* BSD licensing:
  48 * Copyright (c) 2006, Fabrice Bellard
  49 * All rights reserved.
  50 *
  51 * Redistribution and use in source and binary forms, with or without
  52 * modification, are permitted provided that the following conditions are met:
  53 *
  54 * 1. Redistributions of source code must retain the above copyright notice,
  55 * this list of conditions and the following disclaimer.
  56 *
  57 * 2. Redistributions in binary form must reproduce the above copyright notice,
  58 * this list of conditions and the following disclaimer in the documentation
  59 * and/or other materials provided with the distribution.
  60 *
  61 * 3. Neither the name of the copyright holder nor the names of its contributors
  62 * may be used to endorse or promote products derived from this software without
  63 * specific prior written permission.
  64 *
  65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75 * THE POSSIBILITY OF SUCH DAMAGE.
  76 */
  77
  78/* Portions of this work are licensed under the terms of the GNU GPL,
  79 * version 2 or later. See the COPYING file in the top-level directory.
  80 */
  81
  82/* softfloat (and in particular the code in softfloat-specialize.h) is
  83 * target-dependent and needs the TARGET_* macros.
  84 */
  85#include "qemu/osdep.h"
  86#include <math.h>
  87#include "qemu/bitops.h"
  88#include "fpu/softfloat.h"
  89
  90/* We only need stdlib for abort() */
  91
  92/*----------------------------------------------------------------------------
  93| Primitive arithmetic functions, including multi-word arithmetic, and
  94| division and square root approximations.  (Can be specialized to target if
  95| desired.)
  96*----------------------------------------------------------------------------*/
  97#include "fpu/softfloat-macros.h"
  98
  99/*
 100 * Hardfloat
 101 *
 102 * Fast emulation of guest FP instructions is challenging for two reasons.
 103 * First, FP instruction semantics are similar but not identical, particularly
 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
 105 * exception flags is not trivial: reading the host's flags register with a
 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
 107 * and trapping on every FP exception is not fast nor pleasant to work with.
 108 *
 109 * We address these challenges by leveraging the host FPU for a subset of the
 110 * operations. To do this we expand on the idea presented in this paper:
 111 *
 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
 114 *
 115 * The idea is thus to leverage the host FPU to (1) compute FP operations
 116 * and (2) identify whether FP exceptions occurred while avoiding
 117 * expensive exception flag register accesses.
 118 *
 119 * An important optimization shown in the paper is that given that exception
 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
 121 * This is particularly useful for the inexact flag, which is very frequently
 122 * raised in floating-point workloads.
 123 *
 124 * We optimize the code further by deferring to soft-fp whenever FP exception
 125 * detection might get hairy. Two examples: (1) when at least one operand is
 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
 127 * and the result is < the minimum normal.
 128 */
 129#define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
 130    static inline void name(soft_t *a, float_status *s)                 \
 131    {                                                                   \
 132        if (unlikely(soft_t ## _is_denormal(*a))) {                     \
 133            *a = soft_t ## _set_sign(soft_t ## _zero,                   \
 134                                     soft_t ## _is_neg(*a));            \
 135            s->float_exception_flags |= float_flag_input_denormal;      \
 136        }                                                               \
 137    }
 138
 139GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
 140GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
 141#undef GEN_INPUT_FLUSH__NOCHECK
 142
 143#define GEN_INPUT_FLUSH1(name, soft_t)                  \
 144    static inline void name(soft_t *a, float_status *s) \
 145    {                                                   \
 146        if (likely(!s->flush_inputs_to_zero)) {         \
 147            return;                                     \
 148        }                                               \
 149        soft_t ## _input_flush__nocheck(a, s);          \
 150    }
 151
 152GEN_INPUT_FLUSH1(float32_input_flush1, float32)
 153GEN_INPUT_FLUSH1(float64_input_flush1, float64)
 154#undef GEN_INPUT_FLUSH1
 155
 156#define GEN_INPUT_FLUSH2(name, soft_t)                                  \
 157    static inline void name(soft_t *a, soft_t *b, float_status *s)      \
 158    {                                                                   \
 159        if (likely(!s->flush_inputs_to_zero)) {                         \
 160            return;                                                     \
 161        }                                                               \
 162        soft_t ## _input_flush__nocheck(a, s);                          \
 163        soft_t ## _input_flush__nocheck(b, s);                          \
 164    }
 165
 166GEN_INPUT_FLUSH2(float32_input_flush2, float32)
 167GEN_INPUT_FLUSH2(float64_input_flush2, float64)
 168#undef GEN_INPUT_FLUSH2
 169
 170#define GEN_INPUT_FLUSH3(name, soft_t)                                  \
 171    static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
 172    {                                                                   \
 173        if (likely(!s->flush_inputs_to_zero)) {                         \
 174            return;                                                     \
 175        }                                                               \
 176        soft_t ## _input_flush__nocheck(a, s);                          \
 177        soft_t ## _input_flush__nocheck(b, s);                          \
 178        soft_t ## _input_flush__nocheck(c, s);                          \
 179    }
 180
 181GEN_INPUT_FLUSH3(float32_input_flush3, float32)
 182GEN_INPUT_FLUSH3(float64_input_flush3, float64)
 183#undef GEN_INPUT_FLUSH3
 184
 185/*
 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
 187 * hardfloat functions. Each combination of number of inputs and float size
 188 * gets its own value.
 189 */
 190#if defined(__x86_64__)
 191# define QEMU_HARDFLOAT_1F32_USE_FP 0
 192# define QEMU_HARDFLOAT_1F64_USE_FP 1
 193# define QEMU_HARDFLOAT_2F32_USE_FP 0
 194# define QEMU_HARDFLOAT_2F64_USE_FP 1
 195# define QEMU_HARDFLOAT_3F32_USE_FP 0
 196# define QEMU_HARDFLOAT_3F64_USE_FP 1
 197#else
 198# define QEMU_HARDFLOAT_1F32_USE_FP 0
 199# define QEMU_HARDFLOAT_1F64_USE_FP 0
 200# define QEMU_HARDFLOAT_2F32_USE_FP 0
 201# define QEMU_HARDFLOAT_2F64_USE_FP 0
 202# define QEMU_HARDFLOAT_3F32_USE_FP 0
 203# define QEMU_HARDFLOAT_3F64_USE_FP 0
 204#endif
 205
 206/*
 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
 208 * float{32,64}_is_infinity when !USE_FP.
 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
 211 */
 212#if defined(__x86_64__) || defined(__aarch64__)
 213# define QEMU_HARDFLOAT_USE_ISINF   1
 214#else
 215# define QEMU_HARDFLOAT_USE_ISINF   0
 216#endif
 217
 218/*
 219 * Some targets clear the FP flags before most FP operations. This prevents
 220 * the use of hardfloat, since hardfloat relies on the inexact flag being
 221 * already set.
 222 */
 223#if defined(TARGET_PPC) || defined(__FAST_MATH__)
 224# if defined(__FAST_MATH__)
 225#  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
 226    IEEE implementation
 227# endif
 228# define QEMU_NO_HARDFLOAT 1
 229# define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
 230#else
 231# define QEMU_NO_HARDFLOAT 0
 232# define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
 233#endif
 234
 235static inline bool can_use_fpu(const float_status *s)
 236{
 237    if (QEMU_NO_HARDFLOAT) {
 238        return false;
 239    }
 240    return likely(s->float_exception_flags & float_flag_inexact &&
 241                  s->float_rounding_mode == float_round_nearest_even);
 242}
 243
 244/*
 245 * Hardfloat generation functions. Each operation can have two flavors:
 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
 247 * most condition checks, or native ones (e.g. fpclassify).
 248 *
 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
 250 * compiler to propagate constants and inline everything into the callers.
 251 *
 252 * We only generate functions for operations with two inputs, since only
 253 * these are common enough to justify consolidating them into common code.
 254 */
 255
 256typedef union {
 257    float32 s;
 258    float h;
 259} union_float32;
 260
 261typedef union {
 262    float64 s;
 263    double h;
 264} union_float64;
 265
 266typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
 267typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
 268
 269typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
 270typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
 271typedef float   (*hard_f32_op2_fn)(float a, float b);
 272typedef double  (*hard_f64_op2_fn)(double a, double b);
 273
 274/* 2-input is-zero-or-normal */
 275static inline bool f32_is_zon2(union_float32 a, union_float32 b)
 276{
 277    if (QEMU_HARDFLOAT_2F32_USE_FP) {
 278        /*
 279         * Not using a temp variable for consecutive fpclassify calls ends up
 280         * generating faster code.
 281         */
 282        return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 283               (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 284    }
 285    return float32_is_zero_or_normal(a.s) &&
 286           float32_is_zero_or_normal(b.s);
 287}
 288
 289static inline bool f64_is_zon2(union_float64 a, union_float64 b)
 290{
 291    if (QEMU_HARDFLOAT_2F64_USE_FP) {
 292        return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 293               (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 294    }
 295    return float64_is_zero_or_normal(a.s) &&
 296           float64_is_zero_or_normal(b.s);
 297}
 298
 299/* 3-input is-zero-or-normal */
 300static inline
 301bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
 302{
 303    if (QEMU_HARDFLOAT_3F32_USE_FP) {
 304        return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 305               (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 306               (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 307    }
 308    return float32_is_zero_or_normal(a.s) &&
 309           float32_is_zero_or_normal(b.s) &&
 310           float32_is_zero_or_normal(c.s);
 311}
 312
 313static inline
 314bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
 315{
 316    if (QEMU_HARDFLOAT_3F64_USE_FP) {
 317        return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 318               (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 319               (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 320    }
 321    return float64_is_zero_or_normal(a.s) &&
 322           float64_is_zero_or_normal(b.s) &&
 323           float64_is_zero_or_normal(c.s);
 324}
 325
 326static inline bool f32_is_inf(union_float32 a)
 327{
 328    if (QEMU_HARDFLOAT_USE_ISINF) {
 329        return isinf(a.h);
 330    }
 331    return float32_is_infinity(a.s);
 332}
 333
 334static inline bool f64_is_inf(union_float64 a)
 335{
 336    if (QEMU_HARDFLOAT_USE_ISINF) {
 337        return isinf(a.h);
 338    }
 339    return float64_is_infinity(a.s);
 340}
 341
 342/* Note: @fast_test and @post can be NULL */
 343static inline float32
 344float32_gen2(float32 xa, float32 xb, float_status *s,
 345             hard_f32_op2_fn hard, soft_f32_op2_fn soft,
 346             f32_check_fn pre, f32_check_fn post,
 347             f32_check_fn fast_test, soft_f32_op2_fn fast_op)
 348{
 349    union_float32 ua, ub, ur;
 350
 351    ua.s = xa;
 352    ub.s = xb;
 353
 354    if (unlikely(!can_use_fpu(s))) {
 355        goto soft;
 356    }
 357
 358    float32_input_flush2(&ua.s, &ub.s, s);
 359    if (unlikely(!pre(ua, ub))) {
 360        goto soft;
 361    }
 362    if (fast_test && fast_test(ua, ub)) {
 363        return fast_op(ua.s, ub.s, s);
 364    }
 365
 366    ur.h = hard(ua.h, ub.h);
 367    if (unlikely(f32_is_inf(ur))) {
 368        s->float_exception_flags |= float_flag_overflow;
 369    } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
 370        if (post == NULL || post(ua, ub)) {
 371            goto soft;
 372        }
 373    }
 374    return ur.s;
 375
 376 soft:
 377    return soft(ua.s, ub.s, s);
 378}
 379
 380static inline float64
 381float64_gen2(float64 xa, float64 xb, float_status *s,
 382             hard_f64_op2_fn hard, soft_f64_op2_fn soft,
 383             f64_check_fn pre, f64_check_fn post,
 384             f64_check_fn fast_test, soft_f64_op2_fn fast_op)
 385{
 386    union_float64 ua, ub, ur;
 387
 388    ua.s = xa;
 389    ub.s = xb;
 390
 391    if (unlikely(!can_use_fpu(s))) {
 392        goto soft;
 393    }
 394
 395    float64_input_flush2(&ua.s, &ub.s, s);
 396    if (unlikely(!pre(ua, ub))) {
 397        goto soft;
 398    }
 399    if (fast_test && fast_test(ua, ub)) {
 400        return fast_op(ua.s, ub.s, s);
 401    }
 402
 403    ur.h = hard(ua.h, ub.h);
 404    if (unlikely(f64_is_inf(ur))) {
 405        s->float_exception_flags |= float_flag_overflow;
 406    } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
 407        if (post == NULL || post(ua, ub)) {
 408            goto soft;
 409        }
 410    }
 411    return ur.s;
 412
 413 soft:
 414    return soft(ua.s, ub.s, s);
 415}
 416
 417/*----------------------------------------------------------------------------
 418| Returns the fraction bits of the half-precision floating-point value `a'.
 419*----------------------------------------------------------------------------*/
 420
 421static inline uint32_t extractFloat16Frac(float16 a)
 422{
 423    return float16_val(a) & 0x3ff;
 424}
 425
 426/*----------------------------------------------------------------------------
 427| Returns the exponent bits of the half-precision floating-point value `a'.
 428*----------------------------------------------------------------------------*/
 429
 430static inline int extractFloat16Exp(float16 a)
 431{
 432    return (float16_val(a) >> 10) & 0x1f;
 433}
 434
 435/*----------------------------------------------------------------------------
 436| Returns the fraction bits of the single-precision floating-point value `a'.
 437*----------------------------------------------------------------------------*/
 438
 439static inline uint32_t extractFloat32Frac(float32 a)
 440{
 441    return float32_val(a) & 0x007FFFFF;
 442}
 443
 444/*----------------------------------------------------------------------------
 445| Returns the exponent bits of the single-precision floating-point value `a'.
 446*----------------------------------------------------------------------------*/
 447
 448static inline int extractFloat32Exp(float32 a)
 449{
 450    return (float32_val(a) >> 23) & 0xFF;
 451}
 452
 453/*----------------------------------------------------------------------------
 454| Returns the sign bit of the single-precision floating-point value `a'.
 455*----------------------------------------------------------------------------*/
 456
 457static inline flag extractFloat32Sign(float32 a)
 458{
 459    return float32_val(a) >> 31;
 460}
 461
 462/*----------------------------------------------------------------------------
 463| Returns the fraction bits of the double-precision floating-point value `a'.
 464*----------------------------------------------------------------------------*/
 465
 466static inline uint64_t extractFloat64Frac(float64 a)
 467{
 468    return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
 469}
 470
 471/*----------------------------------------------------------------------------
 472| Returns the exponent bits of the double-precision floating-point value `a'.
 473*----------------------------------------------------------------------------*/
 474
 475static inline int extractFloat64Exp(float64 a)
 476{
 477    return (float64_val(a) >> 52) & 0x7FF;
 478}
 479
 480/*----------------------------------------------------------------------------
 481| Returns the sign bit of the double-precision floating-point value `a'.
 482*----------------------------------------------------------------------------*/
 483
 484static inline flag extractFloat64Sign(float64 a)
 485{
 486    return float64_val(a) >> 63;
 487}
 488
 489/*
 490 * Classify a floating point number. Everything above float_class_qnan
 491 * is a NaN so cls >= float_class_qnan is any NaN.
 492 */
 493
 494typedef enum __attribute__ ((__packed__)) {
 495    float_class_unclassified,
 496    float_class_zero,
 497    float_class_normal,
 498    float_class_inf,
 499    float_class_qnan,  /* all NaNs from here */
 500    float_class_snan,
 501} FloatClass;
 502
 503/* Simple helpers for checking if, or what kind of, NaN we have */
 504static inline __attribute__((unused)) bool is_nan(FloatClass c)
 505{
 506    return unlikely(c >= float_class_qnan);
 507}
 508
 509static inline __attribute__((unused)) bool is_snan(FloatClass c)
 510{
 511    return c == float_class_snan;
 512}
 513
 514static inline __attribute__((unused)) bool is_qnan(FloatClass c)
 515{
 516    return c == float_class_qnan;
 517}
 518
 519/*
 520 * Structure holding all of the decomposed parts of a float. The
 521 * exponent is unbiased and the fraction is normalized. All
 522 * calculations are done with a 64 bit fraction and then rounded as
 523 * appropriate for the final format.
 524 *
 525 * Thanks to the packed FloatClass a decent compiler should be able to
 526 * fit the whole structure into registers and avoid using the stack
 527 * for parameter passing.
 528 */
 529
 530typedef struct {
 531    uint64_t frac;
 532    int32_t  exp;
 533    FloatClass cls;
 534    bool sign;
 535} FloatParts;
 536
 537#define DECOMPOSED_BINARY_POINT    (64 - 2)
 538#define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 539#define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
 540
 541/* Structure holding all of the relevant parameters for a format.
 542 *   exp_size: the size of the exponent field
 543 *   exp_bias: the offset applied to the exponent field
 544 *   exp_max: the maximum normalised exponent
 545 *   frac_size: the size of the fraction field
 546 *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 547 * The following are computed based the size of fraction
 548 *   frac_lsb: least significant bit of fraction
 549 *   frac_lsbm1: the bit below the least significant bit (for rounding)
 550 *   round_mask/roundeven_mask: masks used for rounding
 551 * The following optional modifiers are available:
 552 *   arm_althp: handle ARM Alternative Half Precision
 553 */
 554typedef struct {
 555    int exp_size;
 556    int exp_bias;
 557    int exp_max;
 558    int frac_size;
 559    int frac_shift;
 560    uint64_t frac_lsb;
 561    uint64_t frac_lsbm1;
 562    uint64_t round_mask;
 563    uint64_t roundeven_mask;
 564    bool arm_althp;
 565} FloatFmt;
 566
 567/* Expand fields based on the size of exponent and fraction */
 568#define FLOAT_PARAMS(E, F)                                           \
 569    .exp_size       = E,                                             \
 570    .exp_bias       = ((1 << E) - 1) >> 1,                           \
 571    .exp_max        = (1 << E) - 1,                                  \
 572    .frac_size      = F,                                             \
 573    .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 574    .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 575    .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 576    .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 577    .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 578
 579static const FloatFmt float16_params = {
 580    FLOAT_PARAMS(5, 10)
 581};
 582
 583static const FloatFmt float16_params_ahp = {
 584    FLOAT_PARAMS(5, 10),
 585    .arm_althp = true
 586};
 587
 588static const FloatFmt float32_params = {
 589    FLOAT_PARAMS(8, 23)
 590};
 591
 592static const FloatFmt float64_params = {
 593    FLOAT_PARAMS(11, 52)
 594};
 595
 596/* Unpack a float to parts, but do not canonicalize.  */
 597static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
 598{
 599    const int sign_pos = fmt.frac_size + fmt.exp_size;
 600
 601    return (FloatParts) {
 602        .cls = float_class_unclassified,
 603        .sign = extract64(raw, sign_pos, 1),
 604        .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 605        .frac = extract64(raw, 0, fmt.frac_size),
 606    };
 607}
 608
 609static inline FloatParts float16_unpack_raw(float16 f)
 610{
 611    return unpack_raw(float16_params, f);
 612}
 613
 614static inline FloatParts float32_unpack_raw(float32 f)
 615{
 616    return unpack_raw(float32_params, f);
 617}
 618
 619static inline FloatParts float64_unpack_raw(float64 f)
 620{
 621    return unpack_raw(float64_params, f);
 622}
 623
 624/* Pack a float from parts, but do not canonicalize.  */
 625static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
 626{
 627    const int sign_pos = fmt.frac_size + fmt.exp_size;
 628    uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 629    return deposit64(ret, sign_pos, 1, p.sign);
 630}
 631
 632static inline float16 float16_pack_raw(FloatParts p)
 633{
 634    return make_float16(pack_raw(float16_params, p));
 635}
 636
 637static inline float32 float32_pack_raw(FloatParts p)
 638{
 639    return make_float32(pack_raw(float32_params, p));
 640}
 641
 642static inline float64 float64_pack_raw(FloatParts p)
 643{
 644    return make_float64(pack_raw(float64_params, p));
 645}
 646
 647/*----------------------------------------------------------------------------
 648| Functions and definitions to determine:  (1) whether tininess for underflow
 649| is detected before or after rounding by default, (2) what (if anything)
 650| happens when exceptions are raised, (3) how signaling NaNs are distinguished
 651| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 652| are propagated from function inputs to output.  These details are target-
 653| specific.
 654*----------------------------------------------------------------------------*/
 655#include "softfloat-specialize.h"
 656
 657/* Canonicalize EXP and FRAC, setting CLS.  */
 658static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
 659                                  float_status *status)
 660{
 661    if (part.exp == parm->exp_max && !parm->arm_althp) {
 662        if (part.frac == 0) {
 663            part.cls = float_class_inf;
 664        } else {
 665            part.frac <<= parm->frac_shift;
 666            part.cls = (parts_is_snan_frac(part.frac, status)
 667                        ? float_class_snan : float_class_qnan);
 668        }
 669    } else if (part.exp == 0) {
 670        if (likely(part.frac == 0)) {
 671            part.cls = float_class_zero;
 672        } else if (status->flush_inputs_to_zero) {
 673            float_raise(float_flag_input_denormal, status);
 674            part.cls = float_class_zero;
 675            part.frac = 0;
 676        } else {
 677            int shift = clz64(part.frac) - 1;
 678            part.cls = float_class_normal;
 679            part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 680            part.frac <<= shift;
 681        }
 682    } else {
 683        part.cls = float_class_normal;
 684        part.exp -= parm->exp_bias;
 685        part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 686    }
 687    return part;
 688}
 689
 690/* Round and uncanonicalize a floating-point number by parts. There
 691 * are FRAC_SHIFT bits that may require rounding at the bottom of the
 692 * fraction; these bits will be removed. The exponent will be biased
 693 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 694 */
 695
 696static FloatParts round_canonical(FloatParts p, float_status *s,
 697                                  const FloatFmt *parm)
 698{
 699    const uint64_t frac_lsb = parm->frac_lsb;
 700    const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 701    const uint64_t round_mask = parm->round_mask;
 702    const uint64_t roundeven_mask = parm->roundeven_mask;
 703    const int exp_max = parm->exp_max;
 704    const int frac_shift = parm->frac_shift;
 705    uint64_t frac, inc;
 706    int exp, flags = 0;
 707    bool overflow_norm;
 708
 709    frac = p.frac;
 710    exp = p.exp;
 711
 712    switch (p.cls) {
 713    case float_class_normal:
 714        switch (s->float_rounding_mode) {
 715        case float_round_nearest_even:
 716            overflow_norm = false;
 717            inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 718            break;
 719        case float_round_ties_away:
 720            overflow_norm = false;
 721            inc = frac_lsbm1;
 722            break;
 723        case float_round_to_zero:
 724            overflow_norm = true;
 725            inc = 0;
 726            break;
 727        case float_round_up:
 728            inc = p.sign ? 0 : round_mask;
 729            overflow_norm = p.sign;
 730            break;
 731        case float_round_down:
 732            inc = p.sign ? round_mask : 0;
 733            overflow_norm = !p.sign;
 734            break;
 735        case float_round_to_odd:
 736            overflow_norm = true;
 737            inc = frac & frac_lsb ? 0 : round_mask;
 738            break;
 739        default:
 740            g_assert_not_reached();
 741        }
 742
 743        exp += parm->exp_bias;
 744        if (likely(exp > 0)) {
 745            if (frac & round_mask) {
 746                flags |= float_flag_inexact;
 747                frac += inc;
 748                if (frac & DECOMPOSED_OVERFLOW_BIT) {
 749                    frac >>= 1;
 750                    exp++;
 751                }
 752            }
 753            frac >>= frac_shift;
 754
 755            if (parm->arm_althp) {
 756                /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
 757                if (unlikely(exp > exp_max)) {
 758                    /* Overflow.  Return the maximum normal.  */
 759                    flags = float_flag_invalid;
 760                    exp = exp_max;
 761                    frac = -1;
 762                }
 763            } else if (unlikely(exp >= exp_max)) {
 764                flags |= float_flag_overflow | float_flag_inexact;
 765                if (overflow_norm) {
 766                    exp = exp_max - 1;
 767                    frac = -1;
 768                } else {
 769                    p.cls = float_class_inf;
 770                    goto do_inf;
 771                }
 772            }
 773        } else if (s->flush_to_zero) {
 774            flags |= float_flag_output_denormal;
 775            p.cls = float_class_zero;
 776            goto do_zero;
 777        } else {
 778            bool is_tiny = (s->float_detect_tininess
 779                            == float_tininess_before_rounding)
 780                        || (exp < 0)
 781                        || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
 782
 783            shift64RightJamming(frac, 1 - exp, &frac);
 784            if (frac & round_mask) {
 785                /* Need to recompute round-to-even.  */
 786                switch (s->float_rounding_mode) {
 787                case float_round_nearest_even:
 788                    inc = ((frac & roundeven_mask) != frac_lsbm1
 789                           ? frac_lsbm1 : 0);
 790                    break;
 791                case float_round_to_odd:
 792                    inc = frac & frac_lsb ? 0 : round_mask;
 793                    break;
 794                }
 795                flags |= float_flag_inexact;
 796                frac += inc;
 797            }
 798
 799            exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 800            frac >>= frac_shift;
 801
 802            if (is_tiny && (flags & float_flag_inexact)) {
 803                flags |= float_flag_underflow;
 804            }
 805            if (exp == 0 && frac == 0) {
 806                p.cls = float_class_zero;
 807            }
 808        }
 809        break;
 810
 811    case float_class_zero:
 812    do_zero:
 813        exp = 0;
 814        frac = 0;
 815        break;
 816
 817    case float_class_inf:
 818    do_inf:
 819        assert(!parm->arm_althp);
 820        exp = exp_max;
 821        frac = 0;
 822        break;
 823
 824    case float_class_qnan:
 825    case float_class_snan:
 826        assert(!parm->arm_althp);
 827        exp = exp_max;
 828        frac >>= parm->frac_shift;
 829        break;
 830
 831    default:
 832        g_assert_not_reached();
 833    }
 834
 835    float_raise(flags, s);
 836    p.exp = exp;
 837    p.frac = frac;
 838    return p;
 839}
 840
 841/* Explicit FloatFmt version */
 842static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
 843                                            const FloatFmt *params)
 844{
 845    return sf_canonicalize(float16_unpack_raw(f), params, s);
 846}
 847
 848static FloatParts float16_unpack_canonical(float16 f, float_status *s)
 849{
 850    return float16a_unpack_canonical(f, s, &float16_params);
 851}
 852
 853static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
 854                                             const FloatFmt *params)
 855{
 856    return float16_pack_raw(round_canonical(p, s, params));
 857}
 858
 859static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
 860{
 861    return float16a_round_pack_canonical(p, s, &float16_params);
 862}
 863
 864static FloatParts float32_unpack_canonical(float32 f, float_status *s)
 865{
 866    return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
 867}
 868
 869static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
 870{
 871    return float32_pack_raw(round_canonical(p, s, &float32_params));
 872}
 873
 874static FloatParts float64_unpack_canonical(float64 f, float_status *s)
 875{
 876    return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
 877}
 878
 879static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
 880{
 881    return float64_pack_raw(round_canonical(p, s, &float64_params));
 882}
 883
 884static FloatParts return_nan(FloatParts a, float_status *s)
 885{
 886    switch (a.cls) {
 887    case float_class_snan:
 888        s->float_exception_flags |= float_flag_invalid;
 889        a = parts_silence_nan(a, s);
 890        /* fall through */
 891    case float_class_qnan:
 892        if (s->default_nan_mode) {
 893            return parts_default_nan(s);
 894        }
 895        break;
 896
 897    default:
 898        g_assert_not_reached();
 899    }
 900    return a;
 901}
 902
 903static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
 904{
 905    if (is_snan(a.cls) || is_snan(b.cls)) {
 906        s->float_exception_flags |= float_flag_invalid;
 907    }
 908
 909    if (s->default_nan_mode) {
 910        return parts_default_nan(s);
 911    } else {
 912        if (pickNaN(a.cls, b.cls,
 913                    a.frac > b.frac ||
 914                    (a.frac == b.frac && a.sign < b.sign))) {
 915            a = b;
 916        }
 917        if (is_snan(a.cls)) {
 918            return parts_silence_nan(a, s);
 919        }
 920    }
 921    return a;
 922}
 923
 924static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
 925                                  bool inf_zero, float_status *s)
 926{
 927    int which;
 928
 929    if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 930        s->float_exception_flags |= float_flag_invalid;
 931    }
 932
 933    which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
 934
 935    if (s->default_nan_mode) {
 936        /* Note that this check is after pickNaNMulAdd so that function
 937         * has an opportunity to set the Invalid flag.
 938         */
 939        which = 3;
 940    }
 941
 942    switch (which) {
 943    case 0:
 944        break;
 945    case 1:
 946        a = b;
 947        break;
 948    case 2:
 949        a = c;
 950        break;
 951    case 3:
 952        return parts_default_nan(s);
 953    default:
 954        g_assert_not_reached();
 955    }
 956
 957    if (is_snan(a.cls)) {
 958        return parts_silence_nan(a, s);
 959    }
 960    return a;
 961}
 962
 963/*
 964 * Returns the result of adding or subtracting the values of the
 965 * floating-point values `a' and `b'. The operation is performed
 966 * according to the IEC/IEEE Standard for Binary Floating-Point
 967 * Arithmetic.
 968 */
 969
 970static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
 971                                float_status *s)
 972{
 973    bool a_sign = a.sign;
 974    bool b_sign = b.sign ^ subtract;
 975
 976    if (a_sign != b_sign) {
 977        /* Subtraction */
 978
 979        if (a.cls == float_class_normal && b.cls == float_class_normal) {
 980            if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 981                shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 982                a.frac = a.frac - b.frac;
 983            } else {
 984                shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 985                a.frac = b.frac - a.frac;
 986                a.exp = b.exp;
 987                a_sign ^= 1;
 988            }
 989
 990            if (a.frac == 0) {
 991                a.cls = float_class_zero;
 992                a.sign = s->float_rounding_mode == float_round_down;
 993            } else {
 994                int shift = clz64(a.frac) - 1;
 995                a.frac = a.frac << shift;
 996                a.exp = a.exp - shift;
 997                a.sign = a_sign;
 998            }
 999            return a;
1000        }
1001        if (is_nan(a.cls) || is_nan(b.cls)) {
1002            return pick_nan(a, b, s);
1003        }
1004        if (a.cls == float_class_inf) {
1005            if (b.cls == float_class_inf) {
1006                float_raise(float_flag_invalid, s);
1007                return parts_default_nan(s);
1008            }
1009            return a;
1010        }
1011        if (a.cls == float_class_zero && b.cls == float_class_zero) {
1012            a.sign = s->float_rounding_mode == float_round_down;
1013            return a;
1014        }
1015        if (a.cls == float_class_zero || b.cls == float_class_inf) {
1016            b.sign = a_sign ^ 1;
1017            return b;
1018        }
1019        if (b.cls == float_class_zero) {
1020            return a;
1021        }
1022    } else {
1023        /* Addition */
1024        if (a.cls == float_class_normal && b.cls == float_class_normal) {
1025            if (a.exp > b.exp) {
1026                shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1027            } else if (a.exp < b.exp) {
1028                shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1029                a.exp = b.exp;
1030            }
1031            a.frac += b.frac;
1032            if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1033                shift64RightJamming(a.frac, 1, &a.frac);
1034                a.exp += 1;
1035            }
1036            return a;
1037        }
1038        if (is_nan(a.cls) || is_nan(b.cls)) {
1039            return pick_nan(a, b, s);
1040        }
1041        if (a.cls == float_class_inf || b.cls == float_class_zero) {
1042            return a;
1043        }
1044        if (b.cls == float_class_inf || a.cls == float_class_zero) {
1045            b.sign = b_sign;
1046            return b;
1047        }
1048    }
1049    g_assert_not_reached();
1050}
1051
1052/*
1053 * Returns the result of adding or subtracting the floating-point
1054 * values `a' and `b'. The operation is performed according to the
1055 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1056 */
1057
1058float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1059{
1060    FloatParts pa = float16_unpack_canonical(a, status);
1061    FloatParts pb = float16_unpack_canonical(b, status);
1062    FloatParts pr = addsub_floats(pa, pb, false, status);
1063
1064    return float16_round_pack_canonical(pr, status);
1065}
1066
1067float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1068{
1069    FloatParts pa = float16_unpack_canonical(a, status);
1070    FloatParts pb = float16_unpack_canonical(b, status);
1071    FloatParts pr = addsub_floats(pa, pb, true, status);
1072
1073    return float16_round_pack_canonical(pr, status);
1074}
1075
1076static float32 QEMU_SOFTFLOAT_ATTR
1077soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1078{
1079    FloatParts pa = float32_unpack_canonical(a, status);
1080    FloatParts pb = float32_unpack_canonical(b, status);
1081    FloatParts pr = addsub_floats(pa, pb, subtract, status);
1082
1083    return float32_round_pack_canonical(pr, status);
1084}
1085
1086static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1087{
1088    return soft_f32_addsub(a, b, false, status);
1089}
1090
1091static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1092{
1093    return soft_f32_addsub(a, b, true, status);
1094}
1095
1096static float64 QEMU_SOFTFLOAT_ATTR
1097soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1098{
1099    FloatParts pa = float64_unpack_canonical(a, status);
1100    FloatParts pb = float64_unpack_canonical(b, status);
1101    FloatParts pr = addsub_floats(pa, pb, subtract, status);
1102
1103    return float64_round_pack_canonical(pr, status);
1104}
1105
1106static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1107{
1108    return soft_f64_addsub(a, b, false, status);
1109}
1110
1111static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1112{
1113    return soft_f64_addsub(a, b, true, status);
1114}
1115
1116static float hard_f32_add(float a, float b)
1117{
1118    return a + b;
1119}
1120
1121static float hard_f32_sub(float a, float b)
1122{
1123    return a - b;
1124}
1125
1126static double hard_f64_add(double a, double b)
1127{
1128    return a + b;
1129}
1130
1131static double hard_f64_sub(double a, double b)
1132{
1133    return a - b;
1134}
1135
1136static bool f32_addsub_post(union_float32 a, union_float32 b)
1137{
1138    if (QEMU_HARDFLOAT_2F32_USE_FP) {
1139        return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1140    }
1141    return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1142}
1143
1144static bool f64_addsub_post(union_float64 a, union_float64 b)
1145{
1146    if (QEMU_HARDFLOAT_2F64_USE_FP) {
1147        return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1148    } else {
1149        return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1150    }
1151}
1152
1153static float32 float32_addsub(float32 a, float32 b, float_status *s,
1154                              hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1155{
1156    return float32_gen2(a, b, s, hard, soft,
1157                        f32_is_zon2, f32_addsub_post, NULL, NULL);
1158}
1159
1160static float64 float64_addsub(float64 a, float64 b, float_status *s,
1161                              hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1162{
1163    return float64_gen2(a, b, s, hard, soft,
1164                        f64_is_zon2, f64_addsub_post, NULL, NULL);
1165}
1166
1167float32 QEMU_FLATTEN
1168float32_add(float32 a, float32 b, float_status *s)
1169{
1170    return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1171}
1172
1173float32 QEMU_FLATTEN
1174float32_sub(float32 a, float32 b, float_status *s)
1175{
1176    return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1177}
1178
1179float64 QEMU_FLATTEN
1180float64_add(float64 a, float64 b, float_status *s)
1181{
1182    return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1183}
1184
1185float64 QEMU_FLATTEN
1186float64_sub(float64 a, float64 b, float_status *s)
1187{
1188    return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1189}
1190
1191/*
1192 * Returns the result of multiplying the floating-point values `a' and
1193 * `b'. The operation is performed according to the IEC/IEEE Standard
1194 * for Binary Floating-Point Arithmetic.
1195 */
1196
1197static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1198{
1199    bool sign = a.sign ^ b.sign;
1200
1201    if (a.cls == float_class_normal && b.cls == float_class_normal) {
1202        uint64_t hi, lo;
1203        int exp = a.exp + b.exp;
1204
1205        mul64To128(a.frac, b.frac, &hi, &lo);
1206        shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1207        if (lo & DECOMPOSED_OVERFLOW_BIT) {
1208            shift64RightJamming(lo, 1, &lo);
1209            exp += 1;
1210        }
1211
1212        /* Re-use a */
1213        a.exp = exp;
1214        a.sign = sign;
1215        a.frac = lo;
1216        return a;
1217    }
1218    /* handle all the NaN cases */
1219    if (is_nan(a.cls) || is_nan(b.cls)) {
1220        return pick_nan(a, b, s);
1221    }
1222    /* Inf * Zero == NaN */
1223    if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1224        (a.cls == float_class_zero && b.cls == float_class_inf)) {
1225        s->float_exception_flags |= float_flag_invalid;
1226        return parts_default_nan(s);
1227    }
1228    /* Multiply by 0 or Inf */
1229    if (a.cls == float_class_inf || a.cls == float_class_zero) {
1230        a.sign = sign;
1231        return a;
1232    }
1233    if (b.cls == float_class_inf || b.cls == float_class_zero) {
1234        b.sign = sign;
1235        return b;
1236    }
1237    g_assert_not_reached();
1238}
1239
1240float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1241{
1242    FloatParts pa = float16_unpack_canonical(a, status);
1243    FloatParts pb = float16_unpack_canonical(b, status);
1244    FloatParts pr = mul_floats(pa, pb, status);
1245
1246    return float16_round_pack_canonical(pr, status);
1247}
1248
1249static float32 QEMU_SOFTFLOAT_ATTR
1250soft_f32_mul(float32 a, float32 b, float_status *status)
1251{
1252    FloatParts pa = float32_unpack_canonical(a, status);
1253    FloatParts pb = float32_unpack_canonical(b, status);
1254    FloatParts pr = mul_floats(pa, pb, status);
1255
1256    return float32_round_pack_canonical(pr, status);
1257}
1258
1259static float64 QEMU_SOFTFLOAT_ATTR
1260soft_f64_mul(float64 a, float64 b, float_status *status)
1261{
1262    FloatParts pa = float64_unpack_canonical(a, status);
1263    FloatParts pb = float64_unpack_canonical(b, status);
1264    FloatParts pr = mul_floats(pa, pb, status);
1265
1266    return float64_round_pack_canonical(pr, status);
1267}
1268
1269static float hard_f32_mul(float a, float b)
1270{
1271    return a * b;
1272}
1273
1274static double hard_f64_mul(double a, double b)
1275{
1276    return a * b;
1277}
1278
1279static bool f32_mul_fast_test(union_float32 a, union_float32 b)
1280{
1281    return float32_is_zero(a.s) || float32_is_zero(b.s);
1282}
1283
1284static bool f64_mul_fast_test(union_float64 a, union_float64 b)
1285{
1286    return float64_is_zero(a.s) || float64_is_zero(b.s);
1287}
1288
1289static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
1290{
1291    bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
1292
1293    return float32_set_sign(float32_zero, signbit);
1294}
1295
1296static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
1297{
1298    bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
1299
1300    return float64_set_sign(float64_zero, signbit);
1301}
1302
1303float32 QEMU_FLATTEN
1304float32_mul(float32 a, float32 b, float_status *s)
1305{
1306    return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1307                        f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
1308}
1309
1310float64 QEMU_FLATTEN
1311float64_mul(float64 a, float64 b, float_status *s)
1312{
1313    return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1314                        f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
1315}
1316
1317/*
1318 * Returns the result of multiplying the floating-point values `a' and
1319 * `b' then adding 'c', with no intermediate rounding step after the
1320 * multiplication. The operation is performed according to the
1321 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1322 * The flags argument allows the caller to select negation of the
1323 * addend, the intermediate product, or the final result. (The
1324 * difference between this and having the caller do a separate
1325 * negation is that negating externally will flip the sign bit on
1326 * NaNs.)
1327 */
1328
1329static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1330                                int flags, float_status *s)
1331{
1332    bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1333                    ((1 << float_class_inf) | (1 << float_class_zero));
1334    bool p_sign;
1335    bool sign_flip = flags & float_muladd_negate_result;
1336    FloatClass p_class;
1337    uint64_t hi, lo;
1338    int p_exp;
1339
1340    /* It is implementation-defined whether the cases of (0,inf,qnan)
1341     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1342     * they return if they do), so we have to hand this information
1343     * off to the target-specific pick-a-NaN routine.
1344     */
1345    if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1346        return pick_nan_muladd(a, b, c, inf_zero, s);
1347    }
1348
1349    if (inf_zero) {
1350        s->float_exception_flags |= float_flag_invalid;
1351        return parts_default_nan(s);
1352    }
1353
1354    if (flags & float_muladd_negate_c) {
1355        c.sign ^= 1;
1356    }
1357
1358    p_sign = a.sign ^ b.sign;
1359
1360    if (flags & float_muladd_negate_product) {
1361        p_sign ^= 1;
1362    }
1363
1364    if (a.cls == float_class_inf || b.cls == float_class_inf) {
1365        p_class = float_class_inf;
1366    } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1367        p_class = float_class_zero;
1368    } else {
1369        p_class = float_class_normal;
1370    }
1371
1372    if (c.cls == float_class_inf) {
1373        if (p_class == float_class_inf && p_sign != c.sign) {
1374            s->float_exception_flags |= float_flag_invalid;
1375            return parts_default_nan(s);
1376        } else {
1377            a.cls = float_class_inf;
1378            a.sign = c.sign ^ sign_flip;
1379            return a;
1380        }
1381    }
1382
1383    if (p_class == float_class_inf) {
1384        a.cls = float_class_inf;
1385        a.sign = p_sign ^ sign_flip;
1386        return a;
1387    }
1388
1389    if (p_class == float_class_zero) {
1390        if (c.cls == float_class_zero) {
1391            if (p_sign != c.sign) {
1392                p_sign = s->float_rounding_mode == float_round_down;
1393            }
1394            c.sign = p_sign;
1395        } else if (flags & float_muladd_halve_result) {
1396            c.exp -= 1;
1397        }
1398        c.sign ^= sign_flip;
1399        return c;
1400    }
1401
1402    /* a & b should be normals now... */
1403    assert(a.cls == float_class_normal &&
1404           b.cls == float_class_normal);
1405
1406    p_exp = a.exp + b.exp;
1407
1408    /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1409     * result.
1410     */
1411    mul64To128(a.frac, b.frac, &hi, &lo);
1412    /* binary point now at bit 124 */
1413
1414    /* check for overflow */
1415    if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1416        shift128RightJamming(hi, lo, 1, &hi, &lo);
1417        p_exp += 1;
1418    }
1419
1420    /* + add/sub */
1421    if (c.cls == float_class_zero) {
1422        /* move binary point back to 62 */
1423        shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1424    } else {
1425        int exp_diff = p_exp - c.exp;
1426        if (p_sign == c.sign) {
1427            /* Addition */
1428            if (exp_diff <= 0) {
1429                shift128RightJamming(hi, lo,
1430                                     DECOMPOSED_BINARY_POINT - exp_diff,
1431                                     &hi, &lo);
1432                lo += c.frac;
1433                p_exp = c.exp;
1434            } else {
1435                uint64_t c_hi, c_lo;
1436                /* shift c to the same binary point as the product (124) */
1437                c_hi = c.frac >> 2;
1438                c_lo = 0;
1439                shift128RightJamming(c_hi, c_lo,
1440                                     exp_diff,
1441                                     &c_hi, &c_lo);
1442                add128(hi, lo, c_hi, c_lo, &hi, &lo);
1443                /* move binary point back to 62 */
1444                shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1445            }
1446
1447            if (lo & DECOMPOSED_OVERFLOW_BIT) {
1448                shift64RightJamming(lo, 1, &lo);
1449                p_exp += 1;
1450            }
1451
1452        } else {
1453            /* Subtraction */
1454            uint64_t c_hi, c_lo;
1455            /* make C binary point match product at bit 124 */
1456            c_hi = c.frac >> 2;
1457            c_lo = 0;
1458
1459            if (exp_diff <= 0) {
1460                shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1461                if (exp_diff == 0
1462                    &&
1463                    (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1464                    sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1465                } else {
1466                    sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1467                    p_sign ^= 1;
1468                    p_exp = c.exp;
1469                }
1470            } else {
1471                shift128RightJamming(c_hi, c_lo,
1472                                     exp_diff,
1473                                     &c_hi, &c_lo);
1474                sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1475            }
1476
1477            if (hi == 0 && lo == 0) {
1478                a.cls = float_class_zero;
1479                a.sign = s->float_rounding_mode == float_round_down;
1480                a.sign ^= sign_flip;
1481                return a;
1482            } else {
1483                int shift;
1484                if (hi != 0) {
1485                    shift = clz64(hi);
1486                } else {
1487                    shift = clz64(lo) + 64;
1488                }
1489                /* Normalizing to a binary point of 124 is the
1490                   correct adjust for the exponent.  However since we're
1491                   shifting, we might as well put the binary point back
1492                   at 62 where we really want it.  Therefore shift as
1493                   if we're leaving 1 bit at the top of the word, but
1494                   adjust the exponent as if we're leaving 3 bits.  */
1495                shift -= 1;
1496                if (shift >= 64) {
1497                    lo = lo << (shift - 64);
1498                } else {
1499                    hi = (hi << shift) | (lo >> (64 - shift));
1500                    lo = hi | ((lo << shift) != 0);
1501                }
1502                p_exp -= shift - 2;
1503            }
1504        }
1505    }
1506
1507    if (flags & float_muladd_halve_result) {
1508        p_exp -= 1;
1509    }
1510
1511    /* finally prepare our result */
1512    a.cls = float_class_normal;
1513    a.sign = p_sign ^ sign_flip;
1514    a.exp = p_exp;
1515    a.frac = lo;
1516
1517    return a;
1518}
1519
1520float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1521                                                int flags, float_status *status)
1522{
1523    FloatParts pa = float16_unpack_canonical(a, status);
1524    FloatParts pb = float16_unpack_canonical(b, status);
1525    FloatParts pc = float16_unpack_canonical(c, status);
1526    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1527
1528    return float16_round_pack_canonical(pr, status);
1529}
1530
1531static float32 QEMU_SOFTFLOAT_ATTR
1532soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1533                float_status *status)
1534{
1535    FloatParts pa = float32_unpack_canonical(a, status);
1536    FloatParts pb = float32_unpack_canonical(b, status);
1537    FloatParts pc = float32_unpack_canonical(c, status);
1538    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1539
1540    return float32_round_pack_canonical(pr, status);
1541}
1542
1543static float64 QEMU_SOFTFLOAT_ATTR
1544soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1545                float_status *status)
1546{
1547    FloatParts pa = float64_unpack_canonical(a, status);
1548    FloatParts pb = float64_unpack_canonical(b, status);
1549    FloatParts pc = float64_unpack_canonical(c, status);
1550    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1551
1552    return float64_round_pack_canonical(pr, status);
1553}
1554
1555static bool force_soft_fma;
1556
1557float32 QEMU_FLATTEN
1558float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1559{
1560    union_float32 ua, ub, uc, ur;
1561
1562    ua.s = xa;
1563    ub.s = xb;
1564    uc.s = xc;
1565
1566    if (unlikely(!can_use_fpu(s))) {
1567        goto soft;
1568    }
1569    if (unlikely(flags & float_muladd_halve_result)) {
1570        goto soft;
1571    }
1572
1573    float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1574    if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1575        goto soft;
1576    }
1577
1578    if (unlikely(force_soft_fma)) {
1579        goto soft;
1580    }
1581
1582    /*
1583     * When (a || b) == 0, there's no need to check for under/over flow,
1584     * since we know the addend is (normal || 0) and the product is 0.
1585     */
1586    if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1587        union_float32 up;
1588        bool prod_sign;
1589
1590        prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1591        prod_sign ^= !!(flags & float_muladd_negate_product);
1592        up.s = float32_set_sign(float32_zero, prod_sign);
1593
1594        if (flags & float_muladd_negate_c) {
1595            uc.h = -uc.h;
1596        }
1597        ur.h = up.h + uc.h;
1598    } else {
1599        union_float32 ua_orig = ua;
1600        union_float32 uc_orig = uc;
1601
1602        if (flags & float_muladd_negate_product) {
1603            ua.h = -ua.h;
1604        }
1605        if (flags & float_muladd_negate_c) {
1606            uc.h = -uc.h;
1607        }
1608
1609        ur.h = fmaf(ua.h, ub.h, uc.h);
1610
1611        if (unlikely(f32_is_inf(ur))) {
1612            s->float_exception_flags |= float_flag_overflow;
1613        } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1614            ua = ua_orig;
1615            uc = uc_orig;
1616            goto soft;
1617        }
1618    }
1619    if (flags & float_muladd_negate_result) {
1620        return float32_chs(ur.s);
1621    }
1622    return ur.s;
1623
1624 soft:
1625    return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1626}
1627
1628float64 QEMU_FLATTEN
1629float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1630{
1631    union_float64 ua, ub, uc, ur;
1632
1633    ua.s = xa;
1634    ub.s = xb;
1635    uc.s = xc;
1636
1637    if (unlikely(!can_use_fpu(s))) {
1638        goto soft;
1639    }
1640    if (unlikely(flags & float_muladd_halve_result)) {
1641        goto soft;
1642    }
1643
1644    float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1645    if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1646        goto soft;
1647    }
1648
1649    if (unlikely(force_soft_fma)) {
1650        goto soft;
1651    }
1652
1653    /*
1654     * When (a || b) == 0, there's no need to check for under/over flow,
1655     * since we know the addend is (normal || 0) and the product is 0.
1656     */
1657    if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1658        union_float64 up;
1659        bool prod_sign;
1660
1661        prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1662        prod_sign ^= !!(flags & float_muladd_negate_product);
1663        up.s = float64_set_sign(float64_zero, prod_sign);
1664
1665        if (flags & float_muladd_negate_c) {
1666            uc.h = -uc.h;
1667        }
1668        ur.h = up.h + uc.h;
1669    } else {
1670        union_float64 ua_orig = ua;
1671        union_float64 uc_orig = uc;
1672
1673        if (flags & float_muladd_negate_product) {
1674            ua.h = -ua.h;
1675        }
1676        if (flags & float_muladd_negate_c) {
1677            uc.h = -uc.h;
1678        }
1679
1680        ur.h = fma(ua.h, ub.h, uc.h);
1681
1682        if (unlikely(f64_is_inf(ur))) {
1683            s->float_exception_flags |= float_flag_overflow;
1684        } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1685            ua = ua_orig;
1686            uc = uc_orig;
1687            goto soft;
1688        }
1689    }
1690    if (flags & float_muladd_negate_result) {
1691        return float64_chs(ur.s);
1692    }
1693    return ur.s;
1694
1695 soft:
1696    return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1697}
1698
1699/*
1700 * Returns the result of dividing the floating-point value `a' by the
1701 * corresponding value `b'. The operation is performed according to
1702 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1703 */
1704
1705static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1706{
1707    bool sign = a.sign ^ b.sign;
1708
1709    if (a.cls == float_class_normal && b.cls == float_class_normal) {
1710        uint64_t n0, n1, q, r;
1711        int exp = a.exp - b.exp;
1712
1713        /*
1714         * We want a 2*N / N-bit division to produce exactly an N-bit
1715         * result, so that we do not lose any precision and so that we
1716         * do not have to renormalize afterward.  If A.frac < B.frac,
1717         * then division would produce an (N-1)-bit result; shift A left
1718         * by one to produce the an N-bit result, and decrement the
1719         * exponent to match.
1720         *
1721         * The udiv_qrnnd algorithm that we're using requires normalization,
1722         * i.e. the msb of the denominator must be set.  Since we know that
1723         * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1724         * by one (more), and the remainder must be shifted right by one.
1725         */
1726        if (a.frac < b.frac) {
1727            exp -= 1;
1728            shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1729        } else {
1730            shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1731        }
1732        q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1733
1734        /*
1735         * Set lsb if there is a remainder, to set inexact.
1736         * As mentioned above, to find the actual value of the remainder we
1737         * would need to shift right, but (1) we are only concerned about
1738         * non-zero-ness, and (2) the remainder will always be even because
1739         * both inputs to the division primitive are even.
1740         */
1741        a.frac = q | (r != 0);
1742        a.sign = sign;
1743        a.exp = exp;
1744        return a;
1745    }
1746    /* handle all the NaN cases */
1747    if (is_nan(a.cls) || is_nan(b.cls)) {
1748        return pick_nan(a, b, s);
1749    }
1750    /* 0/0 or Inf/Inf */
1751    if (a.cls == b.cls
1752        &&
1753        (a.cls == float_class_inf || a.cls == float_class_zero)) {
1754        s->float_exception_flags |= float_flag_invalid;
1755        return parts_default_nan(s);
1756    }
1757    /* Inf / x or 0 / x */
1758    if (a.cls == float_class_inf || a.cls == float_class_zero) {
1759        a.sign = sign;
1760        return a;
1761    }
1762    /* Div 0 => Inf */
1763    if (b.cls == float_class_zero) {
1764        s->float_exception_flags |= float_flag_divbyzero;
1765        a.cls = float_class_inf;
1766        a.sign = sign;
1767        return a;
1768    }
1769    /* Div by Inf */
1770    if (b.cls == float_class_inf) {
1771        a.cls = float_class_zero;
1772        a.sign = sign;
1773        return a;
1774    }
1775    g_assert_not_reached();
1776}
1777
1778float16 float16_div(float16 a, float16 b, float_status *status)
1779{
1780    FloatParts pa = float16_unpack_canonical(a, status);
1781    FloatParts pb = float16_unpack_canonical(b, status);
1782    FloatParts pr = div_floats(pa, pb, status);
1783
1784    return float16_round_pack_canonical(pr, status);
1785}
1786
1787static float32 QEMU_SOFTFLOAT_ATTR
1788soft_f32_div(float32 a, float32 b, float_status *status)
1789{
1790    FloatParts pa = float32_unpack_canonical(a, status);
1791    FloatParts pb = float32_unpack_canonical(b, status);
1792    FloatParts pr = div_floats(pa, pb, status);
1793
1794    return float32_round_pack_canonical(pr, status);
1795}
1796
1797static float64 QEMU_SOFTFLOAT_ATTR
1798soft_f64_div(float64 a, float64 b, float_status *status)
1799{
1800    FloatParts pa = float64_unpack_canonical(a, status);
1801    FloatParts pb = float64_unpack_canonical(b, status);
1802    FloatParts pr = div_floats(pa, pb, status);
1803
1804    return float64_round_pack_canonical(pr, status);
1805}
1806
1807static float hard_f32_div(float a, float b)
1808{
1809    return a / b;
1810}
1811
1812static double hard_f64_div(double a, double b)
1813{
1814    return a / b;
1815}
1816
1817static bool f32_div_pre(union_float32 a, union_float32 b)
1818{
1819    if (QEMU_HARDFLOAT_2F32_USE_FP) {
1820        return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1821               fpclassify(b.h) == FP_NORMAL;
1822    }
1823    return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1824}
1825
1826static bool f64_div_pre(union_float64 a, union_float64 b)
1827{
1828    if (QEMU_HARDFLOAT_2F64_USE_FP) {
1829        return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1830               fpclassify(b.h) == FP_NORMAL;
1831    }
1832    return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1833}
1834
1835static bool f32_div_post(union_float32 a, union_float32 b)
1836{
1837    if (QEMU_HARDFLOAT_2F32_USE_FP) {
1838        return fpclassify(a.h) != FP_ZERO;
1839    }
1840    return !float32_is_zero(a.s);
1841}
1842
1843static bool f64_div_post(union_float64 a, union_float64 b)
1844{
1845    if (QEMU_HARDFLOAT_2F64_USE_FP) {
1846        return fpclassify(a.h) != FP_ZERO;
1847    }
1848    return !float64_is_zero(a.s);
1849}
1850
1851float32 QEMU_FLATTEN
1852float32_div(float32 a, float32 b, float_status *s)
1853{
1854    return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1855                        f32_div_pre, f32_div_post, NULL, NULL);
1856}
1857
1858float64 QEMU_FLATTEN
1859float64_div(float64 a, float64 b, float_status *s)
1860{
1861    return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1862                        f64_div_pre, f64_div_post, NULL, NULL);
1863}
1864
1865/*
1866 * Float to Float conversions
1867 *
1868 * Returns the result of converting one float format to another. The
1869 * conversion is performed according to the IEC/IEEE Standard for
1870 * Binary Floating-Point Arithmetic.
1871 *
1872 * The float_to_float helper only needs to take care of raising
1873 * invalid exceptions and handling the conversion on NaNs.
1874 */
1875
1876static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1877                                 float_status *s)
1878{
1879    if (dstf->arm_althp) {
1880        switch (a.cls) {
1881        case float_class_qnan:
1882        case float_class_snan:
1883            /* There is no NaN in the destination format.  Raise Invalid
1884             * and return a zero with the sign of the input NaN.
1885             */
1886            s->float_exception_flags |= float_flag_invalid;
1887            a.cls = float_class_zero;
1888            a.frac = 0;
1889            a.exp = 0;
1890            break;
1891
1892        case float_class_inf:
1893            /* There is no Inf in the destination format.  Raise Invalid
1894             * and return the maximum normal with the correct sign.
1895             */
1896            s->float_exception_flags |= float_flag_invalid;
1897            a.cls = float_class_normal;
1898            a.exp = dstf->exp_max;
1899            a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1900            break;
1901
1902        default:
1903            break;
1904        }
1905    } else if (is_nan(a.cls)) {
1906        if (is_snan(a.cls)) {
1907            s->float_exception_flags |= float_flag_invalid;
1908            a = parts_silence_nan(a, s);
1909        }
1910        if (s->default_nan_mode) {
1911            return parts_default_nan(s);
1912        }
1913    }
1914    return a;
1915}
1916
1917float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1918{
1919    const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1920    FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1921    FloatParts pr = float_to_float(p, &float32_params, s);
1922    return float32_round_pack_canonical(pr, s);
1923}
1924
1925float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1926{
1927    const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1928    FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1929    FloatParts pr = float_to_float(p, &float64_params, s);
1930    return float64_round_pack_canonical(pr, s);
1931}
1932
1933float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1934{
1935    const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1936    FloatParts p = float32_unpack_canonical(a, s);
1937    FloatParts pr = float_to_float(p, fmt16, s);
1938    return float16a_round_pack_canonical(pr, s, fmt16);
1939}
1940
1941float64 float32_to_float64(float32 a, float_status *s)
1942{
1943    FloatParts p = float32_unpack_canonical(a, s);
1944    FloatParts pr = float_to_float(p, &float64_params, s);
1945    return float64_round_pack_canonical(pr, s);
1946}
1947
1948float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1949{
1950    const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1951    FloatParts p = float64_unpack_canonical(a, s);
1952    FloatParts pr = float_to_float(p, fmt16, s);
1953    return float16a_round_pack_canonical(pr, s, fmt16);
1954}
1955
1956float32 float64_to_float32(float64 a, float_status *s)
1957{
1958    FloatParts p = float64_unpack_canonical(a, s);
1959    FloatParts pr = float_to_float(p, &float32_params, s);
1960    return float32_round_pack_canonical(pr, s);
1961}
1962
1963/*
1964 * Rounds the floating-point value `a' to an integer, and returns the
1965 * result as a floating-point value. The operation is performed
1966 * according to the IEC/IEEE Standard for Binary Floating-Point
1967 * Arithmetic.
1968 */
1969
1970static FloatParts round_to_int(FloatParts a, int rmode,
1971                               int scale, float_status *s)
1972{
1973    switch (a.cls) {
1974    case float_class_qnan:
1975    case float_class_snan:
1976        return return_nan(a, s);
1977
1978    case float_class_zero:
1979    case float_class_inf:
1980        /* already "integral" */
1981        break;
1982
1983    case float_class_normal:
1984        scale = MIN(MAX(scale, -0x10000), 0x10000);
1985        a.exp += scale;
1986
1987        if (a.exp >= DECOMPOSED_BINARY_POINT) {
1988            /* already integral */
1989            break;
1990        }
1991        if (a.exp < 0) {
1992            bool one;
1993            /* all fractional */
1994            s->float_exception_flags |= float_flag_inexact;
1995            switch (rmode) {
1996            case float_round_nearest_even:
1997                one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1998                break;
1999            case float_round_ties_away:
2000                one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2001                break;
2002            case float_round_to_zero:
2003                one = false;
2004                break;
2005            case float_round_up:
2006                one = !a.sign;
2007                break;
2008            case float_round_down:
2009                one = a.sign;
2010                break;
2011            case float_round_to_odd:
2012                one = true;
2013                break;
2014            default:
2015                g_assert_not_reached();
2016            }
2017
2018            if (one) {
2019                a.frac = DECOMPOSED_IMPLICIT_BIT;
2020                a.exp = 0;
2021            } else {
2022                a.cls = float_class_zero;
2023            }
2024        } else {
2025            uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2026            uint64_t frac_lsbm1 = frac_lsb >> 1;
2027            uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2028            uint64_t rnd_mask = rnd_even_mask >> 1;
2029            uint64_t inc;
2030
2031            switch (rmode) {
2032            case float_round_nearest_even:
2033                inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2034                break;
2035            case float_round_ties_away:
2036                inc = frac_lsbm1;
2037                break;
2038            case float_round_to_zero:
2039                inc = 0;
2040                break;
2041            case float_round_up:
2042                inc = a.sign ? 0 : rnd_mask;
2043                break;
2044            case float_round_down:
2045                inc = a.sign ? rnd_mask : 0;
2046                break;
2047            case float_round_to_odd:
2048                inc = a.frac & frac_lsb ? 0 : rnd_mask;
2049                break;
2050            default:
2051                g_assert_not_reached();
2052            }
2053
2054            if (a.frac & rnd_mask) {
2055                s->float_exception_flags |= float_flag_inexact;
2056                a.frac += inc;
2057                a.frac &= ~rnd_mask;
2058                if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2059                    a.frac >>= 1;
2060                    a.exp++;
2061                }
2062            }
2063        }
2064        break;
2065    default:
2066        g_assert_not_reached();
2067    }
2068    return a;
2069}
2070
2071float16 float16_round_to_int(float16 a, float_status *s)
2072{
2073    FloatParts pa = float16_unpack_canonical(a, s);
2074    FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2075    return float16_round_pack_canonical(pr, s);
2076}
2077
2078float32 float32_round_to_int(float32 a, float_status *s)
2079{
2080    FloatParts pa = float32_unpack_canonical(a, s);
2081    FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2082    return float32_round_pack_canonical(pr, s);
2083}
2084
2085float64 float64_round_to_int(float64 a, float_status *s)
2086{
2087    FloatParts pa = float64_unpack_canonical(a, s);
2088    FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2089    return float64_round_pack_canonical(pr, s);
2090}
2091
2092/*
2093 * Returns the result of converting the floating-point value `a' to
2094 * the two's complement integer format. The conversion is performed
2095 * according to the IEC/IEEE Standard for Binary Floating-Point
2096 * Arithmetic---which means in particular that the conversion is
2097 * rounded according to the current rounding mode. If `a' is a NaN,
2098 * the largest positive integer is returned. Otherwise, if the
2099 * conversion overflows, the largest integer with the same sign as `a'
2100 * is returned.
2101*/
2102
2103static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
2104                                     int64_t min, int64_t max,
2105                                     float_status *s)
2106{
2107    uint64_t r;
2108    int orig_flags = get_float_exception_flags(s);
2109    FloatParts p = round_to_int(in, rmode, scale, s);
2110
2111    switch (p.cls) {
2112    case float_class_snan:
2113    case float_class_qnan:
2114        s->float_exception_flags = orig_flags | float_flag_invalid;
2115        return max;
2116    case float_class_inf:
2117        s->float_exception_flags = orig_flags | float_flag_invalid;
2118        return p.sign ? min : max;
2119    case float_class_zero:
2120        return 0;
2121    case float_class_normal:
2122        if (p.exp < DECOMPOSED_BINARY_POINT) {
2123            r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2124        } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2125            r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2126        } else {
2127            r = UINT64_MAX;
2128        }
2129        if (p.sign) {
2130            if (r <= -(uint64_t) min) {
2131                return -r;
2132            } else {
2133                s->float_exception_flags = orig_flags | float_flag_invalid;
2134                return min;
2135            }
2136        } else {
2137            if (r <= max) {
2138                return r;
2139            } else {
2140                s->float_exception_flags = orig_flags | float_flag_invalid;
2141                return max;
2142            }
2143        }
2144    default:
2145        g_assert_not_reached();
2146    }
2147}
2148
2149int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
2150                                float_status *s)
2151{
2152    return round_to_int_and_pack(float16_unpack_canonical(a, s),
2153                                 rmode, scale, INT16_MIN, INT16_MAX, s);
2154}
2155
2156int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
2157                                float_status *s)
2158{
2159    return round_to_int_and_pack(float16_unpack_canonical(a, s),
2160                                 rmode, scale, INT32_MIN, INT32_MAX, s);
2161}
2162
2163int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
2164                                float_status *s)
2165{
2166    return round_to_int_and_pack(float16_unpack_canonical(a, s),
2167                                 rmode, scale, INT64_MIN, INT64_MAX, s);
2168}
2169
2170int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
2171                                float_status *s)
2172{
2173    return round_to_int_and_pack(float32_unpack_canonical(a, s),
2174                                 rmode, scale, INT16_MIN, INT16_MAX, s);
2175}
2176
2177int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
2178                                float_status *s)
2179{
2180    return round_to_int_and_pack(float32_unpack_canonical(a, s),
2181                                 rmode, scale, INT32_MIN, INT32_MAX, s);
2182}
2183
2184int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
2185                                float_status *s)
2186{
2187    return round_to_int_and_pack(float32_unpack_canonical(a, s),
2188                                 rmode, scale, INT64_MIN, INT64_MAX, s);
2189}
2190
2191int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
2192                                float_status *s)
2193{
2194    return round_to_int_and_pack(float64_unpack_canonical(a, s),
2195                                 rmode, scale, INT16_MIN, INT16_MAX, s);
2196}
2197
2198int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
2199                                float_status *s)
2200{
2201    return round_to_int_and_pack(float64_unpack_canonical(a, s),
2202                                 rmode, scale, INT32_MIN, INT32_MAX, s);
2203}
2204
2205int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
2206                                float_status *s)
2207{
2208    return round_to_int_and_pack(float64_unpack_canonical(a, s),
2209                                 rmode, scale, INT64_MIN, INT64_MAX, s);
2210}
2211
2212int16_t float16_to_int16(float16 a, float_status *s)
2213{
2214    return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2215}
2216
2217int32_t float16_to_int32(float16 a, float_status *s)
2218{
2219    return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2220}
2221
2222int64_t float16_to_int64(float16 a, float_status *s)
2223{
2224    return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2225}
2226
2227int16_t float32_to_int16(float32 a, float_status *s)
2228{
2229    return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2230}
2231
2232int32_t float32_to_int32(float32 a, float_status *s)
2233{
2234    return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2235}
2236
2237int64_t float32_to_int64(float32 a, float_status *s)
2238{
2239    return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2240}
2241
2242int16_t float64_to_int16(float64 a, float_status *s)
2243{
2244    return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2245}
2246
2247int32_t float64_to_int32(float64 a, float_status *s)
2248{
2249    return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2250}
2251
2252int64_t float64_to_int64(float64 a, float_status *s)
2253{
2254    return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2255}
2256
2257int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2258{
2259    return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2260}
2261
2262int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2263{
2264    return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2265}
2266
2267int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2268{
2269    return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2270}
2271
2272int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2273{
2274    return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2275}
2276
2277int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2278{
2279    return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2280}
2281
2282int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2283{
2284    return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2285}
2286
2287int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2288{
2289    return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2290}
2291
2292int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2293{
2294    return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2295}
2296
2297int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2298{
2299    return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2300}
2301
2302/*
2303 *  Returns the result of converting the floating-point value `a' to
2304 *  the unsigned integer format. The conversion is performed according
2305 *  to the IEC/IEEE Standard for Binary Floating-Point
2306 *  Arithmetic---which means in particular that the conversion is
2307 *  rounded according to the current rounding mode. If `a' is a NaN,
2308 *  the largest unsigned integer is returned. Otherwise, if the
2309 *  conversion overflows, the largest unsigned integer is returned. If
2310 *  the 'a' is negative, the result is rounded and zero is returned;
2311 *  values that do not round to zero will raise the inexact exception
2312 *  flag.
2313 */
2314
2315static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2316                                       uint64_t max, float_status *s)
2317{
2318    int orig_flags = get_float_exception_flags(s);
2319    FloatParts p = round_to_int(in, rmode, scale, s);
2320    uint64_t r;
2321
2322    switch (p.cls) {
2323    case float_class_snan:
2324    case float_class_qnan:
2325        s->float_exception_flags = orig_flags | float_flag_invalid;
2326        return max;
2327    case float_class_inf:
2328        s->float_exception_flags = orig_flags | float_flag_invalid;
2329        return p.sign ? 0 : max;
2330    case float_class_zero:
2331        return 0;
2332    case float_class_normal:
2333        if (p.sign) {
2334            s->float_exception_flags = orig_flags | float_flag_invalid;
2335            return 0;
2336        }
2337
2338        if (p.exp < DECOMPOSED_BINARY_POINT) {
2339            r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2340        } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2341            r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2342        } else {
2343            s->float_exception_flags = orig_flags | float_flag_invalid;
2344            return max;
2345        }
2346
2347        /* For uint64 this will never trip, but if p.exp is too large
2348         * to shift a decomposed fraction we shall have exited via the
2349         * 3rd leg above.
2350         */
2351        if (r > max) {
2352            s->float_exception_flags = orig_flags | float_flag_invalid;
2353            return max;
2354        }
2355        return r;
2356    default:
2357        g_assert_not_reached();
2358    }
2359}
2360
2361uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2362                                  float_status *s)
2363{
2364    return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2365                                  rmode, scale, UINT16_MAX, s);
2366}
2367
2368uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2369                                  float_status *s)
2370{
2371    return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2372                                  rmode, scale, UINT32_MAX, s);
2373}
2374
2375uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2376                                  float_status *s)
2377{
2378    return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2379                                  rmode, scale, UINT64_MAX, s);
2380}
2381
2382uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2383                                  float_status *s)
2384{
2385    return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2386                                  rmode, scale, UINT16_MAX, s);
2387}
2388
2389uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2390                                  float_status *s)
2391{
2392    return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2393                                  rmode, scale, UINT32_MAX, s);
2394}
2395
2396uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2397                                  float_status *s)
2398{
2399    return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2400                                  rmode, scale, UINT64_MAX, s);
2401}
2402
2403uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2404                                  float_status *s)
2405{
2406    return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2407                                  rmode, scale, UINT16_MAX, s);
2408}
2409
2410uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2411                                  float_status *s)
2412{
2413    return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2414                                  rmode, scale, UINT32_MAX, s);
2415}
2416
2417uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2418                                  float_status *s)
2419{
2420    return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2421                                  rmode, scale, UINT64_MAX, s);
2422}
2423
2424uint16_t float16_to_uint16(float16 a, float_status *s)
2425{
2426    return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2427}
2428
2429uint32_t float16_to_uint32(float16 a, float_status *s)
2430{
2431    return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2432}
2433
2434uint64_t float16_to_uint64(float16 a, float_status *s)
2435{
2436    return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2437}
2438
2439uint16_t float32_to_uint16(float32 a, float_status *s)
2440{
2441    return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2442}
2443
2444uint32_t float32_to_uint32(float32 a, float_status *s)
2445{
2446    return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2447}
2448
2449uint64_t float32_to_uint64(float32 a, float_status *s)
2450{
2451    return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2452}
2453
2454uint16_t float64_to_uint16(float64 a, float_status *s)
2455{
2456    return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2457}
2458
2459uint32_t float64_to_uint32(float64 a, float_status *s)
2460{
2461    return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2462}
2463
2464uint64_t float64_to_uint64(float64 a, float_status *s)
2465{
2466    return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2467}
2468
2469uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2470{
2471    return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2472}
2473
2474uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2475{
2476    return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2477}
2478
2479uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2480{
2481    return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2482}
2483
2484uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2485{
2486    return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2487}
2488
2489uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2490{
2491    return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2492}
2493
2494uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2495{
2496    return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2497}
2498
2499uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2500{
2501    return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2502}
2503
2504uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2505{
2506    return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2507}
2508
2509uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2510{
2511    return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2512}
2513
2514/*
2515 * Integer to float conversions
2516 *
2517 * Returns the result of converting the two's complement integer `a'
2518 * to the floating-point format. The conversion is performed according
2519 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2520 */
2521
2522static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2523{
2524    FloatParts r = { .sign = false };
2525
2526    if (a == 0) {
2527        r.cls = float_class_zero;
2528    } else {
2529        uint64_t f = a;
2530        int shift;
2531
2532        r.cls = float_class_normal;
2533        if (a < 0) {
2534            f = -f;
2535            r.sign = true;
2536        }
2537        shift = clz64(f) - 1;
2538        scale = MIN(MAX(scale, -0x10000), 0x10000);
2539
2540        r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2541        r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2542    }
2543
2544    return r;
2545}
2546
2547float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2548{
2549    FloatParts pa = int_to_float(a, scale, status);
2550    return float16_round_pack_canonical(pa, status);
2551}
2552
2553float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2554{
2555    return int64_to_float16_scalbn(a, scale, status);
2556}
2557
2558float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2559{
2560    return int64_to_float16_scalbn(a, scale, status);
2561}
2562
2563float16 int64_to_float16(int64_t a, float_status *status)
2564{
2565    return int64_to_float16_scalbn(a, 0, status);
2566}
2567
2568float16 int32_to_float16(int32_t a, float_status *status)
2569{
2570    return int64_to_float16_scalbn(a, 0, status);
2571}
2572
2573float16 int16_to_float16(int16_t a, float_status *status)
2574{
2575    return int64_to_float16_scalbn(a, 0, status);
2576}
2577
2578float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2579{
2580    FloatParts pa = int_to_float(a, scale, status);
2581    return float32_round_pack_canonical(pa, status);
2582}
2583
2584float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2585{
2586    return int64_to_float32_scalbn(a, scale, status);
2587}
2588
2589float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2590{
2591    return int64_to_float32_scalbn(a, scale, status);
2592}
2593
2594float32 int64_to_float32(int64_t a, float_status *status)
2595{
2596    return int64_to_float32_scalbn(a, 0, status);
2597}
2598
2599float32 int32_to_float32(int32_t a, float_status *status)
2600{
2601    return int64_to_float32_scalbn(a, 0, status);
2602}
2603
2604float32 int16_to_float32(int16_t a, float_status *status)
2605{
2606    return int64_to_float32_scalbn(a, 0, status);
2607}
2608
2609float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2610{
2611    FloatParts pa = int_to_float(a, scale, status);
2612    return float64_round_pack_canonical(pa, status);
2613}
2614
2615float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2616{
2617    return int64_to_float64_scalbn(a, scale, status);
2618}
2619
2620float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2621{
2622    return int64_to_float64_scalbn(a, scale, status);
2623}
2624
2625float64 int64_to_float64(int64_t a, float_status *status)
2626{
2627    return int64_to_float64_scalbn(a, 0, status);
2628}
2629
2630float64 int32_to_float64(int32_t a, float_status *status)
2631{
2632    return int64_to_float64_scalbn(a, 0, status);
2633}
2634
2635float64 int16_to_float64(int16_t a, float_status *status)
2636{
2637    return int64_to_float64_scalbn(a, 0, status);
2638}
2639
2640
2641/*
2642 * Unsigned Integer to float conversions
2643 *
2644 * Returns the result of converting the unsigned integer `a' to the
2645 * floating-point format. The conversion is performed according to the
2646 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2647 */
2648
2649static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2650{
2651    FloatParts r = { .sign = false };
2652
2653    if (a == 0) {
2654        r.cls = float_class_zero;
2655    } else {
2656        scale = MIN(MAX(scale, -0x10000), 0x10000);
2657        r.cls = float_class_normal;
2658        if ((int64_t)a < 0) {
2659            r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2660            shift64RightJamming(a, 1, &a);
2661            r.frac = a;
2662        } else {
2663            int shift = clz64(a) - 1;
2664            r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2665            r.frac = a << shift;
2666        }
2667    }
2668
2669    return r;
2670}
2671
2672float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2673{
2674    FloatParts pa = uint_to_float(a, scale, status);
2675    return float16_round_pack_canonical(pa, status);
2676}
2677
2678float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2679{
2680    return uint64_to_float16_scalbn(a, scale, status);
2681}
2682
2683float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2684{
2685    return uint64_to_float16_scalbn(a, scale, status);
2686}
2687
2688float16 uint64_to_float16(uint64_t a, float_status *status)
2689{
2690    return uint64_to_float16_scalbn(a, 0, status);
2691}
2692
2693float16 uint32_to_float16(uint32_t a, float_status *status)
2694{
2695    return uint64_to_float16_scalbn(a, 0, status);
2696}
2697
2698float16 uint16_to_float16(uint16_t a, float_status *status)
2699{
2700    return uint64_to_float16_scalbn(a, 0, status);
2701}
2702
2703float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2704{
2705    FloatParts pa = uint_to_float(a, scale, status);
2706    return float32_round_pack_canonical(pa, status);
2707}
2708
2709float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2710{
2711    return uint64_to_float32_scalbn(a, scale, status);
2712}
2713
2714float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2715{
2716    return uint64_to_float32_scalbn(a, scale, status);
2717}
2718
2719float32 uint64_to_float32(uint64_t a, float_status *status)
2720{
2721    return uint64_to_float32_scalbn(a, 0, status);
2722}
2723
2724float32 uint32_to_float32(uint32_t a, float_status *status)
2725{
2726    return uint64_to_float32_scalbn(a, 0, status);
2727}
2728
2729float32 uint16_to_float32(uint16_t a, float_status *status)
2730{
2731    return uint64_to_float32_scalbn(a, 0, status);
2732}
2733
2734float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2735{
2736    FloatParts pa = uint_to_float(a, scale, status);
2737    return float64_round_pack_canonical(pa, status);
2738}
2739
2740float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2741{
2742    return uint64_to_float64_scalbn(a, scale, status);
2743}
2744
2745float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2746{
2747    return uint64_to_float64_scalbn(a, scale, status);
2748}
2749
2750float64 uint64_to_float64(uint64_t a, float_status *status)
2751{
2752    return uint64_to_float64_scalbn(a, 0, status);
2753}
2754
2755float64 uint32_to_float64(uint32_t a, float_status *status)
2756{
2757    return uint64_to_float64_scalbn(a, 0, status);
2758}
2759
2760float64 uint16_to_float64(uint16_t a, float_status *status)
2761{
2762    return uint64_to_float64_scalbn(a, 0, status);
2763}
2764
2765/* Float Min/Max */
2766/* min() and max() functions. These can't be implemented as
2767 * 'compare and pick one input' because that would mishandle
2768 * NaNs and +0 vs -0.
2769 *
2770 * minnum() and maxnum() functions. These are similar to the min()
2771 * and max() functions but if one of the arguments is a QNaN and
2772 * the other is numerical then the numerical argument is returned.
2773 * SNaNs will get quietened before being returned.
2774 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2775 * and maxNum() operations. min() and max() are the typical min/max
2776 * semantics provided by many CPUs which predate that specification.
2777 *
2778 * minnummag() and maxnummag() functions correspond to minNumMag()
2779 * and minNumMag() from the IEEE-754 2008.
2780 */
2781static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2782                                bool ieee, bool ismag, float_status *s)
2783{
2784    if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2785        if (ieee) {
2786            /* Takes two floating-point values `a' and `b', one of
2787             * which is a NaN, and returns the appropriate NaN
2788             * result. If either `a' or `b' is a signaling NaN,
2789             * the invalid exception is raised.
2790             */
2791            if (is_snan(a.cls) || is_snan(b.cls)) {
2792                return pick_nan(a, b, s);
2793            } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2794                return b;
2795            } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2796                return a;
2797            }
2798        }
2799        return pick_nan(a, b, s);
2800    } else {
2801        int a_exp, b_exp;
2802
2803        switch (a.cls) {
2804        case float_class_normal:
2805            a_exp = a.exp;
2806            break;
2807        case float_class_inf:
2808            a_exp = INT_MAX;
2809            break;
2810        case float_class_zero:
2811            a_exp = INT_MIN;
2812            break;
2813        default:
2814            g_assert_not_reached();
2815            break;
2816        }
2817        switch (b.cls) {
2818        case float_class_normal:
2819            b_exp = b.exp;
2820            break;
2821        case float_class_inf:
2822            b_exp = INT_MAX;
2823            break;
2824        case float_class_zero:
2825            b_exp = INT_MIN;
2826            break;
2827        default:
2828            g_assert_not_reached();
2829            break;
2830        }
2831
2832        if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2833            bool a_less = a_exp < b_exp;
2834            if (a_exp == b_exp) {
2835                a_less = a.frac < b.frac;
2836            }
2837            return a_less ^ ismin ? b : a;
2838        }
2839
2840        if (a.sign == b.sign) {
2841            bool a_less = a_exp < b_exp;
2842            if (a_exp == b_exp) {
2843                a_less = a.frac < b.frac;
2844            }
2845            return a.sign ^ a_less ^ ismin ? b : a;
2846        } else {
2847            return a.sign ^ ismin ? b : a;
2848        }
2849    }
2850}
2851
2852#define MINMAX(sz, name, ismin, isiee, ismag)                           \
2853float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2854                                     float_status *s)                   \
2855{                                                                       \
2856    FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2857    FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2858    FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2859                                                                        \
2860    return float ## sz ## _round_pack_canonical(pr, s);                 \
2861}
2862
2863MINMAX(16, min, true, false, false)
2864MINMAX(16, minnum, true, true, false)
2865MINMAX(16, minnummag, true, true, true)
2866MINMAX(16, max, false, false, false)
2867MINMAX(16, maxnum, false, true, false)
2868MINMAX(16, maxnummag, false, true, true)
2869
2870MINMAX(32, min, true, false, false)
2871MINMAX(32, minnum, true, true, false)
2872MINMAX(32, minnummag, true, true, true)
2873MINMAX(32, max, false, false, false)
2874MINMAX(32, maxnum, false, true, false)
2875MINMAX(32, maxnummag, false, true, true)
2876
2877MINMAX(64, min, true, false, false)
2878MINMAX(64, minnum, true, true, false)
2879MINMAX(64, minnummag, true, true, true)
2880MINMAX(64, max, false, false, false)
2881MINMAX(64, maxnum, false, true, false)
2882MINMAX(64, maxnummag, false, true, true)
2883
2884#undef MINMAX
2885
2886/* Floating point compare */
2887static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2888                          float_status *s)
2889{
2890    if (is_nan(a.cls) || is_nan(b.cls)) {
2891        if (!is_quiet ||
2892            a.cls == float_class_snan ||
2893            b.cls == float_class_snan) {
2894            s->float_exception_flags |= float_flag_invalid;
2895        }
2896        return float_relation_unordered;
2897    }
2898
2899    if (a.cls == float_class_zero) {
2900        if (b.cls == float_class_zero) {
2901            return float_relation_equal;
2902        }
2903        return b.sign ? float_relation_greater : float_relation_less;
2904    } else if (b.cls == float_class_zero) {
2905        return a.sign ? float_relation_less : float_relation_greater;
2906    }
2907
2908    /* The only really important thing about infinity is its sign. If
2909     * both are infinities the sign marks the smallest of the two.
2910     */
2911    if (a.cls == float_class_inf) {
2912        if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2913            return float_relation_equal;
2914        }
2915        return a.sign ? float_relation_less : float_relation_greater;
2916    } else if (b.cls == float_class_inf) {
2917        return b.sign ? float_relation_greater : float_relation_less;
2918    }
2919
2920    if (a.sign != b.sign) {
2921        return a.sign ? float_relation_less : float_relation_greater;
2922    }
2923
2924    if (a.exp == b.exp) {
2925        if (a.frac == b.frac) {
2926            return float_relation_equal;
2927        }
2928        if (a.sign) {
2929            return a.frac > b.frac ?
2930                float_relation_less : float_relation_greater;
2931        } else {
2932            return a.frac > b.frac ?
2933                float_relation_greater : float_relation_less;
2934        }
2935    } else {
2936        if (a.sign) {
2937            return a.exp > b.exp ? float_relation_less : float_relation_greater;
2938        } else {
2939            return a.exp > b.exp ? float_relation_greater : float_relation_less;
2940        }
2941    }
2942}
2943
2944#define COMPARE(name, attr, sz)                                         \
2945static int attr                                                         \
2946name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
2947{                                                                       \
2948    FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2949    FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2950    return compare_floats(pa, pb, is_quiet, s);                         \
2951}
2952
2953COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
2954COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
2955COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
2956
2957#undef COMPARE
2958
2959int float16_compare(float16 a, float16 b, float_status *s)
2960{
2961    return soft_f16_compare(a, b, false, s);
2962}
2963
2964int float16_compare_quiet(float16 a, float16 b, float_status *s)
2965{
2966    return soft_f16_compare(a, b, true, s);
2967}
2968
2969static int QEMU_FLATTEN
2970f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
2971{
2972    union_float32 ua, ub;
2973
2974    ua.s = xa;
2975    ub.s = xb;
2976
2977    if (QEMU_NO_HARDFLOAT) {
2978        goto soft;
2979    }
2980
2981    float32_input_flush2(&ua.s, &ub.s, s);
2982    if (isgreaterequal(ua.h, ub.h)) {
2983        if (isgreater(ua.h, ub.h)) {
2984            return float_relation_greater;
2985        }
2986        return float_relation_equal;
2987    }
2988    if (likely(isless(ua.h, ub.h))) {
2989        return float_relation_less;
2990    }
2991    /* The only condition remaining is unordered.
2992     * Fall through to set flags.
2993     */
2994 soft:
2995    return soft_f32_compare(ua.s, ub.s, is_quiet, s);
2996}
2997
2998int float32_compare(float32 a, float32 b, float_status *s)
2999{
3000    return f32_compare(a, b, false, s);
3001}
3002
3003int float32_compare_quiet(float32 a, float32 b, float_status *s)
3004{
3005    return f32_compare(a, b, true, s);
3006}
3007
3008static int QEMU_FLATTEN
3009f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3010{
3011    union_float64 ua, ub;
3012
3013    ua.s = xa;
3014    ub.s = xb;
3015
3016    if (QEMU_NO_HARDFLOAT) {
3017        goto soft;
3018    }
3019
3020    float64_input_flush2(&ua.s, &ub.s, s);
3021    if (isgreaterequal(ua.h, ub.h)) {
3022        if (isgreater(ua.h, ub.h)) {
3023            return float_relation_greater;
3024        }
3025        return float_relation_equal;
3026    }
3027    if (likely(isless(ua.h, ub.h))) {
3028        return float_relation_less;
3029    }
3030    /* The only condition remaining is unordered.
3031     * Fall through to set flags.
3032     */
3033 soft:
3034    return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3035}
3036
3037int float64_compare(float64 a, float64 b, float_status *s)
3038{
3039    return f64_compare(a, b, false, s);
3040}
3041
3042int float64_compare_quiet(float64 a, float64 b, float_status *s)
3043{
3044    return f64_compare(a, b, true, s);
3045}
3046
3047/* Multiply A by 2 raised to the power N.  */
3048static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3049{
3050    if (unlikely(is_nan(a.cls))) {
3051        return return_nan(a, s);
3052    }
3053    if (a.cls == float_class_normal) {
3054        /* The largest float type (even though not supported by FloatParts)
3055         * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3056         * still allows rounding to infinity, without allowing overflow
3057         * within the int32_t that backs FloatParts.exp.
3058         */
3059        n = MIN(MAX(n, -0x10000), 0x10000);
3060        a.exp += n;
3061    }
3062    return a;
3063}
3064
3065float16 float16_scalbn(float16 a, int n, float_status *status)
3066{
3067    FloatParts pa = float16_unpack_canonical(a, status);
3068    FloatParts pr = scalbn_decomposed(pa, n, status);
3069    return float16_round_pack_canonical(pr, status);
3070}
3071
3072float32 float32_scalbn(float32 a, int n, float_status *status)
3073{
3074    FloatParts pa = float32_unpack_canonical(a, status);
3075    FloatParts pr = scalbn_decomposed(pa, n, status);
3076    return float32_round_pack_canonical(pr, status);
3077}
3078
3079float64 float64_scalbn(float64 a, int n, float_status *status)
3080{
3081    FloatParts pa = float64_unpack_canonical(a, status);
3082    FloatParts pr = scalbn_decomposed(pa, n, status);
3083    return float64_round_pack_canonical(pr, status);
3084}
3085
3086/*
3087 * Square Root
3088 *
3089 * The old softfloat code did an approximation step before zeroing in
3090 * on the final result. However for simpleness we just compute the
3091 * square root by iterating down from the implicit bit to enough extra
3092 * bits to ensure we get a correctly rounded result.
3093 *
3094 * This does mean however the calculation is slower than before,
3095 * especially for 64 bit floats.
3096 */
3097
3098static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3099{
3100    uint64_t a_frac, r_frac, s_frac;
3101    int bit, last_bit;
3102
3103    if (is_nan(a.cls)) {
3104        return return_nan(a, s);
3105    }
3106    if (a.cls == float_class_zero) {
3107        return a;  /* sqrt(+-0) = +-0 */
3108    }
3109    if (a.sign) {
3110        s->float_exception_flags |= float_flag_invalid;
3111        return parts_default_nan(s);
3112    }
3113    if (a.cls == float_class_inf) {
3114        return a;  /* sqrt(+inf) = +inf */
3115    }
3116
3117    assert(a.cls == float_class_normal);
3118
3119    /* We need two overflow bits at the top. Adding room for that is a
3120     * right shift. If the exponent is odd, we can discard the low bit
3121     * by multiplying the fraction by 2; that's a left shift. Combine
3122     * those and we shift right if the exponent is even.
3123     */
3124    a_frac = a.frac;
3125    if (!(a.exp & 1)) {
3126        a_frac >>= 1;
3127    }
3128    a.exp >>= 1;
3129
3130    /* Bit-by-bit computation of sqrt.  */
3131    r_frac = 0;
3132    s_frac = 0;
3133
3134    /* Iterate from implicit bit down to the 3 extra bits to compute a
3135     * properly rounded result. Remember we've inserted one more bit
3136     * at the top, so these positions are one less.
3137     */
3138    bit = DECOMPOSED_BINARY_POINT - 1;
3139    last_bit = MAX(p->frac_shift - 4, 0);
3140    do {
3141        uint64_t q = 1ULL << bit;
3142        uint64_t t_frac = s_frac + q;
3143        if (t_frac <= a_frac) {
3144            s_frac = t_frac + q;
3145            a_frac -= t_frac;
3146            r_frac += q;
3147        }
3148        a_frac <<= 1;
3149    } while (--bit >= last_bit);
3150
3151    /* Undo the right shift done above. If there is any remaining
3152     * fraction, the result is inexact. Set the sticky bit.
3153     */
3154    a.frac = (r_frac << 1) + (a_frac != 0);
3155
3156    return a;
3157}
3158
3159float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3160{
3161    FloatParts pa = float16_unpack_canonical(a, status);
3162    FloatParts pr = sqrt_float(pa, status, &float16_params);
3163    return float16_round_pack_canonical(pr, status);
3164}
3165
3166static float32 QEMU_SOFTFLOAT_ATTR
3167soft_f32_sqrt(float32 a, float_status *status)
3168{
3169    FloatParts pa = float32_unpack_canonical(a, status);
3170    FloatParts pr = sqrt_float(pa, status, &float32_params);
3171    return float32_round_pack_canonical(pr, status);
3172}
3173
3174static float64 QEMU_SOFTFLOAT_ATTR
3175soft_f64_sqrt(float64 a, float_status *status)
3176{
3177    FloatParts pa = float64_unpack_canonical(a, status);
3178    FloatParts pr = sqrt_float(pa, status, &float64_params);
3179    return float64_round_pack_canonical(pr, status);
3180}
3181
3182float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3183{
3184    union_float32 ua, ur;
3185
3186    ua.s = xa;
3187    if (unlikely(!can_use_fpu(s))) {
3188        goto soft;
3189    }
3190
3191    float32_input_flush1(&ua.s, s);
3192    if (QEMU_HARDFLOAT_1F32_USE_FP) {
3193        if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3194                       fpclassify(ua.h) == FP_ZERO) ||
3195                     signbit(ua.h))) {
3196            goto soft;
3197        }
3198    } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3199                        float32_is_neg(ua.s))) {
3200        goto soft;
3201    }
3202    ur.h = sqrtf(ua.h);
3203    return ur.s;
3204
3205 soft:
3206    return soft_f32_sqrt(ua.s, s);
3207}
3208
3209float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3210{
3211    union_float64 ua, ur;
3212
3213    ua.s = xa;
3214    if (unlikely(!can_use_fpu(s))) {
3215        goto soft;
3216    }
3217
3218    float64_input_flush1(&ua.s, s);
3219    if (QEMU_HARDFLOAT_1F64_USE_FP) {
3220        if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3221                       fpclassify(ua.h) == FP_ZERO) ||
3222                     signbit(ua.h))) {
3223            goto soft;
3224        }
3225    } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3226                        float64_is_neg(ua.s))) {
3227        goto soft;
3228    }
3229    ur.h = sqrt(ua.h);
3230    return ur.s;
3231
3232 soft:
3233    return soft_f64_sqrt(ua.s, s);
3234}
3235
3236/*----------------------------------------------------------------------------
3237| The pattern for a default generated NaN.
3238*----------------------------------------------------------------------------*/
3239
3240float16 float16_default_nan(float_status *status)
3241{
3242    FloatParts p = parts_default_nan(status);
3243    p.frac >>= float16_params.frac_shift;
3244    return float16_pack_raw(p);
3245}
3246
3247float32 float32_default_nan(float_status *status)
3248{
3249    FloatParts p = parts_default_nan(status);
3250    p.frac >>= float32_params.frac_shift;
3251    return float32_pack_raw(p);
3252}
3253
3254float64 float64_default_nan(float_status *status)
3255{
3256    FloatParts p = parts_default_nan(status);
3257    p.frac >>= float64_params.frac_shift;
3258    return float64_pack_raw(p);
3259}
3260
3261float128 float128_default_nan(float_status *status)
3262{
3263    FloatParts p = parts_default_nan(status);
3264    float128 r;
3265
3266    /* Extrapolate from the choices made by parts_default_nan to fill
3267     * in the quad-floating format.  If the low bit is set, assume we
3268     * want to set all non-snan bits.
3269     */
3270    r.low = -(p.frac & 1);
3271    r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3272    r.high |= LIT64(0x7FFF000000000000);
3273    r.high |= (uint64_t)p.sign << 63;
3274
3275    return r;
3276}
3277
3278/*----------------------------------------------------------------------------
3279| Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3280*----------------------------------------------------------------------------*/
3281
3282float16 float16_silence_nan(float16 a, float_status *status)
3283{
3284    FloatParts p = float16_unpack_raw(a);
3285    p.frac <<= float16_params.frac_shift;
3286    p = parts_silence_nan(p, status);
3287    p.frac >>= float16_params.frac_shift;
3288    return float16_pack_raw(p);
3289}
3290
3291float32 float32_silence_nan(float32 a, float_status *status)
3292{
3293    FloatParts p = float32_unpack_raw(a);
3294    p.frac <<= float32_params.frac_shift;
3295    p = parts_silence_nan(p, status);
3296    p.frac >>= float32_params.frac_shift;
3297    return float32_pack_raw(p);
3298}
3299
3300float64 float64_silence_nan(float64 a, float_status *status)
3301{
3302    FloatParts p = float64_unpack_raw(a);
3303    p.frac <<= float64_params.frac_shift;
3304    p = parts_silence_nan(p, status);
3305    p.frac >>= float64_params.frac_shift;
3306    return float64_pack_raw(p);
3307}
3308
3309/*----------------------------------------------------------------------------
3310| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3311| and 7, and returns the properly rounded 32-bit integer corresponding to the
3312| input.  If `zSign' is 1, the input is negated before being converted to an
3313| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3314| is simply rounded to an integer, with the inexact exception raised if the
3315| input cannot be represented exactly as an integer.  However, if the fixed-
3316| point input is too large, the invalid exception is raised and the largest
3317| positive or negative integer is returned.
3318*----------------------------------------------------------------------------*/
3319
3320static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
3321{
3322    int8_t roundingMode;
3323    flag roundNearestEven;
3324    int8_t roundIncrement, roundBits;
3325    int32_t z;
3326
3327    roundingMode = status->float_rounding_mode;
3328    roundNearestEven = ( roundingMode == float_round_nearest_even );
3329    switch (roundingMode) {
3330    case float_round_nearest_even:
3331    case float_round_ties_away:
3332        roundIncrement = 0x40;
3333        break;
3334    case float_round_to_zero:
3335        roundIncrement = 0;
3336        break;
3337    case float_round_up:
3338        roundIncrement = zSign ? 0 : 0x7f;
3339        break;
3340    case float_round_down:
3341        roundIncrement = zSign ? 0x7f : 0;
3342        break;
3343    case float_round_to_odd:
3344        roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3345        break;
3346    default:
3347        abort();
3348    }
3349    roundBits = absZ & 0x7F;
3350    absZ = ( absZ + roundIncrement )>>7;
3351    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3352    z = absZ;
3353    if ( zSign ) z = - z;
3354    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3355        float_raise(float_flag_invalid, status);
3356        return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3357    }
3358    if (roundBits) {
3359        status->float_exception_flags |= float_flag_inexact;
3360    }
3361    return z;
3362
3363}
3364
3365/*----------------------------------------------------------------------------
3366| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3367| `absZ1', with binary point between bits 63 and 64 (between the input words),
3368| and returns the properly rounded 64-bit integer corresponding to the input.
3369| If `zSign' is 1, the input is negated before being converted to an integer.
3370| Ordinarily, the fixed-point input is simply rounded to an integer, with
3371| the inexact exception raised if the input cannot be represented exactly as
3372| an integer.  However, if the fixed-point input is too large, the invalid
3373| exception is raised and the largest positive or negative integer is
3374| returned.
3375*----------------------------------------------------------------------------*/
3376
3377static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
3378                               float_status *status)
3379{
3380    int8_t roundingMode;
3381    flag roundNearestEven, increment;
3382    int64_t z;
3383
3384    roundingMode = status->float_rounding_mode;
3385    roundNearestEven = ( roundingMode == float_round_nearest_even );
3386    switch (roundingMode) {
3387    case float_round_nearest_even:
3388    case float_round_ties_away:
3389        increment = ((int64_t) absZ1 < 0);
3390        break;
3391    case float_round_to_zero:
3392        increment = 0;
3393        break;
3394    case float_round_up:
3395        increment = !zSign && absZ1;
3396        break;
3397    case float_round_down:
3398        increment = zSign && absZ1;
3399        break;
3400    case float_round_to_odd:
3401        increment = !(absZ0 & 1) && absZ1;
3402        break;
3403    default:
3404        abort();
3405    }
3406    if ( increment ) {
3407        ++absZ0;
3408        if ( absZ0 == 0 ) goto overflow;
3409        absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3410    }
3411    z = absZ0;
3412    if ( zSign ) z = - z;
3413    if ( z && ( ( z < 0 ) ^ zSign ) ) {
3414 overflow:
3415        float_raise(float_flag_invalid, status);
3416        return
3417              zSign ? (int64_t) LIT64( 0x8000000000000000 )
3418            : LIT64( 0x7FFFFFFFFFFFFFFF );
3419    }
3420    if (absZ1) {
3421        status->float_exception_flags |= float_flag_inexact;
3422    }
3423    return z;
3424
3425}
3426
3427/*----------------------------------------------------------------------------
3428| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3429| `absZ1', with binary point between bits 63 and 64 (between the input words),
3430| and returns the properly rounded 64-bit unsigned integer corresponding to the
3431| input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3432| with the inexact exception raised if the input cannot be represented exactly
3433| as an integer.  However, if the fixed-point input is too large, the invalid
3434| exception is raised and the largest unsigned integer is returned.
3435*----------------------------------------------------------------------------*/
3436
3437static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3438                                uint64_t absZ1, float_status *status)
3439{
3440    int8_t roundingMode;
3441    flag roundNearestEven, increment;
3442
3443    roundingMode = status->float_rounding_mode;
3444    roundNearestEven = (roundingMode == float_round_nearest_even);
3445    switch (roundingMode) {
3446    case float_round_nearest_even:
3447    case float_round_ties_away:
3448        increment = ((int64_t)absZ1 < 0);
3449        break;
3450    case float_round_to_zero:
3451        increment = 0;
3452        break;
3453    case float_round_up:
3454        increment = !zSign && absZ1;
3455        break;
3456    case float_round_down:
3457        increment = zSign && absZ1;
3458        break;
3459    case float_round_to_odd:
3460        increment = !(absZ0 & 1) && absZ1;
3461        break;
3462    default:
3463        abort();
3464    }
3465    if (increment) {
3466        ++absZ0;
3467        if (absZ0 == 0) {
3468            float_raise(float_flag_invalid, status);
3469            return LIT64(0xFFFFFFFFFFFFFFFF);
3470        }
3471        absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3472    }
3473
3474    if (zSign && absZ0) {
3475        float_raise(float_flag_invalid, status);
3476        return 0;
3477    }
3478
3479    if (absZ1) {
3480        status->float_exception_flags |= float_flag_inexact;
3481    }
3482    return absZ0;
3483}
3484
3485/*----------------------------------------------------------------------------
3486| If `a' is denormal and we are in flush-to-zero mode then set the
3487| input-denormal exception and return zero. Otherwise just return the value.
3488*----------------------------------------------------------------------------*/
3489float32 float32_squash_input_denormal(float32 a, float_status *status)
3490{
3491    if (status->flush_inputs_to_zero) {
3492        if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
3493            float_raise(float_flag_input_denormal, status);
3494            return make_float32(float32_val(a) & 0x80000000);
3495        }
3496    }
3497    return a;
3498}
3499
3500/*----------------------------------------------------------------------------
3501| Normalizes the subnormal single-precision floating-point value represented
3502| by the denormalized significand `aSig'.  The normalized exponent and
3503| significand are stored at the locations pointed to by `zExpPtr' and
3504| `zSigPtr', respectively.
3505*----------------------------------------------------------------------------*/
3506
3507static void
3508 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3509{
3510    int8_t shiftCount;
3511
3512    shiftCount = clz32(aSig) - 8;
3513    *zSigPtr = aSig<<shiftCount;
3514    *zExpPtr = 1 - shiftCount;
3515
3516}
3517
3518/*----------------------------------------------------------------------------
3519| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3520| and significand `zSig', and returns the proper single-precision floating-
3521| point value corresponding to the abstract input.  Ordinarily, the abstract
3522| value is simply rounded and packed into the single-precision format, with
3523| the inexact exception raised if the abstract input cannot be represented
3524| exactly.  However, if the abstract value is too large, the overflow and
3525| inexact exceptions are raised and an infinity or maximal finite value is
3526| returned.  If the abstract value is too small, the input value is rounded to
3527| a subnormal number, and the underflow and inexact exceptions are raised if
3528| the abstract input cannot be represented exactly as a subnormal single-
3529| precision floating-point number.
3530|     The input significand `zSig' has its binary point between bits 30
3531| and 29, which is 7 bits to the left of the usual location.  This shifted
3532| significand must be normalized or smaller.  If `zSig' is not normalized,
3533| `zExp' must be 0; in that case, the result returned is a subnormal number,
3534| and it must not require rounding.  In the usual case that `zSig' is
3535| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3536| The handling of underflow and overflow follows the IEC/IEEE Standard for
3537| Binary Floating-Point Arithmetic.
3538*----------------------------------------------------------------------------*/
3539
3540static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3541                                   float_status *status)
3542{
3543    int8_t roundingMode;
3544    flag roundNearestEven;
3545    int8_t roundIncrement, roundBits;
3546    flag isTiny;
3547
3548    roundingMode = status->float_rounding_mode;
3549    roundNearestEven = ( roundingMode == float_round_nearest_even );
3550    switch (roundingMode) {
3551    case float_round_nearest_even:
3552    case float_round_ties_away:
3553        roundIncrement = 0x40;
3554        break;
3555    case float_round_to_zero:
3556        roundIncrement = 0;
3557        break;
3558    case float_round_up:
3559        roundIncrement = zSign ? 0 : 0x7f;
3560        break;
3561    case float_round_down:
3562        roundIncrement = zSign ? 0x7f : 0;
3563        break;
3564    case float_round_to_odd:
3565        roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3566        break;
3567    default:
3568        abort();
3569        break;
3570    }
3571    roundBits = zSig & 0x7F;
3572    if ( 0xFD <= (uint16_t) zExp ) {
3573        if (    ( 0xFD < zExp )
3574             || (    ( zExp == 0xFD )
3575                  && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3576           ) {
3577            bool overflow_to_inf = roundingMode != float_round_to_odd &&
3578                                   roundIncrement != 0;
3579            float_raise(float_flag_overflow | float_flag_inexact, status);
3580            return packFloat32(zSign, 0xFF, -!overflow_to_inf);
3581        }
3582        if ( zExp < 0 ) {
3583            if (status->flush_to_zero) {
3584                float_raise(float_flag_output_denormal, status);
3585                return packFloat32(zSign, 0, 0);
3586            }
3587            isTiny =
3588                (status->float_detect_tininess
3589                 == float_tininess_before_rounding)
3590                || ( zExp < -1 )
3591                || ( zSig + roundIncrement < 0x80000000 );
3592            shift32RightJamming( zSig, - zExp, &zSig );
3593            zExp = 0;
3594            roundBits = zSig & 0x7F;
3595            if (isTiny && roundBits) {
3596                float_raise(float_flag_underflow, status);
3597            }
3598            if (roundingMode == float_round_to_odd) {
3599                /*
3600                 * For round-to-odd case, the roundIncrement depends on
3601                 * zSig which just changed.
3602                 */
3603                roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3604            }
3605        }
3606    }
3607    if (roundBits) {
3608        status->float_exception_flags |= float_flag_inexact;
3609    }
3610    zSig = ( zSig + roundIncrement )>>7;
3611    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3612    if ( zSig == 0 ) zExp = 0;
3613    return packFloat32( zSign, zExp, zSig );
3614
3615}
3616
3617/*----------------------------------------------------------------------------
3618| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3619| and significand `zSig', and returns the proper single-precision floating-
3620| point value corresponding to the abstract input.  This routine is just like
3621| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3622| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3623| floating-point exponent.
3624*----------------------------------------------------------------------------*/
3625
3626static float32
3627 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3628                              float_status *status)
3629{
3630    int8_t shiftCount;
3631
3632    shiftCount = clz32(zSig) - 1;
3633    return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3634                               status);
3635
3636}
3637
3638/*----------------------------------------------------------------------------
3639| If `a' is denormal and we are in flush-to-zero mode then set the
3640| input-denormal exception and return zero. Otherwise just return the value.
3641*----------------------------------------------------------------------------*/
3642float64 float64_squash_input_denormal(float64 a, float_status *status)
3643{
3644    if (status->flush_inputs_to_zero) {
3645        if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
3646            float_raise(float_flag_input_denormal, status);
3647            return make_float64(float64_val(a) & (1ULL << 63));
3648        }
3649    }
3650    return a;
3651}
3652
3653/*----------------------------------------------------------------------------
3654| Normalizes the subnormal double-precision floating-point value represented
3655| by the denormalized significand `aSig'.  The normalized exponent and
3656| significand are stored at the locations pointed to by `zExpPtr' and
3657| `zSigPtr', respectively.
3658*----------------------------------------------------------------------------*/
3659
3660static void
3661 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3662{
3663    int8_t shiftCount;
3664
3665    shiftCount = clz64(aSig) - 11;
3666    *zSigPtr = aSig<<shiftCount;
3667    *zExpPtr = 1 - shiftCount;
3668
3669}
3670
3671/*----------------------------------------------------------------------------
3672| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3673| double-precision floating-point value, returning the result.  After being
3674| shifted into the proper positions, the three fields are simply added
3675| together to form the result.  This means that any integer portion of `zSig'
3676| will be added into the exponent.  Since a properly normalized significand
3677| will have an integer portion equal to 1, the `zExp' input should be 1 less
3678| than the desired result exponent whenever `zSig' is a complete, normalized
3679| significand.
3680*----------------------------------------------------------------------------*/
3681
3682static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3683{
3684
3685    return make_float64(
3686        ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3687
3688}
3689
3690/*----------------------------------------------------------------------------
3691| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3692| and significand `zSig', and returns the proper double-precision floating-
3693| point value corresponding to the abstract input.  Ordinarily, the abstract
3694| value is simply rounded and packed into the double-precision format, with
3695| the inexact exception raised if the abstract input cannot be represented
3696| exactly.  However, if the abstract value is too large, the overflow and
3697| inexact exceptions are raised and an infinity or maximal finite value is
3698| returned.  If the abstract value is too small, the input value is rounded to
3699| a subnormal number, and the underflow and inexact exceptions are raised if
3700| the abstract input cannot be represented exactly as a subnormal double-
3701| precision floating-point number.
3702|     The input significand `zSig' has its binary point between bits 62
3703| and 61, which is 10 bits to the left of the usual location.  This shifted
3704| significand must be normalized or smaller.  If `zSig' is not normalized,
3705| `zExp' must be 0; in that case, the result returned is a subnormal number,
3706| and it must not require rounding.  In the usual case that `zSig' is
3707| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3708| The handling of underflow and overflow follows the IEC/IEEE Standard for
3709| Binary Floating-Point Arithmetic.
3710*----------------------------------------------------------------------------*/
3711
3712static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3713                                   float_status *status)
3714{
3715    int8_t roundingMode;
3716    flag roundNearestEven;
3717    int roundIncrement, roundBits;
3718    flag isTiny;
3719
3720    roundingMode = status->float_rounding_mode;
3721    roundNearestEven = ( roundingMode == float_round_nearest_even );
3722    switch (roundingMode) {
3723    case float_round_nearest_even:
3724    case float_round_ties_away:
3725        roundIncrement = 0x200;
3726        break;
3727    case float_round_to_zero:
3728        roundIncrement = 0;
3729        break;
3730    case float_round_up:
3731        roundIncrement = zSign ? 0 : 0x3ff;
3732        break;
3733    case float_round_down:
3734        roundIncrement = zSign ? 0x3ff : 0;
3735        break;
3736    case float_round_to_odd:
3737        roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3738        break;
3739    default:
3740        abort();
3741    }
3742    roundBits = zSig & 0x3FF;
3743    if ( 0x7FD <= (uint16_t) zExp ) {
3744        if (    ( 0x7FD < zExp )
3745             || (    ( zExp == 0x7FD )
3746                  && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3747           ) {
3748            bool overflow_to_inf = roundingMode != float_round_to_odd &&
3749                                   roundIncrement != 0;
3750            float_raise(float_flag_overflow | float_flag_inexact, status);
3751            return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3752        }
3753        if ( zExp < 0 ) {
3754            if (status->flush_to_zero) {
3755                float_raise(float_flag_output_denormal, status);
3756                return packFloat64(zSign, 0, 0);
3757            }
3758            isTiny =
3759                   (status->float_detect_tininess
3760                    == float_tininess_before_rounding)
3761                || ( zExp < -1 )
3762                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
3763            shift64RightJamming( zSig, - zExp, &zSig );
3764            zExp = 0;
3765            roundBits = zSig & 0x3FF;
3766            if (isTiny && roundBits) {
3767                float_raise(float_flag_underflow, status);
3768            }
3769            if (roundingMode == float_round_to_odd) {
3770                /*
3771                 * For round-to-odd case, the roundIncrement depends on
3772                 * zSig which just changed.
3773                 */
3774                roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3775            }
3776        }
3777    }
3778    if (roundBits) {
3779        status->float_exception_flags |= float_flag_inexact;
3780    }
3781    zSig = ( zSig + roundIncrement )>>10;
3782    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3783    if ( zSig == 0 ) zExp = 0;
3784    return packFloat64( zSign, zExp, zSig );
3785
3786}
3787
3788/*----------------------------------------------------------------------------
3789| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3790| and significand `zSig', and returns the proper double-precision floating-
3791| point value corresponding to the abstract input.  This routine is just like
3792| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3793| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3794| floating-point exponent.
3795*----------------------------------------------------------------------------*/
3796
3797static float64
3798 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3799                              float_status *status)
3800{
3801    int8_t shiftCount;
3802
3803    shiftCount = clz64(zSig) - 1;
3804    return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3805                               status);
3806
3807}
3808
3809/*----------------------------------------------------------------------------
3810| Normalizes the subnormal extended double-precision floating-point value
3811| represented by the denormalized significand `aSig'.  The normalized exponent
3812| and significand are stored at the locations pointed to by `zExpPtr' and
3813| `zSigPtr', respectively.
3814*----------------------------------------------------------------------------*/
3815
3816void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3817                                uint64_t *zSigPtr)
3818{
3819    int8_t shiftCount;
3820
3821    shiftCount = clz64(aSig);
3822    *zSigPtr = aSig<<shiftCount;
3823    *zExpPtr = 1 - shiftCount;
3824}
3825
3826/*----------------------------------------------------------------------------
3827| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3828| and extended significand formed by the concatenation of `zSig0' and `zSig1',
3829| and returns the proper extended double-precision floating-point value
3830| corresponding to the abstract input.  Ordinarily, the abstract value is
3831| rounded and packed into the extended double-precision format, with the
3832| inexact exception raised if the abstract input cannot be represented
3833| exactly.  However, if the abstract value is too large, the overflow and
3834| inexact exceptions are raised and an infinity or maximal finite value is
3835| returned.  If the abstract value is too small, the input value is rounded to
3836| a subnormal number, and the underflow and inexact exceptions are raised if
3837| the abstract input cannot be represented exactly as a subnormal extended
3838| double-precision floating-point number.
3839|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3840| number of bits as single or double precision, respectively.  Otherwise, the
3841| result is rounded to the full precision of the extended double-precision
3842| format.
3843|     The input significand must be normalized or smaller.  If the input
3844| significand is not normalized, `zExp' must be 0; in that case, the result
3845| returned is a subnormal number, and it must not require rounding.  The
3846| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3847| Floating-Point Arithmetic.
3848*----------------------------------------------------------------------------*/
3849
3850floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3851                              int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3852                              float_status *status)
3853{
3854    int8_t roundingMode;
3855    flag roundNearestEven, increment, isTiny;
3856    int64_t roundIncrement, roundMask, roundBits;
3857
3858    roundingMode = status->float_rounding_mode;
3859    roundNearestEven = ( roundingMode == float_round_nearest_even );
3860    if ( roundingPrecision == 80 ) goto precision80;
3861    if ( roundingPrecision == 64 ) {
3862        roundIncrement = LIT64( 0x0000000000000400 );
3863        roundMask = LIT64( 0x00000000000007FF );
3864    }
3865    else if ( roundingPrecision == 32 ) {
3866        roundIncrement = LIT64( 0x0000008000000000 );
3867        roundMask = LIT64( 0x000000FFFFFFFFFF );
3868    }
3869    else {
3870        goto precision80;
3871    }
3872    zSig0 |= ( zSig1 != 0 );
3873    switch (roundingMode) {
3874    case float_round_nearest_even:
3875    case float_round_ties_away:
3876        break;
3877    case float_round_to_zero:
3878        roundIncrement = 0;
3879        break;
3880    case float_round_up:
3881        roundIncrement = zSign ? 0 : roundMask;
3882        break;
3883    case float_round_down:
3884        roundIncrement = zSign ? roundMask : 0;
3885        break;
3886    default:
3887        abort();
3888    }
3889    roundBits = zSig0 & roundMask;
3890    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3891        if (    ( 0x7FFE < zExp )
3892             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3893           ) {
3894            goto overflow;
3895        }
3896        if ( zExp <= 0 ) {
3897            if (status->flush_to_zero) {
3898                float_raise(float_flag_output_denormal, status);
3899                return packFloatx80(zSign, 0, 0);
3900            }
3901            isTiny =
3902                   (status->float_detect_tininess
3903                    == float_tininess_before_rounding)
3904                || ( zExp < 0 )
3905                || ( zSig0 <= zSig0 + roundIncrement );
3906            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3907            zExp = 0;
3908            roundBits = zSig0 & roundMask;
3909            if (isTiny && roundBits) {
3910                float_raise(float_flag_underflow, status);
3911            }
3912            if (roundBits) {
3913                status->float_exception_flags |= float_flag_inexact;
3914            }
3915            zSig0 += roundIncrement;
3916            if ( (int64_t) zSig0 < 0 ) zExp = 1;
3917            roundIncrement = roundMask + 1;
3918            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3919                roundMask |= roundIncrement;
3920            }
3921            zSig0 &= ~ roundMask;
3922            return packFloatx80( zSign, zExp, zSig0 );
3923        }
3924    }
3925    if (roundBits) {
3926        status->float_exception_flags |= float_flag_inexact;
3927    }
3928    zSig0 += roundIncrement;
3929    if ( zSig0 < roundIncrement ) {
3930        ++zExp;
3931        zSig0 = LIT64( 0x8000000000000000 );
3932    }
3933    roundIncrement = roundMask + 1;
3934    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3935        roundMask |= roundIncrement;
3936    }
3937    zSig0 &= ~ roundMask;
3938    if ( zSig0 == 0 ) zExp = 0;
3939    return packFloatx80( zSign, zExp, zSig0 );
3940 precision80:
3941    switch (roundingMode) {
3942    case float_round_nearest_even:
3943    case float_round_ties_away:
3944        increment = ((int64_t)zSig1 < 0);
3945        break;
3946    case float_round_to_zero:
3947        increment = 0;
3948        break;
3949    case float_round_up:
3950        increment = !zSign && zSig1;
3951        break;
3952    case float_round_down:
3953        increment = zSign && zSig1;
3954        break;
3955    default:
3956        abort();
3957    }
3958    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3959        if (    ( 0x7FFE < zExp )
3960             || (    ( zExp == 0x7FFE )
3961                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3962                  && increment
3963                )
3964           ) {
3965            roundMask = 0;
3966 overflow:
3967            float_raise(float_flag_overflow | float_flag_inexact, status);
3968            if (    ( roundingMode == float_round_to_zero )
3969                 || ( zSign && ( roundingMode == float_round_up ) )
3970                 || ( ! zSign && ( roundingMode == float_round_down ) )
3971               ) {
3972                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3973            }
3974            return packFloatx80(zSign,
3975                                floatx80_infinity_high,
3976                                floatx80_infinity_low);
3977        }
3978        if ( zExp <= 0 ) {
3979            isTiny =
3980                   (status->float_detect_tininess
3981                    == float_tininess_before_rounding)
3982                || ( zExp < 0 )
3983                || ! increment
3984                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3985            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3986            zExp = 0;
3987            if (isTiny && zSig1) {
3988                float_raise(float_flag_underflow, status);
3989            }
3990            if (zSig1) {
3991                status->float_exception_flags |= float_flag_inexact;
3992            }
3993            switch (roundingMode) {
3994            case float_round_nearest_even:
3995            case float_round_ties_away:
3996                increment = ((int64_t)zSig1 < 0);
3997                break;
3998            case float_round_to_zero:
3999                increment = 0;
4000                break;
4001            case float_round_up:
4002                increment = !zSign && zSig1;
4003                break;
4004            case float_round_down:
4005                increment = zSign && zSig1;
4006                break;
4007            default:
4008                abort();
4009            }
4010            if ( increment ) {
4011                ++zSig0;
4012                zSig0 &=
4013                    ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4014                if ( (int64_t) zSig0 < 0 ) zExp = 1;
4015            }
4016            return packFloatx80( zSign, zExp, zSig0 );
4017        }
4018    }
4019    if (zSig1) {
4020        status->float_exception_flags |= float_flag_inexact;
4021    }
4022    if ( increment ) {
4023        ++zSig0;
4024        if ( zSig0 == 0 ) {
4025            ++zExp;
4026            zSig0 = LIT64( 0x8000000000000000 );
4027        }
4028        else {
4029            zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4030        }
4031    }
4032    else {
4033        if ( zSig0 == 0 ) zExp = 0;
4034    }
4035    return packFloatx80( zSign, zExp, zSig0 );
4036
4037}
4038
4039/*----------------------------------------------------------------------------
4040| Takes an abstract floating-point value having sign `zSign', exponent
4041| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4042| and returns the proper extended double-precision floating-point value
4043| corresponding to the abstract input.  This routine is just like
4044| `roundAndPackFloatx80' except that the input significand does not have to be
4045| normalized.
4046*----------------------------------------------------------------------------*/
4047
4048floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4049                                       flag zSign, int32_t zExp,
4050                                       uint64_t zSig0, uint64_t zSig1,
4051                                       float_status *status)
4052{
4053    int8_t shiftCount;
4054
4055    if ( zSig0 == 0 ) {
4056        zSig0 = zSig1;
4057        zSig1 = 0;
4058        zExp -= 64;
4059    }
4060    shiftCount = clz64(zSig0);
4061    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4062    zExp -= shiftCount;
4063    return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4064                                zSig0, zSig1, status);
4065
4066}
4067
4068/*----------------------------------------------------------------------------
4069| Returns the least-significant 64 fraction bits of the quadruple-precision
4070| floating-point value `a'.
4071*----------------------------------------------------------------------------*/
4072
4073static inline uint64_t extractFloat128Frac1( float128 a )
4074{
4075
4076    return a.low;
4077
4078}
4079
4080/*----------------------------------------------------------------------------
4081| Returns the most-significant 48 fraction bits of the quadruple-precision
4082| floating-point value `a'.
4083*----------------------------------------------------------------------------*/
4084
4085static inline uint64_t extractFloat128Frac0( float128 a )
4086{
4087
4088    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
4089
4090}
4091
4092/*----------------------------------------------------------------------------
4093| Returns the exponent bits of the quadruple-precision floating-point value
4094| `a'.
4095*----------------------------------------------------------------------------*/
4096
4097static inline int32_t extractFloat128Exp( float128 a )
4098{
4099
4100    return ( a.high>>48 ) & 0x7FFF;
4101
4102}
4103
4104/*----------------------------------------------------------------------------
4105| Returns the sign bit of the quadruple-precision floating-point value `a'.
4106*----------------------------------------------------------------------------*/
4107
4108static inline flag extractFloat128Sign( float128 a )
4109{
4110
4111    return a.high>>63;
4112
4113}
4114
4115/*----------------------------------------------------------------------------
4116| Normalizes the subnormal quadruple-precision floating-point value
4117| represented by the denormalized significand formed by the concatenation of
4118| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4119| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4120| significand are stored at the location pointed to by `zSig0Ptr', and the
4121| least significant 64 bits of the normalized significand are stored at the
4122| location pointed to by `zSig1Ptr'.
4123*----------------------------------------------------------------------------*/
4124
4125static void
4126 normalizeFloat128Subnormal(
4127     uint64_t aSig0,
4128     uint64_t aSig1,
4129     int32_t *zExpPtr,
4130     uint64_t *zSig0Ptr,
4131     uint64_t *zSig1Ptr
4132 )
4133{
4134    int8_t shiftCount;
4135
4136    if ( aSig0 == 0 ) {
4137        shiftCount = clz64(aSig1) - 15;
4138        if ( shiftCount < 0 ) {
4139            *zSig0Ptr = aSig1>>( - shiftCount );
4140            *zSig1Ptr = aSig1<<( shiftCount & 63 );
4141        }
4142        else {
4143            *zSig0Ptr = aSig1<<shiftCount;
4144            *zSig1Ptr = 0;
4145        }
4146        *zExpPtr = - shiftCount - 63;
4147    }
4148    else {
4149        shiftCount = clz64(aSig0) - 15;
4150        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4151        *zExpPtr = 1 - shiftCount;
4152    }
4153
4154}
4155
4156/*----------------------------------------------------------------------------
4157| Packs the sign `zSign', the exponent `zExp', and the significand formed
4158| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4159| floating-point value, returning the result.  After being shifted into the
4160| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4161| added together to form the most significant 32 bits of the result.  This
4162| means that any integer portion of `zSig0' will be added into the exponent.
4163| Since a properly normalized significand will have an integer portion equal
4164| to 1, the `zExp' input should be 1 less than the desired result exponent
4165| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4166| significand.
4167*----------------------------------------------------------------------------*/
4168
4169static inline float128
4170 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
4171{
4172    float128 z;
4173
4174    z.low = zSig1;
4175    z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
4176    return z;
4177
4178}
4179
4180/*----------------------------------------------------------------------------
4181| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4182| and extended significand formed by the concatenation of `zSig0', `zSig1',
4183| and `zSig2', and returns the proper quadruple-precision floating-point value
4184| corresponding to the abstract input.  Ordinarily, the abstract value is
4185| simply rounded and packed into the quadruple-precision format, with the
4186| inexact exception raised if the abstract input cannot be represented
4187| exactly.  However, if the abstract value is too large, the overflow and
4188| inexact exceptions are raised and an infinity or maximal finite value is
4189| returned.  If the abstract value is too small, the input value is rounded to
4190| a subnormal number, and the underflow and inexact exceptions are raised if
4191| the abstract input cannot be represented exactly as a subnormal quadruple-
4192| precision floating-point number.
4193|     The input significand must be normalized or smaller.  If the input
4194| significand is not normalized, `zExp' must be 0; in that case, the result
4195| returned is a subnormal number, and it must not require rounding.  In the
4196| usual case that the input significand is normalized, `zExp' must be 1 less
4197| than the ``true'' floating-point exponent.  The handling of underflow and
4198| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4199*----------------------------------------------------------------------------*/
4200
4201static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
4202                                     uint64_t zSig0, uint64_t zSig1,
4203                                     uint64_t zSig2, float_status *status)
4204{
4205    int8_t roundingMode;
4206    flag roundNearestEven, increment, isTiny;
4207
4208    roundingMode = status->float_rounding_mode;
4209    roundNearestEven = ( roundingMode == float_round_nearest_even );
4210    switch (roundingMode) {
4211    case float_round_nearest_even:
4212    case float_round_ties_away:
4213        increment = ((int64_t)zSig2 < 0);
4214        break;
4215    case float_round_to_zero:
4216        increment = 0;
4217        break;
4218    case float_round_up:
4219        increment = !zSign && zSig2;
4220        break;
4221    case float_round_down:
4222        increment = zSign && zSig2;
4223        break;
4224    case float_round_to_odd:
4225        increment = !(zSig1 & 0x1) && zSig2;
4226        break;
4227    default:
4228        abort();
4229    }
4230    if ( 0x7FFD <= (uint32_t) zExp ) {
4231        if (    ( 0x7FFD < zExp )
4232             || (    ( zExp == 0x7FFD )
4233                  && eq128(
4234                         LIT64( 0x0001FFFFFFFFFFFF ),
4235                         LIT64( 0xFFFFFFFFFFFFFFFF ),
4236                         zSig0,
4237                         zSig1
4238                     )
4239                  && increment
4240                )
4241           ) {
4242            float_raise(float_flag_overflow | float_flag_inexact, status);
4243            if (    ( roundingMode == float_round_to_zero )
4244                 || ( zSign && ( roundingMode == float_round_up ) )
4245                 || ( ! zSign && ( roundingMode == float_round_down ) )
4246                 || (roundingMode == float_round_to_odd)
4247               ) {
4248                return
4249                    packFloat128(
4250                        zSign,
4251                        0x7FFE,
4252                        LIT64( 0x0000FFFFFFFFFFFF ),
4253                        LIT64( 0xFFFFFFFFFFFFFFFF )
4254                    );
4255            }
4256            return packFloat128( zSign, 0x7FFF, 0, 0 );
4257        }
4258        if ( zExp < 0 ) {
4259            if (status->flush_to_zero) {
4260                float_raise(float_flag_output_denormal, status);
4261                return packFloat128(zSign, 0, 0, 0);
4262            }
4263            isTiny =
4264                   (status->float_detect_tininess
4265                    == float_tininess_before_rounding)
4266                || ( zExp < -1 )
4267                || ! increment
4268                || lt128(
4269                       zSig0,
4270                       zSig1,
4271                       LIT64( 0x0001FFFFFFFFFFFF ),
4272                       LIT64( 0xFFFFFFFFFFFFFFFF )
4273                   );
4274            shift128ExtraRightJamming(
4275                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4276            zExp = 0;
4277            if (isTiny && zSig2) {
4278                float_raise(float_flag_underflow, status);
4279            }
4280            switch (roundingMode) {
4281            case float_round_nearest_even:
4282            case float_round_ties_away:
4283                increment = ((int64_t)zSig2 < 0);
4284                break;
4285            case float_round_to_zero:
4286                increment = 0;
4287                break;
4288            case float_round_up:
4289                increment = !zSign && zSig2;
4290                break;
4291            case float_round_down:
4292                increment = zSign && zSig2;
4293                break;
4294            case float_round_to_odd:
4295                increment = !(zSig1 & 0x1) && zSig2;
4296                break;
4297            default:
4298                abort();
4299            }
4300        }
4301    }
4302    if (zSig2) {
4303        status->float_exception_flags |= float_flag_inexact;
4304    }
4305    if ( increment ) {
4306        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4307        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
4308    }
4309    else {
4310        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4311    }
4312    return packFloat128( zSign, zExp, zSig0, zSig1 );
4313
4314}
4315
4316/*----------------------------------------------------------------------------
4317| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4318| and significand formed by the concatenation of `zSig0' and `zSig1', and
4319| returns the proper quadruple-precision floating-point value corresponding
4320| to the abstract input.  This routine is just like `roundAndPackFloat128'
4321| except that the input significand has fewer bits and does not have to be
4322| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4323| point exponent.
4324*----------------------------------------------------------------------------*/
4325
4326static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
4327                                              uint64_t zSig0, uint64_t zSig1,
4328                                              float_status *status)
4329{
4330    int8_t shiftCount;
4331    uint64_t zSig2;
4332
4333    if ( zSig0 == 0 ) {
4334        zSig0 = zSig1;
4335        zSig1 = 0;
4336        zExp -= 64;
4337    }
4338    shiftCount = clz64(zSig0) - 15;
4339    if ( 0 <= shiftCount ) {
4340        zSig2 = 0;
4341        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4342    }
4343    else {
4344        shift128ExtraRightJamming(
4345            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4346    }
4347    zExp -= shiftCount;
4348    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4349
4350}
4351
4352
4353/*----------------------------------------------------------------------------
4354| Returns the result of converting the 32-bit two's complement integer `a'
4355| to the extended double-precision floating-point format.  The conversion
4356| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4357| Arithmetic.
4358*----------------------------------------------------------------------------*/
4359
4360floatx80 int32_to_floatx80(int32_t a, float_status *status)
4361{
4362    flag zSign;
4363    uint32_t absA;
4364    int8_t shiftCount;
4365    uint64_t zSig;
4366
4367    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4368    zSign = ( a < 0 );
4369    absA = zSign ? - a : a;
4370    shiftCount = clz32(absA) + 32;
4371    zSig = absA;
4372    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4373
4374}
4375
4376/*----------------------------------------------------------------------------
4377| Returns the result of converting the 32-bit two's complement integer `a' to
4378| the quadruple-precision floating-point format.  The conversion is performed
4379| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4380*----------------------------------------------------------------------------*/
4381
4382float128 int32_to_float128(int32_t a, float_status *status)
4383{
4384    flag zSign;
4385    uint32_t absA;
4386    int8_t shiftCount;
4387    uint64_t zSig0;
4388
4389    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4390    zSign = ( a < 0 );
4391    absA = zSign ? - a : a;
4392    shiftCount = clz32(absA) + 17;
4393    zSig0 = absA;
4394    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4395
4396}
4397
4398/*----------------------------------------------------------------------------
4399| Returns the result of converting the 64-bit two's complement integer `a'
4400| to the extended double-precision floating-point format.  The conversion
4401| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4402| Arithmetic.
4403*----------------------------------------------------------------------------*/
4404
4405floatx80 int64_to_floatx80(int64_t a, float_status *status)
4406{
4407    flag zSign;
4408    uint64_t absA;
4409    int8_t shiftCount;
4410
4411    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4412    zSign = ( a < 0 );
4413    absA = zSign ? - a : a;
4414    shiftCount = clz64(absA);
4415    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4416
4417}
4418
4419/*----------------------------------------------------------------------------
4420| Returns the result of converting the 64-bit two's complement integer `a' to
4421| the quadruple-precision floating-point format.  The conversion is performed
4422| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4423*----------------------------------------------------------------------------*/
4424
4425float128 int64_to_float128(int64_t a, float_status *status)
4426{
4427    flag zSign;
4428    uint64_t absA;
4429    int8_t shiftCount;
4430    int32_t zExp;
4431    uint64_t zSig0, zSig1;
4432
4433    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4434    zSign = ( a < 0 );
4435    absA = zSign ? - a : a;
4436    shiftCount = clz64(absA) + 49;
4437    zExp = 0x406E - shiftCount;
4438    if ( 64 <= shiftCount ) {
4439        zSig1 = 0;
4440        zSig0 = absA;
4441        shiftCount -= 64;
4442    }
4443    else {
4444        zSig1 = absA;
4445        zSig0 = 0;
4446    }
4447    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4448    return packFloat128( zSign, zExp, zSig0, zSig1 );
4449
4450}
4451
4452/*----------------------------------------------------------------------------
4453| Returns the result of converting the 64-bit unsigned integer `a'
4454| to the quadruple-precision floating-point format.  The conversion is performed
4455| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4456*----------------------------------------------------------------------------*/
4457
4458float128 uint64_to_float128(uint64_t a, float_status *status)
4459{
4460    if (a == 0) {
4461        return float128_zero;
4462    }
4463    return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4464}
4465
4466/*----------------------------------------------------------------------------
4467| Returns the result of converting the single-precision floating-point value
4468| `a' to the extended double-precision floating-point format.  The conversion
4469| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4470| Arithmetic.
4471*----------------------------------------------------------------------------*/
4472
4473floatx80 float32_to_floatx80(float32 a, float_status *status)
4474{
4475    flag aSign;
4476    int aExp;
4477    uint32_t aSig;
4478
4479    a = float32_squash_input_denormal(a, status);
4480    aSig = extractFloat32Frac( a );
4481    aExp = extractFloat32Exp( a );
4482    aSign = extractFloat32Sign( a );
4483    if ( aExp == 0xFF ) {
4484        if (aSig) {
4485            return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
4486        }
4487        return packFloatx80(aSign,
4488                            floatx80_infinity_high,
4489                            floatx80_infinity_low);
4490    }
4491    if ( aExp == 0 ) {
4492        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4493        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4494    }
4495    aSig |= 0x00800000;
4496    return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4497
4498}
4499
4500/*----------------------------------------------------------------------------
4501| Returns the result of converting the single-precision floating-point value
4502| `a' to the double-precision floating-point format.  The conversion is
4503| performed according to the IEC/IEEE Standard for Binary Floating-Point
4504| Arithmetic.
4505*----------------------------------------------------------------------------*/
4506
4507float128 float32_to_float128(float32 a, float_status *status)
4508{
4509    flag aSign;
4510    int aExp;
4511    uint32_t aSig;
4512
4513    a = float32_squash_input_denormal(a, status);
4514    aSig = extractFloat32Frac( a );
4515    aExp = extractFloat32Exp( a );
4516    aSign = extractFloat32Sign( a );
4517    if ( aExp == 0xFF ) {
4518        if (aSig) {
4519            return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4520        }
4521        return packFloat128( aSign, 0x7FFF, 0, 0 );
4522    }
4523    if ( aExp == 0 ) {
4524        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4525        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4526        --aExp;
4527    }
4528    return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4529
4530}
4531
4532/*----------------------------------------------------------------------------
4533| Returns the remainder of the single-precision floating-point value `a'
4534| with respect to the corresponding value `b'.  The operation is performed
4535| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4536*----------------------------------------------------------------------------*/
4537
4538float32 float32_rem(float32 a, float32 b, float_status *status)
4539{
4540    flag aSign, zSign;
4541    int aExp, bExp, expDiff;
4542    uint32_t aSig, bSig;
4543    uint32_t q;
4544    uint64_t aSig64, bSig64, q64;
4545    uint32_t alternateASig;
4546    int32_t sigMean;
4547    a = float32_squash_input_denormal(a, status);
4548    b = float32_squash_input_denormal(b, status);
4549
4550    aSig = extractFloat32Frac( a );
4551    aExp = extractFloat32Exp( a );
4552    aSign = extractFloat32Sign( a );
4553    bSig = extractFloat32Frac( b );
4554    bExp = extractFloat32Exp( b );
4555    if ( aExp == 0xFF ) {
4556        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4557            return propagateFloat32NaN(a, b, status);
4558        }
4559        float_raise(float_flag_invalid, status);
4560        return float32_default_nan(status);
4561    }
4562    if ( bExp == 0xFF ) {
4563        if (bSig) {
4564            return propagateFloat32NaN(a, b, status);
4565        }
4566        return a;
4567    }
4568    if ( bExp == 0 ) {
4569        if ( bSig == 0 ) {
4570            float_raise(float_flag_invalid, status);
4571            return float32_default_nan(status);
4572        }
4573        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4574    }
4575    if ( aExp == 0 ) {
4576        if ( aSig == 0 ) return a;
4577        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4578    }
4579    expDiff = aExp - bExp;
4580    aSig |= 0x00800000;
4581    bSig |= 0x00800000;
4582    if ( expDiff < 32 ) {
4583        aSig <<= 8;
4584        bSig <<= 8;
4585        if ( expDiff < 0 ) {
4586            if ( expDiff < -1 ) return a;
4587            aSig >>= 1;
4588        }
4589        q = ( bSig <= aSig );
4590        if ( q ) aSig -= bSig;
4591        if ( 0 < expDiff ) {
4592            q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4593            q >>= 32 - expDiff;
4594            bSig >>= 2;
4595            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4596        }
4597        else {
4598            aSig >>= 2;
4599            bSig >>= 2;
4600        }
4601    }
4602    else {
4603        if ( bSig <= aSig ) aSig -= bSig;
4604        aSig64 = ( (uint64_t) aSig )<<40;
4605        bSig64 = ( (uint64_t) bSig )<<40;
4606        expDiff -= 64;
4607        while ( 0 < expDiff ) {
4608            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4609            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4610            aSig64 = - ( ( bSig * q64 )<<38 );
4611            expDiff -= 62;
4612        }
4613        expDiff += 64;
4614        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4615        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4616        q = q64>>( 64 - expDiff );
4617        bSig <<= 6;
4618        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4619    }
4620    do {
4621        alternateASig = aSig;
4622        ++q;
4623        aSig -= bSig;
4624    } while ( 0 <= (int32_t) aSig );
4625    sigMean = aSig + alternateASig;
4626    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4627        aSig = alternateASig;
4628    }
4629    zSign = ( (int32_t) aSig < 0 );
4630    if ( zSign ) aSig = - aSig;
4631    return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4632}
4633
4634
4635
4636/*----------------------------------------------------------------------------
4637| Returns the binary exponential of the single-precision floating-point value
4638| `a'. The operation is performed according to the IEC/IEEE Standard for
4639| Binary Floating-Point Arithmetic.
4640|
4641| Uses the following identities:
4642|
4643| 1. -------------------------------------------------------------------------
4644|      x    x*ln(2)
4645|     2  = e
4646|
4647| 2. -------------------------------------------------------------------------
4648|                      2     3     4     5           n
4649|      x        x     x     x     x     x           x
4650|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4651|               1!    2!    3!    4!    5!          n!
4652*----------------------------------------------------------------------------*/
4653
4654static const float64 float32_exp2_coefficients[15] =
4655{
4656    const_float64( 0x3ff0000000000000ll ), /*  1 */
4657    const_float64( 0x3fe0000000000000ll ), /*  2 */
4658    const_float64( 0x3fc5555555555555ll ), /*  3 */
4659    const_float64( 0x3fa5555555555555ll ), /*  4 */
4660    const_float64( 0x3f81111111111111ll ), /*  5 */
4661    const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4662    const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4663    const_float64( 0x3efa01a01a01a01all ), /*  8 */
4664    const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4665    const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4666    const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4667    const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4668    const_float64( 0x3de6124613a86d09ll ), /* 13 */
4669    const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4670    const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4671};
4672
4673float32 float32_exp2(float32 a, float_status *status)
4674{
4675    flag aSign;
4676    int aExp;
4677    uint32_t aSig;
4678    float64 r, x, xn;
4679    int i;
4680    a = float32_squash_input_denormal(a, status);
4681
4682    aSig = extractFloat32Frac( a );
4683    aExp = extractFloat32Exp( a );
4684    aSign = extractFloat32Sign( a );
4685
4686    if ( aExp == 0xFF) {
4687        if (aSig) {
4688            return propagateFloat32NaN(a, float32_zero, status);
4689        }
4690        return (aSign) ? float32_zero : a;
4691    }
4692    if (aExp == 0) {
4693        if (aSig == 0) return float32_one;
4694    }
4695
4696    float_raise(float_flag_inexact, status);
4697
4698    /* ******************************* */
4699    /* using float64 for approximation */
4700    /* ******************************* */
4701    x = float32_to_float64(a, status);
4702    x = float64_mul(x, float64_ln2, status);
4703
4704    xn = x;
4705    r = float64_one;
4706    for (i = 0 ; i < 15 ; i++) {
4707        float64 f;
4708
4709        f = float64_mul(xn, float32_exp2_coefficients[i], status);
4710        r = float64_add(r, f, status);
4711
4712        xn = float64_mul(xn, x, status);
4713    }
4714
4715    return float64_to_float32(r, status);
4716}
4717
4718/*----------------------------------------------------------------------------
4719| Returns the binary log of the single-precision floating-point value `a'.
4720| The operation is performed according to the IEC/IEEE Standard for Binary
4721| Floating-Point Arithmetic.
4722*----------------------------------------------------------------------------*/
4723float32 float32_log2(float32 a, float_status *status)
4724{
4725    flag aSign, zSign;
4726    int aExp;
4727    uint32_t aSig, zSig, i;
4728
4729    a = float32_squash_input_denormal(a, status);
4730    aSig = extractFloat32Frac( a );
4731    aExp = extractFloat32Exp( a );
4732    aSign = extractFloat32Sign( a );
4733
4734    if ( aExp == 0 ) {
4735        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4736        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4737    }
4738    if ( aSign ) {
4739        float_raise(float_flag_invalid, status);
4740        return float32_default_nan(status);
4741    }
4742    if ( aExp == 0xFF ) {
4743        if (aSig) {
4744            return propagateFloat32NaN(a, float32_zero, status);
4745        }
4746        return a;
4747    }
4748
4749    aExp -= 0x7F;
4750    aSig |= 0x00800000;
4751    zSign = aExp < 0;
4752    zSig = aExp << 23;
4753
4754    for (i = 1 << 22; i > 0; i >>= 1) {
4755        aSig = ( (uint64_t)aSig * aSig ) >> 23;
4756        if ( aSig & 0x01000000 ) {
4757            aSig >>= 1;
4758            zSig |= i;
4759        }
4760    }
4761
4762    if ( zSign )
4763        zSig = -zSig;
4764
4765    return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4766}
4767
4768/*----------------------------------------------------------------------------
4769| Returns 1 if the single-precision floating-point value `a' is equal to
4770| the corresponding value `b', and 0 otherwise.  The invalid exception is
4771| raised if either operand is a NaN.  Otherwise, the comparison is performed
4772| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4773*----------------------------------------------------------------------------*/
4774
4775int float32_eq(float32 a, float32 b, float_status *status)
4776{
4777    uint32_t av, bv;
4778    a = float32_squash_input_denormal(a, status);
4779    b = float32_squash_input_denormal(b, status);
4780
4781    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4782         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4783       ) {
4784        float_raise(float_flag_invalid, status);
4785        return 0;
4786    }
4787    av = float32_val(a);
4788    bv = float32_val(b);
4789    return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4790}
4791
4792/*----------------------------------------------------------------------------
4793| Returns 1 if the single-precision floating-point value `a' is less than
4794| or equal to the corresponding value `b', and 0 otherwise.  The invalid
4795| exception is raised if either operand is a NaN.  The comparison is performed
4796| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4797*----------------------------------------------------------------------------*/
4798
4799int float32_le(float32 a, float32 b, float_status *status)
4800{
4801    flag aSign, bSign;
4802    uint32_t av, bv;
4803    a = float32_squash_input_denormal(a, status);
4804    b = float32_squash_input_denormal(b, status);
4805
4806    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4807         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4808       ) {
4809        float_raise(float_flag_invalid, status);
4810        return 0;
4811    }
4812    aSign = extractFloat32Sign( a );
4813    bSign = extractFloat32Sign( b );
4814    av = float32_val(a);
4815    bv = float32_val(b);
4816    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4817    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4818
4819}
4820
4821/*----------------------------------------------------------------------------
4822| Returns 1 if the single-precision floating-point value `a' is less than
4823| the corresponding value `b', and 0 otherwise.  The invalid exception is
4824| raised if either operand is a NaN.  The comparison is performed according
4825| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4826*----------------------------------------------------------------------------*/
4827
4828int float32_lt(float32 a, float32 b, float_status *status)
4829{
4830    flag aSign, bSign;
4831    uint32_t av, bv;
4832    a = float32_squash_input_denormal(a, status);
4833    b = float32_squash_input_denormal(b, status);
4834
4835    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4836         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4837       ) {
4838        float_raise(float_flag_invalid, status);
4839        return 0;
4840    }
4841    aSign = extractFloat32Sign( a );
4842    bSign = extractFloat32Sign( b );
4843    av = float32_val(a);
4844    bv = float32_val(b);
4845    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4846    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4847
4848}
4849
4850/*----------------------------------------------------------------------------
4851| Returns 1 if the single-precision floating-point values `a' and `b' cannot
4852| be compared, and 0 otherwise.  The invalid exception is raised if either
4853| operand is a NaN.  The comparison is performed according to the IEC/IEEE
4854| Standard for Binary Floating-Point Arithmetic.
4855*----------------------------------------------------------------------------*/
4856
4857int float32_unordered(float32 a, float32 b, float_status *status)
4858{
4859    a = float32_squash_input_denormal(a, status);
4860    b = float32_squash_input_denormal(b, status);
4861
4862    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4863         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4864       ) {
4865        float_raise(float_flag_invalid, status);
4866        return 1;
4867    }
4868    return 0;
4869}
4870
4871/*----------------------------------------------------------------------------
4872| Returns 1 if the single-precision floating-point value `a' is equal to
4873| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4874| exception.  The comparison is performed according to the IEC/IEEE Standard
4875| for Binary Floating-Point Arithmetic.
4876*----------------------------------------------------------------------------*/
4877
4878int float32_eq_quiet(float32 a, float32 b, float_status *status)
4879{
4880    a = float32_squash_input_denormal(a, status);
4881    b = float32_squash_input_denormal(b, status);
4882
4883    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4884         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4885       ) {
4886        if (float32_is_signaling_nan(a, status)
4887         || float32_is_signaling_nan(b, status)) {
4888            float_raise(float_flag_invalid, status);
4889        }
4890        return 0;
4891    }
4892    return ( float32_val(a) == float32_val(b) ) ||
4893            ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4894}
4895
4896/*----------------------------------------------------------------------------
4897| Returns 1 if the single-precision floating-point value `a' is less than or
4898| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4899| cause an exception.  Otherwise, the comparison is performed according to the
4900| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4901*----------------------------------------------------------------------------*/
4902
4903int float32_le_quiet(float32 a, float32 b, float_status *status)
4904{
4905    flag aSign, bSign;
4906    uint32_t av, bv;
4907    a = float32_squash_input_denormal(a, status);
4908    b = float32_squash_input_denormal(b, status);
4909
4910    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4911         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4912       ) {
4913        if (float32_is_signaling_nan(a, status)
4914         || float32_is_signaling_nan(b, status)) {
4915            float_raise(float_flag_invalid, status);
4916        }
4917        return 0;
4918    }
4919    aSign = extractFloat32Sign( a );
4920    bSign = extractFloat32Sign( b );
4921    av = float32_val(a);
4922    bv = float32_val(b);
4923    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4924    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4925
4926}
4927
4928/*----------------------------------------------------------------------------
4929| Returns 1 if the single-precision floating-point value `a' is less than
4930| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4931| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4932| Standard for Binary Floating-Point Arithmetic.
4933*----------------------------------------------------------------------------*/
4934
4935int float32_lt_quiet(float32 a, float32 b, float_status *status)
4936{
4937    flag aSign, bSign;
4938    uint32_t av, bv;
4939    a = float32_squash_input_denormal(a, status);
4940    b = float32_squash_input_denormal(b, status);
4941
4942    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4943         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4944       ) {
4945        if (float32_is_signaling_nan(a, status)
4946         || float32_is_signaling_nan(b, status)) {
4947            float_raise(float_flag_invalid, status);
4948        }
4949        return 0;
4950    }
4951    aSign = extractFloat32Sign( a );
4952    bSign = extractFloat32Sign( b );
4953    av = float32_val(a);
4954    bv = float32_val(b);
4955    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4956    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4957
4958}
4959
4960/*----------------------------------------------------------------------------
4961| Returns 1 if the single-precision floating-point values `a' and `b' cannot
4962| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4963| comparison is performed according to the IEC/IEEE Standard for Binary
4964| Floating-Point Arithmetic.
4965*----------------------------------------------------------------------------*/
4966
4967int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4968{
4969    a = float32_squash_input_denormal(a, status);
4970    b = float32_squash_input_denormal(b, status);
4971
4972    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4973         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4974       ) {
4975        if (float32_is_signaling_nan(a, status)
4976         || float32_is_signaling_nan(b, status)) {
4977            float_raise(float_flag_invalid, status);
4978        }
4979        return 1;
4980    }
4981    return 0;
4982}
4983
4984/*----------------------------------------------------------------------------
4985| If `a' is denormal and we are in flush-to-zero mode then set the
4986| input-denormal exception and return zero. Otherwise just return the value.
4987*----------------------------------------------------------------------------*/
4988float16 float16_squash_input_denormal(float16 a, float_status *status)
4989{
4990    if (status->flush_inputs_to_zero) {
4991        if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4992            float_raise(float_flag_input_denormal, status);
4993            return make_float16(float16_val(a) & 0x8000);
4994        }
4995    }
4996    return a;
4997}
4998
4999/*----------------------------------------------------------------------------
5000| Returns the result of converting the double-precision floating-point value
5001| `a' to the extended double-precision floating-point format.  The conversion
5002| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5003| Arithmetic.
5004*----------------------------------------------------------------------------*/
5005
5006floatx80 float64_to_floatx80(float64 a, float_status *status)
5007{
5008    flag aSign;
5009    int aExp;
5010    uint64_t aSig;
5011
5012    a = float64_squash_input_denormal(a, status);
5013    aSig = extractFloat64Frac( a );
5014    aExp = extractFloat64Exp( a );
5015    aSign = extractFloat64Sign( a );
5016    if ( aExp == 0x7FF ) {
5017        if (aSig) {
5018            return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
5019        }
5020        return packFloatx80(aSign,
5021                            floatx80_infinity_high,
5022                            floatx80_infinity_low);
5023    }
5024    if ( aExp == 0 ) {
5025        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5026        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5027    }
5028    return
5029        packFloatx80(
5030            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
5031
5032}
5033
5034/*----------------------------------------------------------------------------
5035| Returns the result of converting the double-precision floating-point value
5036| `a' to the quadruple-precision floating-point format.  The conversion is
5037| performed according to the IEC/IEEE Standard for Binary Floating-Point
5038| Arithmetic.
5039*----------------------------------------------------------------------------*/
5040
5041float128 float64_to_float128(float64 a, float_status *status)
5042{
5043    flag aSign;
5044    int aExp;
5045    uint64_t aSig, zSig0, zSig1;
5046
5047    a = float64_squash_input_denormal(a, status);
5048    aSig = extractFloat64Frac( a );
5049    aExp = extractFloat64Exp( a );
5050    aSign = extractFloat64Sign( a );
5051    if ( aExp == 0x7FF ) {
5052        if (aSig) {
5053            return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5054        }
5055        return packFloat128( aSign, 0x7FFF, 0, 0 );
5056    }
5057    if ( aExp == 0 ) {
5058        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5059        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5060        --aExp;
5061    }
5062    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5063    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5064
5065}
5066
5067
5068/*----------------------------------------------------------------------------
5069| Returns the remainder of the double-precision floating-point value `a'
5070| with respect to the corresponding value `b'.  The operation is performed
5071| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5072*----------------------------------------------------------------------------*/
5073
5074float64 float64_rem(float64 a, float64 b, float_status *status)
5075{
5076    flag aSign, zSign;
5077    int aExp, bExp, expDiff;
5078    uint64_t aSig, bSig;
5079    uint64_t q, alternateASig;
5080    int64_t sigMean;
5081
5082    a = float64_squash_input_denormal(a, status);
5083    b = float64_squash_input_denormal(b, status);
5084    aSig = extractFloat64Frac( a );
5085    aExp = extractFloat64Exp( a );
5086    aSign = extractFloat64Sign( a );
5087    bSig = extractFloat64Frac( b );
5088    bExp = extractFloat64Exp( b );
5089    if ( aExp == 0x7FF ) {
5090        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5091            return propagateFloat64NaN(a, b, status);
5092        }
5093        float_raise(float_flag_invalid, status);
5094        return float64_default_nan(status);
5095    }
5096    if ( bExp == 0x7FF ) {
5097        if (bSig) {
5098            return propagateFloat64NaN(a, b, status);
5099        }
5100        return a;
5101    }
5102    if ( bExp == 0 ) {
5103        if ( bSig == 0 ) {
5104            float_raise(float_flag_invalid, status);
5105            return float64_default_nan(status);
5106        }
5107        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5108    }
5109    if ( aExp == 0 ) {
5110        if ( aSig == 0 ) return a;
5111        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5112    }
5113    expDiff = aExp - bExp;
5114    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
5115    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
5116    if ( expDiff < 0 ) {
5117        if ( expDiff < -1 ) return a;
5118        aSig >>= 1;
5119    }
5120    q = ( bSig <= aSig );
5121    if ( q ) aSig -= bSig;
5122    expDiff -= 64;
5123    while ( 0 < expDiff ) {
5124        q = estimateDiv128To64( aSig, 0, bSig );
5125        q = ( 2 < q ) ? q - 2 : 0;
5126        aSig = - ( ( bSig>>2 ) * q );
5127        expDiff -= 62;
5128    }
5129    expDiff += 64;
5130    if ( 0 < expDiff ) {
5131        q = estimateDiv128To64( aSig, 0, bSig );
5132        q = ( 2 < q ) ? q - 2 : 0;
5133        q >>= 64 - expDiff;
5134        bSig >>= 2;
5135        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5136    }
5137    else {
5138        aSig >>= 2;
5139        bSig >>= 2;
5140    }
5141    do {
5142        alternateASig = aSig;
5143        ++q;
5144        aSig -= bSig;
5145    } while ( 0 <= (int64_t) aSig );
5146    sigMean = aSig + alternateASig;
5147    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5148        aSig = alternateASig;
5149    }
5150    zSign = ( (int64_t) aSig < 0 );
5151    if ( zSign ) aSig = - aSig;
5152    return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5153
5154}
5155
5156/*----------------------------------------------------------------------------
5157| Returns the binary log of the double-precision floating-point value `a'.
5158| The operation is performed according to the IEC/IEEE Standard for Binary
5159| Floating-Point Arithmetic.
5160*----------------------------------------------------------------------------*/
5161float64 float64_log2(float64 a, float_status *status)
5162{
5163    flag aSign, zSign;
5164    int aExp;
5165    uint64_t aSig, aSig0, aSig1, zSig, i;
5166    a = float64_squash_input_denormal(a, status);
5167
5168    aSig = extractFloat64Frac( a );
5169    aExp = extractFloat64Exp( a );
5170    aSign = extractFloat64Sign( a );
5171
5172    if ( aExp == 0 ) {
5173        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5174        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5175    }
5176    if ( aSign ) {
5177        float_raise(float_flag_invalid, status);
5178        return float64_default_nan(status);
5179    }
5180    if ( aExp == 0x7FF ) {
5181        if (aSig) {
5182            return propagateFloat64NaN(a, float64_zero, status);
5183        }
5184        return a;
5185    }
5186
5187    aExp -= 0x3FF;
5188    aSig |= LIT64( 0x0010000000000000 );
5189    zSign = aExp < 0;
5190    zSig = (uint64_t)aExp << 52;
5191    for (i = 1LL << 51; i > 0; i >>= 1) {
5192        mul64To128( aSig, aSig, &aSig0, &aSig1 );
5193        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5194        if ( aSig & LIT64( 0x0020000000000000 ) ) {
5195            aSig >>= 1;
5196            zSig |= i;
5197        }
5198    }
5199
5200    if ( zSign )
5201        zSig = -zSig;
5202    return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5203}
5204
5205/*----------------------------------------------------------------------------
5206| Returns 1 if the double-precision floating-point value `a' is equal to the
5207| corresponding value `b', and 0 otherwise.  The invalid exception is raised
5208| if either operand is a NaN.  Otherwise, the comparison is performed
5209| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5210*----------------------------------------------------------------------------*/
5211
5212int float64_eq(float64 a, float64 b, float_status *status)
5213{
5214    uint64_t av, bv;
5215    a = float64_squash_input_denormal(a, status);
5216    b = float64_squash_input_denormal(b, status);
5217
5218    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5219         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5220       ) {
5221        float_raise(float_flag_invalid, status);
5222        return 0;
5223    }
5224    av = float64_val(a);
5225    bv = float64_val(b);
5226    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5227
5228}
5229
5230/*----------------------------------------------------------------------------
5231| Returns 1 if the double-precision floating-point value `a' is less than or
5232| equal to the corresponding value `b', and 0 otherwise.  The invalid
5233| exception is raised if either operand is a NaN.  The comparison is performed
5234| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5235*----------------------------------------------------------------------------*/
5236
5237int float64_le(float64 a, float64 b, float_status *status)
5238{
5239    flag aSign, bSign;
5240    uint64_t av, bv;
5241    a = float64_squash_input_denormal(a, status);
5242    b = float64_squash_input_denormal(b, status);
5243
5244    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5245         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5246       ) {
5247        float_raise(float_flag_invalid, status);
5248        return 0;
5249    }
5250    aSign = extractFloat64Sign( a );
5251    bSign = extractFloat64Sign( b );
5252    av = float64_val(a);
5253    bv = float64_val(b);
5254    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5255    return ( av == bv ) || ( aSign ^ ( av < bv ) );
5256
5257}
5258
5259/*----------------------------------------------------------------------------
5260| Returns 1 if the double-precision floating-point value `a' is less than
5261| the corresponding value `b', and 0 otherwise.  The invalid exception is
5262| raised if either operand is a NaN.  The comparison is performed according
5263| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5264*----------------------------------------------------------------------------*/
5265
5266int float64_lt(float64 a, float64 b, float_status *status)
5267{
5268    flag aSign, bSign;
5269    uint64_t av, bv;
5270
5271    a = float64_squash_input_denormal(a, status);
5272    b = float64_squash_input_denormal(b, status);
5273    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5274         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5275       ) {
5276        float_raise(float_flag_invalid, status);
5277        return 0;
5278    }
5279    aSign = extractFloat64Sign( a );
5280    bSign = extractFloat64Sign( b );
5281    av = float64_val(a);
5282    bv = float64_val(b);
5283    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5284    return ( av != bv ) && ( aSign ^ ( av < bv ) );
5285
5286}
5287
5288/*----------------------------------------------------------------------------
5289| Returns 1 if the double-precision floating-point values `a' and `b' cannot
5290| be compared, and 0 otherwise.  The invalid exception is raised if either
5291| operand is a NaN.  The comparison is performed according to the IEC/IEEE
5292| Standard for Binary Floating-Point Arithmetic.
5293*----------------------------------------------------------------------------*/
5294
5295int float64_unordered(float64 a, float64 b, float_status *status)
5296{
5297    a = float64_squash_input_denormal(a, status);
5298    b = float64_squash_input_denormal(b, status);
5299
5300    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5301         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5302       ) {
5303        float_raise(float_flag_invalid, status);
5304        return 1;
5305    }
5306    return 0;
5307}
5308
5309/*----------------------------------------------------------------------------
5310| Returns 1 if the double-precision floating-point value `a' is equal to the
5311| corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5312| exception.The comparison is performed according to the IEC/IEEE Standard
5313| for Binary Floating-Point Arithmetic.
5314*----------------------------------------------------------------------------*/
5315
5316int float64_eq_quiet(float64 a, float64 b, float_status *status)
5317{
5318    uint64_t av, bv;
5319    a = float64_squash_input_denormal(a, status);
5320    b = float64_squash_input_denormal(b, status);
5321
5322    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5323         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5324       ) {
5325        if (float64_is_signaling_nan(a, status)
5326         || float64_is_signaling_nan(b, status)) {
5327            float_raise(float_flag_invalid, status);
5328        }
5329        return 0;
5330    }
5331    av = float64_val(a);
5332    bv = float64_val(b);
5333    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5334
5335}
5336
5337/*----------------------------------------------------------------------------
5338| Returns 1 if the double-precision floating-point value `a' is less than or
5339| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5340| cause an exception.  Otherwise, the comparison is performed according to the
5341| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5342*----------------------------------------------------------------------------*/
5343
5344int float64_le_quiet(float64 a, float64 b, float_status *status)
5345{
5346    flag aSign, bSign;
5347    uint64_t av, bv;
5348    a = float64_squash_input_denormal(a, status);
5349    b = float64_squash_input_denormal(b, status);
5350
5351    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5352         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5353       ) {
5354        if (float64_is_signaling_nan(a, status)
5355         || float64_is_signaling_nan(b, status)) {
5356            float_raise(float_flag_invalid, status);
5357        }
5358        return 0;
5359    }
5360    aSign = extractFloat64Sign( a );
5361    bSign = extractFloat64Sign( b );
5362    av = float64_val(a);
5363    bv = float64_val(b);
5364    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5365    return ( av == bv ) || ( aSign ^ ( av < bv ) );
5366
5367}
5368
5369/*----------------------------------------------------------------------------
5370| Returns 1 if the double-precision floating-point value `a' is less than
5371| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5372| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5373| Standard for Binary Floating-Point Arithmetic.
5374*----------------------------------------------------------------------------*/
5375
5376int float64_lt_quiet(float64 a, float64 b, float_status *status)
5377{
5378    flag aSign, bSign;
5379    uint64_t av, bv;
5380    a = float64_squash_input_denormal(a, status);
5381    b = float64_squash_input_denormal(b, status);
5382
5383    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5384         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5385       ) {
5386        if (float64_is_signaling_nan(a, status)
5387         || float64_is_signaling_nan(b, status)) {
5388            float_raise(float_flag_invalid, status);
5389        }
5390        return 0;
5391    }
5392    aSign = extractFloat64Sign( a );
5393    bSign = extractFloat64Sign( b );
5394    av = float64_val(a);
5395    bv = float64_val(b);
5396    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5397    return ( av != bv ) && ( aSign ^ ( av < bv ) );
5398
5399}
5400
5401/*----------------------------------------------------------------------------
5402| Returns 1 if the double-precision floating-point values `a' and `b' cannot
5403| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5404| comparison is performed according to the IEC/IEEE Standard for Binary
5405| Floating-Point Arithmetic.
5406*----------------------------------------------------------------------------*/
5407
5408int float64_unordered_quiet(float64 a, float64 b, float_status *status)
5409{
5410    a = float64_squash_input_denormal(a, status);
5411    b = float64_squash_input_denormal(b, status);
5412
5413    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5414         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5415       ) {
5416        if (float64_is_signaling_nan(a, status)
5417         || float64_is_signaling_nan(b, status)) {
5418            float_raise(float_flag_invalid, status);
5419        }
5420        return 1;
5421    }
5422    return 0;
5423}
5424
5425/*----------------------------------------------------------------------------
5426| Returns the result of converting the extended double-precision floating-
5427| point value `a' to the 32-bit two's complement integer format.  The
5428| conversion is performed according to the IEC/IEEE Standard for Binary
5429| Floating-Point Arithmetic---which means in particular that the conversion
5430| is rounded according to the current rounding mode.  If `a' is a NaN, the
5431| largest positive integer is returned.  Otherwise, if the conversion
5432| overflows, the largest integer with the same sign as `a' is returned.
5433*----------------------------------------------------------------------------*/
5434
5435int32_t floatx80_to_int32(floatx80 a, float_status *status)
5436{
5437    flag aSign;
5438    int32_t aExp, shiftCount;
5439    uint64_t aSig;
5440
5441    if (floatx80_invalid_encoding(a)) {
5442        float_raise(float_flag_invalid, status);
5443        return 1 << 31;
5444    }
5445    aSig = extractFloatx80Frac( a );
5446    aExp = extractFloatx80Exp( a );
5447    aSign = extractFloatx80Sign( a );
5448    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5449    shiftCount = 0x4037 - aExp;
5450    if ( shiftCount <= 0 ) shiftCount = 1;
5451    shift64RightJamming( aSig, shiftCount, &aSig );
5452    return roundAndPackInt32(aSign, aSig, status);
5453
5454}
5455
5456/*----------------------------------------------------------------------------
5457| Returns the result of converting the extended double-precision floating-
5458| point value `a' to the 32-bit two's complement integer format.  The
5459| conversion is performed according to the IEC/IEEE Standard for Binary
5460| Floating-Point Arithmetic, except that the conversion is always rounded
5461| toward zero.  If `a' is a NaN, the largest positive integer is returned.
5462| Otherwise, if the conversion overflows, the largest integer with the same
5463| sign as `a' is returned.
5464*----------------------------------------------------------------------------*/
5465
5466int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5467{
5468    flag aSign;
5469    int32_t aExp, shiftCount;
5470    uint64_t aSig, savedASig;
5471    int32_t z;
5472
5473    if (floatx80_invalid_encoding(a)) {
5474        float_raise(float_flag_invalid, status);
5475        return 1 << 31;
5476    }
5477    aSig = extractFloatx80Frac( a );
5478    aExp = extractFloatx80Exp( a );
5479    aSign = extractFloatx80Sign( a );
5480    if ( 0x401E < aExp ) {
5481        if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5482        goto invalid;
5483    }
5484    else if ( aExp < 0x3FFF ) {
5485        if (aExp || aSig) {
5486            status->float_exception_flags |= float_flag_inexact;
5487        }
5488        return 0;
5489    }
5490    shiftCount = 0x403E - aExp;
5491    savedASig = aSig;
5492    aSig >>= shiftCount;
5493    z = aSig;
5494    if ( aSign ) z = - z;
5495    if ( ( z < 0 ) ^ aSign ) {
5496 invalid:
5497        float_raise(float_flag_invalid, status);
5498        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5499    }
5500    if ( ( aSig<<shiftCount ) != savedASig ) {
5501        status->float_exception_flags |= float_flag_inexact;
5502    }
5503    return z;
5504
5505}
5506
5507/*----------------------------------------------------------------------------
5508| Returns the result of converting the extended double-precision floating-
5509| point value `a' to the 64-bit two's complement integer format.  The
5510| conversion is performed according to the IEC/IEEE Standard for Binary
5511| Floating-Point Arithmetic---which means in particular that the conversion
5512| is rounded according to the current rounding mode.  If `a' is a NaN,
5513| the largest positive integer is returned.  Otherwise, if the conversion
5514| overflows, the largest integer with the same sign as `a' is returned.
5515*----------------------------------------------------------------------------*/
5516
5517int64_t floatx80_to_int64(floatx80 a, float_status *status)
5518{
5519    flag aSign;
5520    int32_t aExp, shiftCount;
5521    uint64_t aSig, aSigExtra;
5522
5523    if (floatx80_invalid_encoding(a)) {
5524        float_raise(float_flag_invalid, status);
5525        return 1ULL << 63;
5526    }
5527    aSig = extractFloatx80Frac( a );
5528    aExp = extractFloatx80Exp( a );
5529    aSign = extractFloatx80Sign( a );
5530    shiftCount = 0x403E - aExp;
5531    if ( shiftCount <= 0 ) {
5532        if ( shiftCount ) {
5533            float_raise(float_flag_invalid, status);
5534            if (!aSign || floatx80_is_any_nan(a)) {
5535                return LIT64( 0x7FFFFFFFFFFFFFFF );
5536            }
5537            return (int64_t) LIT64( 0x8000000000000000 );
5538        }
5539        aSigExtra = 0;
5540    }
5541    else {
5542        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5543    }
5544    return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5545
5546}
5547
5548/*----------------------------------------------------------------------------
5549| Returns the result of converting the extended double-precision floating-
5550| point value `a' to the 64-bit two's complement integer format.  The
5551| conversion is performed according to the IEC/IEEE Standard for Binary
5552| Floating-Point Arithmetic, except that the conversion is always rounded
5553| toward zero.  If `a' is a NaN, the largest positive integer is returned.
5554| Otherwise, if the conversion overflows, the largest integer with the same
5555| sign as `a' is returned.
5556*----------------------------------------------------------------------------*/
5557
5558int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5559{
5560    flag aSign;
5561    int32_t aExp, shiftCount;
5562    uint64_t aSig;
5563    int64_t z;
5564
5565    if (floatx80_invalid_encoding(a)) {
5566        float_raise(float_flag_invalid, status);
5567        return 1ULL << 63;
5568    }
5569    aSig = extractFloatx80Frac( a );
5570    aExp = extractFloatx80Exp( a );
5571    aSign = extractFloatx80Sign( a );
5572    shiftCount = aExp - 0x403E;
5573    if ( 0 <= shiftCount ) {
5574        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
5575        if ( ( a.high != 0xC03E ) || aSig ) {
5576            float_raise(float_flag_invalid, status);
5577            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5578                return LIT64( 0x7FFFFFFFFFFFFFFF );
5579            }
5580        }
5581        return (int64_t) LIT64( 0x8000000000000000 );
5582    }
5583    else if ( aExp < 0x3FFF ) {
5584        if (aExp | aSig) {
5585            status->float_exception_flags |= float_flag_inexact;
5586        }
5587        return 0;
5588    }
5589    z = aSig>>( - shiftCount );
5590    if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5591        status->float_exception_flags |= float_flag_inexact;
5592    }
5593    if ( aSign ) z = - z;
5594    return z;
5595
5596}
5597
5598/*----------------------------------------------------------------------------
5599| Returns the result of converting the extended double-precision floating-
5600| point value `a' to the single-precision floating-point format.  The
5601| conversion is performed according to the IEC/IEEE Standard for Binary
5602| Floating-Point Arithmetic.
5603*----------------------------------------------------------------------------*/
5604
5605float32 floatx80_to_float32(floatx80 a, float_status *status)
5606{
5607    flag aSign;
5608    int32_t aExp;
5609    uint64_t aSig;
5610
5611    if (floatx80_invalid_encoding(a)) {
5612        float_raise(float_flag_invalid, status);
5613        return float32_default_nan(status);
5614    }
5615    aSig = extractFloatx80Frac( a );
5616    aExp = extractFloatx80Exp( a );
5617    aSign = extractFloatx80Sign( a );
5618    if ( aExp == 0x7FFF ) {
5619        if ( (uint64_t) ( aSig<<1 ) ) {
5620            return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5621        }
5622        return packFloat32( aSign, 0xFF, 0 );
5623    }
5624    shift64RightJamming( aSig, 33, &aSig );
5625    if ( aExp || aSig ) aExp -= 0x3F81;
5626    return roundAndPackFloat32(aSign, aExp, aSig, status);
5627
5628}
5629
5630/*----------------------------------------------------------------------------
5631| Returns the result of converting the extended double-precision floating-
5632| point value `a' to the double-precision floating-point format.  The
5633| conversion is performed according to the IEC/IEEE Standard for Binary
5634| Floating-Point Arithmetic.
5635*----------------------------------------------------------------------------*/
5636
5637float64 floatx80_to_float64(floatx80 a, float_status *status)
5638{
5639    flag aSign;
5640    int32_t aExp;
5641    uint64_t aSig, zSig;
5642
5643    if (floatx80_invalid_encoding(a)) {
5644        float_raise(float_flag_invalid, status);
5645        return float64_default_nan(status);
5646    }
5647    aSig = extractFloatx80Frac( a );
5648    aExp = extractFloatx80Exp( a );
5649    aSign = extractFloatx80Sign( a );
5650    if ( aExp == 0x7FFF ) {
5651        if ( (uint64_t) ( aSig<<1 ) ) {
5652            return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5653        }
5654        return packFloat64( aSign, 0x7FF, 0 );
5655    }
5656    shift64RightJamming( aSig, 1, &zSig );
5657    if ( aExp || aSig ) aExp -= 0x3C01;
5658    return roundAndPackFloat64(aSign, aExp, zSig, status);
5659
5660}
5661
5662/*----------------------------------------------------------------------------
5663| Returns the result of converting the extended double-precision floating-
5664| point value `a' to the quadruple-precision floating-point format.  The
5665| conversion is performed according to the IEC/IEEE Standard for Binary
5666| Floating-Point Arithmetic.
5667*----------------------------------------------------------------------------*/
5668
5669float128 floatx80_to_float128(floatx80 a, float_status *status)
5670{
5671    flag aSign;
5672    int aExp;
5673    uint64_t aSig, zSig0, zSig1;
5674
5675    if (floatx80_invalid_encoding(a)) {
5676        float_raise(float_flag_invalid, status);
5677        return float128_default_nan(status);
5678    }
5679    aSig = extractFloatx80Frac( a );
5680    aExp = extractFloatx80Exp( a );
5681    aSign = extractFloatx80Sign( a );
5682    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5683        return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5684    }
5685    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5686    return packFloat128( aSign, aExp, zSig0, zSig1 );
5687
5688}
5689
5690/*----------------------------------------------------------------------------
5691| Rounds the extended double-precision floating-point value `a'
5692| to the precision provided by floatx80_rounding_precision and returns the
5693| result as an extended double-precision floating-point value.
5694| The operation is performed according to the IEC/IEEE Standard for Binary
5695| Floating-Point Arithmetic.
5696*----------------------------------------------------------------------------*/
5697
5698floatx80 floatx80_round(floatx80 a, float_status *status)
5699{
5700    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5701                                extractFloatx80Sign(a),
5702                                extractFloatx80Exp(a),
5703                                extractFloatx80Frac(a), 0, status);
5704}
5705
5706/*----------------------------------------------------------------------------
5707| Rounds the extended double-precision floating-point value `a' to an integer,
5708| and returns the result as an extended quadruple-precision floating-point
5709| value.  The operation is performed according to the IEC/IEEE Standard for
5710| Binary Floating-Point Arithmetic.
5711*----------------------------------------------------------------------------*/
5712
5713floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5714{
5715    flag aSign;
5716    int32_t aExp;
5717    uint64_t lastBitMask, roundBitsMask;
5718    floatx80 z;
5719
5720    if (floatx80_invalid_encoding(a)) {
5721        float_raise(float_flag_invalid, status);
5722        return floatx80_default_nan(status);
5723    }
5724    aExp = extractFloatx80Exp( a );
5725    if ( 0x403E <= aExp ) {
5726        if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5727            return propagateFloatx80NaN(a, a, status);
5728        }
5729        return a;
5730    }
5731    if ( aExp < 0x3FFF ) {
5732        if (    ( aExp == 0 )
5733             && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5734            return a;
5735        }
5736        status->float_exception_flags |= float_flag_inexact;
5737        aSign = extractFloatx80Sign( a );
5738        switch (status->float_rounding_mode) {
5739         case float_round_nearest_even:
5740            if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5741               ) {
5742                return
5743                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5744            }
5745            break;
5746        case float_round_ties_away:
5747            if (aExp == 0x3FFE) {
5748                return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5749            }
5750            break;
5751         case float_round_down:
5752            return
5753                  aSign ?
5754                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5755                : packFloatx80( 0, 0, 0 );
5756         case float_round_up:
5757            return
5758                  aSign ? packFloatx80( 1, 0, 0 )
5759                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5760        }
5761        return packFloatx80( aSign, 0, 0 );
5762    }
5763    lastBitMask = 1;
5764    lastBitMask <<= 0x403E - aExp;
5765    roundBitsMask = lastBitMask - 1;
5766    z = a;
5767    switch (status->float_rounding_mode) {
5768    case float_round_nearest_even:
5769        z.low += lastBitMask>>1;
5770        if ((z.low & roundBitsMask) == 0) {
5771            z.low &= ~lastBitMask;
5772        }
5773        break;
5774    case float_round_ties_away:
5775        z.low += lastBitMask >> 1;
5776        break;
5777    case float_round_to_zero:
5778        break;
5779    case float_round_up:
5780        if (!extractFloatx80Sign(z)) {
5781            z.low += roundBitsMask;
5782        }
5783        break;
5784    case float_round_down:
5785        if (extractFloatx80Sign(z)) {
5786            z.low += roundBitsMask;
5787        }
5788        break;
5789    default:
5790        abort();
5791    }
5792    z.low &= ~ roundBitsMask;
5793    if ( z.low == 0 ) {
5794        ++z.high;
5795        z.low = LIT64( 0x8000000000000000 );
5796    }
5797    if (z.low != a.low) {
5798        status->float_exception_flags |= float_flag_inexact;
5799    }
5800    return z;
5801
5802}
5803
5804/*----------------------------------------------------------------------------
5805| Returns the result of adding the absolute values of the extended double-
5806| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5807| negated before being returned.  `zSign' is ignored if the result is a NaN.
5808| The addition is performed according to the IEC/IEEE Standard for Binary
5809| Floating-Point Arithmetic.
5810*----------------------------------------------------------------------------*/
5811
5812static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5813                                float_status *status)
5814{
5815    int32_t aExp, bExp, zExp;
5816    uint64_t aSig, bSig, zSig0, zSig1;
5817    int32_t expDiff;
5818
5819    aSig = extractFloatx80Frac( a );
5820    aExp = extractFloatx80Exp( a );
5821    bSig = extractFloatx80Frac( b );
5822    bExp = extractFloatx80Exp( b );
5823    expDiff = aExp - bExp;
5824    if ( 0 < expDiff ) {
5825        if ( aExp == 0x7FFF ) {
5826            if ((uint64_t)(aSig << 1)) {
5827                return propagateFloatx80NaN(a, b, status);
5828            }
5829            return a;
5830        }
5831        if ( bExp == 0 ) --expDiff;
5832        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5833        zExp = aExp;
5834    }
5835    else if ( expDiff < 0 ) {
5836        if ( bExp == 0x7FFF ) {
5837            if ((uint64_t)(bSig << 1)) {
5838                return propagateFloatx80NaN(a, b, status);
5839            }
5840            return packFloatx80(zSign,
5841                                floatx80_infinity_high,
5842                                floatx80_infinity_low);
5843        }
5844        if ( aExp == 0 ) ++expDiff;
5845        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5846        zExp = bExp;
5847    }
5848    else {
5849        if ( aExp == 0x7FFF ) {
5850            if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5851                return propagateFloatx80NaN(a, b, status);
5852            }
5853            return a;
5854        }
5855        zSig1 = 0;
5856        zSig0 = aSig + bSig;
5857        if ( aExp == 0 ) {
5858            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5859            goto roundAndPack;
5860        }
5861        zExp = aExp;
5862        goto shiftRight1;
5863    }
5864    zSig0 = aSig + bSig;
5865    if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5866 shiftRight1:
5867    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5868    zSig0 |= LIT64( 0x8000000000000000 );
5869    ++zExp;
5870 roundAndPack:
5871    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5872                                zSign, zExp, zSig0, zSig1, status);
5873}
5874
5875/*----------------------------------------------------------------------------
5876| Returns the result of subtracting the absolute values of the extended
5877| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5878| difference is negated before being returned.  `zSign' is ignored if the
5879| result is a NaN.  The subtraction is performed according to the IEC/IEEE
5880| Standard for Binary Floating-Point Arithmetic.
5881*----------------------------------------------------------------------------*/
5882
5883static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5884                                float_status *status)
5885{
5886    int32_t aExp, bExp, zExp;
5887    uint64_t aSig, bSig, zSig0, zSig1;
5888    int32_t expDiff;
5889
5890    aSig = extractFloatx80Frac( a );
5891    aExp = extractFloatx80Exp( a );
5892    bSig = extractFloatx80Frac( b );
5893    bExp = extractFloatx80Exp( b );
5894    expDiff = aExp - bExp;
5895    if ( 0 < expDiff ) goto aExpBigger;
5896    if ( expDiff < 0 ) goto bExpBigger;
5897    if ( aExp == 0x7FFF ) {
5898        if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5899            return propagateFloatx80NaN(a, b, status);
5900        }
5901        float_raise(float_flag_invalid, status);
5902        return floatx80_default_nan(status);
5903    }
5904    if ( aExp == 0 ) {
5905        aExp = 1;
5906        bExp = 1;
5907    }
5908    zSig1 = 0;
5909    if ( bSig < aSig ) goto aBigger;
5910    if ( aSig < bSig ) goto bBigger;
5911    return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5912 bExpBigger:
5913    if ( bExp == 0x7FFF ) {
5914        if ((uint64_t)(bSig << 1)) {
5915            return propagateFloatx80NaN(a, b, status);
5916        }
5917        return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5918                            floatx80_infinity_low);
5919    }
5920    if ( aExp == 0 ) ++expDiff;
5921    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5922 bBigger:
5923    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5924    zExp = bExp;
5925    zSign ^= 1;
5926    goto normalizeRoundAndPack;
5927 aExpBigger:
5928    if ( aExp == 0x7FFF ) {
5929        if ((uint64_t)(aSig << 1)) {
5930            return propagateFloatx80NaN(a, b, status);
5931        }
5932        return a;
5933    }
5934    if ( bExp == 0 ) --expDiff;
5935    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5936 aBigger:
5937    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5938    zExp = aExp;
5939 normalizeRoundAndPack:
5940    return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5941                                         zSign, zExp, zSig0, zSig1, status);
5942}
5943
5944/*----------------------------------------------------------------------------
5945| Returns the result of adding the extended double-precision floating-point
5946| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5947| Standard for Binary Floating-Point Arithmetic.
5948*----------------------------------------------------------------------------*/
5949
5950floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5951{
5952    flag aSign, bSign;
5953
5954    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5955        float_raise(float_flag_invalid, status);
5956        return floatx80_default_nan(status);
5957    }
5958    aSign = extractFloatx80Sign( a );
5959    bSign = extractFloatx80Sign( b );
5960    if ( aSign == bSign ) {
5961        return addFloatx80Sigs(a, b, aSign, status);
5962    }
5963    else {
5964        return subFloatx80Sigs(a, b, aSign, status);
5965    }
5966
5967}
5968
5969/*----------------------------------------------------------------------------
5970| Returns the result of subtracting the extended double-precision floating-
5971| point values `a' and `b'.  The operation is performed according to the
5972| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5973*----------------------------------------------------------------------------*/
5974
5975floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5976{
5977    flag aSign, bSign;
5978
5979    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5980        float_raise(float_flag_invalid, status);
5981        return floatx80_default_nan(status);
5982    }
5983    aSign = extractFloatx80Sign( a );
5984    bSign = extractFloatx80Sign( b );
5985    if ( aSign == bSign ) {
5986        return subFloatx80Sigs(a, b, aSign, status);
5987    }
5988    else {
5989        return addFloatx80Sigs(a, b, aSign, status);
5990    }
5991
5992}
5993
5994/*----------------------------------------------------------------------------
5995| Returns the result of multiplying the extended double-precision floating-
5996| point values `a' and `b'.  The operation is performed according to the
5997| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5998*----------------------------------------------------------------------------*/
5999
6000floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6001{
6002    flag aSign, bSign, zSign;
6003    int32_t aExp, bExp, zExp;
6004    uint64_t aSig, bSig, zSig0, zSig1;
6005
6006    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6007        float_raise(float_flag_invalid, status);
6008        return floatx80_default_nan(status);
6009    }
6010    aSig = extractFloatx80Frac( a );
6011    aExp = extractFloatx80Exp( a );
6012    aSign = extractFloatx80Sign( a );
6013    bSig = extractFloatx80Frac( b );
6014    bExp = extractFloatx80Exp( b );
6015    bSign = extractFloatx80Sign( b );
6016    zSign = aSign ^ bSign;
6017    if ( aExp == 0x7FFF ) {
6018        if (    (uint64_t) ( aSig<<1 )
6019             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6020            return propagateFloatx80NaN(a, b, status);
6021        }
6022        if ( ( bExp | bSig ) == 0 ) goto invalid;
6023        return packFloatx80(zSign, floatx80_infinity_high,
6024                                   floatx80_infinity_low);
6025    }
6026    if ( bExp == 0x7FFF ) {
6027        if ((uint64_t)(bSig << 1)) {
6028            return propagateFloatx80NaN(a, b, status);
6029        }
6030        if ( ( aExp | aSig ) == 0 ) {
6031 invalid:
6032            float_raise(float_flag_invalid, status);
6033            return floatx80_default_nan(status);
6034        }
6035        return packFloatx80(zSign, floatx80_infinity_high,
6036                                   floatx80_infinity_low);
6037    }
6038    if ( aExp == 0 ) {
6039        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6040        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6041    }
6042    if ( bExp == 0 ) {
6043        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6044        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6045    }
6046    zExp = aExp + bExp - 0x3FFE;
6047    mul64To128( aSig, bSig, &zSig0, &zSig1 );
6048    if ( 0 < (int64_t) zSig0 ) {
6049        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6050        --zExp;
6051    }
6052    return roundAndPackFloatx80(status->floatx80_rounding_precision,
6053                                zSign, zExp, zSig0, zSig1, status);
6054}
6055
6056/*----------------------------------------------------------------------------
6057| Returns the result of dividing the extended double-precision floating-point
6058| value `a' by the corresponding value `b'.  The operation is performed
6059| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6060*----------------------------------------------------------------------------*/
6061
6062floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6063{
6064    flag aSign, bSign, zSign;
6065    int32_t aExp, bExp, zExp;
6066    uint64_t aSig, bSig, zSig0, zSig1;
6067    uint64_t rem0, rem1, rem2, term0, term1, term2;
6068
6069    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6070        float_raise(float_flag_invalid, status);
6071        return floatx80_default_nan(status);
6072    }
6073    aSig = extractFloatx80Frac( a );
6074    aExp = extractFloatx80Exp( a );
6075    aSign = extractFloatx80Sign( a );
6076    bSig = extractFloatx80Frac( b );
6077    bExp = extractFloatx80Exp( b );
6078    bSign = extractFloatx80Sign( b );
6079    zSign = aSign ^ bSign;
6080    if ( aExp == 0x7FFF ) {
6081        if ((uint64_t)(aSig << 1)) {
6082            return propagateFloatx80NaN(a, b, status);
6083        }
6084        if ( bExp == 0x7FFF ) {
6085            if ((uint64_t)(bSig << 1)) {
6086                return propagateFloatx80NaN(a, b, status);
6087            }
6088            goto invalid;
6089        }
6090        return packFloatx80(zSign, floatx80_infinity_high,
6091                                   floatx80_infinity_low);
6092    }
6093    if ( bExp == 0x7FFF ) {
6094        if ((uint64_t)(bSig << 1)) {
6095            return propagateFloatx80NaN(a, b, status);
6096        }
6097        return packFloatx80( zSign, 0, 0 );
6098    }
6099    if ( bExp == 0 ) {
6100        if ( bSig == 0 ) {
6101            if ( ( aExp | aSig ) == 0 ) {
6102 invalid:
6103                float_raise(float_flag_invalid, status);
6104                return floatx80_default_nan(status);
6105            }
6106            float_raise(float_flag_divbyzero, status);
6107            return packFloatx80(zSign, floatx80_infinity_high,
6108                                       floatx80_infinity_low);
6109        }
6110        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6111    }
6112    if ( aExp == 0 ) {
6113        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6114        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6115    }
6116    zExp = aExp - bExp + 0x3FFE;
6117    rem1 = 0;
6118    if ( bSig <= aSig ) {
6119        shift128Right( aSig, 0, 1, &aSig, &rem1 );
6120        ++zExp;
6121    }
6122    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6123    mul64To128( bSig, zSig0, &term0, &term1 );
6124    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6125    while ( (int64_t) rem0 < 0 ) {
6126        --zSig0;
6127        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6128    }
6129    zSig1 = estimateDiv128To64( rem1, 0, bSig );
6130    if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6131        mul64To128( bSig, zSig1, &term1, &term2 );
6132        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6133        while ( (int64_t) rem1 < 0 ) {
6134            --zSig1;
6135            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6136        }
6137        zSig1 |= ( ( rem1 | rem2 ) != 0 );
6138    }
6139    return roundAndPackFloatx80(status->floatx80_rounding_precision,
6140                                zSign, zExp, zSig0, zSig1, status);
6141}
6142
6143/*----------------------------------------------------------------------------
6144| Returns the remainder of the extended double-precision floating-point value
6145| `a' with respect to the corresponding value `b'.  The operation is performed
6146| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6147*----------------------------------------------------------------------------*/
6148
6149floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6150{
6151    flag aSign, zSign;
6152    int32_t aExp, bExp, expDiff;
6153    uint64_t aSig0, aSig1, bSig;
6154    uint64_t q, term0, term1, alternateASig0, alternateASig1;
6155
6156    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6157        float_raise(float_flag_invalid, status);
6158        return floatx80_default_nan(status);
6159    }
6160    aSig0 = extractFloatx80Frac( a );
6161    aExp = extractFloatx80Exp( a );
6162    aSign = extractFloatx80Sign( a );
6163    bSig = extractFloatx80Frac( b );
6164    bExp = extractFloatx80Exp( b );
6165    if ( aExp == 0x7FFF ) {
6166        if (    (uint64_t) ( aSig0<<1 )
6167             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6168            return propagateFloatx80NaN(a, b, status);
6169        }
6170        goto invalid;
6171    }
6172    if ( bExp == 0x7FFF ) {
6173        if ((uint64_t)(bSig << 1)) {
6174            return propagateFloatx80NaN(a, b, status);
6175        }
6176        return a;
6177    }
6178    if ( bExp == 0 ) {
6179        if ( bSig == 0 ) {
6180 invalid:
6181            float_raise(float_flag_invalid, status);
6182            return floatx80_default_nan(status);
6183        }
6184        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6185    }
6186    if ( aExp == 0 ) {
6187        if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
6188        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6189    }
6190    bSig |= LIT64( 0x8000000000000000 );
6191    zSign = aSign;
6192    expDiff = aExp - bExp;
6193    aSig1 = 0;
6194    if ( expDiff < 0 ) {
6195        if ( expDiff < -1 ) return a;
6196        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6197        expDiff = 0;
6198    }
6199    q = ( bSig <= aSig0 );
6200    if ( q ) aSig0 -= bSig;
6201    expDiff -= 64;
6202    while ( 0 < expDiff ) {
6203        q = estimateDiv128To64( aSig0, aSig1, bSig );
6204        q = ( 2 < q ) ? q - 2 : 0;
6205        mul64To128( bSig, q, &term0, &term1 );
6206        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6207        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6208        expDiff -= 62;
6209    }
6210    expDiff += 64;
6211    if ( 0 < expDiff ) {
6212        q = estimateDiv128To64( aSig0, aSig1, bSig );
6213        q = ( 2 < q ) ? q - 2 : 0;
6214        q >>= 64 - expDiff;
6215        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6216        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6217        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6218        while ( le128( term0, term1, aSig0, aSig1 ) ) {
6219            ++q;
6220            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6221        }
6222    }
6223    else {
6224        term1 = 0;
6225        term0 = bSig;
6226    }
6227    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6228    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6229         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6230              && ( q & 1 ) )
6231       ) {
6232        aSig0 = alternateASig0;
6233        aSig1 = alternateASig1;
6234        zSign = ! zSign;
6235    }
6236    return
6237        normalizeRoundAndPackFloatx80(
6238            80, zSign, bExp + expDiff, aSig0, aSig1, status);
6239
6240}
6241
6242/*----------------------------------------------------------------------------
6243| Returns the square root of the extended double-precision floating-point
6244| value `a'.  The operation is performed according to the IEC/IEEE Standard
6245| for Binary Floating-Point Arithmetic.
6246*----------------------------------------------------------------------------*/
6247
6248floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6249{
6250    flag aSign;
6251    int32_t aExp, zExp;
6252    uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6253    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6254
6255    if (floatx80_invalid_encoding(a)) {
6256        float_raise(float_flag_invalid, status);
6257        return floatx80_default_nan(status);
6258    }
6259    aSig0 = extractFloatx80Frac( a );
6260    aExp = extractFloatx80Exp( a );
6261    aSign = extractFloatx80Sign( a );
6262    if ( aExp == 0x7FFF ) {
6263        if ((uint64_t)(aSig0 << 1)) {
6264            return propagateFloatx80NaN(a, a, status);
6265        }
6266        if ( ! aSign ) return a;
6267        goto invalid;
6268    }
6269    if ( aSign ) {
6270        if ( ( aExp | aSig0 ) == 0 ) return a;
6271 invalid:
6272        float_raise(float_flag_invalid, status);
6273        return floatx80_default_nan(status);
6274    }
6275    if ( aExp == 0 ) {
6276        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6277        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6278    }
6279    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6280    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6281    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6282    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6283    doubleZSig0 = zSig0<<1;
6284    mul64To128( zSig0, zSig0, &term0, &term1 );
6285    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6286    while ( (int64_t) rem0 < 0 ) {
6287        --zSig0;
6288        doubleZSig0 -= 2;
6289        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6290    }
6291    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6292    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
6293        if ( zSig1 == 0 ) zSig1 = 1;
6294        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6295        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6296        mul64To128( zSig1, zSig1, &term2, &term3 );
6297        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6298        while ( (int64_t) rem1 < 0 ) {
6299            --zSig1;
6300            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6301            term3 |= 1;
6302            term2 |= doubleZSig0;
6303            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6304        }
6305        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6306    }
6307    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6308    zSig0 |= doubleZSig0;
6309    return roundAndPackFloatx80(status->floatx80_rounding_precision,
6310                                0, zExp, zSig0, zSig1, status);
6311}
6312
6313/*----------------------------------------------------------------------------
6314| Returns 1 if the extended double-precision floating-point value `a' is equal
6315| to the corresponding value `b', and 0 otherwise.  The invalid exception is
6316| raised if either operand is a NaN.  Otherwise, the comparison is performed
6317| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6318*----------------------------------------------------------------------------*/
6319
6320int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
6321{
6322
6323    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6324        || (extractFloatx80Exp(a) == 0x7FFF
6325            && (uint64_t) (extractFloatx80Frac(a) << 1))
6326        || (extractFloatx80Exp(b) == 0x7FFF
6327            && (uint64_t) (extractFloatx80Frac(b) << 1))
6328       ) {
6329        float_raise(float_flag_invalid, status);
6330        return 0;
6331    }
6332    return
6333           ( a.low == b.low )
6334        && (    ( a.high == b.high )
6335             || (    ( a.low == 0 )
6336                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6337           );
6338
6339}
6340
6341/*----------------------------------------------------------------------------
6342| Returns 1 if the extended double-precision floating-point value `a' is
6343| less than or equal to the corresponding value `b', and 0 otherwise.  The
6344| invalid exception is raised if either operand is a NaN.  The comparison is
6345| performed according to the IEC/IEEE Standard for Binary Floating-Point
6346| Arithmetic.
6347*----------------------------------------------------------------------------*/
6348
6349int floatx80_le(floatx80 a, floatx80 b, float_status *status)
6350{
6351    flag aSign, bSign;
6352
6353    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6354        || (extractFloatx80Exp(a) == 0x7FFF
6355            && (uint64_t) (extractFloatx80Frac(a) << 1))
6356        || (extractFloatx80Exp(b) == 0x7FFF
6357            && (uint64_t) (extractFloatx80Frac(b) << 1))
6358       ) {
6359        float_raise(float_flag_invalid, status);
6360        return 0;
6361    }
6362    aSign = extractFloatx80Sign( a );
6363    bSign = extractFloatx80Sign( b );
6364    if ( aSign != bSign ) {
6365        return
6366               aSign
6367            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6368                 == 0 );
6369    }
6370    return
6371          aSign ? le128( b.high, b.low, a.high, a.low )
6372        : le128( a.high, a.low, b.high, b.low );
6373
6374}
6375
6376/*----------------------------------------------------------------------------
6377| Returns 1 if the extended double-precision floating-point value `a' is
6378| less than the corresponding value `b', and 0 otherwise.  The invalid
6379| exception is raised if either operand is a NaN.  The comparison is performed
6380| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6381*----------------------------------------------------------------------------*/
6382
6383int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
6384{
6385    flag aSign, bSign;
6386
6387    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6388        || (extractFloatx80Exp(a) == 0x7FFF
6389            && (uint64_t) (extractFloatx80Frac(a) << 1))
6390        || (extractFloatx80Exp(b) == 0x7FFF
6391            && (uint64_t) (extractFloatx80Frac(b) << 1))
6392       ) {
6393        float_raise(float_flag_invalid, status);
6394        return 0;
6395    }
6396    aSign = extractFloatx80Sign( a );
6397    bSign = extractFloatx80Sign( b );
6398    if ( aSign != bSign ) {
6399        return
6400               aSign
6401            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6402                 != 0 );
6403    }
6404    return
6405          aSign ? lt128( b.high, b.low, a.high, a.low )
6406        : lt128( a.high, a.low, b.high, b.low );
6407
6408}
6409
6410/*----------------------------------------------------------------------------
6411| Returns 1 if the extended double-precision floating-point values `a' and `b'
6412| cannot be compared, and 0 otherwise.  The invalid exception is raised if
6413| either operand is a NaN.   The comparison is performed according to the
6414| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6415*----------------------------------------------------------------------------*/
6416int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
6417{
6418    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6419        || (extractFloatx80Exp(a) == 0x7FFF
6420            && (uint64_t) (extractFloatx80Frac(a) << 1))
6421        || (extractFloatx80Exp(b) == 0x7FFF
6422            && (uint64_t) (extractFloatx80Frac(b) << 1))
6423       ) {
6424        float_raise(float_flag_invalid, status);
6425        return 1;
6426    }
6427    return 0;
6428}
6429
6430/*----------------------------------------------------------------------------
6431| Returns 1 if the extended double-precision floating-point value `a' is
6432| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6433| cause an exception.  The comparison is performed according to the IEC/IEEE
6434| Standard for Binary Floating-Point Arithmetic.
6435*----------------------------------------------------------------------------*/
6436
6437int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
6438{
6439
6440    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6441        float_raise(float_flag_invalid, status);
6442        return 0;
6443    }
6444    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6445              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6446         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6447              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6448       ) {
6449        if (floatx80_is_signaling_nan(a, status)
6450         || floatx80_is_signaling_nan(b, status)) {
6451            float_raise(float_flag_invalid, status);
6452        }
6453        return 0;
6454    }
6455    return
6456           ( a.low == b.low )
6457        && (    ( a.high == b.high )
6458             || (    ( a.low == 0 )
6459                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6460           );
6461
6462}
6463
6464/*----------------------------------------------------------------------------
6465| Returns 1 if the extended double-precision floating-point value `a' is less
6466| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
6467| do not cause an exception.  Otherwise, the comparison is performed according
6468| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6469*----------------------------------------------------------------------------*/
6470
6471int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
6472{
6473    flag aSign, bSign;
6474
6475    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6476        float_raise(float_flag_invalid, status);
6477        return 0;
6478    }
6479    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6480              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6481         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6482              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6483       ) {
6484        if (floatx80_is_signaling_nan(a, status)
6485         || floatx80_is_signaling_nan(b, status)) {
6486            float_raise(float_flag_invalid, status);
6487        }
6488        return 0;
6489    }
6490    aSign = extractFloatx80Sign( a );
6491    bSign = extractFloatx80Sign( b );
6492    if ( aSign != bSign ) {
6493        return
6494               aSign
6495            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6496                 == 0 );
6497    }
6498    return
6499          aSign ? le128( b.high, b.low, a.high, a.low )
6500        : le128( a.high, a.low, b.high, b.low );
6501
6502}
6503
6504/*----------------------------------------------------------------------------
6505| Returns 1 if the extended double-precision floating-point value `a' is less
6506| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
6507| an exception.  Otherwise, the comparison is performed according to the
6508| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6509*----------------------------------------------------------------------------*/
6510
6511int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6512{
6513    flag aSign, bSign;
6514
6515    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6516        float_raise(float_flag_invalid, status);
6517        return 0;
6518    }
6519    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6520              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6521         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6522              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6523       ) {
6524        if (floatx80_is_signaling_nan(a, status)
6525         || floatx80_is_signaling_nan(b, status)) {
6526            float_raise(float_flag_invalid, status);
6527        }
6528        return 0;
6529    }
6530    aSign = extractFloatx80Sign( a );
6531    bSign = extractFloatx80Sign( b );
6532    if ( aSign != bSign ) {
6533        return
6534               aSign
6535            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6536                 != 0 );
6537    }
6538    return
6539          aSign ? lt128( b.high, b.low, a.high, a.low )
6540        : lt128( a.high, a.low, b.high, b.low );
6541
6542}
6543
6544/*----------------------------------------------------------------------------
6545| Returns 1 if the extended double-precision floating-point values `a' and `b'
6546| cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
6547| The comparison is performed according to the IEC/IEEE Standard for Binary
6548| Floating-Point Arithmetic.
6549*----------------------------------------------------------------------------*/
6550int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6551{
6552    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6553        float_raise(float_flag_invalid, status);
6554        return 1;
6555    }
6556    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6557              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6558         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6559              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6560       ) {
6561        if (floatx80_is_signaling_nan(a, status)
6562         || floatx80_is_signaling_nan(b, status)) {
6563            float_raise(float_flag_invalid, status);
6564        }
6565        return 1;
6566    }
6567    return 0;
6568}
6569
6570/*----------------------------------------------------------------------------
6571| Returns the result of converting the quadruple-precision floating-point
6572| value `a' to the 32-bit two's complement integer format.  The conversion
6573| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6574| Arithmetic---which means in particular that the conversion is rounded
6575| according to the current rounding mode.  If `a' is a NaN, the largest
6576| positive integer is returned.  Otherwise, if the conversion overflows, the
6577| largest integer with the same sign as `a' is returned.
6578*----------------------------------------------------------------------------*/
6579
6580int32_t float128_to_int32(float128 a, float_status *status)
6581{
6582    flag aSign;
6583    int32_t aExp, shiftCount;
6584    uint64_t aSig0, aSig1;
6585
6586    aSig1 = extractFloat128Frac1( a );
6587    aSig0 = extractFloat128Frac0( a );
6588    aExp = extractFloat128Exp( a );
6589    aSign = extractFloat128Sign( a );
6590    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6591    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6592    aSig0 |= ( aSig1 != 0 );
6593    shiftCount = 0x4028 - aExp;
6594    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6595    return roundAndPackInt32(aSign, aSig0, status);
6596
6597}
6598
6599/*----------------------------------------------------------------------------
6600| Returns the result of converting the quadruple-precision floating-point
6601| value `a' to the 32-bit two's complement integer format.  The conversion
6602| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6603| Arithmetic, except that the conversion is always rounded toward zero.  If
6604| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6605| conversion overflows, the largest integer with the same sign as `a' is
6606| returned.
6607*----------------------------------------------------------------------------*/
6608
6609int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6610{
6611    flag aSign;
6612    int32_t aExp, shiftCount;
6613    uint64_t aSig0, aSig1, savedASig;
6614    int32_t z;
6615
6616    aSig1 = extractFloat128Frac1( a );
6617    aSig0 = extractFloat128Frac0( a );
6618    aExp = extractFloat128Exp( a );
6619    aSign = extractFloat128Sign( a );
6620    aSig0 |= ( aSig1 != 0 );
6621    if ( 0x401E < aExp ) {
6622        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6623        goto invalid;
6624    }
6625    else if ( aExp < 0x3FFF ) {
6626        if (aExp || aSig0) {
6627            status->float_exception_flags |= float_flag_inexact;
6628        }
6629        return 0;
6630    }
6631    aSig0 |= LIT64( 0x0001000000000000 );
6632    shiftCount = 0x402F - aExp;
6633    savedASig = aSig0;
6634    aSig0 >>= shiftCount;
6635    z = aSig0;
6636    if ( aSign ) z = - z;
6637    if ( ( z < 0 ) ^ aSign ) {
6638 invalid:
6639        float_raise(float_flag_invalid, status);
6640        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6641    }
6642    if ( ( aSig0<<shiftCount ) != savedASig ) {
6643        status->float_exception_flags |= float_flag_inexact;
6644    }
6645    return z;
6646
6647}
6648
6649/*----------------------------------------------------------------------------
6650| Returns the result of converting the quadruple-precision floating-point
6651| value `a' to the 64-bit two's complement integer format.  The conversion
6652| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6653| Arithmetic---which means in particular that the conversion is rounded
6654| according to the current rounding mode.  If `a' is a NaN, the largest
6655| positive integer is returned.  Otherwise, if the conversion overflows, the
6656| largest integer with the same sign as `a' is returned.
6657*----------------------------------------------------------------------------*/
6658
6659int64_t float128_to_int64(float128 a, float_status *status)
6660{
6661    flag aSign;
6662    int32_t aExp, shiftCount;
6663    uint64_t aSig0, aSig1;
6664
6665    aSig1 = extractFloat128Frac1( a );
6666    aSig0 = extractFloat128Frac0( a );
6667    aExp = extractFloat128Exp( a );
6668    aSign = extractFloat128Sign( a );
6669    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6670    shiftCount = 0x402F - aExp;
6671    if ( shiftCount <= 0 ) {
6672        if ( 0x403E < aExp ) {
6673            float_raise(float_flag_invalid, status);
6674            if (    ! aSign
6675                 || (    ( aExp == 0x7FFF )
6676                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6677                    )
6678               ) {
6679                return LIT64( 0x7FFFFFFFFFFFFFFF );
6680            }
6681            return (int64_t) LIT64( 0x8000000000000000 );
6682        }
6683        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6684    }
6685    else {
6686        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6687    }
6688    return roundAndPackInt64(aSign, aSig0, aSig1, status);
6689
6690}
6691
6692/*----------------------------------------------------------------------------
6693| Returns the result of converting the quadruple-precision floating-point
6694| value `a' to the 64-bit two's complement integer format.  The conversion
6695| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6696| Arithmetic, except that the conversion is always rounded toward zero.
6697| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6698| the conversion overflows, the largest integer with the same sign as `a' is
6699| returned.
6700*----------------------------------------------------------------------------*/
6701
6702int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6703{
6704    flag aSign;
6705    int32_t aExp, shiftCount;
6706    uint64_t aSig0, aSig1;
6707    int64_t z;
6708
6709    aSig1 = extractFloat128Frac1( a );
6710    aSig0 = extractFloat128Frac0( a );
6711    aExp = extractFloat128Exp( a );
6712    aSign = extractFloat128Sign( a );
6713    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6714    shiftCount = aExp - 0x402F;
6715    if ( 0 < shiftCount ) {
6716        if ( 0x403E <= aExp ) {
6717            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6718            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6719                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6720                if (aSig1) {
6721                    status->float_exception_flags |= float_flag_inexact;
6722                }
6723            }
6724            else {
6725                float_raise(float_flag_invalid, status);
6726                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6727                    return LIT64( 0x7FFFFFFFFFFFFFFF );
6728                }
6729            }
6730            return (int64_t) LIT64( 0x8000000000000000 );
6731        }
6732        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6733        if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6734            status->float_exception_flags |= float_flag_inexact;
6735        }
6736    }
6737    else {
6738        if ( aExp < 0x3FFF ) {
6739            if ( aExp | aSig0 | aSig1 ) {
6740                status->float_exception_flags |= float_flag_inexact;
6741            }
6742            return 0;
6743        }
6744        z = aSig0>>( - shiftCount );
6745        if (    aSig1
6746             || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6747            status->float_exception_flags |= float_flag_inexact;
6748        }
6749    }
6750    if ( aSign ) z = - z;
6751    return z;
6752
6753}
6754
6755/*----------------------------------------------------------------------------
6756| Returns the result of converting the quadruple-precision floating-point value
6757| `a' to the 64-bit unsigned integer format.  The conversion is
6758| performed according to the IEC/IEEE Standard for Binary Floating-Point
6759| Arithmetic---which means in particular that the conversion is rounded
6760| according to the current rounding mode.  If `a' is a NaN, the largest
6761| positive integer is returned.  If the conversion overflows, the
6762| largest unsigned integer is returned.  If 'a' is negative, the value is
6763| rounded and zero is returned; negative values that do not round to zero
6764| will raise the inexact exception.
6765*----------------------------------------------------------------------------*/
6766
6767uint64_t float128_to_uint64(float128 a, float_status *status)
6768{
6769    flag aSign;
6770    int aExp;
6771    int shiftCount;
6772    uint64_t aSig0, aSig1;
6773
6774    aSig0 = extractFloat128Frac0(a);
6775    aSig1 = extractFloat128Frac1(a);
6776    aExp = extractFloat128Exp(a);
6777    aSign = extractFloat128Sign(a);
6778    if (aSign && (aExp > 0x3FFE)) {
6779        float_raise(float_flag_invalid, status);
6780        if (float128_is_any_nan(a)) {
6781            return LIT64(0xFFFFFFFFFFFFFFFF);
6782        } else {
6783            return 0;
6784        }
6785    }
6786    if (aExp) {
6787        aSig0 |= LIT64(0x0001000000000000);
6788    }
6789    shiftCount = 0x402F - aExp;
6790    if (shiftCount <= 0) {
6791        if (0x403E < aExp) {
6792            float_raise(float_flag_invalid, status);
6793            return LIT64(0xFFFFFFFFFFFFFFFF);
6794        }
6795        shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6796    } else {
6797        shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6798    }
6799    return roundAndPackUint64(aSign, aSig0, aSig1, status);
6800}
6801
6802uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6803{
6804    uint64_t v;
6805    signed char current_rounding_mode = status->float_rounding_mode;
6806
6807    set_float_rounding_mode(float_round_to_zero, status);
6808    v = float128_to_uint64(a, status);
6809    set_float_rounding_mode(current_rounding_mode, status);
6810
6811    return v;
6812}
6813
6814/*----------------------------------------------------------------------------
6815| Returns the result of converting the quadruple-precision floating-point
6816| value `a' to the 32-bit unsigned integer format.  The conversion
6817| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6818| Arithmetic except that the conversion is always rounded toward zero.
6819| If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6820| if the conversion overflows, the largest unsigned integer is returned.
6821| If 'a' is negative, the value is rounded and zero is returned; negative
6822| values that do not round to zero will raise the inexact exception.
6823*----------------------------------------------------------------------------*/
6824
6825uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6826{
6827    uint64_t v;
6828    uint32_t res;
6829    int old_exc_flags = get_float_exception_flags(status);
6830
6831    v = float128_to_uint64_round_to_zero(a, status);
6832    if (v > 0xffffffff) {
6833        res = 0xffffffff;
6834    } else {
6835        return v;
6836    }
6837    set_float_exception_flags(old_exc_flags, status);
6838    float_raise(float_flag_invalid, status);
6839    return res;
6840}
6841
6842/*----------------------------------------------------------------------------
6843| Returns the result of converting the quadruple-precision floating-point value
6844| `a' to the 32-bit unsigned integer format.  The conversion is
6845| performed according to the IEC/IEEE Standard for Binary Floating-Point
6846| Arithmetic---which means in particular that the conversion is rounded
6847| according to the current rounding mode.  If `a' is a NaN, the largest
6848| positive integer is returned.  If the conversion overflows, the
6849| largest unsigned integer is returned.  If 'a' is negative, the value is
6850| rounded and zero is returned; negative values that do not round to zero
6851| will raise the inexact exception.
6852*----------------------------------------------------------------------------*/
6853
6854uint32_t float128_to_uint32(float128 a, float_status *status)
6855{
6856    uint64_t v;
6857    uint32_t res;
6858    int old_exc_flags = get_float_exception_flags(status);
6859
6860    v = float128_to_uint64(a, status);
6861    if (v > 0xffffffff) {
6862        res = 0xffffffff;
6863    } else {
6864        return v;
6865    }
6866    set_float_exception_flags(old_exc_flags, status);
6867    float_raise(float_flag_invalid, status);
6868    return res;
6869}
6870
6871/*----------------------------------------------------------------------------
6872| Returns the result of converting the quadruple-precision floating-point
6873| value `a' to the single-precision floating-point format.  The conversion
6874| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6875| Arithmetic.
6876*----------------------------------------------------------------------------*/
6877
6878float32 float128_to_float32(float128 a, float_status *status)
6879{
6880    flag aSign;
6881    int32_t aExp;
6882    uint64_t aSig0, aSig1;
6883    uint32_t zSig;
6884
6885    aSig1 = extractFloat128Frac1( a );
6886    aSig0 = extractFloat128Frac0( a );
6887    aExp = extractFloat128Exp( a );
6888    aSign = extractFloat128Sign( a );
6889    if ( aExp == 0x7FFF ) {
6890        if ( aSig0 | aSig1 ) {
6891            return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6892        }
6893        return packFloat32( aSign, 0xFF, 0 );
6894    }
6895    aSig0 |= ( aSig1 != 0 );
6896    shift64RightJamming( aSig0, 18, &aSig0 );
6897    zSig = aSig0;
6898    if ( aExp || zSig ) {
6899        zSig |= 0x40000000;
6900        aExp -= 0x3F81;
6901    }
6902    return roundAndPackFloat32(aSign, aExp, zSig, status);
6903
6904}
6905
6906/*----------------------------------------------------------------------------
6907| Returns the result of converting the quadruple-precision floating-point
6908| value `a' to the double-precision floating-point format.  The conversion
6909| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6910| Arithmetic.
6911*----------------------------------------------------------------------------*/
6912
6913float64 float128_to_float64(float128 a, float_status *status)
6914{
6915    flag aSign;
6916    int32_t aExp;
6917    uint64_t aSig0, aSig1;
6918
6919    aSig1 = extractFloat128Frac1( a );
6920    aSig0 = extractFloat128Frac0( a );
6921    aExp = extractFloat128Exp( a );
6922    aSign = extractFloat128Sign( a );
6923    if ( aExp == 0x7FFF ) {
6924        if ( aSig0 | aSig1 ) {
6925            return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6926        }
6927        return packFloat64( aSign, 0x7FF, 0 );
6928    }
6929    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6930    aSig0 |= ( aSig1 != 0 );
6931    if ( aExp || aSig0 ) {
6932        aSig0 |= LIT64( 0x4000000000000000 );
6933        aExp -= 0x3C01;
6934    }
6935    return roundAndPackFloat64(aSign, aExp, aSig0, status);
6936
6937}
6938
6939/*----------------------------------------------------------------------------
6940| Returns the result of converting the quadruple-precision floating-point
6941| value `a' to the extended double-precision floating-point format.  The
6942| conversion is performed according to the IEC/IEEE Standard for Binary
6943| Floating-Point Arithmetic.
6944*----------------------------------------------------------------------------*/
6945
6946floatx80 float128_to_floatx80(float128 a, float_status *status)
6947{
6948    flag aSign;
6949    int32_t aExp;
6950    uint64_t aSig0, aSig1;
6951
6952    aSig1 = extractFloat128Frac1( a );
6953    aSig0 = extractFloat128Frac0( a );
6954    aExp = extractFloat128Exp( a );
6955    aSign = extractFloat128Sign( a );
6956    if ( aExp == 0x7FFF ) {
6957        if ( aSig0 | aSig1 ) {
6958            return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6959        }
6960        return packFloatx80(aSign, floatx80_infinity_high,
6961                                   floatx80_infinity_low);
6962    }
6963    if ( aExp == 0 ) {
6964        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6965        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6966    }
6967    else {
6968        aSig0 |= LIT64( 0x0001000000000000 );
6969    }
6970    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6971    return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6972
6973}
6974
6975/*----------------------------------------------------------------------------
6976| Rounds the quadruple-precision floating-point value `a' to an integer, and
6977| returns the result as a quadruple-precision floating-point value.  The
6978| operation is performed according to the IEC/IEEE Standard for Binary
6979| Floating-Point Arithmetic.
6980*----------------------------------------------------------------------------*/
6981
6982float128 float128_round_to_int(float128 a, float_status *status)
6983{
6984    flag aSign;
6985    int32_t aExp;
6986    uint64_t lastBitMask, roundBitsMask;
6987    float128 z;
6988
6989    aExp = extractFloat128Exp( a );
6990    if ( 0x402F <= aExp ) {
6991        if ( 0x406F <= aExp ) {
6992            if (    ( aExp == 0x7FFF )
6993                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6994               ) {
6995                return propagateFloat128NaN(a, a, status);
6996            }
6997            return a;
6998        }
6999        lastBitMask = 1;
7000        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
7001        roundBitsMask = lastBitMask - 1;
7002        z = a;
7003        switch (status->float_rounding_mode) {
7004        case float_round_nearest_even:
7005            if ( lastBitMask ) {
7006                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
7007                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
7008            }
7009            else {
7010                if ( (int64_t) z.low < 0 ) {
7011                    ++z.high;
7012                    if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
7013                }
7014            }
7015            break;
7016        case float_round_ties_away:
7017            if (lastBitMask) {
7018                add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
7019            } else {
7020                if ((int64_t) z.low < 0) {
7021                    ++z.high;
7022                }
7023            }
7024            break;
7025        case float_round_to_zero:
7026            break;
7027        case float_round_up:
7028            if (!extractFloat128Sign(z)) {
7029                add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7030            }
7031            break;
7032        case float_round_down:
7033            if (extractFloat128Sign(z)) {
7034                add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7035            }
7036            break;
7037        case float_round_to_odd:
7038            /*
7039             * Note that if lastBitMask == 0, the last bit is the lsb
7040             * of high, and roundBitsMask == -1.
7041             */
7042            if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
7043                add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7044            }
7045            break;
7046        default:
7047            abort();
7048        }
7049        z.low &= ~ roundBitsMask;
7050    }
7051    else {
7052        if ( aExp < 0x3FFF ) {
7053            if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
7054            status->float_exception_flags |= float_flag_inexact;
7055            aSign = extractFloat128Sign( a );
7056            switch (status->float_rounding_mode) {
7057            case float_round_nearest_even:
7058                if (    ( aExp == 0x3FFE )
7059                     && (   extractFloat128Frac0( a )
7060                          | extractFloat128Frac1( a ) )
7061                   ) {
7062                    return packFloat128( aSign, 0x3FFF, 0, 0 );
7063                }
7064                break;
7065            case float_round_ties_away:
7066                if (aExp == 0x3FFE) {
7067                    return packFloat128(aSign, 0x3FFF, 0, 0);
7068                }
7069                break;
7070            case float_round_down:
7071                return
7072                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7073                    : packFloat128( 0, 0, 0, 0 );
7074            case float_round_up:
7075                return
7076                      aSign ? packFloat128( 1, 0, 0, 0 )
7077                    : packFloat128( 0, 0x3FFF, 0, 0 );
7078
7079            case float_round_to_odd:
7080                return packFloat128(aSign, 0x3FFF, 0, 0);
7081            }
7082            return packFloat128( aSign, 0, 0, 0 );
7083        }
7084        lastBitMask = 1;
7085        lastBitMask <<= 0x402F - aExp;
7086        roundBitsMask = lastBitMask - 1;
7087        z.low = 0;
7088        z.high = a.high;
7089        switch (status->float_rounding_mode) {
7090        case float_round_nearest_even:
7091            z.high += lastBitMask>>1;
7092            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7093                z.high &= ~ lastBitMask;
7094            }
7095            break;
7096        case float_round_ties_away:
7097            z.high += lastBitMask>>1;
7098            break;
7099        case float_round_to_zero:
7100            break;
7101        case float_round_up:
7102            if (!extractFloat128Sign(z)) {
7103                z.high |= ( a.low != 0 );
7104                z.high += roundBitsMask;
7105            }
7106            break;
7107        case float_round_down:
7108            if (extractFloat128Sign(z)) {
7109                z.high |= (a.low != 0);
7110                z.high += roundBitsMask;
7111            }
7112            break;
7113        case float_round_to_odd:
7114            if ((z.high & lastBitMask) == 0) {
7115                z.high |= (a.low != 0);
7116                z.high += roundBitsMask;
7117            }
7118            break;
7119        default:
7120            abort();
7121        }
7122        z.high &= ~ roundBitsMask;
7123    }
7124    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7125        status->float_exception_flags |= float_flag_inexact;
7126    }
7127    return z;
7128
7129}
7130
7131/*----------------------------------------------------------------------------
7132| Returns the result of adding the absolute values of the quadruple-precision
7133| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
7134| before being returned.  `zSign' is ignored if the result is a NaN.
7135| The addition is performed according to the IEC/IEEE Standard for Binary
7136| Floating-Point Arithmetic.
7137*----------------------------------------------------------------------------*/
7138
7139static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
7140                                float_status *status)
7141{
7142    int32_t aExp, bExp, zExp;
7143    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7144    int32_t expDiff;
7145
7146    aSig1 = extractFloat128Frac1( a );
7147    aSig0 = extractFloat128Frac0( a );
7148    aExp = extractFloat128Exp( a );
7149    bSig1 = extractFloat128Frac1( b );
7150    bSig0 = extractFloat128Frac0( b );
7151    bExp = extractFloat128Exp( b );
7152    expDiff = aExp - bExp;
7153    if ( 0 < expDiff ) {
7154        if ( aExp == 0x7FFF ) {
7155            if (aSig0 | aSig1) {
7156                return propagateFloat128NaN(a, b, status);
7157            }
7158            return a;
7159        }
7160        if ( bExp == 0 ) {
7161            --expDiff;
7162        }
7163        else {
7164            bSig0 |= LIT64( 0x0001000000000000 );
7165        }
7166        shift128ExtraRightJamming(
7167            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7168        zExp = aExp;
7169    }
7170    else if ( expDiff < 0 ) {
7171        if ( bExp == 0x7FFF ) {
7172            if (bSig0 | bSig1) {
7173                return propagateFloat128NaN(a, b, status);
7174            }
7175            return packFloat128( zSign, 0x7FFF, 0, 0 );
7176        }
7177        if ( aExp == 0 ) {
7178            ++expDiff;
7179        }
7180        else {
7181            aSig0 |= LIT64( 0x0001000000000000 );
7182        }
7183        shift128ExtraRightJamming(
7184            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7185        zExp = bExp;
7186    }
7187    else {
7188        if ( aExp == 0x7FFF ) {
7189            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7190                return propagateFloat128NaN(a, b, status);
7191            }
7192            return a;
7193        }
7194        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7195        if ( aExp == 0 ) {
7196            if (status->flush_to_zero) {
7197                if (zSig0 | zSig1) {
7198                    float_raise(float_flag_output_denormal, status);
7199                }
7200                return packFloat128(zSign, 0, 0, 0);
7201            }
7202            return packFloat128( zSign, 0, zSig0, zSig1 );
7203        }
7204        zSig2 = 0;
7205        zSig0 |= LIT64( 0x0002000000000000 );
7206        zExp = aExp;
7207        goto shiftRight1;
7208    }
7209    aSig0 |= LIT64( 0x0001000000000000 );
7210    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7211    --zExp;
7212    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
7213    ++zExp;
7214 shiftRight1:
7215    shift128ExtraRightJamming(
7216        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7217 roundAndPack:
7218    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7219
7220}
7221
7222/*----------------------------------------------------------------------------
7223| Returns the result of subtracting the absolute values of the quadruple-
7224| precision floating-point values `a' and `b'.  If `zSign' is 1, the
7225| difference is negated before being returned.  `zSign' is ignored if the
7226| result is a NaN.  The subtraction is performed according to the IEC/IEEE
7227| Standard for Binary Floating-Point Arithmetic.
7228*----------------------------------------------------------------------------*/
7229
7230static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
7231                                float_status *status)
7232{
7233    int32_t aExp, bExp, zExp;
7234    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7235    int32_t expDiff;
7236
7237    aSig1 = extractFloat128Frac1( a );
7238    aSig0 = extractFloat128Frac0( a );
7239    aExp = extractFloat128Exp( a );
7240    bSig1 = extractFloat128Frac1( b );
7241    bSig0 = extractFloat128Frac0( b );
7242    bExp = extractFloat128Exp( b );
7243    expDiff = aExp - bExp;
7244    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7245    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7246    if ( 0 < expDiff ) goto aExpBigger;
7247    if ( expDiff < 0 ) goto bExpBigger;
7248    if ( aExp == 0x7FFF ) {
7249        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7250            return propagateFloat128NaN(a, b, status);
7251        }
7252        float_raise(float_flag_invalid, status);
7253        return float128_default_nan(status);
7254    }
7255    if ( aExp == 0 ) {
7256        aExp = 1;
7257        bExp = 1;
7258    }
7259    if ( bSig0 < aSig0 ) goto aBigger;
7260    if ( aSig0 < bSig0 ) goto bBigger;
7261    if ( bSig1 < aSig1 ) goto aBigger;
7262    if ( aSig1 < bSig1 ) goto bBigger;
7263    return packFloat128(status->float_rounding_mode == float_round_down,
7264                        0, 0, 0);
7265 bExpBigger:
7266    if ( bExp == 0x7FFF ) {
7267        if (bSig0 | bSig1) {
7268            return propagateFloat128NaN(a, b, status);
7269        }
7270        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7271    }
7272    if ( aExp == 0 ) {
7273        ++expDiff;
7274    }
7275    else {
7276        aSig0 |= LIT64( 0x4000000000000000 );
7277    }
7278    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7279    bSig0 |= LIT64( 0x4000000000000000 );
7280 bBigger:
7281    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7282    zExp = bExp;
7283    zSign ^= 1;
7284    goto normalizeRoundAndPack;
7285 aExpBigger:
7286    if ( aExp == 0x7FFF ) {
7287        if (aSig0 | aSig1) {
7288            return propagateFloat128NaN(a, b, status);
7289        }
7290        return a;
7291    }
7292    if ( bExp == 0 ) {
7293        --expDiff;
7294    }
7295    else {
7296        bSig0 |= LIT64( 0x4000000000000000 );
7297    }
7298    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7299    aSig0 |= LIT64( 0x4000000000000000 );
7300 aBigger:
7301    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7302    zExp = aExp;
7303 normalizeRoundAndPack:
7304    --zExp;
7305    return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7306                                         status);
7307
7308}
7309
7310/*----------------------------------------------------------------------------
7311| Returns the result of adding the quadruple-precision floating-point values
7312| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7313| for Binary Floating-Point Arithmetic.
7314*----------------------------------------------------------------------------*/
7315
7316float128 float128_add(float128 a, float128 b, float_status *status)
7317{
7318    flag aSign, bSign;
7319
7320    aSign = extractFloat128Sign( a );
7321    bSign = extractFloat128Sign( b );
7322    if ( aSign == bSign ) {
7323        return addFloat128Sigs(a, b, aSign, status);
7324    }
7325    else {
7326        return subFloat128Sigs(a, b, aSign, status);
7327    }
7328
7329}
7330
7331/*----------------------------------------------------------------------------
7332| Returns the result of subtracting the quadruple-precision floating-point
7333| values `a' and `b'.  The operation is performed according to the IEC/IEEE
7334| Standard for Binary Floating-Point Arithmetic.
7335*----------------------------------------------------------------------------*/
7336
7337float128 float128_sub(float128 a, float128 b, float_status *status)
7338{
7339    flag aSign, bSign;
7340
7341    aSign = extractFloat128Sign( a );
7342    bSign = extractFloat128Sign( b );
7343    if ( aSign == bSign ) {
7344        return subFloat128Sigs(a, b, aSign, status);
7345    }
7346    else {
7347        return addFloat128Sigs(a, b, aSign, status);
7348    }
7349
7350}
7351
7352/*----------------------------------------------------------------------------
7353| Returns the result of multiplying the quadruple-precision floating-point
7354| values `a' and `b'.  The operation is performed according to the IEC/IEEE
7355| Standard for Binary Floating-Point Arithmetic.
7356*----------------------------------------------------------------------------*/
7357
7358float128 float128_mul(float128 a, float128 b, float_status *status)
7359{
7360    flag aSign, bSign, zSign;
7361    int32_t aExp, bExp, zExp;
7362    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7363
7364    aSig1 = extractFloat128Frac1( a );
7365    aSig0 = extractFloat128Frac0( a );
7366    aExp = extractFloat128Exp( a );
7367    aSign = extractFloat128Sign( a );
7368    bSig1 = extractFloat128Frac1( b );
7369    bSig0 = extractFloat128Frac0( b );
7370    bExp = extractFloat128Exp( b );
7371    bSign = extractFloat128Sign( b );
7372    zSign = aSign ^ bSign;
7373    if ( aExp == 0x7FFF ) {
7374        if (    ( aSig0 | aSig1 )
7375             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7376            return propagateFloat128NaN(a, b, status);
7377        }
7378        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7379        return packFloat128( zSign, 0x7FFF, 0, 0 );
7380    }
7381    if ( bExp == 0x7FFF ) {
7382        if (bSig0 | bSig1) {
7383            return propagateFloat128NaN(a, b, status);
7384        }
7385        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7386 invalid:
7387            float_raise(float_flag_invalid, status);
7388            return float128_default_nan(status);
7389        }
7390        return packFloat128( zSign, 0x7FFF, 0, 0 );
7391    }
7392    if ( aExp == 0 ) {
7393        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7394        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7395    }
7396    if ( bExp == 0 ) {
7397        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7398        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7399    }
7400    zExp = aExp + bExp - 0x4000;
7401    aSig0 |= LIT64( 0x0001000000000000 );
7402    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7403    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7404    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7405    zSig2 |= ( zSig3 != 0 );
7406    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
7407        shift128ExtraRightJamming(
7408            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7409        ++zExp;
7410    }
7411    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7412
7413}
7414
7415/*----------------------------------------------------------------------------
7416| Returns the result of dividing the quadruple-precision floating-point value
7417| `a' by the corresponding value `b'.  The operation is performed according to
7418| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7419*----------------------------------------------------------------------------*/
7420
7421float128 float128_div(float128 a, float128 b, float_status *status)
7422{
7423    flag aSign, bSign, zSign;
7424    int32_t aExp, bExp, zExp;
7425    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7426    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7427
7428    aSig1 = extractFloat128Frac1( a );
7429    aSig0 = extractFloat128Frac0( a );
7430    aExp = extractFloat128Exp( a );
7431    aSign = extractFloat128Sign( a );
7432    bSig1 = extractFloat128Frac1( b );
7433    bSig0 = extractFloat128Frac0( b );
7434    bExp = extractFloat128Exp( b );
7435    bSign = extractFloat128Sign( b );
7436    zSign = aSign ^ bSign;
7437    if ( aExp == 0x7FFF ) {
7438        if (aSig0 | aSig1) {
7439            return propagateFloat128NaN(a, b, status);
7440        }
7441        if ( bExp == 0x7FFF ) {
7442            if (bSig0 | bSig1) {
7443                return propagateFloat128NaN(a, b, status);
7444            }
7445            goto invalid;
7446        }
7447        return packFloat128( zSign, 0x7FFF, 0, 0 );
7448    }
7449    if ( bExp == 0x7FFF ) {
7450        if (bSig0 | bSig1) {
7451            return propagateFloat128NaN(a, b, status);
7452        }
7453        return packFloat128( zSign, 0, 0, 0 );
7454    }
7455    if ( bExp == 0 ) {
7456        if ( ( bSig0 | bSig1 ) == 0 ) {
7457            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7458 invalid:
7459                float_raise(float_flag_invalid, status);
7460                return float128_default_nan(status);
7461            }
7462            float_raise(float_flag_divbyzero, status);
7463            return packFloat128( zSign, 0x7FFF, 0, 0 );
7464        }
7465        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7466    }
7467    if ( aExp == 0 ) {
7468        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7469        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7470    }
7471    zExp = aExp - bExp + 0x3FFD;
7472    shortShift128Left(
7473        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
7474    shortShift128Left(
7475        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7476    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7477        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7478        ++zExp;
7479    }
7480    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7481    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7482    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7483    while ( (int64_t) rem0 < 0 ) {
7484        --zSig0;
7485        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7486    }
7487    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7488    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7489        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7490        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7491        while ( (int64_t) rem1 < 0 ) {
7492            --zSig1;
7493            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7494        }
7495        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7496    }
7497    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7498    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7499
7500}
7501
7502/*----------------------------------------------------------------------------
7503| Returns the remainder of the quadruple-precision floating-point value `a'
7504| with respect to the corresponding value `b'.  The operation is performed
7505| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7506*----------------------------------------------------------------------------*/
7507
7508float128 float128_rem(float128 a, float128 b, float_status *status)
7509{
7510    flag aSign, zSign;
7511    int32_t aExp, bExp, expDiff;
7512    uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7513    uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7514    int64_t sigMean0;
7515
7516    aSig1 = extractFloat128Frac1( a );
7517    aSig0 = extractFloat128Frac0( a );
7518    aExp = extractFloat128Exp( a );
7519    aSign = extractFloat128Sign( a );
7520    bSig1 = extractFloat128Frac1( b );
7521    bSig0 = extractFloat128Frac0( b );
7522    bExp = extractFloat128Exp( b );
7523    if ( aExp == 0x7FFF ) {
7524        if (    ( aSig0 | aSig1 )
7525             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7526            return propagateFloat128NaN(a, b, status);
7527        }
7528        goto invalid;
7529    }
7530    if ( bExp == 0x7FFF ) {
7531        if (bSig0 | bSig1) {
7532            return propagateFloat128NaN(a, b, status);
7533        }
7534        return a;
7535    }
7536    if ( bExp == 0 ) {
7537        if ( ( bSig0 | bSig1 ) == 0 ) {
7538 invalid:
7539            float_raise(float_flag_invalid, status);
7540            return float128_default_nan(status);
7541        }
7542        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7543    }
7544    if ( aExp == 0 ) {
7545        if ( ( aSig0 | aSig1 ) == 0 ) return a;
7546        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7547    }
7548    expDiff = aExp - bExp;
7549    if ( expDiff < -1 ) return a;
7550    shortShift128Left(
7551        aSig0 | LIT64( 0x0001000000000000 ),
7552        aSig1,
7553        15 - ( expDiff < 0 ),
7554        &aSig0,
7555        &aSig1
7556    );
7557    shortShift128Left(
7558        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7559    q = le128( bSig0, bSig1, aSig0, aSig1 );
7560    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7561    expDiff -= 64;
7562    while ( 0 < expDiff ) {
7563        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7564        q = ( 4 < q ) ? q - 4 : 0;
7565        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7566        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7567        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7568        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7569        expDiff -= 61;
7570    }
7571    if ( -64 < expDiff ) {
7572        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7573        q = ( 4 < q ) ? q - 4 : 0;
7574        q >>= - expDiff;
7575        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7576        expDiff += 52;
7577        if ( expDiff < 0 ) {
7578            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7579        }
7580        else {
7581            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7582        }
7583        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7584        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7585    }
7586    else {
7587        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7588        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7589    }
7590    do {
7591        alternateASig0 = aSig0;
7592        alternateASig1 = aSig1;
7593        ++q;
7594        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7595    } while ( 0 <= (int64_t) aSig0 );
7596    add128(
7597        aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7598    if (    ( sigMean0 < 0 )
7599         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7600        aSig0 = alternateASig0;
7601        aSig1 = alternateASig1;
7602    }
7603    zSign = ( (int64_t) aSig0 < 0 );
7604    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7605    return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7606                                         status);
7607}
7608
7609/*----------------------------------------------------------------------------
7610| Returns the square root of the quadruple-precision floating-point value `a'.
7611| The operation is performed according to the IEC/IEEE Standard for Binary
7612| Floating-Point Arithmetic.
7613*----------------------------------------------------------------------------*/
7614
7615float128 float128_sqrt(float128 a, float_status *status)
7616{
7617    flag aSign;
7618    int32_t aExp, zExp;
7619    uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7620    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7621
7622    aSig1 = extractFloat128Frac1( a );
7623    aSig0 = extractFloat128Frac0( a );
7624    aExp = extractFloat128Exp( a );
7625    aSign = extractFloat128Sign( a );
7626    if ( aExp == 0x7FFF ) {
7627        if (aSig0 | aSig1) {
7628            return propagateFloat128NaN(a, a, status);
7629        }
7630        if ( ! aSign ) return a;
7631        goto invalid;
7632    }
7633    if ( aSign ) {
7634        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7635 invalid:
7636        float_raise(float_flag_invalid, status);
7637        return float128_default_nan(status);
7638    }
7639    if ( aExp == 0 ) {
7640        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7641        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7642    }
7643    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7644    aSig0 |= LIT64( 0x0001000000000000 );
7645    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7646    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7647    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7648    doubleZSig0 = zSig0<<1;
7649    mul64To128( zSig0, zSig0, &term0, &term1 );
7650    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7651    while ( (int64_t) rem0 < 0 ) {
7652        --zSig0;
7653        doubleZSig0 -= 2;
7654        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7655    }
7656    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7657    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7658        if ( zSig1 == 0 ) zSig1 = 1;
7659        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7660        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7661        mul64To128( zSig1, zSig1, &term2, &term3 );
7662        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7663        while ( (int64_t) rem1 < 0 ) {
7664            --zSig1;
7665            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7666            term3 |= 1;
7667            term2 |= doubleZSig0;
7668            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7669        }
7670        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7671    }
7672    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7673    return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7674
7675}
7676
7677/*----------------------------------------------------------------------------
7678| Returns 1 if the quadruple-precision floating-point value `a' is equal to
7679| the corresponding value `b', and 0 otherwise.  The invalid exception is
7680| raised if either operand is a NaN.  Otherwise, the comparison is performed
7681| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7682*----------------------------------------------------------------------------*/
7683
7684int float128_eq(float128 a, float128 b, float_status *status)
7685{
7686
7687    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7688              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7689         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7690              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7691       ) {
7692        float_raise(float_flag_invalid, status);
7693        return 0;
7694    }
7695    return
7696           ( a.low == b.low )
7697        && (    ( a.high == b.high )
7698             || (    ( a.low == 0 )
7699                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7700           );
7701
7702}
7703
7704/*----------------------------------------------------------------------------
7705| Returns 1 if the quadruple-precision floating-point value `a' is less than
7706| or equal to the corresponding value `b', and 0 otherwise.  The invalid
7707| exception is raised if either operand is a NaN.  The comparison is performed
7708| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7709*----------------------------------------------------------------------------*/
7710
7711int float128_le(float128 a, float128 b, float_status *status)
7712{
7713    flag aSign, bSign;
7714
7715    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7716              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7717         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7718              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7719       ) {
7720        float_raise(float_flag_invalid, status);
7721        return 0;
7722    }
7723    aSign = extractFloat128Sign( a );
7724    bSign = extractFloat128Sign( b );
7725    if ( aSign != bSign ) {
7726        return
7727               aSign
7728            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7729                 == 0 );
7730    }
7731    return
7732          aSign ? le128( b.high, b.low, a.high, a.low )
7733        : le128( a.high, a.low, b.high, b.low );
7734
7735}
7736
7737/*----------------------------------------------------------------------------
7738| Returns 1 if the quadruple-precision floating-point value `a' is less than
7739| the corresponding value `b', and 0 otherwise.  The invalid exception is
7740| raised if either operand is a NaN.  The comparison is performed according
7741| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7742*----------------------------------------------------------------------------*/
7743
7744int float128_lt(float128 a, float128 b, float_status *status)
7745{
7746    flag aSign, bSign;
7747
7748    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7749              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7750         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7751              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7752       ) {
7753        float_raise(float_flag_invalid, status);
7754        return 0;
7755    }
7756    aSign = extractFloat128Sign( a );
7757    bSign = extractFloat128Sign( b );
7758    if ( aSign != bSign ) {
7759        return
7760               aSign
7761            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7762                 != 0 );
7763    }
7764    return
7765          aSign ? lt128( b.high, b.low, a.high, a.low )
7766        : lt128( a.high, a.low, b.high, b.low );
7767
7768}
7769
7770/*----------------------------------------------------------------------------
7771| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7772| be compared, and 0 otherwise.  The invalid exception is raised if either
7773| operand is a NaN. The comparison is performed according to the IEC/IEEE
7774| Standard for Binary Floating-Point Arithmetic.
7775*----------------------------------------------------------------------------*/
7776
7777int float128_unordered(float128 a, float128 b, float_status *status)
7778{
7779    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7780              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7781         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7782              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7783       ) {
7784        float_raise(float_flag_invalid, status);
7785        return 1;
7786    }
7787    return 0;
7788}
7789
7790/*----------------------------------------------------------------------------
7791| Returns 1 if the quadruple-precision floating-point value `a' is equal to
7792| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7793| exception.  The comparison is performed according to the IEC/IEEE Standard
7794| for Binary Floating-Point Arithmetic.
7795*----------------------------------------------------------------------------*/
7796
7797int float128_eq_quiet(float128 a, float128 b, float_status *status)
7798{
7799
7800    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7801              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7802         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7803              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7804       ) {
7805        if (float128_is_signaling_nan(a, status)
7806         || float128_is_signaling_nan(b, status)) {
7807            float_raise(float_flag_invalid, status);
7808        }
7809        return 0;
7810    }
7811    return
7812           ( a.low == b.low )
7813        && (    ( a.high == b.high )
7814             || (    ( a.low == 0 )
7815                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7816           );
7817
7818}
7819
7820/*----------------------------------------------------------------------------
7821| Returns 1 if the quadruple-precision floating-point value `a' is less than
7822| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7823| cause an exception.  Otherwise, the comparison is performed according to the
7824| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7825*----------------------------------------------------------------------------*/
7826
7827int float128_le_quiet(float128 a, float128 b, float_status *status)
7828{
7829    flag aSign, bSign;
7830
7831    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7832              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7833         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7834              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7835       ) {
7836        if (float128_is_signaling_nan(a, status)
7837         || float128_is_signaling_nan(b, status)) {
7838            float_raise(float_flag_invalid, status);
7839        }
7840        return 0;
7841    }
7842    aSign = extractFloat128Sign( a );
7843    bSign = extractFloat128Sign( b );
7844    if ( aSign != bSign ) {
7845        return
7846               aSign
7847            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7848                 == 0 );
7849    }
7850    return
7851          aSign ? le128( b.high, b.low, a.high, a.low )
7852        : le128( a.high, a.low, b.high, b.low );
7853
7854}
7855
7856/*----------------------------------------------------------------------------
7857| Returns 1 if the quadruple-precision floating-point value `a' is less than
7858| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7859| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7860| Standard for Binary Floating-Point Arithmetic.
7861*----------------------------------------------------------------------------*/
7862
7863int float128_lt_quiet(float128 a, float128 b, float_status *status)
7864{
7865    flag aSign, bSign;
7866
7867    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7868              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7869         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7870              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7871       ) {
7872        if (float128_is_signaling_nan(a, status)
7873         || float128_is_signaling_nan(b, status)) {
7874            float_raise(float_flag_invalid, status);
7875        }
7876        return 0;
7877    }
7878    aSign = extractFloat128Sign( a );
7879    bSign = extractFloat128Sign( b );
7880    if ( aSign != bSign ) {
7881        return
7882               aSign
7883            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7884                 != 0 );
7885    }
7886    return
7887          aSign ? lt128( b.high, b.low, a.high, a.low )
7888        : lt128( a.high, a.low, b.high, b.low );
7889
7890}
7891
7892/*----------------------------------------------------------------------------
7893| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7894| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7895| comparison is performed according to the IEC/IEEE Standard for Binary
7896| Floating-Point Arithmetic.
7897*----------------------------------------------------------------------------*/
7898
7899int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7900{
7901    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7902              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7903         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7904              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7905       ) {
7906        if (float128_is_signaling_nan(a, status)
7907         || float128_is_signaling_nan(b, status)) {
7908            float_raise(float_flag_invalid, status);
7909        }
7910        return 1;
7911    }
7912    return 0;
7913}
7914
7915static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7916                                            int is_quiet, float_status *status)
7917{
7918    flag aSign, bSign;
7919
7920    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7921        float_raise(float_flag_invalid, status);
7922        return float_relation_unordered;
7923    }
7924    if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7925          ( extractFloatx80Frac( a )<<1 ) ) ||
7926        ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7927          ( extractFloatx80Frac( b )<<1 ) )) {
7928        if (!is_quiet ||
7929            floatx80_is_signaling_nan(a, status) ||
7930            floatx80_is_signaling_nan(b, status)) {
7931            float_raise(float_flag_invalid, status);
7932        }
7933        return float_relation_unordered;
7934    }
7935    aSign = extractFloatx80Sign( a );
7936    bSign = extractFloatx80Sign( b );
7937    if ( aSign != bSign ) {
7938
7939        if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7940             ( ( a.low | b.low ) == 0 ) ) {
7941            /* zero case */
7942            return float_relation_equal;
7943        } else {
7944            return 1 - (2 * aSign);
7945        }
7946    } else {
7947        if (a.low == b.low && a.high == b.high) {
7948            return float_relation_equal;
7949        } else {
7950            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7951        }
7952    }
7953}
7954
7955int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7956{
7957    return floatx80_compare_internal(a, b, 0, status);
7958}
7959
7960int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7961{
7962    return floatx80_compare_internal(a, b, 1, status);
7963}
7964
7965static inline int float128_compare_internal(float128 a, float128 b,
7966                                            int is_quiet, float_status *status)
7967{
7968    flag aSign, bSign;
7969
7970    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7971          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7972        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7973          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7974        if (!is_quiet ||
7975            float128_is_signaling_nan(a, status) ||
7976            float128_is_signaling_nan(b, status)) {
7977            float_raise(float_flag_invalid, status);
7978        }
7979        return float_relation_unordered;
7980    }
7981    aSign = extractFloat128Sign( a );
7982    bSign = extractFloat128Sign( b );
7983    if ( aSign != bSign ) {
7984        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7985            /* zero case */
7986            return float_relation_equal;
7987        } else {
7988            return 1 - (2 * aSign);
7989        }
7990    } else {
7991        if (a.low == b.low && a.high == b.high) {
7992            return float_relation_equal;
7993        } else {
7994            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7995        }
7996    }
7997}
7998
7999int float128_compare(float128 a, float128 b, float_status *status)
8000{
8001    return float128_compare_internal(a, b, 0, status);
8002}
8003
8004int float128_compare_quiet(float128 a, float128 b, float_status *status)
8005{
8006    return float128_compare_internal(a, b, 1, status);
8007}
8008
8009floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
8010{
8011    flag aSign;
8012    int32_t aExp;
8013    uint64_t aSig;
8014
8015    if (floatx80_invalid_encoding(a)) {
8016        float_raise(float_flag_invalid, status);
8017        return floatx80_default_nan(status);
8018    }
8019    aSig = extractFloatx80Frac( a );
8020    aExp = extractFloatx80Exp( a );
8021    aSign = extractFloatx80Sign( a );
8022
8023    if ( aExp == 0x7FFF ) {
8024        if ( aSig<<1 ) {
8025            return propagateFloatx80NaN(a, a, status);
8026        }
8027        return a;
8028    }
8029
8030    if (aExp == 0) {
8031        if (aSig == 0) {
8032            return a;
8033        }
8034        aExp++;
8035    }
8036
8037    if (n > 0x10000) {
8038        n = 0x10000;
8039    } else if (n < -0x10000) {
8040        n = -0x10000;
8041    }
8042
8043    aExp += n;
8044    return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
8045                                         aSign, aExp, aSig, 0, status);
8046}
8047
8048float128 float128_scalbn(float128 a, int n, float_status *status)
8049{
8050    flag aSign;
8051    int32_t aExp;
8052    uint64_t aSig0, aSig1;
8053
8054    aSig1 = extractFloat128Frac1( a );
8055    aSig0 = extractFloat128Frac0( a );
8056    aExp = extractFloat128Exp( a );
8057    aSign = extractFloat128Sign( a );
8058    if ( aExp == 0x7FFF ) {
8059        if ( aSig0 | aSig1 ) {
8060            return propagateFloat128NaN(a, a, status);
8061        }
8062        return a;
8063    }
8064    if (aExp != 0) {
8065        aSig0 |= LIT64( 0x0001000000000000 );
8066    } else if (aSig0 == 0 && aSig1 == 0) {
8067        return a;
8068    } else {
8069        aExp++;
8070    }
8071
8072    if (n > 0x10000) {
8073        n = 0x10000;
8074    } else if (n < -0x10000) {
8075        n = -0x10000;
8076    }
8077
8078    aExp += n - 1;
8079    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
8080                                         , status);
8081
8082}
8083
8084static void __attribute__((constructor)) softfloat_init(void)
8085{
8086    union_float64 ua, ub, uc, ur;
8087
8088    if (QEMU_NO_HARDFLOAT) {
8089        return;
8090    }
8091    /*
8092     * Test that the host's FMA is not obviously broken. For example,
8093     * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
8094     *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
8095     */
8096    ua.s = 0x0020000000000001ULL;
8097    ub.s = 0x3ca0000000000000ULL;
8098    uc.s = 0x0020000000000000ULL;
8099    ur.h = fma(ua.h, ub.h, uc.h);
8100    if (ur.s != 0x0020000000000001ULL) {
8101        force_soft_fma = true;
8102    }
8103}
8104