qemu/fpu/softfloat.c
<<
>>
Prefs
   1/*
   2 * QEMU float support
   3 *
   4 * The code in this source file is derived from release 2a of the SoftFloat
   5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6 * some later contributions) are provided under that license, as detailed below.
   7 * It has subsequently been modified by contributors to the QEMU Project,
   8 * so some portions are provided under:
   9 *  the SoftFloat-2a license
  10 *  the BSD license
  11 *  GPL-v2-or-later
  12 *
  13 * Any future contributions to this file after December 1st 2014 will be
  14 * taken to be licensed under the Softfloat-2a license unless specifically
  15 * indicated otherwise.
  16 */
  17
  18/*
  19===============================================================================
  20This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21Arithmetic Package, Release 2a.
  22
  23Written by John R. Hauser.  This work was made possible in part by the
  24International Computer Science Institute, located at Suite 600, 1947 Center
  25Street, Berkeley, California 94704.  Funding was partially provided by the
  26National Science Foundation under grant MIP-9311980.  The original version
  27of this code was written as part of a project to build a fixed-point vector
  28processor in collaboration with the University of California at Berkeley,
  29overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31arithmetic/SoftFloat.html'.
  32
  33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39Derivative works are acceptable, even for commercial purposes, so long as
  40(1) they include prominent notice that the work is derivative, and (2) they
  41include prominent notice akin to these four paragraphs for those parts of
  42this code that are retained.
  43
  44===============================================================================
  45*/
  46
  47/* BSD licensing:
  48 * Copyright (c) 2006, Fabrice Bellard
  49 * All rights reserved.
  50 *
  51 * Redistribution and use in source and binary forms, with or without
  52 * modification, are permitted provided that the following conditions are met:
  53 *
  54 * 1. Redistributions of source code must retain the above copyright notice,
  55 * this list of conditions and the following disclaimer.
  56 *
  57 * 2. Redistributions in binary form must reproduce the above copyright notice,
  58 * this list of conditions and the following disclaimer in the documentation
  59 * and/or other materials provided with the distribution.
  60 *
  61 * 3. Neither the name of the copyright holder nor the names of its contributors
  62 * may be used to endorse or promote products derived from this software without
  63 * specific prior written permission.
  64 *
  65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75 * THE POSSIBILITY OF SUCH DAMAGE.
  76 */
  77
  78/* Portions of this work are licensed under the terms of the GNU GPL,
  79 * version 2 or later. See the COPYING file in the top-level directory.
  80 */
  81
  82/* softfloat (and in particular the code in softfloat-specialize.h) is
  83 * target-dependent and needs the TARGET_* macros.
  84 */
  85#include "qemu/osdep.h"
  86#include "qemu/bitops.h"
  87#include "fpu/softfloat.h"
  88
  89/* We only need stdlib for abort() */
  90
  91/*----------------------------------------------------------------------------
  92| Primitive arithmetic functions, including multi-word arithmetic, and
  93| division and square root approximations.  (Can be specialized to target if
  94| desired.)
  95*----------------------------------------------------------------------------*/
  96#include "fpu/softfloat-macros.h"
  97
  98/*----------------------------------------------------------------------------
  99| Returns the fraction bits of the half-precision floating-point value `a'.
 100*----------------------------------------------------------------------------*/
 101
 102static inline uint32_t extractFloat16Frac(float16 a)
 103{
 104    return float16_val(a) & 0x3ff;
 105}
 106
 107/*----------------------------------------------------------------------------
 108| Returns the exponent bits of the half-precision floating-point value `a'.
 109*----------------------------------------------------------------------------*/
 110
 111static inline int extractFloat16Exp(float16 a)
 112{
 113    return (float16_val(a) >> 10) & 0x1f;
 114}
 115
 116/*----------------------------------------------------------------------------
 117| Returns the fraction bits of the single-precision floating-point value `a'.
 118*----------------------------------------------------------------------------*/
 119
 120static inline uint32_t extractFloat32Frac(float32 a)
 121{
 122    return float32_val(a) & 0x007FFFFF;
 123}
 124
 125/*----------------------------------------------------------------------------
 126| Returns the exponent bits of the single-precision floating-point value `a'.
 127*----------------------------------------------------------------------------*/
 128
 129static inline int extractFloat32Exp(float32 a)
 130{
 131    return (float32_val(a) >> 23) & 0xFF;
 132}
 133
 134/*----------------------------------------------------------------------------
 135| Returns the sign bit of the single-precision floating-point value `a'.
 136*----------------------------------------------------------------------------*/
 137
 138static inline flag extractFloat32Sign(float32 a)
 139{
 140    return float32_val(a) >> 31;
 141}
 142
 143/*----------------------------------------------------------------------------
 144| Returns the fraction bits of the double-precision floating-point value `a'.
 145*----------------------------------------------------------------------------*/
 146
 147static inline uint64_t extractFloat64Frac(float64 a)
 148{
 149    return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
 150}
 151
 152/*----------------------------------------------------------------------------
 153| Returns the exponent bits of the double-precision floating-point value `a'.
 154*----------------------------------------------------------------------------*/
 155
 156static inline int extractFloat64Exp(float64 a)
 157{
 158    return (float64_val(a) >> 52) & 0x7FF;
 159}
 160
 161/*----------------------------------------------------------------------------
 162| Returns the sign bit of the double-precision floating-point value `a'.
 163*----------------------------------------------------------------------------*/
 164
 165static inline flag extractFloat64Sign(float64 a)
 166{
 167    return float64_val(a) >> 63;
 168}
 169
 170/*
 171 * Classify a floating point number. Everything above float_class_qnan
 172 * is a NaN so cls >= float_class_qnan is any NaN.
 173 */
 174
 175typedef enum __attribute__ ((__packed__)) {
 176    float_class_unclassified,
 177    float_class_zero,
 178    float_class_normal,
 179    float_class_inf,
 180    float_class_qnan,  /* all NaNs from here */
 181    float_class_snan,
 182} FloatClass;
 183
 184/* Simple helpers for checking if, or what kind of, NaN we have */
 185static inline __attribute__((unused)) bool is_nan(FloatClass c)
 186{
 187    return unlikely(c >= float_class_qnan);
 188}
 189
 190static inline __attribute__((unused)) bool is_snan(FloatClass c)
 191{
 192    return c == float_class_snan;
 193}
 194
 195static inline __attribute__((unused)) bool is_qnan(FloatClass c)
 196{
 197    return c == float_class_qnan;
 198}
 199
 200/*
 201 * Structure holding all of the decomposed parts of a float. The
 202 * exponent is unbiased and the fraction is normalized. All
 203 * calculations are done with a 64 bit fraction and then rounded as
 204 * appropriate for the final format.
 205 *
 206 * Thanks to the packed FloatClass a decent compiler should be able to
 207 * fit the whole structure into registers and avoid using the stack
 208 * for parameter passing.
 209 */
 210
 211typedef struct {
 212    uint64_t frac;
 213    int32_t  exp;
 214    FloatClass cls;
 215    bool sign;
 216} FloatParts;
 217
 218#define DECOMPOSED_BINARY_POINT    (64 - 2)
 219#define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 220#define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
 221
 222/* Structure holding all of the relevant parameters for a format.
 223 *   exp_size: the size of the exponent field
 224 *   exp_bias: the offset applied to the exponent field
 225 *   exp_max: the maximum normalised exponent
 226 *   frac_size: the size of the fraction field
 227 *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 228 * The following are computed based the size of fraction
 229 *   frac_lsb: least significant bit of fraction
 230 *   frac_lsbm1: the bit below the least significant bit (for rounding)
 231 *   round_mask/roundeven_mask: masks used for rounding
 232 * The following optional modifiers are available:
 233 *   arm_althp: handle ARM Alternative Half Precision
 234 */
 235typedef struct {
 236    int exp_size;
 237    int exp_bias;
 238    int exp_max;
 239    int frac_size;
 240    int frac_shift;
 241    uint64_t frac_lsb;
 242    uint64_t frac_lsbm1;
 243    uint64_t round_mask;
 244    uint64_t roundeven_mask;
 245    bool arm_althp;
 246} FloatFmt;
 247
 248/* Expand fields based on the size of exponent and fraction */
 249#define FLOAT_PARAMS(E, F)                                           \
 250    .exp_size       = E,                                             \
 251    .exp_bias       = ((1 << E) - 1) >> 1,                           \
 252    .exp_max        = (1 << E) - 1,                                  \
 253    .frac_size      = F,                                             \
 254    .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 255    .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 256    .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 257    .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 258    .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 259
 260static const FloatFmt float16_params = {
 261    FLOAT_PARAMS(5, 10)
 262};
 263
 264static const FloatFmt float16_params_ahp = {
 265    FLOAT_PARAMS(5, 10),
 266    .arm_althp = true
 267};
 268
 269static const FloatFmt float32_params = {
 270    FLOAT_PARAMS(8, 23)
 271};
 272
 273static const FloatFmt float64_params = {
 274    FLOAT_PARAMS(11, 52)
 275};
 276
 277/* Unpack a float to parts, but do not canonicalize.  */
 278static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
 279{
 280    const int sign_pos = fmt.frac_size + fmt.exp_size;
 281
 282    return (FloatParts) {
 283        .cls = float_class_unclassified,
 284        .sign = extract64(raw, sign_pos, 1),
 285        .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 286        .frac = extract64(raw, 0, fmt.frac_size),
 287    };
 288}
 289
 290static inline FloatParts float16_unpack_raw(float16 f)
 291{
 292    return unpack_raw(float16_params, f);
 293}
 294
 295static inline FloatParts float32_unpack_raw(float32 f)
 296{
 297    return unpack_raw(float32_params, f);
 298}
 299
 300static inline FloatParts float64_unpack_raw(float64 f)
 301{
 302    return unpack_raw(float64_params, f);
 303}
 304
 305/* Pack a float from parts, but do not canonicalize.  */
 306static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
 307{
 308    const int sign_pos = fmt.frac_size + fmt.exp_size;
 309    uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 310    return deposit64(ret, sign_pos, 1, p.sign);
 311}
 312
 313static inline float16 float16_pack_raw(FloatParts p)
 314{
 315    return make_float16(pack_raw(float16_params, p));
 316}
 317
 318static inline float32 float32_pack_raw(FloatParts p)
 319{
 320    return make_float32(pack_raw(float32_params, p));
 321}
 322
 323static inline float64 float64_pack_raw(FloatParts p)
 324{
 325    return make_float64(pack_raw(float64_params, p));
 326}
 327
 328/*----------------------------------------------------------------------------
 329| Functions and definitions to determine:  (1) whether tininess for underflow
 330| is detected before or after rounding by default, (2) what (if anything)
 331| happens when exceptions are raised, (3) how signaling NaNs are distinguished
 332| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 333| are propagated from function inputs to output.  These details are target-
 334| specific.
 335*----------------------------------------------------------------------------*/
 336#include "softfloat-specialize.h"
 337
 338/* Canonicalize EXP and FRAC, setting CLS.  */
 339static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
 340                               float_status *status)
 341{
 342    if (part.exp == parm->exp_max && !parm->arm_althp) {
 343        if (part.frac == 0) {
 344            part.cls = float_class_inf;
 345        } else {
 346            part.frac <<= parm->frac_shift;
 347            part.cls = (parts_is_snan_frac(part.frac, status)
 348                        ? float_class_snan : float_class_qnan);
 349        }
 350    } else if (part.exp == 0) {
 351        if (likely(part.frac == 0)) {
 352            part.cls = float_class_zero;
 353        } else if (status->flush_inputs_to_zero) {
 354            float_raise(float_flag_input_denormal, status);
 355            part.cls = float_class_zero;
 356            part.frac = 0;
 357        } else {
 358            int shift = clz64(part.frac) - 1;
 359            part.cls = float_class_normal;
 360            part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 361            part.frac <<= shift;
 362        }
 363    } else {
 364        part.cls = float_class_normal;
 365        part.exp -= parm->exp_bias;
 366        part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 367    }
 368    return part;
 369}
 370
 371/* Round and uncanonicalize a floating-point number by parts. There
 372 * are FRAC_SHIFT bits that may require rounding at the bottom of the
 373 * fraction; these bits will be removed. The exponent will be biased
 374 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 375 */
 376
 377static FloatParts round_canonical(FloatParts p, float_status *s,
 378                                  const FloatFmt *parm)
 379{
 380    const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 381    const uint64_t round_mask = parm->round_mask;
 382    const uint64_t roundeven_mask = parm->roundeven_mask;
 383    const int exp_max = parm->exp_max;
 384    const int frac_shift = parm->frac_shift;
 385    uint64_t frac, inc;
 386    int exp, flags = 0;
 387    bool overflow_norm;
 388
 389    frac = p.frac;
 390    exp = p.exp;
 391
 392    switch (p.cls) {
 393    case float_class_normal:
 394        switch (s->float_rounding_mode) {
 395        case float_round_nearest_even:
 396            overflow_norm = false;
 397            inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 398            break;
 399        case float_round_ties_away:
 400            overflow_norm = false;
 401            inc = frac_lsbm1;
 402            break;
 403        case float_round_to_zero:
 404            overflow_norm = true;
 405            inc = 0;
 406            break;
 407        case float_round_up:
 408            inc = p.sign ? 0 : round_mask;
 409            overflow_norm = p.sign;
 410            break;
 411        case float_round_down:
 412            inc = p.sign ? round_mask : 0;
 413            overflow_norm = !p.sign;
 414            break;
 415        default:
 416            g_assert_not_reached();
 417        }
 418
 419        exp += parm->exp_bias;
 420        if (likely(exp > 0)) {
 421            if (frac & round_mask) {
 422                flags |= float_flag_inexact;
 423                frac += inc;
 424                if (frac & DECOMPOSED_OVERFLOW_BIT) {
 425                    frac >>= 1;
 426                    exp++;
 427                }
 428            }
 429            frac >>= frac_shift;
 430
 431            if (parm->arm_althp) {
 432                /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
 433                if (unlikely(exp > exp_max)) {
 434                    /* Overflow.  Return the maximum normal.  */
 435                    flags = float_flag_invalid;
 436                    exp = exp_max;
 437                    frac = -1;
 438                }
 439            } else if (unlikely(exp >= exp_max)) {
 440                flags |= float_flag_overflow | float_flag_inexact;
 441                if (overflow_norm) {
 442                    exp = exp_max - 1;
 443                    frac = -1;
 444                } else {
 445                    p.cls = float_class_inf;
 446                    goto do_inf;
 447                }
 448            }
 449        } else if (s->flush_to_zero) {
 450            flags |= float_flag_output_denormal;
 451            p.cls = float_class_zero;
 452            goto do_zero;
 453        } else {
 454            bool is_tiny = (s->float_detect_tininess
 455                            == float_tininess_before_rounding)
 456                        || (exp < 0)
 457                        || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
 458
 459            shift64RightJamming(frac, 1 - exp, &frac);
 460            if (frac & round_mask) {
 461                /* Need to recompute round-to-even.  */
 462                if (s->float_rounding_mode == float_round_nearest_even) {
 463                    inc = ((frac & roundeven_mask) != frac_lsbm1
 464                           ? frac_lsbm1 : 0);
 465                }
 466                flags |= float_flag_inexact;
 467                frac += inc;
 468            }
 469
 470            exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 471            frac >>= frac_shift;
 472
 473            if (is_tiny && (flags & float_flag_inexact)) {
 474                flags |= float_flag_underflow;
 475            }
 476            if (exp == 0 && frac == 0) {
 477                p.cls = float_class_zero;
 478            }
 479        }
 480        break;
 481
 482    case float_class_zero:
 483    do_zero:
 484        exp = 0;
 485        frac = 0;
 486        break;
 487
 488    case float_class_inf:
 489    do_inf:
 490        assert(!parm->arm_althp);
 491        exp = exp_max;
 492        frac = 0;
 493        break;
 494
 495    case float_class_qnan:
 496    case float_class_snan:
 497        assert(!parm->arm_althp);
 498        exp = exp_max;
 499        frac >>= parm->frac_shift;
 500        break;
 501
 502    default:
 503        g_assert_not_reached();
 504    }
 505
 506    float_raise(flags, s);
 507    p.exp = exp;
 508    p.frac = frac;
 509    return p;
 510}
 511
 512/* Explicit FloatFmt version */
 513static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
 514                                            const FloatFmt *params)
 515{
 516    return canonicalize(float16_unpack_raw(f), params, s);
 517}
 518
 519static FloatParts float16_unpack_canonical(float16 f, float_status *s)
 520{
 521    return float16a_unpack_canonical(f, s, &float16_params);
 522}
 523
 524static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
 525                                             const FloatFmt *params)
 526{
 527    return float16_pack_raw(round_canonical(p, s, params));
 528}
 529
 530static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
 531{
 532    return float16a_round_pack_canonical(p, s, &float16_params);
 533}
 534
 535static FloatParts float32_unpack_canonical(float32 f, float_status *s)
 536{
 537    return canonicalize(float32_unpack_raw(f), &float32_params, s);
 538}
 539
 540static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
 541{
 542    return float32_pack_raw(round_canonical(p, s, &float32_params));
 543}
 544
 545static FloatParts float64_unpack_canonical(float64 f, float_status *s)
 546{
 547    return canonicalize(float64_unpack_raw(f), &float64_params, s);
 548}
 549
 550static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
 551{
 552    return float64_pack_raw(round_canonical(p, s, &float64_params));
 553}
 554
 555static FloatParts return_nan(FloatParts a, float_status *s)
 556{
 557    switch (a.cls) {
 558    case float_class_snan:
 559        s->float_exception_flags |= float_flag_invalid;
 560        a = parts_silence_nan(a, s);
 561        /* fall through */
 562    case float_class_qnan:
 563        if (s->default_nan_mode) {
 564            return parts_default_nan(s);
 565        }
 566        break;
 567
 568    default:
 569        g_assert_not_reached();
 570    }
 571    return a;
 572}
 573
 574static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
 575{
 576    if (is_snan(a.cls) || is_snan(b.cls)) {
 577        s->float_exception_flags |= float_flag_invalid;
 578    }
 579
 580    if (s->default_nan_mode) {
 581        return parts_default_nan(s);
 582    } else {
 583        if (pickNaN(a.cls, b.cls,
 584                    a.frac > b.frac ||
 585                    (a.frac == b.frac && a.sign < b.sign))) {
 586            a = b;
 587        }
 588        if (is_snan(a.cls)) {
 589            return parts_silence_nan(a, s);
 590        }
 591    }
 592    return a;
 593}
 594
 595static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
 596                                  bool inf_zero, float_status *s)
 597{
 598    int which;
 599
 600    if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 601        s->float_exception_flags |= float_flag_invalid;
 602    }
 603
 604    which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
 605
 606    if (s->default_nan_mode) {
 607        /* Note that this check is after pickNaNMulAdd so that function
 608         * has an opportunity to set the Invalid flag.
 609         */
 610        which = 3;
 611    }
 612
 613    switch (which) {
 614    case 0:
 615        break;
 616    case 1:
 617        a = b;
 618        break;
 619    case 2:
 620        a = c;
 621        break;
 622    case 3:
 623        return parts_default_nan(s);
 624    default:
 625        g_assert_not_reached();
 626    }
 627
 628    if (is_snan(a.cls)) {
 629        return parts_silence_nan(a, s);
 630    }
 631    return a;
 632}
 633
 634/*
 635 * Returns the result of adding or subtracting the values of the
 636 * floating-point values `a' and `b'. The operation is performed
 637 * according to the IEC/IEEE Standard for Binary Floating-Point
 638 * Arithmetic.
 639 */
 640
 641static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
 642                                float_status *s)
 643{
 644    bool a_sign = a.sign;
 645    bool b_sign = b.sign ^ subtract;
 646
 647    if (a_sign != b_sign) {
 648        /* Subtraction */
 649
 650        if (a.cls == float_class_normal && b.cls == float_class_normal) {
 651            if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 652                shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 653                a.frac = a.frac - b.frac;
 654            } else {
 655                shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 656                a.frac = b.frac - a.frac;
 657                a.exp = b.exp;
 658                a_sign ^= 1;
 659            }
 660
 661            if (a.frac == 0) {
 662                a.cls = float_class_zero;
 663                a.sign = s->float_rounding_mode == float_round_down;
 664            } else {
 665                int shift = clz64(a.frac) - 1;
 666                a.frac = a.frac << shift;
 667                a.exp = a.exp - shift;
 668                a.sign = a_sign;
 669            }
 670            return a;
 671        }
 672        if (is_nan(a.cls) || is_nan(b.cls)) {
 673            return pick_nan(a, b, s);
 674        }
 675        if (a.cls == float_class_inf) {
 676            if (b.cls == float_class_inf) {
 677                float_raise(float_flag_invalid, s);
 678                return parts_default_nan(s);
 679            }
 680            return a;
 681        }
 682        if (a.cls == float_class_zero && b.cls == float_class_zero) {
 683            a.sign = s->float_rounding_mode == float_round_down;
 684            return a;
 685        }
 686        if (a.cls == float_class_zero || b.cls == float_class_inf) {
 687            b.sign = a_sign ^ 1;
 688            return b;
 689        }
 690        if (b.cls == float_class_zero) {
 691            return a;
 692        }
 693    } else {
 694        /* Addition */
 695        if (a.cls == float_class_normal && b.cls == float_class_normal) {
 696            if (a.exp > b.exp) {
 697                shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 698            } else if (a.exp < b.exp) {
 699                shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 700                a.exp = b.exp;
 701            }
 702            a.frac += b.frac;
 703            if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
 704                shift64RightJamming(a.frac, 1, &a.frac);
 705                a.exp += 1;
 706            }
 707            return a;
 708        }
 709        if (is_nan(a.cls) || is_nan(b.cls)) {
 710            return pick_nan(a, b, s);
 711        }
 712        if (a.cls == float_class_inf || b.cls == float_class_zero) {
 713            return a;
 714        }
 715        if (b.cls == float_class_inf || a.cls == float_class_zero) {
 716            b.sign = b_sign;
 717            return b;
 718        }
 719    }
 720    g_assert_not_reached();
 721}
 722
 723/*
 724 * Returns the result of adding or subtracting the floating-point
 725 * values `a' and `b'. The operation is performed according to the
 726 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 727 */
 728
 729float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
 730{
 731    FloatParts pa = float16_unpack_canonical(a, status);
 732    FloatParts pb = float16_unpack_canonical(b, status);
 733    FloatParts pr = addsub_floats(pa, pb, false, status);
 734
 735    return float16_round_pack_canonical(pr, status);
 736}
 737
 738float32 QEMU_FLATTEN float32_add(float32 a, float32 b, float_status *status)
 739{
 740    FloatParts pa = float32_unpack_canonical(a, status);
 741    FloatParts pb = float32_unpack_canonical(b, status);
 742    FloatParts pr = addsub_floats(pa, pb, false, status);
 743
 744    return float32_round_pack_canonical(pr, status);
 745}
 746
 747float64 QEMU_FLATTEN float64_add(float64 a, float64 b, float_status *status)
 748{
 749    FloatParts pa = float64_unpack_canonical(a, status);
 750    FloatParts pb = float64_unpack_canonical(b, status);
 751    FloatParts pr = addsub_floats(pa, pb, false, status);
 752
 753    return float64_round_pack_canonical(pr, status);
 754}
 755
 756float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
 757{
 758    FloatParts pa = float16_unpack_canonical(a, status);
 759    FloatParts pb = float16_unpack_canonical(b, status);
 760    FloatParts pr = addsub_floats(pa, pb, true, status);
 761
 762    return float16_round_pack_canonical(pr, status);
 763}
 764
 765float32 QEMU_FLATTEN float32_sub(float32 a, float32 b, float_status *status)
 766{
 767    FloatParts pa = float32_unpack_canonical(a, status);
 768    FloatParts pb = float32_unpack_canonical(b, status);
 769    FloatParts pr = addsub_floats(pa, pb, true, status);
 770
 771    return float32_round_pack_canonical(pr, status);
 772}
 773
 774float64 QEMU_FLATTEN float64_sub(float64 a, float64 b, float_status *status)
 775{
 776    FloatParts pa = float64_unpack_canonical(a, status);
 777    FloatParts pb = float64_unpack_canonical(b, status);
 778    FloatParts pr = addsub_floats(pa, pb, true, status);
 779
 780    return float64_round_pack_canonical(pr, status);
 781}
 782
 783/*
 784 * Returns the result of multiplying the floating-point values `a' and
 785 * `b'. The operation is performed according to the IEC/IEEE Standard
 786 * for Binary Floating-Point Arithmetic.
 787 */
 788
 789static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
 790{
 791    bool sign = a.sign ^ b.sign;
 792
 793    if (a.cls == float_class_normal && b.cls == float_class_normal) {
 794        uint64_t hi, lo;
 795        int exp = a.exp + b.exp;
 796
 797        mul64To128(a.frac, b.frac, &hi, &lo);
 798        shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 799        if (lo & DECOMPOSED_OVERFLOW_BIT) {
 800            shift64RightJamming(lo, 1, &lo);
 801            exp += 1;
 802        }
 803
 804        /* Re-use a */
 805        a.exp = exp;
 806        a.sign = sign;
 807        a.frac = lo;
 808        return a;
 809    }
 810    /* handle all the NaN cases */
 811    if (is_nan(a.cls) || is_nan(b.cls)) {
 812        return pick_nan(a, b, s);
 813    }
 814    /* Inf * Zero == NaN */
 815    if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
 816        (a.cls == float_class_zero && b.cls == float_class_inf)) {
 817        s->float_exception_flags |= float_flag_invalid;
 818        return parts_default_nan(s);
 819    }
 820    /* Multiply by 0 or Inf */
 821    if (a.cls == float_class_inf || a.cls == float_class_zero) {
 822        a.sign = sign;
 823        return a;
 824    }
 825    if (b.cls == float_class_inf || b.cls == float_class_zero) {
 826        b.sign = sign;
 827        return b;
 828    }
 829    g_assert_not_reached();
 830}
 831
 832float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
 833{
 834    FloatParts pa = float16_unpack_canonical(a, status);
 835    FloatParts pb = float16_unpack_canonical(b, status);
 836    FloatParts pr = mul_floats(pa, pb, status);
 837
 838    return float16_round_pack_canonical(pr, status);
 839}
 840
 841float32 QEMU_FLATTEN float32_mul(float32 a, float32 b, float_status *status)
 842{
 843    FloatParts pa = float32_unpack_canonical(a, status);
 844    FloatParts pb = float32_unpack_canonical(b, status);
 845    FloatParts pr = mul_floats(pa, pb, status);
 846
 847    return float32_round_pack_canonical(pr, status);
 848}
 849
 850float64 QEMU_FLATTEN float64_mul(float64 a, float64 b, float_status *status)
 851{
 852    FloatParts pa = float64_unpack_canonical(a, status);
 853    FloatParts pb = float64_unpack_canonical(b, status);
 854    FloatParts pr = mul_floats(pa, pb, status);
 855
 856    return float64_round_pack_canonical(pr, status);
 857}
 858
 859/*
 860 * Returns the result of multiplying the floating-point values `a' and
 861 * `b' then adding 'c', with no intermediate rounding step after the
 862 * multiplication. The operation is performed according to the
 863 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
 864 * The flags argument allows the caller to select negation of the
 865 * addend, the intermediate product, or the final result. (The
 866 * difference between this and having the caller do a separate
 867 * negation is that negating externally will flip the sign bit on
 868 * NaNs.)
 869 */
 870
 871static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
 872                                int flags, float_status *s)
 873{
 874    bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
 875                    ((1 << float_class_inf) | (1 << float_class_zero));
 876    bool p_sign;
 877    bool sign_flip = flags & float_muladd_negate_result;
 878    FloatClass p_class;
 879    uint64_t hi, lo;
 880    int p_exp;
 881
 882    /* It is implementation-defined whether the cases of (0,inf,qnan)
 883     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
 884     * they return if they do), so we have to hand this information
 885     * off to the target-specific pick-a-NaN routine.
 886     */
 887    if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
 888        return pick_nan_muladd(a, b, c, inf_zero, s);
 889    }
 890
 891    if (inf_zero) {
 892        s->float_exception_flags |= float_flag_invalid;
 893        return parts_default_nan(s);
 894    }
 895
 896    if (flags & float_muladd_negate_c) {
 897        c.sign ^= 1;
 898    }
 899
 900    p_sign = a.sign ^ b.sign;
 901
 902    if (flags & float_muladd_negate_product) {
 903        p_sign ^= 1;
 904    }
 905
 906    if (a.cls == float_class_inf || b.cls == float_class_inf) {
 907        p_class = float_class_inf;
 908    } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
 909        p_class = float_class_zero;
 910    } else {
 911        p_class = float_class_normal;
 912    }
 913
 914    if (c.cls == float_class_inf) {
 915        if (p_class == float_class_inf && p_sign != c.sign) {
 916            s->float_exception_flags |= float_flag_invalid;
 917            return parts_default_nan(s);
 918        } else {
 919            a.cls = float_class_inf;
 920            a.sign = c.sign ^ sign_flip;
 921            return a;
 922        }
 923    }
 924
 925    if (p_class == float_class_inf) {
 926        a.cls = float_class_inf;
 927        a.sign = p_sign ^ sign_flip;
 928        return a;
 929    }
 930
 931    if (p_class == float_class_zero) {
 932        if (c.cls == float_class_zero) {
 933            if (p_sign != c.sign) {
 934                p_sign = s->float_rounding_mode == float_round_down;
 935            }
 936            c.sign = p_sign;
 937        } else if (flags & float_muladd_halve_result) {
 938            c.exp -= 1;
 939        }
 940        c.sign ^= sign_flip;
 941        return c;
 942    }
 943
 944    /* a & b should be normals now... */
 945    assert(a.cls == float_class_normal &&
 946           b.cls == float_class_normal);
 947
 948    p_exp = a.exp + b.exp;
 949
 950    /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
 951     * result.
 952     */
 953    mul64To128(a.frac, b.frac, &hi, &lo);
 954    /* binary point now at bit 124 */
 955
 956    /* check for overflow */
 957    if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
 958        shift128RightJamming(hi, lo, 1, &hi, &lo);
 959        p_exp += 1;
 960    }
 961
 962    /* + add/sub */
 963    if (c.cls == float_class_zero) {
 964        /* move binary point back to 62 */
 965        shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 966    } else {
 967        int exp_diff = p_exp - c.exp;
 968        if (p_sign == c.sign) {
 969            /* Addition */
 970            if (exp_diff <= 0) {
 971                shift128RightJamming(hi, lo,
 972                                     DECOMPOSED_BINARY_POINT - exp_diff,
 973                                     &hi, &lo);
 974                lo += c.frac;
 975                p_exp = c.exp;
 976            } else {
 977                uint64_t c_hi, c_lo;
 978                /* shift c to the same binary point as the product (124) */
 979                c_hi = c.frac >> 2;
 980                c_lo = 0;
 981                shift128RightJamming(c_hi, c_lo,
 982                                     exp_diff,
 983                                     &c_hi, &c_lo);
 984                add128(hi, lo, c_hi, c_lo, &hi, &lo);
 985                /* move binary point back to 62 */
 986                shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 987            }
 988
 989            if (lo & DECOMPOSED_OVERFLOW_BIT) {
 990                shift64RightJamming(lo, 1, &lo);
 991                p_exp += 1;
 992            }
 993
 994        } else {
 995            /* Subtraction */
 996            uint64_t c_hi, c_lo;
 997            /* make C binary point match product at bit 124 */
 998            c_hi = c.frac >> 2;
 999            c_lo = 0;
1000
1001            if (exp_diff <= 0) {
1002                shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1003                if (exp_diff == 0
1004                    &&
1005                    (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1006                    sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1007                } else {
1008                    sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1009                    p_sign ^= 1;
1010                    p_exp = c.exp;
1011                }
1012            } else {
1013                shift128RightJamming(c_hi, c_lo,
1014                                     exp_diff,
1015                                     &c_hi, &c_lo);
1016                sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1017            }
1018
1019            if (hi == 0 && lo == 0) {
1020                a.cls = float_class_zero;
1021                a.sign = s->float_rounding_mode == float_round_down;
1022                a.sign ^= sign_flip;
1023                return a;
1024            } else {
1025                int shift;
1026                if (hi != 0) {
1027                    shift = clz64(hi);
1028                } else {
1029                    shift = clz64(lo) + 64;
1030                }
1031                /* Normalizing to a binary point of 124 is the
1032                   correct adjust for the exponent.  However since we're
1033                   shifting, we might as well put the binary point back
1034                   at 62 where we really want it.  Therefore shift as
1035                   if we're leaving 1 bit at the top of the word, but
1036                   adjust the exponent as if we're leaving 3 bits.  */
1037                shift -= 1;
1038                if (shift >= 64) {
1039                    lo = lo << (shift - 64);
1040                } else {
1041                    hi = (hi << shift) | (lo >> (64 - shift));
1042                    lo = hi | ((lo << shift) != 0);
1043                }
1044                p_exp -= shift - 2;
1045            }
1046        }
1047    }
1048
1049    if (flags & float_muladd_halve_result) {
1050        p_exp -= 1;
1051    }
1052
1053    /* finally prepare our result */
1054    a.cls = float_class_normal;
1055    a.sign = p_sign ^ sign_flip;
1056    a.exp = p_exp;
1057    a.frac = lo;
1058
1059    return a;
1060}
1061
1062float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1063                                                int flags, float_status *status)
1064{
1065    FloatParts pa = float16_unpack_canonical(a, status);
1066    FloatParts pb = float16_unpack_canonical(b, status);
1067    FloatParts pc = float16_unpack_canonical(c, status);
1068    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1069
1070    return float16_round_pack_canonical(pr, status);
1071}
1072
1073float32 QEMU_FLATTEN float32_muladd(float32 a, float32 b, float32 c,
1074                                                int flags, float_status *status)
1075{
1076    FloatParts pa = float32_unpack_canonical(a, status);
1077    FloatParts pb = float32_unpack_canonical(b, status);
1078    FloatParts pc = float32_unpack_canonical(c, status);
1079    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1080
1081    return float32_round_pack_canonical(pr, status);
1082}
1083
1084float64 QEMU_FLATTEN float64_muladd(float64 a, float64 b, float64 c,
1085                                                int flags, float_status *status)
1086{
1087    FloatParts pa = float64_unpack_canonical(a, status);
1088    FloatParts pb = float64_unpack_canonical(b, status);
1089    FloatParts pc = float64_unpack_canonical(c, status);
1090    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1091
1092    return float64_round_pack_canonical(pr, status);
1093}
1094
1095/*
1096 * Returns the result of dividing the floating-point value `a' by the
1097 * corresponding value `b'. The operation is performed according to
1098 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1099 */
1100
1101static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1102{
1103    bool sign = a.sign ^ b.sign;
1104
1105    if (a.cls == float_class_normal && b.cls == float_class_normal) {
1106        uint64_t n0, n1, q, r;
1107        int exp = a.exp - b.exp;
1108
1109        /*
1110         * We want a 2*N / N-bit division to produce exactly an N-bit
1111         * result, so that we do not lose any precision and so that we
1112         * do not have to renormalize afterward.  If A.frac < B.frac,
1113         * then division would produce an (N-1)-bit result; shift A left
1114         * by one to produce the an N-bit result, and decrement the
1115         * exponent to match.
1116         *
1117         * The udiv_qrnnd algorithm that we're using requires normalization,
1118         * i.e. the msb of the denominator must be set.  Since we know that
1119         * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1120         * by one (more), and the remainder must be shifted right by one.
1121         */
1122        if (a.frac < b.frac) {
1123            exp -= 1;
1124            shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1125        } else {
1126            shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1127        }
1128        q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1129
1130        /*
1131         * Set lsb if there is a remainder, to set inexact.
1132         * As mentioned above, to find the actual value of the remainder we
1133         * would need to shift right, but (1) we are only concerned about
1134         * non-zero-ness, and (2) the remainder will always be even because
1135         * both inputs to the division primitive are even.
1136         */
1137        a.frac = q | (r != 0);
1138        a.sign = sign;
1139        a.exp = exp;
1140        return a;
1141    }
1142    /* handle all the NaN cases */
1143    if (is_nan(a.cls) || is_nan(b.cls)) {
1144        return pick_nan(a, b, s);
1145    }
1146    /* 0/0 or Inf/Inf */
1147    if (a.cls == b.cls
1148        &&
1149        (a.cls == float_class_inf || a.cls == float_class_zero)) {
1150        s->float_exception_flags |= float_flag_invalid;
1151        return parts_default_nan(s);
1152    }
1153    /* Inf / x or 0 / x */
1154    if (a.cls == float_class_inf || a.cls == float_class_zero) {
1155        a.sign = sign;
1156        return a;
1157    }
1158    /* Div 0 => Inf */
1159    if (b.cls == float_class_zero) {
1160        s->float_exception_flags |= float_flag_divbyzero;
1161        a.cls = float_class_inf;
1162        a.sign = sign;
1163        return a;
1164    }
1165    /* Div by Inf */
1166    if (b.cls == float_class_inf) {
1167        a.cls = float_class_zero;
1168        a.sign = sign;
1169        return a;
1170    }
1171    g_assert_not_reached();
1172}
1173
1174float16 float16_div(float16 a, float16 b, float_status *status)
1175{
1176    FloatParts pa = float16_unpack_canonical(a, status);
1177    FloatParts pb = float16_unpack_canonical(b, status);
1178    FloatParts pr = div_floats(pa, pb, status);
1179
1180    return float16_round_pack_canonical(pr, status);
1181}
1182
1183float32 float32_div(float32 a, float32 b, float_status *status)
1184{
1185    FloatParts pa = float32_unpack_canonical(a, status);
1186    FloatParts pb = float32_unpack_canonical(b, status);
1187    FloatParts pr = div_floats(pa, pb, status);
1188
1189    return float32_round_pack_canonical(pr, status);
1190}
1191
1192float64 float64_div(float64 a, float64 b, float_status *status)
1193{
1194    FloatParts pa = float64_unpack_canonical(a, status);
1195    FloatParts pb = float64_unpack_canonical(b, status);
1196    FloatParts pr = div_floats(pa, pb, status);
1197
1198    return float64_round_pack_canonical(pr, status);
1199}
1200
1201/*
1202 * Float to Float conversions
1203 *
1204 * Returns the result of converting one float format to another. The
1205 * conversion is performed according to the IEC/IEEE Standard for
1206 * Binary Floating-Point Arithmetic.
1207 *
1208 * The float_to_float helper only needs to take care of raising
1209 * invalid exceptions and handling the conversion on NaNs.
1210 */
1211
1212static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1213                                 float_status *s)
1214{
1215    if (dstf->arm_althp) {
1216        switch (a.cls) {
1217        case float_class_qnan:
1218        case float_class_snan:
1219            /* There is no NaN in the destination format.  Raise Invalid
1220             * and return a zero with the sign of the input NaN.
1221             */
1222            s->float_exception_flags |= float_flag_invalid;
1223            a.cls = float_class_zero;
1224            a.frac = 0;
1225            a.exp = 0;
1226            break;
1227
1228        case float_class_inf:
1229            /* There is no Inf in the destination format.  Raise Invalid
1230             * and return the maximum normal with the correct sign.
1231             */
1232            s->float_exception_flags |= float_flag_invalid;
1233            a.cls = float_class_normal;
1234            a.exp = dstf->exp_max;
1235            a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1236            break;
1237
1238        default:
1239            break;
1240        }
1241    } else if (is_nan(a.cls)) {
1242        if (is_snan(a.cls)) {
1243            s->float_exception_flags |= float_flag_invalid;
1244            a = parts_silence_nan(a, s);
1245        }
1246        if (s->default_nan_mode) {
1247            return parts_default_nan(s);
1248        }
1249    }
1250    return a;
1251}
1252
1253float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1254{
1255    const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1256    FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1257    FloatParts pr = float_to_float(p, &float32_params, s);
1258    return float32_round_pack_canonical(pr, s);
1259}
1260
1261float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1262{
1263    const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1264    FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1265    FloatParts pr = float_to_float(p, &float64_params, s);
1266    return float64_round_pack_canonical(pr, s);
1267}
1268
1269float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1270{
1271    const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1272    FloatParts p = float32_unpack_canonical(a, s);
1273    FloatParts pr = float_to_float(p, fmt16, s);
1274    return float16a_round_pack_canonical(pr, s, fmt16);
1275}
1276
1277float64 float32_to_float64(float32 a, float_status *s)
1278{
1279    FloatParts p = float32_unpack_canonical(a, s);
1280    FloatParts pr = float_to_float(p, &float64_params, s);
1281    return float64_round_pack_canonical(pr, s);
1282}
1283
1284float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1285{
1286    const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1287    FloatParts p = float64_unpack_canonical(a, s);
1288    FloatParts pr = float_to_float(p, fmt16, s);
1289    return float16a_round_pack_canonical(pr, s, fmt16);
1290}
1291
1292float32 float64_to_float32(float64 a, float_status *s)
1293{
1294    FloatParts p = float64_unpack_canonical(a, s);
1295    FloatParts pr = float_to_float(p, &float32_params, s);
1296    return float32_round_pack_canonical(pr, s);
1297}
1298
1299/*
1300 * Rounds the floating-point value `a' to an integer, and returns the
1301 * result as a floating-point value. The operation is performed
1302 * according to the IEC/IEEE Standard for Binary Floating-Point
1303 * Arithmetic.
1304 */
1305
1306static FloatParts round_to_int(FloatParts a, int rmode,
1307                               int scale, float_status *s)
1308{
1309    switch (a.cls) {
1310    case float_class_qnan:
1311    case float_class_snan:
1312        return return_nan(a, s);
1313
1314    case float_class_zero:
1315    case float_class_inf:
1316        /* already "integral" */
1317        break;
1318
1319    case float_class_normal:
1320        scale = MIN(MAX(scale, -0x10000), 0x10000);
1321        a.exp += scale;
1322
1323        if (a.exp >= DECOMPOSED_BINARY_POINT) {
1324            /* already integral */
1325            break;
1326        }
1327        if (a.exp < 0) {
1328            bool one;
1329            /* all fractional */
1330            s->float_exception_flags |= float_flag_inexact;
1331            switch (rmode) {
1332            case float_round_nearest_even:
1333                one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1334                break;
1335            case float_round_ties_away:
1336                one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1337                break;
1338            case float_round_to_zero:
1339                one = false;
1340                break;
1341            case float_round_up:
1342                one = !a.sign;
1343                break;
1344            case float_round_down:
1345                one = a.sign;
1346                break;
1347            default:
1348                g_assert_not_reached();
1349            }
1350
1351            if (one) {
1352                a.frac = DECOMPOSED_IMPLICIT_BIT;
1353                a.exp = 0;
1354            } else {
1355                a.cls = float_class_zero;
1356            }
1357        } else {
1358            uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1359            uint64_t frac_lsbm1 = frac_lsb >> 1;
1360            uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1361            uint64_t rnd_mask = rnd_even_mask >> 1;
1362            uint64_t inc;
1363
1364            switch (rmode) {
1365            case float_round_nearest_even:
1366                inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1367                break;
1368            case float_round_ties_away:
1369                inc = frac_lsbm1;
1370                break;
1371            case float_round_to_zero:
1372                inc = 0;
1373                break;
1374            case float_round_up:
1375                inc = a.sign ? 0 : rnd_mask;
1376                break;
1377            case float_round_down:
1378                inc = a.sign ? rnd_mask : 0;
1379                break;
1380            default:
1381                g_assert_not_reached();
1382            }
1383
1384            if (a.frac & rnd_mask) {
1385                s->float_exception_flags |= float_flag_inexact;
1386                a.frac += inc;
1387                a.frac &= ~rnd_mask;
1388                if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1389                    a.frac >>= 1;
1390                    a.exp++;
1391                }
1392            }
1393        }
1394        break;
1395    default:
1396        g_assert_not_reached();
1397    }
1398    return a;
1399}
1400
1401float16 float16_round_to_int(float16 a, float_status *s)
1402{
1403    FloatParts pa = float16_unpack_canonical(a, s);
1404    FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1405    return float16_round_pack_canonical(pr, s);
1406}
1407
1408float32 float32_round_to_int(float32 a, float_status *s)
1409{
1410    FloatParts pa = float32_unpack_canonical(a, s);
1411    FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1412    return float32_round_pack_canonical(pr, s);
1413}
1414
1415float64 float64_round_to_int(float64 a, float_status *s)
1416{
1417    FloatParts pa = float64_unpack_canonical(a, s);
1418    FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1419    return float64_round_pack_canonical(pr, s);
1420}
1421
1422/*
1423 * Returns the result of converting the floating-point value `a' to
1424 * the two's complement integer format. The conversion is performed
1425 * according to the IEC/IEEE Standard for Binary Floating-Point
1426 * Arithmetic---which means in particular that the conversion is
1427 * rounded according to the current rounding mode. If `a' is a NaN,
1428 * the largest positive integer is returned. Otherwise, if the
1429 * conversion overflows, the largest integer with the same sign as `a'
1430 * is returned.
1431*/
1432
1433static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
1434                                     int64_t min, int64_t max,
1435                                     float_status *s)
1436{
1437    uint64_t r;
1438    int orig_flags = get_float_exception_flags(s);
1439    FloatParts p = round_to_int(in, rmode, scale, s);
1440
1441    switch (p.cls) {
1442    case float_class_snan:
1443    case float_class_qnan:
1444        s->float_exception_flags = orig_flags | float_flag_invalid;
1445        return max;
1446    case float_class_inf:
1447        s->float_exception_flags = orig_flags | float_flag_invalid;
1448        return p.sign ? min : max;
1449    case float_class_zero:
1450        return 0;
1451    case float_class_normal:
1452        if (p.exp < DECOMPOSED_BINARY_POINT) {
1453            r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1454        } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1455            r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1456        } else {
1457            r = UINT64_MAX;
1458        }
1459        if (p.sign) {
1460            if (r <= -(uint64_t) min) {
1461                return -r;
1462            } else {
1463                s->float_exception_flags = orig_flags | float_flag_invalid;
1464                return min;
1465            }
1466        } else {
1467            if (r <= max) {
1468                return r;
1469            } else {
1470                s->float_exception_flags = orig_flags | float_flag_invalid;
1471                return max;
1472            }
1473        }
1474    default:
1475        g_assert_not_reached();
1476    }
1477}
1478
1479int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
1480                                float_status *s)
1481{
1482    return round_to_int_and_pack(float16_unpack_canonical(a, s),
1483                                 rmode, scale, INT16_MIN, INT16_MAX, s);
1484}
1485
1486int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
1487                                float_status *s)
1488{
1489    return round_to_int_and_pack(float16_unpack_canonical(a, s),
1490                                 rmode, scale, INT32_MIN, INT32_MAX, s);
1491}
1492
1493int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
1494                                float_status *s)
1495{
1496    return round_to_int_and_pack(float16_unpack_canonical(a, s),
1497                                 rmode, scale, INT64_MIN, INT64_MAX, s);
1498}
1499
1500int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
1501                                float_status *s)
1502{
1503    return round_to_int_and_pack(float32_unpack_canonical(a, s),
1504                                 rmode, scale, INT16_MIN, INT16_MAX, s);
1505}
1506
1507int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
1508                                float_status *s)
1509{
1510    return round_to_int_and_pack(float32_unpack_canonical(a, s),
1511                                 rmode, scale, INT32_MIN, INT32_MAX, s);
1512}
1513
1514int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
1515                                float_status *s)
1516{
1517    return round_to_int_and_pack(float32_unpack_canonical(a, s),
1518                                 rmode, scale, INT64_MIN, INT64_MAX, s);
1519}
1520
1521int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
1522                                float_status *s)
1523{
1524    return round_to_int_and_pack(float64_unpack_canonical(a, s),
1525                                 rmode, scale, INT16_MIN, INT16_MAX, s);
1526}
1527
1528int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
1529                                float_status *s)
1530{
1531    return round_to_int_and_pack(float64_unpack_canonical(a, s),
1532                                 rmode, scale, INT32_MIN, INT32_MAX, s);
1533}
1534
1535int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
1536                                float_status *s)
1537{
1538    return round_to_int_and_pack(float64_unpack_canonical(a, s),
1539                                 rmode, scale, INT64_MIN, INT64_MAX, s);
1540}
1541
1542int16_t float16_to_int16(float16 a, float_status *s)
1543{
1544    return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1545}
1546
1547int32_t float16_to_int32(float16 a, float_status *s)
1548{
1549    return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1550}
1551
1552int64_t float16_to_int64(float16 a, float_status *s)
1553{
1554    return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1555}
1556
1557int16_t float32_to_int16(float32 a, float_status *s)
1558{
1559    return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1560}
1561
1562int32_t float32_to_int32(float32 a, float_status *s)
1563{
1564    return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1565}
1566
1567int64_t float32_to_int64(float32 a, float_status *s)
1568{
1569    return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1570}
1571
1572int16_t float64_to_int16(float64 a, float_status *s)
1573{
1574    return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1575}
1576
1577int32_t float64_to_int32(float64 a, float_status *s)
1578{
1579    return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1580}
1581
1582int64_t float64_to_int64(float64 a, float_status *s)
1583{
1584    return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1585}
1586
1587int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
1588{
1589    return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
1590}
1591
1592int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
1593{
1594    return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
1595}
1596
1597int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
1598{
1599    return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
1600}
1601
1602int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
1603{
1604    return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
1605}
1606
1607int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
1608{
1609    return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
1610}
1611
1612int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
1613{
1614    return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
1615}
1616
1617int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
1618{
1619    return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
1620}
1621
1622int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
1623{
1624    return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
1625}
1626
1627int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
1628{
1629    return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
1630}
1631
1632/*
1633 *  Returns the result of converting the floating-point value `a' to
1634 *  the unsigned integer format. The conversion is performed according
1635 *  to the IEC/IEEE Standard for Binary Floating-Point
1636 *  Arithmetic---which means in particular that the conversion is
1637 *  rounded according to the current rounding mode. If `a' is a NaN,
1638 *  the largest unsigned integer is returned. Otherwise, if the
1639 *  conversion overflows, the largest unsigned integer is returned. If
1640 *  the 'a' is negative, the result is rounded and zero is returned;
1641 *  values that do not round to zero will raise the inexact exception
1642 *  flag.
1643 */
1644
1645static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
1646                                       uint64_t max, float_status *s)
1647{
1648    int orig_flags = get_float_exception_flags(s);
1649    FloatParts p = round_to_int(in, rmode, scale, s);
1650    uint64_t r;
1651
1652    switch (p.cls) {
1653    case float_class_snan:
1654    case float_class_qnan:
1655        s->float_exception_flags = orig_flags | float_flag_invalid;
1656        return max;
1657    case float_class_inf:
1658        s->float_exception_flags = orig_flags | float_flag_invalid;
1659        return p.sign ? 0 : max;
1660    case float_class_zero:
1661        return 0;
1662    case float_class_normal:
1663        if (p.sign) {
1664            s->float_exception_flags = orig_flags | float_flag_invalid;
1665            return 0;
1666        }
1667
1668        if (p.exp < DECOMPOSED_BINARY_POINT) {
1669            r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1670        } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1671            r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1672        } else {
1673            s->float_exception_flags = orig_flags | float_flag_invalid;
1674            return max;
1675        }
1676
1677        /* For uint64 this will never trip, but if p.exp is too large
1678         * to shift a decomposed fraction we shall have exited via the
1679         * 3rd leg above.
1680         */
1681        if (r > max) {
1682            s->float_exception_flags = orig_flags | float_flag_invalid;
1683            return max;
1684        }
1685        return r;
1686    default:
1687        g_assert_not_reached();
1688    }
1689}
1690
1691uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
1692                                  float_status *s)
1693{
1694    return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1695                                  rmode, scale, UINT16_MAX, s);
1696}
1697
1698uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
1699                                  float_status *s)
1700{
1701    return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1702                                  rmode, scale, UINT32_MAX, s);
1703}
1704
1705uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
1706                                  float_status *s)
1707{
1708    return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1709                                  rmode, scale, UINT64_MAX, s);
1710}
1711
1712uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
1713                                  float_status *s)
1714{
1715    return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1716                                  rmode, scale, UINT16_MAX, s);
1717}
1718
1719uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
1720                                  float_status *s)
1721{
1722    return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1723                                  rmode, scale, UINT32_MAX, s);
1724}
1725
1726uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
1727                                  float_status *s)
1728{
1729    return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1730                                  rmode, scale, UINT64_MAX, s);
1731}
1732
1733uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
1734                                  float_status *s)
1735{
1736    return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1737                                  rmode, scale, UINT16_MAX, s);
1738}
1739
1740uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
1741                                  float_status *s)
1742{
1743    return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1744                                  rmode, scale, UINT32_MAX, s);
1745}
1746
1747uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
1748                                  float_status *s)
1749{
1750    return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1751                                  rmode, scale, UINT64_MAX, s);
1752}
1753
1754uint16_t float16_to_uint16(float16 a, float_status *s)
1755{
1756    return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1757}
1758
1759uint32_t float16_to_uint32(float16 a, float_status *s)
1760{
1761    return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1762}
1763
1764uint64_t float16_to_uint64(float16 a, float_status *s)
1765{
1766    return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1767}
1768
1769uint16_t float32_to_uint16(float32 a, float_status *s)
1770{
1771    return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1772}
1773
1774uint32_t float32_to_uint32(float32 a, float_status *s)
1775{
1776    return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1777}
1778
1779uint64_t float32_to_uint64(float32 a, float_status *s)
1780{
1781    return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1782}
1783
1784uint16_t float64_to_uint16(float64 a, float_status *s)
1785{
1786    return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1787}
1788
1789uint32_t float64_to_uint32(float64 a, float_status *s)
1790{
1791    return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1792}
1793
1794uint64_t float64_to_uint64(float64 a, float_status *s)
1795{
1796    return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1797}
1798
1799uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
1800{
1801    return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1802}
1803
1804uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
1805{
1806    return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1807}
1808
1809uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
1810{
1811    return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1812}
1813
1814uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
1815{
1816    return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1817}
1818
1819uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
1820{
1821    return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1822}
1823
1824uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
1825{
1826    return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1827}
1828
1829uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
1830{
1831    return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1832}
1833
1834uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
1835{
1836    return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1837}
1838
1839uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
1840{
1841    return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1842}
1843
1844/*
1845 * Integer to float conversions
1846 *
1847 * Returns the result of converting the two's complement integer `a'
1848 * to the floating-point format. The conversion is performed according
1849 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1850 */
1851
1852static FloatParts int_to_float(int64_t a, int scale, float_status *status)
1853{
1854    FloatParts r = { .sign = false };
1855
1856    if (a == 0) {
1857        r.cls = float_class_zero;
1858    } else {
1859        uint64_t f = a;
1860        int shift;
1861
1862        r.cls = float_class_normal;
1863        if (a < 0) {
1864            f = -f;
1865            r.sign = true;
1866        }
1867        shift = clz64(f) - 1;
1868        scale = MIN(MAX(scale, -0x10000), 0x10000);
1869
1870        r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
1871        r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
1872    }
1873
1874    return r;
1875}
1876
1877float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
1878{
1879    FloatParts pa = int_to_float(a, scale, status);
1880    return float16_round_pack_canonical(pa, status);
1881}
1882
1883float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
1884{
1885    return int64_to_float16_scalbn(a, scale, status);
1886}
1887
1888float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
1889{
1890    return int64_to_float16_scalbn(a, scale, status);
1891}
1892
1893float16 int64_to_float16(int64_t a, float_status *status)
1894{
1895    return int64_to_float16_scalbn(a, 0, status);
1896}
1897
1898float16 int32_to_float16(int32_t a, float_status *status)
1899{
1900    return int64_to_float16_scalbn(a, 0, status);
1901}
1902
1903float16 int16_to_float16(int16_t a, float_status *status)
1904{
1905    return int64_to_float16_scalbn(a, 0, status);
1906}
1907
1908float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
1909{
1910    FloatParts pa = int_to_float(a, scale, status);
1911    return float32_round_pack_canonical(pa, status);
1912}
1913
1914float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
1915{
1916    return int64_to_float32_scalbn(a, scale, status);
1917}
1918
1919float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
1920{
1921    return int64_to_float32_scalbn(a, scale, status);
1922}
1923
1924float32 int64_to_float32(int64_t a, float_status *status)
1925{
1926    return int64_to_float32_scalbn(a, 0, status);
1927}
1928
1929float32 int32_to_float32(int32_t a, float_status *status)
1930{
1931    return int64_to_float32_scalbn(a, 0, status);
1932}
1933
1934float32 int16_to_float32(int16_t a, float_status *status)
1935{
1936    return int64_to_float32_scalbn(a, 0, status);
1937}
1938
1939float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
1940{
1941    FloatParts pa = int_to_float(a, scale, status);
1942    return float64_round_pack_canonical(pa, status);
1943}
1944
1945float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
1946{
1947    return int64_to_float64_scalbn(a, scale, status);
1948}
1949
1950float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
1951{
1952    return int64_to_float64_scalbn(a, scale, status);
1953}
1954
1955float64 int64_to_float64(int64_t a, float_status *status)
1956{
1957    return int64_to_float64_scalbn(a, 0, status);
1958}
1959
1960float64 int32_to_float64(int32_t a, float_status *status)
1961{
1962    return int64_to_float64_scalbn(a, 0, status);
1963}
1964
1965float64 int16_to_float64(int16_t a, float_status *status)
1966{
1967    return int64_to_float64_scalbn(a, 0, status);
1968}
1969
1970
1971/*
1972 * Unsigned Integer to float conversions
1973 *
1974 * Returns the result of converting the unsigned integer `a' to the
1975 * floating-point format. The conversion is performed according to the
1976 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1977 */
1978
1979static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
1980{
1981    FloatParts r = { .sign = false };
1982
1983    if (a == 0) {
1984        r.cls = float_class_zero;
1985    } else {
1986        scale = MIN(MAX(scale, -0x10000), 0x10000);
1987        r.cls = float_class_normal;
1988        if ((int64_t)a < 0) {
1989            r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
1990            shift64RightJamming(a, 1, &a);
1991            r.frac = a;
1992        } else {
1993            int shift = clz64(a) - 1;
1994            r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
1995            r.frac = a << shift;
1996        }
1997    }
1998
1999    return r;
2000}
2001
2002float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2003{
2004    FloatParts pa = uint_to_float(a, scale, status);
2005    return float16_round_pack_canonical(pa, status);
2006}
2007
2008float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2009{
2010    return uint64_to_float16_scalbn(a, scale, status);
2011}
2012
2013float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2014{
2015    return uint64_to_float16_scalbn(a, scale, status);
2016}
2017
2018float16 uint64_to_float16(uint64_t a, float_status *status)
2019{
2020    return uint64_to_float16_scalbn(a, 0, status);
2021}
2022
2023float16 uint32_to_float16(uint32_t a, float_status *status)
2024{
2025    return uint64_to_float16_scalbn(a, 0, status);
2026}
2027
2028float16 uint16_to_float16(uint16_t a, float_status *status)
2029{
2030    return uint64_to_float16_scalbn(a, 0, status);
2031}
2032
2033float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2034{
2035    FloatParts pa = uint_to_float(a, scale, status);
2036    return float32_round_pack_canonical(pa, status);
2037}
2038
2039float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2040{
2041    return uint64_to_float32_scalbn(a, scale, status);
2042}
2043
2044float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2045{
2046    return uint64_to_float32_scalbn(a, scale, status);
2047}
2048
2049float32 uint64_to_float32(uint64_t a, float_status *status)
2050{
2051    return uint64_to_float32_scalbn(a, 0, status);
2052}
2053
2054float32 uint32_to_float32(uint32_t a, float_status *status)
2055{
2056    return uint64_to_float32_scalbn(a, 0, status);
2057}
2058
2059float32 uint16_to_float32(uint16_t a, float_status *status)
2060{
2061    return uint64_to_float32_scalbn(a, 0, status);
2062}
2063
2064float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2065{
2066    FloatParts pa = uint_to_float(a, scale, status);
2067    return float64_round_pack_canonical(pa, status);
2068}
2069
2070float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2071{
2072    return uint64_to_float64_scalbn(a, scale, status);
2073}
2074
2075float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2076{
2077    return uint64_to_float64_scalbn(a, scale, status);
2078}
2079
2080float64 uint64_to_float64(uint64_t a, float_status *status)
2081{
2082    return uint64_to_float64_scalbn(a, 0, status);
2083}
2084
2085float64 uint32_to_float64(uint32_t a, float_status *status)
2086{
2087    return uint64_to_float64_scalbn(a, 0, status);
2088}
2089
2090float64 uint16_to_float64(uint16_t a, float_status *status)
2091{
2092    return uint64_to_float64_scalbn(a, 0, status);
2093}
2094
2095/* Float Min/Max */
2096/* min() and max() functions. These can't be implemented as
2097 * 'compare and pick one input' because that would mishandle
2098 * NaNs and +0 vs -0.
2099 *
2100 * minnum() and maxnum() functions. These are similar to the min()
2101 * and max() functions but if one of the arguments is a QNaN and
2102 * the other is numerical then the numerical argument is returned.
2103 * SNaNs will get quietened before being returned.
2104 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2105 * and maxNum() operations. min() and max() are the typical min/max
2106 * semantics provided by many CPUs which predate that specification.
2107 *
2108 * minnummag() and maxnummag() functions correspond to minNumMag()
2109 * and minNumMag() from the IEEE-754 2008.
2110 */
2111static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2112                                bool ieee, bool ismag, float_status *s)
2113{
2114    if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2115        if (ieee) {
2116            /* Takes two floating-point values `a' and `b', one of
2117             * which is a NaN, and returns the appropriate NaN
2118             * result. If either `a' or `b' is a signaling NaN,
2119             * the invalid exception is raised.
2120             */
2121            if (is_snan(a.cls) || is_snan(b.cls)) {
2122                return pick_nan(a, b, s);
2123            } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2124                return b;
2125            } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2126                return a;
2127            }
2128        }
2129        return pick_nan(a, b, s);
2130    } else {
2131        int a_exp, b_exp;
2132
2133        switch (a.cls) {
2134        case float_class_normal:
2135            a_exp = a.exp;
2136            break;
2137        case float_class_inf:
2138            a_exp = INT_MAX;
2139            break;
2140        case float_class_zero:
2141            a_exp = INT_MIN;
2142            break;
2143        default:
2144            g_assert_not_reached();
2145            break;
2146        }
2147        switch (b.cls) {
2148        case float_class_normal:
2149            b_exp = b.exp;
2150            break;
2151        case float_class_inf:
2152            b_exp = INT_MAX;
2153            break;
2154        case float_class_zero:
2155            b_exp = INT_MIN;
2156            break;
2157        default:
2158            g_assert_not_reached();
2159            break;
2160        }
2161
2162        if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2163            bool a_less = a_exp < b_exp;
2164            if (a_exp == b_exp) {
2165                a_less = a.frac < b.frac;
2166            }
2167            return a_less ^ ismin ? b : a;
2168        }
2169
2170        if (a.sign == b.sign) {
2171            bool a_less = a_exp < b_exp;
2172            if (a_exp == b_exp) {
2173                a_less = a.frac < b.frac;
2174            }
2175            return a.sign ^ a_less ^ ismin ? b : a;
2176        } else {
2177            return a.sign ^ ismin ? b : a;
2178        }
2179    }
2180}
2181
2182#define MINMAX(sz, name, ismin, isiee, ismag)                           \
2183float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2184                                     float_status *s)                   \
2185{                                                                       \
2186    FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2187    FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2188    FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2189                                                                        \
2190    return float ## sz ## _round_pack_canonical(pr, s);                 \
2191}
2192
2193MINMAX(16, min, true, false, false)
2194MINMAX(16, minnum, true, true, false)
2195MINMAX(16, minnummag, true, true, true)
2196MINMAX(16, max, false, false, false)
2197MINMAX(16, maxnum, false, true, false)
2198MINMAX(16, maxnummag, false, true, true)
2199
2200MINMAX(32, min, true, false, false)
2201MINMAX(32, minnum, true, true, false)
2202MINMAX(32, minnummag, true, true, true)
2203MINMAX(32, max, false, false, false)
2204MINMAX(32, maxnum, false, true, false)
2205MINMAX(32, maxnummag, false, true, true)
2206
2207MINMAX(64, min, true, false, false)
2208MINMAX(64, minnum, true, true, false)
2209MINMAX(64, minnummag, true, true, true)
2210MINMAX(64, max, false, false, false)
2211MINMAX(64, maxnum, false, true, false)
2212MINMAX(64, maxnummag, false, true, true)
2213
2214#undef MINMAX
2215
2216/* Floating point compare */
2217static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2218                          float_status *s)
2219{
2220    if (is_nan(a.cls) || is_nan(b.cls)) {
2221        if (!is_quiet ||
2222            a.cls == float_class_snan ||
2223            b.cls == float_class_snan) {
2224            s->float_exception_flags |= float_flag_invalid;
2225        }
2226        return float_relation_unordered;
2227    }
2228
2229    if (a.cls == float_class_zero) {
2230        if (b.cls == float_class_zero) {
2231            return float_relation_equal;
2232        }
2233        return b.sign ? float_relation_greater : float_relation_less;
2234    } else if (b.cls == float_class_zero) {
2235        return a.sign ? float_relation_less : float_relation_greater;
2236    }
2237
2238    /* The only really important thing about infinity is its sign. If
2239     * both are infinities the sign marks the smallest of the two.
2240     */
2241    if (a.cls == float_class_inf) {
2242        if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2243            return float_relation_equal;
2244        }
2245        return a.sign ? float_relation_less : float_relation_greater;
2246    } else if (b.cls == float_class_inf) {
2247        return b.sign ? float_relation_greater : float_relation_less;
2248    }
2249
2250    if (a.sign != b.sign) {
2251        return a.sign ? float_relation_less : float_relation_greater;
2252    }
2253
2254    if (a.exp == b.exp) {
2255        if (a.frac == b.frac) {
2256            return float_relation_equal;
2257        }
2258        if (a.sign) {
2259            return a.frac > b.frac ?
2260                float_relation_less : float_relation_greater;
2261        } else {
2262            return a.frac > b.frac ?
2263                float_relation_greater : float_relation_less;
2264        }
2265    } else {
2266        if (a.sign) {
2267            return a.exp > b.exp ? float_relation_less : float_relation_greater;
2268        } else {
2269            return a.exp > b.exp ? float_relation_greater : float_relation_less;
2270        }
2271    }
2272}
2273
2274#define COMPARE(sz)                                                     \
2275int float ## sz ## _compare(float ## sz a, float ## sz b,               \
2276                            float_status *s)                            \
2277{                                                                       \
2278    FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2279    FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2280    return compare_floats(pa, pb, false, s);                            \
2281}                                                                       \
2282int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
2283                                  float_status *s)                      \
2284{                                                                       \
2285    FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2286    FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2287    return compare_floats(pa, pb, true, s);                             \
2288}
2289
2290COMPARE(16)
2291COMPARE(32)
2292COMPARE(64)
2293
2294#undef COMPARE
2295
2296/* Multiply A by 2 raised to the power N.  */
2297static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
2298{
2299    if (unlikely(is_nan(a.cls))) {
2300        return return_nan(a, s);
2301    }
2302    if (a.cls == float_class_normal) {
2303        /* The largest float type (even though not supported by FloatParts)
2304         * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
2305         * still allows rounding to infinity, without allowing overflow
2306         * within the int32_t that backs FloatParts.exp.
2307         */
2308        n = MIN(MAX(n, -0x10000), 0x10000);
2309        a.exp += n;
2310    }
2311    return a;
2312}
2313
2314float16 float16_scalbn(float16 a, int n, float_status *status)
2315{
2316    FloatParts pa = float16_unpack_canonical(a, status);
2317    FloatParts pr = scalbn_decomposed(pa, n, status);
2318    return float16_round_pack_canonical(pr, status);
2319}
2320
2321float32 float32_scalbn(float32 a, int n, float_status *status)
2322{
2323    FloatParts pa = float32_unpack_canonical(a, status);
2324    FloatParts pr = scalbn_decomposed(pa, n, status);
2325    return float32_round_pack_canonical(pr, status);
2326}
2327
2328float64 float64_scalbn(float64 a, int n, float_status *status)
2329{
2330    FloatParts pa = float64_unpack_canonical(a, status);
2331    FloatParts pr = scalbn_decomposed(pa, n, status);
2332    return float64_round_pack_canonical(pr, status);
2333}
2334
2335/*
2336 * Square Root
2337 *
2338 * The old softfloat code did an approximation step before zeroing in
2339 * on the final result. However for simpleness we just compute the
2340 * square root by iterating down from the implicit bit to enough extra
2341 * bits to ensure we get a correctly rounded result.
2342 *
2343 * This does mean however the calculation is slower than before,
2344 * especially for 64 bit floats.
2345 */
2346
2347static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2348{
2349    uint64_t a_frac, r_frac, s_frac;
2350    int bit, last_bit;
2351
2352    if (is_nan(a.cls)) {
2353        return return_nan(a, s);
2354    }
2355    if (a.cls == float_class_zero) {
2356        return a;  /* sqrt(+-0) = +-0 */
2357    }
2358    if (a.sign) {
2359        s->float_exception_flags |= float_flag_invalid;
2360        return parts_default_nan(s);
2361    }
2362    if (a.cls == float_class_inf) {
2363        return a;  /* sqrt(+inf) = +inf */
2364    }
2365
2366    assert(a.cls == float_class_normal);
2367
2368    /* We need two overflow bits at the top. Adding room for that is a
2369     * right shift. If the exponent is odd, we can discard the low bit
2370     * by multiplying the fraction by 2; that's a left shift. Combine
2371     * those and we shift right if the exponent is even.
2372     */
2373    a_frac = a.frac;
2374    if (!(a.exp & 1)) {
2375        a_frac >>= 1;
2376    }
2377    a.exp >>= 1;
2378
2379    /* Bit-by-bit computation of sqrt.  */
2380    r_frac = 0;
2381    s_frac = 0;
2382
2383    /* Iterate from implicit bit down to the 3 extra bits to compute a
2384     * properly rounded result. Remember we've inserted one more bit
2385     * at the top, so these positions are one less.
2386     */
2387    bit = DECOMPOSED_BINARY_POINT - 1;
2388    last_bit = MAX(p->frac_shift - 4, 0);
2389    do {
2390        uint64_t q = 1ULL << bit;
2391        uint64_t t_frac = s_frac + q;
2392        if (t_frac <= a_frac) {
2393            s_frac = t_frac + q;
2394            a_frac -= t_frac;
2395            r_frac += q;
2396        }
2397        a_frac <<= 1;
2398    } while (--bit >= last_bit);
2399
2400    /* Undo the right shift done above. If there is any remaining
2401     * fraction, the result is inexact. Set the sticky bit.
2402     */
2403    a.frac = (r_frac << 1) + (a_frac != 0);
2404
2405    return a;
2406}
2407
2408float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
2409{
2410    FloatParts pa = float16_unpack_canonical(a, status);
2411    FloatParts pr = sqrt_float(pa, status, &float16_params);
2412    return float16_round_pack_canonical(pr, status);
2413}
2414
2415float32 QEMU_FLATTEN float32_sqrt(float32 a, float_status *status)
2416{
2417    FloatParts pa = float32_unpack_canonical(a, status);
2418    FloatParts pr = sqrt_float(pa, status, &float32_params);
2419    return float32_round_pack_canonical(pr, status);
2420}
2421
2422float64 QEMU_FLATTEN float64_sqrt(float64 a, float_status *status)
2423{
2424    FloatParts pa = float64_unpack_canonical(a, status);
2425    FloatParts pr = sqrt_float(pa, status, &float64_params);
2426    return float64_round_pack_canonical(pr, status);
2427}
2428
2429/*----------------------------------------------------------------------------
2430| The pattern for a default generated NaN.
2431*----------------------------------------------------------------------------*/
2432
2433float16 float16_default_nan(float_status *status)
2434{
2435    FloatParts p = parts_default_nan(status);
2436    p.frac >>= float16_params.frac_shift;
2437    return float16_pack_raw(p);
2438}
2439
2440float32 float32_default_nan(float_status *status)
2441{
2442    FloatParts p = parts_default_nan(status);
2443    p.frac >>= float32_params.frac_shift;
2444    return float32_pack_raw(p);
2445}
2446
2447float64 float64_default_nan(float_status *status)
2448{
2449    FloatParts p = parts_default_nan(status);
2450    p.frac >>= float64_params.frac_shift;
2451    return float64_pack_raw(p);
2452}
2453
2454float128 float128_default_nan(float_status *status)
2455{
2456    FloatParts p = parts_default_nan(status);
2457    float128 r;
2458
2459    /* Extrapolate from the choices made by parts_default_nan to fill
2460     * in the quad-floating format.  If the low bit is set, assume we
2461     * want to set all non-snan bits.
2462     */
2463    r.low = -(p.frac & 1);
2464    r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
2465    r.high |= LIT64(0x7FFF000000000000);
2466    r.high |= (uint64_t)p.sign << 63;
2467
2468    return r;
2469}
2470
2471/*----------------------------------------------------------------------------
2472| Returns a quiet NaN from a signalling NaN for the floating point value `a'.
2473*----------------------------------------------------------------------------*/
2474
2475float16 float16_silence_nan(float16 a, float_status *status)
2476{
2477    FloatParts p = float16_unpack_raw(a);
2478    p.frac <<= float16_params.frac_shift;
2479    p = parts_silence_nan(p, status);
2480    p.frac >>= float16_params.frac_shift;
2481    return float16_pack_raw(p);
2482}
2483
2484float32 float32_silence_nan(float32 a, float_status *status)
2485{
2486    FloatParts p = float32_unpack_raw(a);
2487    p.frac <<= float32_params.frac_shift;
2488    p = parts_silence_nan(p, status);
2489    p.frac >>= float32_params.frac_shift;
2490    return float32_pack_raw(p);
2491}
2492
2493float64 float64_silence_nan(float64 a, float_status *status)
2494{
2495    FloatParts p = float64_unpack_raw(a);
2496    p.frac <<= float64_params.frac_shift;
2497    p = parts_silence_nan(p, status);
2498    p.frac >>= float64_params.frac_shift;
2499    return float64_pack_raw(p);
2500}
2501
2502/*----------------------------------------------------------------------------
2503| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2504| and 7, and returns the properly rounded 32-bit integer corresponding to the
2505| input.  If `zSign' is 1, the input is negated before being converted to an
2506| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2507| is simply rounded to an integer, with the inexact exception raised if the
2508| input cannot be represented exactly as an integer.  However, if the fixed-
2509| point input is too large, the invalid exception is raised and the largest
2510| positive or negative integer is returned.
2511*----------------------------------------------------------------------------*/
2512
2513static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2514{
2515    int8_t roundingMode;
2516    flag roundNearestEven;
2517    int8_t roundIncrement, roundBits;
2518    int32_t z;
2519
2520    roundingMode = status->float_rounding_mode;
2521    roundNearestEven = ( roundingMode == float_round_nearest_even );
2522    switch (roundingMode) {
2523    case float_round_nearest_even:
2524    case float_round_ties_away:
2525        roundIncrement = 0x40;
2526        break;
2527    case float_round_to_zero:
2528        roundIncrement = 0;
2529        break;
2530    case float_round_up:
2531        roundIncrement = zSign ? 0 : 0x7f;
2532        break;
2533    case float_round_down:
2534        roundIncrement = zSign ? 0x7f : 0;
2535        break;
2536    default:
2537        abort();
2538    }
2539    roundBits = absZ & 0x7F;
2540    absZ = ( absZ + roundIncrement )>>7;
2541    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2542    z = absZ;
2543    if ( zSign ) z = - z;
2544    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2545        float_raise(float_flag_invalid, status);
2546        return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2547    }
2548    if (roundBits) {
2549        status->float_exception_flags |= float_flag_inexact;
2550    }
2551    return z;
2552
2553}
2554
2555/*----------------------------------------------------------------------------
2556| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2557| `absZ1', with binary point between bits 63 and 64 (between the input words),
2558| and returns the properly rounded 64-bit integer corresponding to the input.
2559| If `zSign' is 1, the input is negated before being converted to an integer.
2560| Ordinarily, the fixed-point input is simply rounded to an integer, with
2561| the inexact exception raised if the input cannot be represented exactly as
2562| an integer.  However, if the fixed-point input is too large, the invalid
2563| exception is raised and the largest positive or negative integer is
2564| returned.
2565*----------------------------------------------------------------------------*/
2566
2567static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2568                               float_status *status)
2569{
2570    int8_t roundingMode;
2571    flag roundNearestEven, increment;
2572    int64_t z;
2573
2574    roundingMode = status->float_rounding_mode;
2575    roundNearestEven = ( roundingMode == float_round_nearest_even );
2576    switch (roundingMode) {
2577    case float_round_nearest_even:
2578    case float_round_ties_away:
2579        increment = ((int64_t) absZ1 < 0);
2580        break;
2581    case float_round_to_zero:
2582        increment = 0;
2583        break;
2584    case float_round_up:
2585        increment = !zSign && absZ1;
2586        break;
2587    case float_round_down:
2588        increment = zSign && absZ1;
2589        break;
2590    default:
2591        abort();
2592    }
2593    if ( increment ) {
2594        ++absZ0;
2595        if ( absZ0 == 0 ) goto overflow;
2596        absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2597    }
2598    z = absZ0;
2599    if ( zSign ) z = - z;
2600    if ( z && ( ( z < 0 ) ^ zSign ) ) {
2601 overflow:
2602        float_raise(float_flag_invalid, status);
2603        return
2604              zSign ? (int64_t) LIT64( 0x8000000000000000 )
2605            : LIT64( 0x7FFFFFFFFFFFFFFF );
2606    }
2607    if (absZ1) {
2608        status->float_exception_flags |= float_flag_inexact;
2609    }
2610    return z;
2611
2612}
2613
2614/*----------------------------------------------------------------------------
2615| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2616| `absZ1', with binary point between bits 63 and 64 (between the input words),
2617| and returns the properly rounded 64-bit unsigned integer corresponding to the
2618| input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2619| with the inexact exception raised if the input cannot be represented exactly
2620| as an integer.  However, if the fixed-point input is too large, the invalid
2621| exception is raised and the largest unsigned integer is returned.
2622*----------------------------------------------------------------------------*/
2623
2624static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2625                                uint64_t absZ1, float_status *status)
2626{
2627    int8_t roundingMode;
2628    flag roundNearestEven, increment;
2629
2630    roundingMode = status->float_rounding_mode;
2631    roundNearestEven = (roundingMode == float_round_nearest_even);
2632    switch (roundingMode) {
2633    case float_round_nearest_even:
2634    case float_round_ties_away:
2635        increment = ((int64_t)absZ1 < 0);
2636        break;
2637    case float_round_to_zero:
2638        increment = 0;
2639        break;
2640    case float_round_up:
2641        increment = !zSign && absZ1;
2642        break;
2643    case float_round_down:
2644        increment = zSign && absZ1;
2645        break;
2646    default:
2647        abort();
2648    }
2649    if (increment) {
2650        ++absZ0;
2651        if (absZ0 == 0) {
2652            float_raise(float_flag_invalid, status);
2653            return LIT64(0xFFFFFFFFFFFFFFFF);
2654        }
2655        absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2656    }
2657
2658    if (zSign && absZ0) {
2659        float_raise(float_flag_invalid, status);
2660        return 0;
2661    }
2662
2663    if (absZ1) {
2664        status->float_exception_flags |= float_flag_inexact;
2665    }
2666    return absZ0;
2667}
2668
2669/*----------------------------------------------------------------------------
2670| If `a' is denormal and we are in flush-to-zero mode then set the
2671| input-denormal exception and return zero. Otherwise just return the value.
2672*----------------------------------------------------------------------------*/
2673float32 float32_squash_input_denormal(float32 a, float_status *status)
2674{
2675    if (status->flush_inputs_to_zero) {
2676        if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2677            float_raise(float_flag_input_denormal, status);
2678            return make_float32(float32_val(a) & 0x80000000);
2679        }
2680    }
2681    return a;
2682}
2683
2684/*----------------------------------------------------------------------------
2685| Normalizes the subnormal single-precision floating-point value represented
2686| by the denormalized significand `aSig'.  The normalized exponent and
2687| significand are stored at the locations pointed to by `zExpPtr' and
2688| `zSigPtr', respectively.
2689*----------------------------------------------------------------------------*/
2690
2691static void
2692 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2693{
2694    int8_t shiftCount;
2695
2696    shiftCount = clz32(aSig) - 8;
2697    *zSigPtr = aSig<<shiftCount;
2698    *zExpPtr = 1 - shiftCount;
2699
2700}
2701
2702/*----------------------------------------------------------------------------
2703| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2704| and significand `zSig', and returns the proper single-precision floating-
2705| point value corresponding to the abstract input.  Ordinarily, the abstract
2706| value is simply rounded and packed into the single-precision format, with
2707| the inexact exception raised if the abstract input cannot be represented
2708| exactly.  However, if the abstract value is too large, the overflow and
2709| inexact exceptions are raised and an infinity or maximal finite value is
2710| returned.  If the abstract value is too small, the input value is rounded to
2711| a subnormal number, and the underflow and inexact exceptions are raised if
2712| the abstract input cannot be represented exactly as a subnormal single-
2713| precision floating-point number.
2714|     The input significand `zSig' has its binary point between bits 30
2715| and 29, which is 7 bits to the left of the usual location.  This shifted
2716| significand must be normalized or smaller.  If `zSig' is not normalized,
2717| `zExp' must be 0; in that case, the result returned is a subnormal number,
2718| and it must not require rounding.  In the usual case that `zSig' is
2719| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2720| The handling of underflow and overflow follows the IEC/IEEE Standard for
2721| Binary Floating-Point Arithmetic.
2722*----------------------------------------------------------------------------*/
2723
2724static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2725                                   float_status *status)
2726{
2727    int8_t roundingMode;
2728    flag roundNearestEven;
2729    int8_t roundIncrement, roundBits;
2730    flag isTiny;
2731
2732    roundingMode = status->float_rounding_mode;
2733    roundNearestEven = ( roundingMode == float_round_nearest_even );
2734    switch (roundingMode) {
2735    case float_round_nearest_even:
2736    case float_round_ties_away:
2737        roundIncrement = 0x40;
2738        break;
2739    case float_round_to_zero:
2740        roundIncrement = 0;
2741        break;
2742    case float_round_up:
2743        roundIncrement = zSign ? 0 : 0x7f;
2744        break;
2745    case float_round_down:
2746        roundIncrement = zSign ? 0x7f : 0;
2747        break;
2748    default:
2749        abort();
2750        break;
2751    }
2752    roundBits = zSig & 0x7F;
2753    if ( 0xFD <= (uint16_t) zExp ) {
2754        if (    ( 0xFD < zExp )
2755             || (    ( zExp == 0xFD )
2756                  && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2757           ) {
2758            float_raise(float_flag_overflow | float_flag_inexact, status);
2759            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2760        }
2761        if ( zExp < 0 ) {
2762            if (status->flush_to_zero) {
2763                float_raise(float_flag_output_denormal, status);
2764                return packFloat32(zSign, 0, 0);
2765            }
2766            isTiny =
2767                (status->float_detect_tininess
2768                 == float_tininess_before_rounding)
2769                || ( zExp < -1 )
2770                || ( zSig + roundIncrement < 0x80000000 );
2771            shift32RightJamming( zSig, - zExp, &zSig );
2772            zExp = 0;
2773            roundBits = zSig & 0x7F;
2774            if (isTiny && roundBits) {
2775                float_raise(float_flag_underflow, status);
2776            }
2777        }
2778    }
2779    if (roundBits) {
2780        status->float_exception_flags |= float_flag_inexact;
2781    }
2782    zSig = ( zSig + roundIncrement )>>7;
2783    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2784    if ( zSig == 0 ) zExp = 0;
2785    return packFloat32( zSign, zExp, zSig );
2786
2787}
2788
2789/*----------------------------------------------------------------------------
2790| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2791| and significand `zSig', and returns the proper single-precision floating-
2792| point value corresponding to the abstract input.  This routine is just like
2793| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2794| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2795| floating-point exponent.
2796*----------------------------------------------------------------------------*/
2797
2798static float32
2799 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2800                              float_status *status)
2801{
2802    int8_t shiftCount;
2803
2804    shiftCount = clz32(zSig) - 1;
2805    return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2806                               status);
2807
2808}
2809
2810/*----------------------------------------------------------------------------
2811| If `a' is denormal and we are in flush-to-zero mode then set the
2812| input-denormal exception and return zero. Otherwise just return the value.
2813*----------------------------------------------------------------------------*/
2814float64 float64_squash_input_denormal(float64 a, float_status *status)
2815{
2816    if (status->flush_inputs_to_zero) {
2817        if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2818            float_raise(float_flag_input_denormal, status);
2819            return make_float64(float64_val(a) & (1ULL << 63));
2820        }
2821    }
2822    return a;
2823}
2824
2825/*----------------------------------------------------------------------------
2826| Normalizes the subnormal double-precision floating-point value represented
2827| by the denormalized significand `aSig'.  The normalized exponent and
2828| significand are stored at the locations pointed to by `zExpPtr' and
2829| `zSigPtr', respectively.
2830*----------------------------------------------------------------------------*/
2831
2832static void
2833 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2834{
2835    int8_t shiftCount;
2836
2837    shiftCount = clz64(aSig) - 11;
2838    *zSigPtr = aSig<<shiftCount;
2839    *zExpPtr = 1 - shiftCount;
2840
2841}
2842
2843/*----------------------------------------------------------------------------
2844| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2845| double-precision floating-point value, returning the result.  After being
2846| shifted into the proper positions, the three fields are simply added
2847| together to form the result.  This means that any integer portion of `zSig'
2848| will be added into the exponent.  Since a properly normalized significand
2849| will have an integer portion equal to 1, the `zExp' input should be 1 less
2850| than the desired result exponent whenever `zSig' is a complete, normalized
2851| significand.
2852*----------------------------------------------------------------------------*/
2853
2854static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2855{
2856
2857    return make_float64(
2858        ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2859
2860}
2861
2862/*----------------------------------------------------------------------------
2863| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2864| and significand `zSig', and returns the proper double-precision floating-
2865| point value corresponding to the abstract input.  Ordinarily, the abstract
2866| value is simply rounded and packed into the double-precision format, with
2867| the inexact exception raised if the abstract input cannot be represented
2868| exactly.  However, if the abstract value is too large, the overflow and
2869| inexact exceptions are raised and an infinity or maximal finite value is
2870| returned.  If the abstract value is too small, the input value is rounded to
2871| a subnormal number, and the underflow and inexact exceptions are raised if
2872| the abstract input cannot be represented exactly as a subnormal double-
2873| precision floating-point number.
2874|     The input significand `zSig' has its binary point between bits 62
2875| and 61, which is 10 bits to the left of the usual location.  This shifted
2876| significand must be normalized or smaller.  If `zSig' is not normalized,
2877| `zExp' must be 0; in that case, the result returned is a subnormal number,
2878| and it must not require rounding.  In the usual case that `zSig' is
2879| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2880| The handling of underflow and overflow follows the IEC/IEEE Standard for
2881| Binary Floating-Point Arithmetic.
2882*----------------------------------------------------------------------------*/
2883
2884static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2885                                   float_status *status)
2886{
2887    int8_t roundingMode;
2888    flag roundNearestEven;
2889    int roundIncrement, roundBits;
2890    flag isTiny;
2891
2892    roundingMode = status->float_rounding_mode;
2893    roundNearestEven = ( roundingMode == float_round_nearest_even );
2894    switch (roundingMode) {
2895    case float_round_nearest_even:
2896    case float_round_ties_away:
2897        roundIncrement = 0x200;
2898        break;
2899    case float_round_to_zero:
2900        roundIncrement = 0;
2901        break;
2902    case float_round_up:
2903        roundIncrement = zSign ? 0 : 0x3ff;
2904        break;
2905    case float_round_down:
2906        roundIncrement = zSign ? 0x3ff : 0;
2907        break;
2908    case float_round_to_odd:
2909        roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2910        break;
2911    default:
2912        abort();
2913    }
2914    roundBits = zSig & 0x3FF;
2915    if ( 0x7FD <= (uint16_t) zExp ) {
2916        if (    ( 0x7FD < zExp )
2917             || (    ( zExp == 0x7FD )
2918                  && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2919           ) {
2920            bool overflow_to_inf = roundingMode != float_round_to_odd &&
2921                                   roundIncrement != 0;
2922            float_raise(float_flag_overflow | float_flag_inexact, status);
2923            return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2924        }
2925        if ( zExp < 0 ) {
2926            if (status->flush_to_zero) {
2927                float_raise(float_flag_output_denormal, status);
2928                return packFloat64(zSign, 0, 0);
2929            }
2930            isTiny =
2931                   (status->float_detect_tininess
2932                    == float_tininess_before_rounding)
2933                || ( zExp < -1 )
2934                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2935            shift64RightJamming( zSig, - zExp, &zSig );
2936            zExp = 0;
2937            roundBits = zSig & 0x3FF;
2938            if (isTiny && roundBits) {
2939                float_raise(float_flag_underflow, status);
2940            }
2941            if (roundingMode == float_round_to_odd) {
2942                /*
2943                 * For round-to-odd case, the roundIncrement depends on
2944                 * zSig which just changed.
2945                 */
2946                roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2947            }
2948        }
2949    }
2950    if (roundBits) {
2951        status->float_exception_flags |= float_flag_inexact;
2952    }
2953    zSig = ( zSig + roundIncrement )>>10;
2954    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2955    if ( zSig == 0 ) zExp = 0;
2956    return packFloat64( zSign, zExp, zSig );
2957
2958}
2959
2960/*----------------------------------------------------------------------------
2961| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2962| and significand `zSig', and returns the proper double-precision floating-
2963| point value corresponding to the abstract input.  This routine is just like
2964| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2965| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2966| floating-point exponent.
2967*----------------------------------------------------------------------------*/
2968
2969static float64
2970 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2971                              float_status *status)
2972{
2973    int8_t shiftCount;
2974
2975    shiftCount = clz64(zSig) - 1;
2976    return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2977                               status);
2978
2979}
2980
2981/*----------------------------------------------------------------------------
2982| Normalizes the subnormal extended double-precision floating-point value
2983| represented by the denormalized significand `aSig'.  The normalized exponent
2984| and significand are stored at the locations pointed to by `zExpPtr' and
2985| `zSigPtr', respectively.
2986*----------------------------------------------------------------------------*/
2987
2988void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2989                                uint64_t *zSigPtr)
2990{
2991    int8_t shiftCount;
2992
2993    shiftCount = clz64(aSig);
2994    *zSigPtr = aSig<<shiftCount;
2995    *zExpPtr = 1 - shiftCount;
2996}
2997
2998/*----------------------------------------------------------------------------
2999| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3000| and extended significand formed by the concatenation of `zSig0' and `zSig1',
3001| and returns the proper extended double-precision floating-point value
3002| corresponding to the abstract input.  Ordinarily, the abstract value is
3003| rounded and packed into the extended double-precision format, with the
3004| inexact exception raised if the abstract input cannot be represented
3005| exactly.  However, if the abstract value is too large, the overflow and
3006| inexact exceptions are raised and an infinity or maximal finite value is
3007| returned.  If the abstract value is too small, the input value is rounded to
3008| a subnormal number, and the underflow and inexact exceptions are raised if
3009| the abstract input cannot be represented exactly as a subnormal extended
3010| double-precision floating-point number.
3011|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3012| number of bits as single or double precision, respectively.  Otherwise, the
3013| result is rounded to the full precision of the extended double-precision
3014| format.
3015|     The input significand must be normalized or smaller.  If the input
3016| significand is not normalized, `zExp' must be 0; in that case, the result
3017| returned is a subnormal number, and it must not require rounding.  The
3018| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3019| Floating-Point Arithmetic.
3020*----------------------------------------------------------------------------*/
3021
3022floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3023                              int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3024                              float_status *status)
3025{
3026    int8_t roundingMode;
3027    flag roundNearestEven, increment, isTiny;
3028    int64_t roundIncrement, roundMask, roundBits;
3029
3030    roundingMode = status->float_rounding_mode;
3031    roundNearestEven = ( roundingMode == float_round_nearest_even );
3032    if ( roundingPrecision == 80 ) goto precision80;
3033    if ( roundingPrecision == 64 ) {
3034        roundIncrement = LIT64( 0x0000000000000400 );
3035        roundMask = LIT64( 0x00000000000007FF );
3036    }
3037    else if ( roundingPrecision == 32 ) {
3038        roundIncrement = LIT64( 0x0000008000000000 );
3039        roundMask = LIT64( 0x000000FFFFFFFFFF );
3040    }
3041    else {
3042        goto precision80;
3043    }
3044    zSig0 |= ( zSig1 != 0 );
3045    switch (roundingMode) {
3046    case float_round_nearest_even:
3047    case float_round_ties_away:
3048        break;
3049    case float_round_to_zero:
3050        roundIncrement = 0;
3051        break;
3052    case float_round_up:
3053        roundIncrement = zSign ? 0 : roundMask;
3054        break;
3055    case float_round_down:
3056        roundIncrement = zSign ? roundMask : 0;
3057        break;
3058    default:
3059        abort();
3060    }
3061    roundBits = zSig0 & roundMask;
3062    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3063        if (    ( 0x7FFE < zExp )
3064             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3065           ) {
3066            goto overflow;
3067        }
3068        if ( zExp <= 0 ) {
3069            if (status->flush_to_zero) {
3070                float_raise(float_flag_output_denormal, status);
3071                return packFloatx80(zSign, 0, 0);
3072            }
3073            isTiny =
3074                   (status->float_detect_tininess
3075                    == float_tininess_before_rounding)
3076                || ( zExp < 0 )
3077                || ( zSig0 <= zSig0 + roundIncrement );
3078            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3079            zExp = 0;
3080            roundBits = zSig0 & roundMask;
3081            if (isTiny && roundBits) {
3082                float_raise(float_flag_underflow, status);
3083            }
3084            if (roundBits) {
3085                status->float_exception_flags |= float_flag_inexact;
3086            }
3087            zSig0 += roundIncrement;
3088            if ( (int64_t) zSig0 < 0 ) zExp = 1;
3089            roundIncrement = roundMask + 1;
3090            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3091                roundMask |= roundIncrement;
3092            }
3093            zSig0 &= ~ roundMask;
3094            return packFloatx80( zSign, zExp, zSig0 );
3095        }
3096    }
3097    if (roundBits) {
3098        status->float_exception_flags |= float_flag_inexact;
3099    }
3100    zSig0 += roundIncrement;
3101    if ( zSig0 < roundIncrement ) {
3102        ++zExp;
3103        zSig0 = LIT64( 0x8000000000000000 );
3104    }
3105    roundIncrement = roundMask + 1;
3106    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3107        roundMask |= roundIncrement;
3108    }
3109    zSig0 &= ~ roundMask;
3110    if ( zSig0 == 0 ) zExp = 0;
3111    return packFloatx80( zSign, zExp, zSig0 );
3112 precision80:
3113    switch (roundingMode) {
3114    case float_round_nearest_even:
3115    case float_round_ties_away:
3116        increment = ((int64_t)zSig1 < 0);
3117        break;
3118    case float_round_to_zero:
3119        increment = 0;
3120        break;
3121    case float_round_up:
3122        increment = !zSign && zSig1;
3123        break;
3124    case float_round_down:
3125        increment = zSign && zSig1;
3126        break;
3127    default:
3128        abort();
3129    }
3130    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3131        if (    ( 0x7FFE < zExp )
3132             || (    ( zExp == 0x7FFE )
3133                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3134                  && increment
3135                )
3136           ) {
3137            roundMask = 0;
3138 overflow:
3139            float_raise(float_flag_overflow | float_flag_inexact, status);
3140            if (    ( roundingMode == float_round_to_zero )
3141                 || ( zSign && ( roundingMode == float_round_up ) )
3142                 || ( ! zSign && ( roundingMode == float_round_down ) )
3143               ) {
3144                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3145            }
3146            return packFloatx80(zSign,
3147                                floatx80_infinity_high,
3148                                floatx80_infinity_low);
3149        }
3150        if ( zExp <= 0 ) {
3151            isTiny =
3152                   (status->float_detect_tininess
3153                    == float_tininess_before_rounding)
3154                || ( zExp < 0 )
3155                || ! increment
3156                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3157            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3158            zExp = 0;
3159            if (isTiny && zSig1) {
3160                float_raise(float_flag_underflow, status);
3161            }
3162            if (zSig1) {
3163                status->float_exception_flags |= float_flag_inexact;
3164            }
3165            switch (roundingMode) {
3166            case float_round_nearest_even:
3167            case float_round_ties_away:
3168                increment = ((int64_t)zSig1 < 0);
3169                break;
3170            case float_round_to_zero:
3171                increment = 0;
3172                break;
3173            case float_round_up:
3174                increment = !zSign && zSig1;
3175                break;
3176            case float_round_down:
3177                increment = zSign && zSig1;
3178                break;
3179            default:
3180                abort();
3181            }
3182            if ( increment ) {
3183                ++zSig0;
3184                zSig0 &=
3185                    ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3186                if ( (int64_t) zSig0 < 0 ) zExp = 1;
3187            }
3188            return packFloatx80( zSign, zExp, zSig0 );
3189        }
3190    }
3191    if (zSig1) {
3192        status->float_exception_flags |= float_flag_inexact;
3193    }
3194    if ( increment ) {
3195        ++zSig0;
3196        if ( zSig0 == 0 ) {
3197            ++zExp;
3198            zSig0 = LIT64( 0x8000000000000000 );
3199        }
3200        else {
3201            zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3202        }
3203    }
3204    else {
3205        if ( zSig0 == 0 ) zExp = 0;
3206    }
3207    return packFloatx80( zSign, zExp, zSig0 );
3208
3209}
3210
3211/*----------------------------------------------------------------------------
3212| Takes an abstract floating-point value having sign `zSign', exponent
3213| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3214| and returns the proper extended double-precision floating-point value
3215| corresponding to the abstract input.  This routine is just like
3216| `roundAndPackFloatx80' except that the input significand does not have to be
3217| normalized.
3218*----------------------------------------------------------------------------*/
3219
3220floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
3221                                       flag zSign, int32_t zExp,
3222                                       uint64_t zSig0, uint64_t zSig1,
3223                                       float_status *status)
3224{
3225    int8_t shiftCount;
3226
3227    if ( zSig0 == 0 ) {
3228        zSig0 = zSig1;
3229        zSig1 = 0;
3230        zExp -= 64;
3231    }
3232    shiftCount = clz64(zSig0);
3233    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3234    zExp -= shiftCount;
3235    return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
3236                                zSig0, zSig1, status);
3237
3238}
3239
3240/*----------------------------------------------------------------------------
3241| Returns the least-significant 64 fraction bits of the quadruple-precision
3242| floating-point value `a'.
3243*----------------------------------------------------------------------------*/
3244
3245static inline uint64_t extractFloat128Frac1( float128 a )
3246{
3247
3248    return a.low;
3249
3250}
3251
3252/*----------------------------------------------------------------------------
3253| Returns the most-significant 48 fraction bits of the quadruple-precision
3254| floating-point value `a'.
3255*----------------------------------------------------------------------------*/
3256
3257static inline uint64_t extractFloat128Frac0( float128 a )
3258{
3259
3260    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
3261
3262}
3263
3264/*----------------------------------------------------------------------------
3265| Returns the exponent bits of the quadruple-precision floating-point value
3266| `a'.
3267*----------------------------------------------------------------------------*/
3268
3269static inline int32_t extractFloat128Exp( float128 a )
3270{
3271
3272    return ( a.high>>48 ) & 0x7FFF;
3273
3274}
3275
3276/*----------------------------------------------------------------------------
3277| Returns the sign bit of the quadruple-precision floating-point value `a'.
3278*----------------------------------------------------------------------------*/
3279
3280static inline flag extractFloat128Sign( float128 a )
3281{
3282
3283    return a.high>>63;
3284
3285}
3286
3287/*----------------------------------------------------------------------------
3288| Normalizes the subnormal quadruple-precision floating-point value
3289| represented by the denormalized significand formed by the concatenation of
3290| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
3291| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
3292| significand are stored at the location pointed to by `zSig0Ptr', and the
3293| least significant 64 bits of the normalized significand are stored at the
3294| location pointed to by `zSig1Ptr'.
3295*----------------------------------------------------------------------------*/
3296
3297static void
3298 normalizeFloat128Subnormal(
3299     uint64_t aSig0,
3300     uint64_t aSig1,
3301     int32_t *zExpPtr,
3302     uint64_t *zSig0Ptr,
3303     uint64_t *zSig1Ptr
3304 )
3305{
3306    int8_t shiftCount;
3307
3308    if ( aSig0 == 0 ) {
3309        shiftCount = clz64(aSig1) - 15;
3310        if ( shiftCount < 0 ) {
3311            *zSig0Ptr = aSig1>>( - shiftCount );
3312            *zSig1Ptr = aSig1<<( shiftCount & 63 );
3313        }
3314        else {
3315            *zSig0Ptr = aSig1<<shiftCount;
3316            *zSig1Ptr = 0;
3317        }
3318        *zExpPtr = - shiftCount - 63;
3319    }
3320    else {
3321        shiftCount = clz64(aSig0) - 15;
3322        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
3323        *zExpPtr = 1 - shiftCount;
3324    }
3325
3326}
3327
3328/*----------------------------------------------------------------------------
3329| Packs the sign `zSign', the exponent `zExp', and the significand formed
3330| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
3331| floating-point value, returning the result.  After being shifted into the
3332| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
3333| added together to form the most significant 32 bits of the result.  This
3334| means that any integer portion of `zSig0' will be added into the exponent.
3335| Since a properly normalized significand will have an integer portion equal
3336| to 1, the `zExp' input should be 1 less than the desired result exponent
3337| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
3338| significand.
3339*----------------------------------------------------------------------------*/
3340
3341static inline float128
3342 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
3343{
3344    float128 z;
3345
3346    z.low = zSig1;
3347    z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
3348    return z;
3349
3350}
3351
3352/*----------------------------------------------------------------------------
3353| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3354| and extended significand formed by the concatenation of `zSig0', `zSig1',
3355| and `zSig2', and returns the proper quadruple-precision floating-point value
3356| corresponding to the abstract input.  Ordinarily, the abstract value is
3357| simply rounded and packed into the quadruple-precision format, with the
3358| inexact exception raised if the abstract input cannot be represented
3359| exactly.  However, if the abstract value is too large, the overflow and
3360| inexact exceptions are raised and an infinity or maximal finite value is
3361| returned.  If the abstract value is too small, the input value is rounded to
3362| a subnormal number, and the underflow and inexact exceptions are raised if
3363| the abstract input cannot be represented exactly as a subnormal quadruple-
3364| precision floating-point number.
3365|     The input significand must be normalized or smaller.  If the input
3366| significand is not normalized, `zExp' must be 0; in that case, the result
3367| returned is a subnormal number, and it must not require rounding.  In the
3368| usual case that the input significand is normalized, `zExp' must be 1 less
3369| than the ``true'' floating-point exponent.  The handling of underflow and
3370| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3371*----------------------------------------------------------------------------*/
3372
3373static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
3374                                     uint64_t zSig0, uint64_t zSig1,
3375                                     uint64_t zSig2, float_status *status)
3376{
3377    int8_t roundingMode;
3378    flag roundNearestEven, increment, isTiny;
3379
3380    roundingMode = status->float_rounding_mode;
3381    roundNearestEven = ( roundingMode == float_round_nearest_even );
3382    switch (roundingMode) {
3383    case float_round_nearest_even:
3384    case float_round_ties_away:
3385        increment = ((int64_t)zSig2 < 0);
3386        break;
3387    case float_round_to_zero:
3388        increment = 0;
3389        break;
3390    case float_round_up:
3391        increment = !zSign && zSig2;
3392        break;
3393    case float_round_down:
3394        increment = zSign && zSig2;
3395        break;
3396    case float_round_to_odd:
3397        increment = !(zSig1 & 0x1) && zSig2;
3398        break;
3399    default:
3400        abort();
3401    }
3402    if ( 0x7FFD <= (uint32_t) zExp ) {
3403        if (    ( 0x7FFD < zExp )
3404             || (    ( zExp == 0x7FFD )
3405                  && eq128(
3406                         LIT64( 0x0001FFFFFFFFFFFF ),
3407                         LIT64( 0xFFFFFFFFFFFFFFFF ),
3408                         zSig0,
3409                         zSig1
3410                     )
3411                  && increment
3412                )
3413           ) {
3414            float_raise(float_flag_overflow | float_flag_inexact, status);
3415            if (    ( roundingMode == float_round_to_zero )
3416                 || ( zSign && ( roundingMode == float_round_up ) )
3417                 || ( ! zSign && ( roundingMode == float_round_down ) )
3418                 || (roundingMode == float_round_to_odd)
3419               ) {
3420                return
3421                    packFloat128(
3422                        zSign,
3423                        0x7FFE,
3424                        LIT64( 0x0000FFFFFFFFFFFF ),
3425                        LIT64( 0xFFFFFFFFFFFFFFFF )
3426                    );
3427            }
3428            return packFloat128( zSign, 0x7FFF, 0, 0 );
3429        }
3430        if ( zExp < 0 ) {
3431            if (status->flush_to_zero) {
3432                float_raise(float_flag_output_denormal, status);
3433                return packFloat128(zSign, 0, 0, 0);
3434            }
3435            isTiny =
3436                   (status->float_detect_tininess
3437                    == float_tininess_before_rounding)
3438                || ( zExp < -1 )
3439                || ! increment
3440                || lt128(
3441                       zSig0,
3442                       zSig1,
3443                       LIT64( 0x0001FFFFFFFFFFFF ),
3444                       LIT64( 0xFFFFFFFFFFFFFFFF )
3445                   );
3446            shift128ExtraRightJamming(
3447                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3448            zExp = 0;
3449            if (isTiny && zSig2) {
3450                float_raise(float_flag_underflow, status);
3451            }
3452            switch (roundingMode) {
3453            case float_round_nearest_even:
3454            case float_round_ties_away:
3455                increment = ((int64_t)zSig2 < 0);
3456                break;
3457            case float_round_to_zero:
3458                increment = 0;
3459                break;
3460            case float_round_up:
3461                increment = !zSign && zSig2;
3462                break;
3463            case float_round_down:
3464                increment = zSign && zSig2;
3465                break;
3466            case float_round_to_odd:
3467                increment = !(zSig1 & 0x1) && zSig2;
3468                break;
3469            default:
3470                abort();
3471            }
3472        }
3473    }
3474    if (zSig2) {
3475        status->float_exception_flags |= float_flag_inexact;
3476    }
3477    if ( increment ) {
3478        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3479        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3480    }
3481    else {
3482        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3483    }
3484    return packFloat128( zSign, zExp, zSig0, zSig1 );
3485
3486}
3487
3488/*----------------------------------------------------------------------------
3489| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3490| and significand formed by the concatenation of `zSig0' and `zSig1', and
3491| returns the proper quadruple-precision floating-point value corresponding
3492| to the abstract input.  This routine is just like `roundAndPackFloat128'
3493| except that the input significand has fewer bits and does not have to be
3494| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
3495| point exponent.
3496*----------------------------------------------------------------------------*/
3497
3498static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3499                                              uint64_t zSig0, uint64_t zSig1,
3500                                              float_status *status)
3501{
3502    int8_t shiftCount;
3503    uint64_t zSig2;
3504
3505    if ( zSig0 == 0 ) {
3506        zSig0 = zSig1;
3507        zSig1 = 0;
3508        zExp -= 64;
3509    }
3510    shiftCount = clz64(zSig0) - 15;
3511    if ( 0 <= shiftCount ) {
3512        zSig2 = 0;
3513        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3514    }
3515    else {
3516        shift128ExtraRightJamming(
3517            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3518    }
3519    zExp -= shiftCount;
3520    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3521
3522}
3523
3524
3525/*----------------------------------------------------------------------------
3526| Returns the result of converting the 32-bit two's complement integer `a'
3527| to the extended double-precision floating-point format.  The conversion
3528| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3529| Arithmetic.
3530*----------------------------------------------------------------------------*/
3531
3532floatx80 int32_to_floatx80(int32_t a, float_status *status)
3533{
3534    flag zSign;
3535    uint32_t absA;
3536    int8_t shiftCount;
3537    uint64_t zSig;
3538
3539    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3540    zSign = ( a < 0 );
3541    absA = zSign ? - a : a;
3542    shiftCount = clz32(absA) + 32;
3543    zSig = absA;
3544    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3545
3546}
3547
3548/*----------------------------------------------------------------------------
3549| Returns the result of converting the 32-bit two's complement integer `a' to
3550| the quadruple-precision floating-point format.  The conversion is performed
3551| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3552*----------------------------------------------------------------------------*/
3553
3554float128 int32_to_float128(int32_t a, float_status *status)
3555{
3556    flag zSign;
3557    uint32_t absA;
3558    int8_t shiftCount;
3559    uint64_t zSig0;
3560
3561    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3562    zSign = ( a < 0 );
3563    absA = zSign ? - a : a;
3564    shiftCount = clz32(absA) + 17;
3565    zSig0 = absA;
3566    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3567
3568}
3569
3570/*----------------------------------------------------------------------------
3571| Returns the result of converting the 64-bit two's complement integer `a'
3572| to the extended double-precision floating-point format.  The conversion
3573| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3574| Arithmetic.
3575*----------------------------------------------------------------------------*/
3576
3577floatx80 int64_to_floatx80(int64_t a, float_status *status)
3578{
3579    flag zSign;
3580    uint64_t absA;
3581    int8_t shiftCount;
3582
3583    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3584    zSign = ( a < 0 );
3585    absA = zSign ? - a : a;
3586    shiftCount = clz64(absA);
3587    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3588
3589}
3590
3591/*----------------------------------------------------------------------------
3592| Returns the result of converting the 64-bit two's complement integer `a' to
3593| the quadruple-precision floating-point format.  The conversion is performed
3594| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3595*----------------------------------------------------------------------------*/
3596
3597float128 int64_to_float128(int64_t a, float_status *status)
3598{
3599    flag zSign;
3600    uint64_t absA;
3601    int8_t shiftCount;
3602    int32_t zExp;
3603    uint64_t zSig0, zSig1;
3604
3605    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3606    zSign = ( a < 0 );
3607    absA = zSign ? - a : a;
3608    shiftCount = clz64(absA) + 49;
3609    zExp = 0x406E - shiftCount;
3610    if ( 64 <= shiftCount ) {
3611        zSig1 = 0;
3612        zSig0 = absA;
3613        shiftCount -= 64;
3614    }
3615    else {
3616        zSig1 = absA;
3617        zSig0 = 0;
3618    }
3619    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3620    return packFloat128( zSign, zExp, zSig0, zSig1 );
3621
3622}
3623
3624/*----------------------------------------------------------------------------
3625| Returns the result of converting the 64-bit unsigned integer `a'
3626| to the quadruple-precision floating-point format.  The conversion is performed
3627| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3628*----------------------------------------------------------------------------*/
3629
3630float128 uint64_to_float128(uint64_t a, float_status *status)
3631{
3632    if (a == 0) {
3633        return float128_zero;
3634    }
3635    return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
3636}
3637
3638/*----------------------------------------------------------------------------
3639| Returns the result of converting the single-precision floating-point value
3640| `a' to the extended double-precision floating-point format.  The conversion
3641| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3642| Arithmetic.
3643*----------------------------------------------------------------------------*/
3644
3645floatx80 float32_to_floatx80(float32 a, float_status *status)
3646{
3647    flag aSign;
3648    int aExp;
3649    uint32_t aSig;
3650
3651    a = float32_squash_input_denormal(a, status);
3652    aSig = extractFloat32Frac( a );
3653    aExp = extractFloat32Exp( a );
3654    aSign = extractFloat32Sign( a );
3655    if ( aExp == 0xFF ) {
3656        if (aSig) {
3657            return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3658        }
3659        return packFloatx80(aSign,
3660                            floatx80_infinity_high,
3661                            floatx80_infinity_low);
3662    }
3663    if ( aExp == 0 ) {
3664        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3665        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3666    }
3667    aSig |= 0x00800000;
3668    return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3669
3670}
3671
3672/*----------------------------------------------------------------------------
3673| Returns the result of converting the single-precision floating-point value
3674| `a' to the double-precision floating-point format.  The conversion is
3675| performed according to the IEC/IEEE Standard for Binary Floating-Point
3676| Arithmetic.
3677*----------------------------------------------------------------------------*/
3678
3679float128 float32_to_float128(float32 a, float_status *status)
3680{
3681    flag aSign;
3682    int aExp;
3683    uint32_t aSig;
3684
3685    a = float32_squash_input_denormal(a, status);
3686    aSig = extractFloat32Frac( a );
3687    aExp = extractFloat32Exp( a );
3688    aSign = extractFloat32Sign( a );
3689    if ( aExp == 0xFF ) {
3690        if (aSig) {
3691            return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3692        }
3693        return packFloat128( aSign, 0x7FFF, 0, 0 );
3694    }
3695    if ( aExp == 0 ) {
3696        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3697        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3698        --aExp;
3699    }
3700    return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3701
3702}
3703
3704/*----------------------------------------------------------------------------
3705| Returns the remainder of the single-precision floating-point value `a'
3706| with respect to the corresponding value `b'.  The operation is performed
3707| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3708*----------------------------------------------------------------------------*/
3709
3710float32 float32_rem(float32 a, float32 b, float_status *status)
3711{
3712    flag aSign, zSign;
3713    int aExp, bExp, expDiff;
3714    uint32_t aSig, bSig;
3715    uint32_t q;
3716    uint64_t aSig64, bSig64, q64;
3717    uint32_t alternateASig;
3718    int32_t sigMean;
3719    a = float32_squash_input_denormal(a, status);
3720    b = float32_squash_input_denormal(b, status);
3721
3722    aSig = extractFloat32Frac( a );
3723    aExp = extractFloat32Exp( a );
3724    aSign = extractFloat32Sign( a );
3725    bSig = extractFloat32Frac( b );
3726    bExp = extractFloat32Exp( b );
3727    if ( aExp == 0xFF ) {
3728        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3729            return propagateFloat32NaN(a, b, status);
3730        }
3731        float_raise(float_flag_invalid, status);
3732        return float32_default_nan(status);
3733    }
3734    if ( bExp == 0xFF ) {
3735        if (bSig) {
3736            return propagateFloat32NaN(a, b, status);
3737        }
3738        return a;
3739    }
3740    if ( bExp == 0 ) {
3741        if ( bSig == 0 ) {
3742            float_raise(float_flag_invalid, status);
3743            return float32_default_nan(status);
3744        }
3745        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3746    }
3747    if ( aExp == 0 ) {
3748        if ( aSig == 0 ) return a;
3749        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3750    }
3751    expDiff = aExp - bExp;
3752    aSig |= 0x00800000;
3753    bSig |= 0x00800000;
3754    if ( expDiff < 32 ) {
3755        aSig <<= 8;
3756        bSig <<= 8;
3757        if ( expDiff < 0 ) {
3758            if ( expDiff < -1 ) return a;
3759            aSig >>= 1;
3760        }
3761        q = ( bSig <= aSig );
3762        if ( q ) aSig -= bSig;
3763        if ( 0 < expDiff ) {
3764            q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3765            q >>= 32 - expDiff;
3766            bSig >>= 2;
3767            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3768        }
3769        else {
3770            aSig >>= 2;
3771            bSig >>= 2;
3772        }
3773    }
3774    else {
3775        if ( bSig <= aSig ) aSig -= bSig;
3776        aSig64 = ( (uint64_t) aSig )<<40;
3777        bSig64 = ( (uint64_t) bSig )<<40;
3778        expDiff -= 64;
3779        while ( 0 < expDiff ) {
3780            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3781            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3782            aSig64 = - ( ( bSig * q64 )<<38 );
3783            expDiff -= 62;
3784        }
3785        expDiff += 64;
3786        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3787        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3788        q = q64>>( 64 - expDiff );
3789        bSig <<= 6;
3790        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3791    }
3792    do {
3793        alternateASig = aSig;
3794        ++q;
3795        aSig -= bSig;
3796    } while ( 0 <= (int32_t) aSig );
3797    sigMean = aSig + alternateASig;
3798    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3799        aSig = alternateASig;
3800    }
3801    zSign = ( (int32_t) aSig < 0 );
3802    if ( zSign ) aSig = - aSig;
3803    return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3804}
3805
3806
3807
3808/*----------------------------------------------------------------------------
3809| Returns the binary exponential of the single-precision floating-point value
3810| `a'. The operation is performed according to the IEC/IEEE Standard for
3811| Binary Floating-Point Arithmetic.
3812|
3813| Uses the following identities:
3814|
3815| 1. -------------------------------------------------------------------------
3816|      x    x*ln(2)
3817|     2  = e
3818|
3819| 2. -------------------------------------------------------------------------
3820|                      2     3     4     5           n
3821|      x        x     x     x     x     x           x
3822|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3823|               1!    2!    3!    4!    5!          n!
3824*----------------------------------------------------------------------------*/
3825
3826static const float64 float32_exp2_coefficients[15] =
3827{
3828    const_float64( 0x3ff0000000000000ll ), /*  1 */
3829    const_float64( 0x3fe0000000000000ll ), /*  2 */
3830    const_float64( 0x3fc5555555555555ll ), /*  3 */
3831    const_float64( 0x3fa5555555555555ll ), /*  4 */
3832    const_float64( 0x3f81111111111111ll ), /*  5 */
3833    const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3834    const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3835    const_float64( 0x3efa01a01a01a01all ), /*  8 */
3836    const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3837    const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3838    const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3839    const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3840    const_float64( 0x3de6124613a86d09ll ), /* 13 */
3841    const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3842    const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3843};
3844
3845float32 float32_exp2(float32 a, float_status *status)
3846{
3847    flag aSign;
3848    int aExp;
3849    uint32_t aSig;
3850    float64 r, x, xn;
3851    int i;
3852    a = float32_squash_input_denormal(a, status);
3853
3854    aSig = extractFloat32Frac( a );
3855    aExp = extractFloat32Exp( a );
3856    aSign = extractFloat32Sign( a );
3857
3858    if ( aExp == 0xFF) {
3859        if (aSig) {
3860            return propagateFloat32NaN(a, float32_zero, status);
3861        }
3862        return (aSign) ? float32_zero : a;
3863    }
3864    if (aExp == 0) {
3865        if (aSig == 0) return float32_one;
3866    }
3867
3868    float_raise(float_flag_inexact, status);
3869
3870    /* ******************************* */
3871    /* using float64 for approximation */
3872    /* ******************************* */
3873    x = float32_to_float64(a, status);
3874    x = float64_mul(x, float64_ln2, status);
3875
3876    xn = x;
3877    r = float64_one;
3878    for (i = 0 ; i < 15 ; i++) {
3879        float64 f;
3880
3881        f = float64_mul(xn, float32_exp2_coefficients[i], status);
3882        r = float64_add(r, f, status);
3883
3884        xn = float64_mul(xn, x, status);
3885    }
3886
3887    return float64_to_float32(r, status);
3888}
3889
3890/*----------------------------------------------------------------------------
3891| Returns the binary log of the single-precision floating-point value `a'.
3892| The operation is performed according to the IEC/IEEE Standard for Binary
3893| Floating-Point Arithmetic.
3894*----------------------------------------------------------------------------*/
3895float32 float32_log2(float32 a, float_status *status)
3896{
3897    flag aSign, zSign;
3898    int aExp;
3899    uint32_t aSig, zSig, i;
3900
3901    a = float32_squash_input_denormal(a, status);
3902    aSig = extractFloat32Frac( a );
3903    aExp = extractFloat32Exp( a );
3904    aSign = extractFloat32Sign( a );
3905
3906    if ( aExp == 0 ) {
3907        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3908        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3909    }
3910    if ( aSign ) {
3911        float_raise(float_flag_invalid, status);
3912        return float32_default_nan(status);
3913    }
3914    if ( aExp == 0xFF ) {
3915        if (aSig) {
3916            return propagateFloat32NaN(a, float32_zero, status);
3917        }
3918        return a;
3919    }
3920
3921    aExp -= 0x7F;
3922    aSig |= 0x00800000;
3923    zSign = aExp < 0;
3924    zSig = aExp << 23;
3925
3926    for (i = 1 << 22; i > 0; i >>= 1) {
3927        aSig = ( (uint64_t)aSig * aSig ) >> 23;
3928        if ( aSig & 0x01000000 ) {
3929            aSig >>= 1;
3930            zSig |= i;
3931        }
3932    }
3933
3934    if ( zSign )
3935        zSig = -zSig;
3936
3937    return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3938}
3939
3940/*----------------------------------------------------------------------------
3941| Returns 1 if the single-precision floating-point value `a' is equal to
3942| the corresponding value `b', and 0 otherwise.  The invalid exception is
3943| raised if either operand is a NaN.  Otherwise, the comparison is performed
3944| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3945*----------------------------------------------------------------------------*/
3946
3947int float32_eq(float32 a, float32 b, float_status *status)
3948{
3949    uint32_t av, bv;
3950    a = float32_squash_input_denormal(a, status);
3951    b = float32_squash_input_denormal(b, status);
3952
3953    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3954         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3955       ) {
3956        float_raise(float_flag_invalid, status);
3957        return 0;
3958    }
3959    av = float32_val(a);
3960    bv = float32_val(b);
3961    return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3962}
3963
3964/*----------------------------------------------------------------------------
3965| Returns 1 if the single-precision floating-point value `a' is less than
3966| or equal to the corresponding value `b', and 0 otherwise.  The invalid
3967| exception is raised if either operand is a NaN.  The comparison is performed
3968| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3969*----------------------------------------------------------------------------*/
3970
3971int float32_le(float32 a, float32 b, float_status *status)
3972{
3973    flag aSign, bSign;
3974    uint32_t av, bv;
3975    a = float32_squash_input_denormal(a, status);
3976    b = float32_squash_input_denormal(b, status);
3977
3978    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3979         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3980       ) {
3981        float_raise(float_flag_invalid, status);
3982        return 0;
3983    }
3984    aSign = extractFloat32Sign( a );
3985    bSign = extractFloat32Sign( b );
3986    av = float32_val(a);
3987    bv = float32_val(b);
3988    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3989    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3990
3991}
3992
3993/*----------------------------------------------------------------------------
3994| Returns 1 if the single-precision floating-point value `a' is less than
3995| the corresponding value `b', and 0 otherwise.  The invalid exception is
3996| raised if either operand is a NaN.  The comparison is performed according
3997| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3998*----------------------------------------------------------------------------*/
3999
4000int float32_lt(float32 a, float32 b, float_status *status)
4001{
4002    flag aSign, bSign;
4003    uint32_t av, bv;
4004    a = float32_squash_input_denormal(a, status);
4005    b = float32_squash_input_denormal(b, status);
4006
4007    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4008         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4009       ) {
4010        float_raise(float_flag_invalid, status);
4011        return 0;
4012    }
4013    aSign = extractFloat32Sign( a );
4014    bSign = extractFloat32Sign( b );
4015    av = float32_val(a);
4016    bv = float32_val(b);
4017    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4018    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4019
4020}
4021
4022/*----------------------------------------------------------------------------
4023| Returns 1 if the single-precision floating-point values `a' and `b' cannot
4024| be compared, and 0 otherwise.  The invalid exception is raised if either
4025| operand is a NaN.  The comparison is performed according to the IEC/IEEE
4026| Standard for Binary Floating-Point Arithmetic.
4027*----------------------------------------------------------------------------*/
4028
4029int float32_unordered(float32 a, float32 b, float_status *status)
4030{
4031    a = float32_squash_input_denormal(a, status);
4032    b = float32_squash_input_denormal(b, status);
4033
4034    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4035         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4036       ) {
4037        float_raise(float_flag_invalid, status);
4038        return 1;
4039    }
4040    return 0;
4041}
4042
4043/*----------------------------------------------------------------------------
4044| Returns 1 if the single-precision floating-point value `a' is equal to
4045| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4046| exception.  The comparison is performed according to the IEC/IEEE Standard
4047| for Binary Floating-Point Arithmetic.
4048*----------------------------------------------------------------------------*/
4049
4050int float32_eq_quiet(float32 a, float32 b, float_status *status)
4051{
4052    a = float32_squash_input_denormal(a, status);
4053    b = float32_squash_input_denormal(b, status);
4054
4055    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4056         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4057       ) {
4058        if (float32_is_signaling_nan(a, status)
4059         || float32_is_signaling_nan(b, status)) {
4060            float_raise(float_flag_invalid, status);
4061        }
4062        return 0;
4063    }
4064    return ( float32_val(a) == float32_val(b) ) ||
4065            ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4066}
4067
4068/*----------------------------------------------------------------------------
4069| Returns 1 if the single-precision floating-point value `a' is less than or
4070| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4071| cause an exception.  Otherwise, the comparison is performed according to the
4072| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4073*----------------------------------------------------------------------------*/
4074
4075int float32_le_quiet(float32 a, float32 b, float_status *status)
4076{
4077    flag aSign, bSign;
4078    uint32_t av, bv;
4079    a = float32_squash_input_denormal(a, status);
4080    b = float32_squash_input_denormal(b, status);
4081
4082    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4083         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4084       ) {
4085        if (float32_is_signaling_nan(a, status)
4086         || float32_is_signaling_nan(b, status)) {
4087            float_raise(float_flag_invalid, status);
4088        }
4089        return 0;
4090    }
4091    aSign = extractFloat32Sign( a );
4092    bSign = extractFloat32Sign( b );
4093    av = float32_val(a);
4094    bv = float32_val(b);
4095    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4096    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4097
4098}
4099
4100/*----------------------------------------------------------------------------
4101| Returns 1 if the single-precision floating-point value `a' is less than
4102| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4103| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4104| Standard for Binary Floating-Point Arithmetic.
4105*----------------------------------------------------------------------------*/
4106
4107int float32_lt_quiet(float32 a, float32 b, float_status *status)
4108{
4109    flag aSign, bSign;
4110    uint32_t av, bv;
4111    a = float32_squash_input_denormal(a, status);
4112    b = float32_squash_input_denormal(b, status);
4113
4114    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4115         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4116       ) {
4117        if (float32_is_signaling_nan(a, status)
4118         || float32_is_signaling_nan(b, status)) {
4119            float_raise(float_flag_invalid, status);
4120        }
4121        return 0;
4122    }
4123    aSign = extractFloat32Sign( a );
4124    bSign = extractFloat32Sign( b );
4125    av = float32_val(a);
4126    bv = float32_val(b);
4127    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4128    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4129
4130}
4131
4132/*----------------------------------------------------------------------------
4133| Returns 1 if the single-precision floating-point values `a' and `b' cannot
4134| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4135| comparison is performed according to the IEC/IEEE Standard for Binary
4136| Floating-Point Arithmetic.
4137*----------------------------------------------------------------------------*/
4138
4139int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4140{
4141    a = float32_squash_input_denormal(a, status);
4142    b = float32_squash_input_denormal(b, status);
4143
4144    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4145         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4146       ) {
4147        if (float32_is_signaling_nan(a, status)
4148         || float32_is_signaling_nan(b, status)) {
4149            float_raise(float_flag_invalid, status);
4150        }
4151        return 1;
4152    }
4153    return 0;
4154}
4155
4156/*----------------------------------------------------------------------------
4157| If `a' is denormal and we are in flush-to-zero mode then set the
4158| input-denormal exception and return zero. Otherwise just return the value.
4159*----------------------------------------------------------------------------*/
4160float16 float16_squash_input_denormal(float16 a, float_status *status)
4161{
4162    if (status->flush_inputs_to_zero) {
4163        if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4164            float_raise(float_flag_input_denormal, status);
4165            return make_float16(float16_val(a) & 0x8000);
4166        }
4167    }
4168    return a;
4169}
4170
4171/*----------------------------------------------------------------------------
4172| Returns the result of converting the double-precision floating-point value
4173| `a' to the extended double-precision floating-point format.  The conversion
4174| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4175| Arithmetic.
4176*----------------------------------------------------------------------------*/
4177
4178floatx80 float64_to_floatx80(float64 a, float_status *status)
4179{
4180    flag aSign;
4181    int aExp;
4182    uint64_t aSig;
4183
4184    a = float64_squash_input_denormal(a, status);
4185    aSig = extractFloat64Frac( a );
4186    aExp = extractFloat64Exp( a );
4187    aSign = extractFloat64Sign( a );
4188    if ( aExp == 0x7FF ) {
4189        if (aSig) {
4190            return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4191        }
4192        return packFloatx80(aSign,
4193                            floatx80_infinity_high,
4194                            floatx80_infinity_low);
4195    }
4196    if ( aExp == 0 ) {
4197        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4198        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4199    }
4200    return
4201        packFloatx80(
4202            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4203
4204}
4205
4206/*----------------------------------------------------------------------------
4207| Returns the result of converting the double-precision floating-point value
4208| `a' to the quadruple-precision floating-point format.  The conversion is
4209| performed according to the IEC/IEEE Standard for Binary Floating-Point
4210| Arithmetic.
4211*----------------------------------------------------------------------------*/
4212
4213float128 float64_to_float128(float64 a, float_status *status)
4214{
4215    flag aSign;
4216    int aExp;
4217    uint64_t aSig, zSig0, zSig1;
4218
4219    a = float64_squash_input_denormal(a, status);
4220    aSig = extractFloat64Frac( a );
4221    aExp = extractFloat64Exp( a );
4222    aSign = extractFloat64Sign( a );
4223    if ( aExp == 0x7FF ) {
4224        if (aSig) {
4225            return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4226        }
4227        return packFloat128( aSign, 0x7FFF, 0, 0 );
4228    }
4229    if ( aExp == 0 ) {
4230        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4231        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4232        --aExp;
4233    }
4234    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4235    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4236
4237}
4238
4239
4240/*----------------------------------------------------------------------------
4241| Returns the remainder of the double-precision floating-point value `a'
4242| with respect to the corresponding value `b'.  The operation is performed
4243| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4244*----------------------------------------------------------------------------*/
4245
4246float64 float64_rem(float64 a, float64 b, float_status *status)
4247{
4248    flag aSign, zSign;
4249    int aExp, bExp, expDiff;
4250    uint64_t aSig, bSig;
4251    uint64_t q, alternateASig;
4252    int64_t sigMean;
4253
4254    a = float64_squash_input_denormal(a, status);
4255    b = float64_squash_input_denormal(b, status);
4256    aSig = extractFloat64Frac( a );
4257    aExp = extractFloat64Exp( a );
4258    aSign = extractFloat64Sign( a );
4259    bSig = extractFloat64Frac( b );
4260    bExp = extractFloat64Exp( b );
4261    if ( aExp == 0x7FF ) {
4262        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4263            return propagateFloat64NaN(a, b, status);
4264        }
4265        float_raise(float_flag_invalid, status);
4266        return float64_default_nan(status);
4267    }
4268    if ( bExp == 0x7FF ) {
4269        if (bSig) {
4270            return propagateFloat64NaN(a, b, status);
4271        }
4272        return a;
4273    }
4274    if ( bExp == 0 ) {
4275        if ( bSig == 0 ) {
4276            float_raise(float_flag_invalid, status);
4277            return float64_default_nan(status);
4278        }
4279        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4280    }
4281    if ( aExp == 0 ) {
4282        if ( aSig == 0 ) return a;
4283        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4284    }
4285    expDiff = aExp - bExp;
4286    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4287    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4288    if ( expDiff < 0 ) {
4289        if ( expDiff < -1 ) return a;
4290        aSig >>= 1;
4291    }
4292    q = ( bSig <= aSig );
4293    if ( q ) aSig -= bSig;
4294    expDiff -= 64;
4295    while ( 0 < expDiff ) {
4296        q = estimateDiv128To64( aSig, 0, bSig );
4297        q = ( 2 < q ) ? q - 2 : 0;
4298        aSig = - ( ( bSig>>2 ) * q );
4299        expDiff -= 62;
4300    }
4301    expDiff += 64;
4302    if ( 0 < expDiff ) {
4303        q = estimateDiv128To64( aSig, 0, bSig );
4304        q = ( 2 < q ) ? q - 2 : 0;
4305        q >>= 64 - expDiff;
4306        bSig >>= 2;
4307        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4308    }
4309    else {
4310        aSig >>= 2;
4311        bSig >>= 2;
4312    }
4313    do {
4314        alternateASig = aSig;
4315        ++q;
4316        aSig -= bSig;
4317    } while ( 0 <= (int64_t) aSig );
4318    sigMean = aSig + alternateASig;
4319    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4320        aSig = alternateASig;
4321    }
4322    zSign = ( (int64_t) aSig < 0 );
4323    if ( zSign ) aSig = - aSig;
4324    return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4325
4326}
4327
4328/*----------------------------------------------------------------------------
4329| Returns the binary log of the double-precision floating-point value `a'.
4330| The operation is performed according to the IEC/IEEE Standard for Binary
4331| Floating-Point Arithmetic.
4332*----------------------------------------------------------------------------*/
4333float64 float64_log2(float64 a, float_status *status)
4334{
4335    flag aSign, zSign;
4336    int aExp;
4337    uint64_t aSig, aSig0, aSig1, zSig, i;
4338    a = float64_squash_input_denormal(a, status);
4339
4340    aSig = extractFloat64Frac( a );
4341    aExp = extractFloat64Exp( a );
4342    aSign = extractFloat64Sign( a );
4343
4344    if ( aExp == 0 ) {
4345        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4346        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4347    }
4348    if ( aSign ) {
4349        float_raise(float_flag_invalid, status);
4350        return float64_default_nan(status);
4351    }
4352    if ( aExp == 0x7FF ) {
4353        if (aSig) {
4354            return propagateFloat64NaN(a, float64_zero, status);
4355        }
4356        return a;
4357    }
4358
4359    aExp -= 0x3FF;
4360    aSig |= LIT64( 0x0010000000000000 );
4361    zSign = aExp < 0;
4362    zSig = (uint64_t)aExp << 52;
4363    for (i = 1LL << 51; i > 0; i >>= 1) {
4364        mul64To128( aSig, aSig, &aSig0, &aSig1 );
4365        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4366        if ( aSig & LIT64( 0x0020000000000000 ) ) {
4367            aSig >>= 1;
4368            zSig |= i;
4369        }
4370    }
4371
4372    if ( zSign )
4373        zSig = -zSig;
4374    return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4375}
4376
4377/*----------------------------------------------------------------------------
4378| Returns 1 if the double-precision floating-point value `a' is equal to the
4379| corresponding value `b', and 0 otherwise.  The invalid exception is raised
4380| if either operand is a NaN.  Otherwise, the comparison is performed
4381| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4382*----------------------------------------------------------------------------*/
4383
4384int float64_eq(float64 a, float64 b, float_status *status)
4385{
4386    uint64_t av, bv;
4387    a = float64_squash_input_denormal(a, status);
4388    b = float64_squash_input_denormal(b, status);
4389
4390    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4391         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4392       ) {
4393        float_raise(float_flag_invalid, status);
4394        return 0;
4395    }
4396    av = float64_val(a);
4397    bv = float64_val(b);
4398    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4399
4400}
4401
4402/*----------------------------------------------------------------------------
4403| Returns 1 if the double-precision floating-point value `a' is less than or
4404| equal to the corresponding value `b', and 0 otherwise.  The invalid
4405| exception is raised if either operand is a NaN.  The comparison is performed
4406| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4407*----------------------------------------------------------------------------*/
4408
4409int float64_le(float64 a, float64 b, float_status *status)
4410{
4411    flag aSign, bSign;
4412    uint64_t av, bv;
4413    a = float64_squash_input_denormal(a, status);
4414    b = float64_squash_input_denormal(b, status);
4415
4416    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4417         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4418       ) {
4419        float_raise(float_flag_invalid, status);
4420        return 0;
4421    }
4422    aSign = extractFloat64Sign( a );
4423    bSign = extractFloat64Sign( b );
4424    av = float64_val(a);
4425    bv = float64_val(b);
4426    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4427    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4428
4429}
4430
4431/*----------------------------------------------------------------------------
4432| Returns 1 if the double-precision floating-point value `a' is less than
4433| the corresponding value `b', and 0 otherwise.  The invalid exception is
4434| raised if either operand is a NaN.  The comparison is performed according
4435| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4436*----------------------------------------------------------------------------*/
4437
4438int float64_lt(float64 a, float64 b, float_status *status)
4439{
4440    flag aSign, bSign;
4441    uint64_t av, bv;
4442
4443    a = float64_squash_input_denormal(a, status);
4444    b = float64_squash_input_denormal(b, status);
4445    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4446         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4447       ) {
4448        float_raise(float_flag_invalid, status);
4449        return 0;
4450    }
4451    aSign = extractFloat64Sign( a );
4452    bSign = extractFloat64Sign( b );
4453    av = float64_val(a);
4454    bv = float64_val(b);
4455    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4456    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4457
4458}
4459
4460/*----------------------------------------------------------------------------
4461| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4462| be compared, and 0 otherwise.  The invalid exception is raised if either
4463| operand is a NaN.  The comparison is performed according to the IEC/IEEE
4464| Standard for Binary Floating-Point Arithmetic.
4465*----------------------------------------------------------------------------*/
4466
4467int float64_unordered(float64 a, float64 b, float_status *status)
4468{
4469    a = float64_squash_input_denormal(a, status);
4470    b = float64_squash_input_denormal(b, status);
4471
4472    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4473         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4474       ) {
4475        float_raise(float_flag_invalid, status);
4476        return 1;
4477    }
4478    return 0;
4479}
4480
4481/*----------------------------------------------------------------------------
4482| Returns 1 if the double-precision floating-point value `a' is equal to the
4483| corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4484| exception.The comparison is performed according to the IEC/IEEE Standard
4485| for Binary Floating-Point Arithmetic.
4486*----------------------------------------------------------------------------*/
4487
4488int float64_eq_quiet(float64 a, float64 b, float_status *status)
4489{
4490    uint64_t av, bv;
4491    a = float64_squash_input_denormal(a, status);
4492    b = float64_squash_input_denormal(b, status);
4493
4494    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4495         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4496       ) {
4497        if (float64_is_signaling_nan(a, status)
4498         || float64_is_signaling_nan(b, status)) {
4499            float_raise(float_flag_invalid, status);
4500        }
4501        return 0;
4502    }
4503    av = float64_val(a);
4504    bv = float64_val(b);
4505    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4506
4507}
4508
4509/*----------------------------------------------------------------------------
4510| Returns 1 if the double-precision floating-point value `a' is less than or
4511| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4512| cause an exception.  Otherwise, the comparison is performed according to the
4513| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4514*----------------------------------------------------------------------------*/
4515
4516int float64_le_quiet(float64 a, float64 b, float_status *status)
4517{
4518    flag aSign, bSign;
4519    uint64_t av, bv;
4520    a = float64_squash_input_denormal(a, status);
4521    b = float64_squash_input_denormal(b, status);
4522
4523    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4524         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4525       ) {
4526        if (float64_is_signaling_nan(a, status)
4527         || float64_is_signaling_nan(b, status)) {
4528            float_raise(float_flag_invalid, status);
4529        }
4530        return 0;
4531    }
4532    aSign = extractFloat64Sign( a );
4533    bSign = extractFloat64Sign( b );
4534    av = float64_val(a);
4535    bv = float64_val(b);
4536    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4537    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4538
4539}
4540
4541/*----------------------------------------------------------------------------
4542| Returns 1 if the double-precision floating-point value `a' is less than
4543| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4544| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4545| Standard for Binary Floating-Point Arithmetic.
4546*----------------------------------------------------------------------------*/
4547
4548int float64_lt_quiet(float64 a, float64 b, float_status *status)
4549{
4550    flag aSign, bSign;
4551    uint64_t av, bv;
4552    a = float64_squash_input_denormal(a, status);
4553    b = float64_squash_input_denormal(b, status);
4554
4555    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4556         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4557       ) {
4558        if (float64_is_signaling_nan(a, status)
4559         || float64_is_signaling_nan(b, status)) {
4560            float_raise(float_flag_invalid, status);
4561        }
4562        return 0;
4563    }
4564    aSign = extractFloat64Sign( a );
4565    bSign = extractFloat64Sign( b );
4566    av = float64_val(a);
4567    bv = float64_val(b);
4568    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4569    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4570
4571}
4572
4573/*----------------------------------------------------------------------------
4574| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4575| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4576| comparison is performed according to the IEC/IEEE Standard for Binary
4577| Floating-Point Arithmetic.
4578*----------------------------------------------------------------------------*/
4579
4580int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4581{
4582    a = float64_squash_input_denormal(a, status);
4583    b = float64_squash_input_denormal(b, status);
4584
4585    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4586         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4587       ) {
4588        if (float64_is_signaling_nan(a, status)
4589         || float64_is_signaling_nan(b, status)) {
4590            float_raise(float_flag_invalid, status);
4591        }
4592        return 1;
4593    }
4594    return 0;
4595}
4596
4597/*----------------------------------------------------------------------------
4598| Returns the result of converting the extended double-precision floating-
4599| point value `a' to the 32-bit two's complement integer format.  The
4600| conversion is performed according to the IEC/IEEE Standard for Binary
4601| Floating-Point Arithmetic---which means in particular that the conversion
4602| is rounded according to the current rounding mode.  If `a' is a NaN, the
4603| largest positive integer is returned.  Otherwise, if the conversion
4604| overflows, the largest integer with the same sign as `a' is returned.
4605*----------------------------------------------------------------------------*/
4606
4607int32_t floatx80_to_int32(floatx80 a, float_status *status)
4608{
4609    flag aSign;
4610    int32_t aExp, shiftCount;
4611    uint64_t aSig;
4612
4613    if (floatx80_invalid_encoding(a)) {
4614        float_raise(float_flag_invalid, status);
4615        return 1 << 31;
4616    }
4617    aSig = extractFloatx80Frac( a );
4618    aExp = extractFloatx80Exp( a );
4619    aSign = extractFloatx80Sign( a );
4620    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4621    shiftCount = 0x4037 - aExp;
4622    if ( shiftCount <= 0 ) shiftCount = 1;
4623    shift64RightJamming( aSig, shiftCount, &aSig );
4624    return roundAndPackInt32(aSign, aSig, status);
4625
4626}
4627
4628/*----------------------------------------------------------------------------
4629| Returns the result of converting the extended double-precision floating-
4630| point value `a' to the 32-bit two's complement integer format.  The
4631| conversion is performed according to the IEC/IEEE Standard for Binary
4632| Floating-Point Arithmetic, except that the conversion is always rounded
4633| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4634| Otherwise, if the conversion overflows, the largest integer with the same
4635| sign as `a' is returned.
4636*----------------------------------------------------------------------------*/
4637
4638int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4639{
4640    flag aSign;
4641    int32_t aExp, shiftCount;
4642    uint64_t aSig, savedASig;
4643    int32_t z;
4644
4645    if (floatx80_invalid_encoding(a)) {
4646        float_raise(float_flag_invalid, status);
4647        return 1 << 31;
4648    }
4649    aSig = extractFloatx80Frac( a );
4650    aExp = extractFloatx80Exp( a );
4651    aSign = extractFloatx80Sign( a );
4652    if ( 0x401E < aExp ) {
4653        if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4654        goto invalid;
4655    }
4656    else if ( aExp < 0x3FFF ) {
4657        if (aExp || aSig) {
4658            status->float_exception_flags |= float_flag_inexact;
4659        }
4660        return 0;
4661    }
4662    shiftCount = 0x403E - aExp;
4663    savedASig = aSig;
4664    aSig >>= shiftCount;
4665    z = aSig;
4666    if ( aSign ) z = - z;
4667    if ( ( z < 0 ) ^ aSign ) {
4668 invalid:
4669        float_raise(float_flag_invalid, status);
4670        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4671    }
4672    if ( ( aSig<<shiftCount ) != savedASig ) {
4673        status->float_exception_flags |= float_flag_inexact;
4674    }
4675    return z;
4676
4677}
4678
4679/*----------------------------------------------------------------------------
4680| Returns the result of converting the extended double-precision floating-
4681| point value `a' to the 64-bit two's complement integer format.  The
4682| conversion is performed according to the IEC/IEEE Standard for Binary
4683| Floating-Point Arithmetic---which means in particular that the conversion
4684| is rounded according to the current rounding mode.  If `a' is a NaN,
4685| the largest positive integer is returned.  Otherwise, if the conversion
4686| overflows, the largest integer with the same sign as `a' is returned.
4687*----------------------------------------------------------------------------*/
4688
4689int64_t floatx80_to_int64(floatx80 a, float_status *status)
4690{
4691    flag aSign;
4692    int32_t aExp, shiftCount;
4693    uint64_t aSig, aSigExtra;
4694
4695    if (floatx80_invalid_encoding(a)) {
4696        float_raise(float_flag_invalid, status);
4697        return 1ULL << 63;
4698    }
4699    aSig = extractFloatx80Frac( a );
4700    aExp = extractFloatx80Exp( a );
4701    aSign = extractFloatx80Sign( a );
4702    shiftCount = 0x403E - aExp;
4703    if ( shiftCount <= 0 ) {
4704        if ( shiftCount ) {
4705            float_raise(float_flag_invalid, status);
4706            if (!aSign || floatx80_is_any_nan(a)) {
4707                return LIT64( 0x7FFFFFFFFFFFFFFF );
4708            }
4709            return (int64_t) LIT64( 0x8000000000000000 );
4710        }
4711        aSigExtra = 0;
4712    }
4713    else {
4714        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4715    }
4716    return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4717
4718}
4719
4720/*----------------------------------------------------------------------------
4721| Returns the result of converting the extended double-precision floating-
4722| point value `a' to the 64-bit two's complement integer format.  The
4723| conversion is performed according to the IEC/IEEE Standard for Binary
4724| Floating-Point Arithmetic, except that the conversion is always rounded
4725| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4726| Otherwise, if the conversion overflows, the largest integer with the same
4727| sign as `a' is returned.
4728*----------------------------------------------------------------------------*/
4729
4730int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4731{
4732    flag aSign;
4733    int32_t aExp, shiftCount;
4734    uint64_t aSig;
4735    int64_t z;
4736
4737    if (floatx80_invalid_encoding(a)) {
4738        float_raise(float_flag_invalid, status);
4739        return 1ULL << 63;
4740    }
4741    aSig = extractFloatx80Frac( a );
4742    aExp = extractFloatx80Exp( a );
4743    aSign = extractFloatx80Sign( a );
4744    shiftCount = aExp - 0x403E;
4745    if ( 0 <= shiftCount ) {
4746        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4747        if ( ( a.high != 0xC03E ) || aSig ) {
4748            float_raise(float_flag_invalid, status);
4749            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4750                return LIT64( 0x7FFFFFFFFFFFFFFF );
4751            }
4752        }
4753        return (int64_t) LIT64( 0x8000000000000000 );
4754    }
4755    else if ( aExp < 0x3FFF ) {
4756        if (aExp | aSig) {
4757            status->float_exception_flags |= float_flag_inexact;
4758        }
4759        return 0;
4760    }
4761    z = aSig>>( - shiftCount );
4762    if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4763        status->float_exception_flags |= float_flag_inexact;
4764    }
4765    if ( aSign ) z = - z;
4766    return z;
4767
4768}
4769
4770/*----------------------------------------------------------------------------
4771| Returns the result of converting the extended double-precision floating-
4772| point value `a' to the single-precision floating-point format.  The
4773| conversion is performed according to the IEC/IEEE Standard for Binary
4774| Floating-Point Arithmetic.
4775*----------------------------------------------------------------------------*/
4776
4777float32 floatx80_to_float32(floatx80 a, float_status *status)
4778{
4779    flag aSign;
4780    int32_t aExp;
4781    uint64_t aSig;
4782
4783    if (floatx80_invalid_encoding(a)) {
4784        float_raise(float_flag_invalid, status);
4785        return float32_default_nan(status);
4786    }
4787    aSig = extractFloatx80Frac( a );
4788    aExp = extractFloatx80Exp( a );
4789    aSign = extractFloatx80Sign( a );
4790    if ( aExp == 0x7FFF ) {
4791        if ( (uint64_t) ( aSig<<1 ) ) {
4792            return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4793        }
4794        return packFloat32( aSign, 0xFF, 0 );
4795    }
4796    shift64RightJamming( aSig, 33, &aSig );
4797    if ( aExp || aSig ) aExp -= 0x3F81;
4798    return roundAndPackFloat32(aSign, aExp, aSig, status);
4799
4800}
4801
4802/*----------------------------------------------------------------------------
4803| Returns the result of converting the extended double-precision floating-
4804| point value `a' to the double-precision floating-point format.  The
4805| conversion is performed according to the IEC/IEEE Standard for Binary
4806| Floating-Point Arithmetic.
4807*----------------------------------------------------------------------------*/
4808
4809float64 floatx80_to_float64(floatx80 a, float_status *status)
4810{
4811    flag aSign;
4812    int32_t aExp;
4813    uint64_t aSig, zSig;
4814
4815    if (floatx80_invalid_encoding(a)) {
4816        float_raise(float_flag_invalid, status);
4817        return float64_default_nan(status);
4818    }
4819    aSig = extractFloatx80Frac( a );
4820    aExp = extractFloatx80Exp( a );
4821    aSign = extractFloatx80Sign( a );
4822    if ( aExp == 0x7FFF ) {
4823        if ( (uint64_t) ( aSig<<1 ) ) {
4824            return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4825        }
4826        return packFloat64( aSign, 0x7FF, 0 );
4827    }
4828    shift64RightJamming( aSig, 1, &zSig );
4829    if ( aExp || aSig ) aExp -= 0x3C01;
4830    return roundAndPackFloat64(aSign, aExp, zSig, status);
4831
4832}
4833
4834/*----------------------------------------------------------------------------
4835| Returns the result of converting the extended double-precision floating-
4836| point value `a' to the quadruple-precision floating-point format.  The
4837| conversion is performed according to the IEC/IEEE Standard for Binary
4838| Floating-Point Arithmetic.
4839*----------------------------------------------------------------------------*/
4840
4841float128 floatx80_to_float128(floatx80 a, float_status *status)
4842{
4843    flag aSign;
4844    int aExp;
4845    uint64_t aSig, zSig0, zSig1;
4846
4847    if (floatx80_invalid_encoding(a)) {
4848        float_raise(float_flag_invalid, status);
4849        return float128_default_nan(status);
4850    }
4851    aSig = extractFloatx80Frac( a );
4852    aExp = extractFloatx80Exp( a );
4853    aSign = extractFloatx80Sign( a );
4854    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4855        return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4856    }
4857    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4858    return packFloat128( aSign, aExp, zSig0, zSig1 );
4859
4860}
4861
4862/*----------------------------------------------------------------------------
4863| Rounds the extended double-precision floating-point value `a'
4864| to the precision provided by floatx80_rounding_precision and returns the
4865| result as an extended double-precision floating-point value.
4866| The operation is performed according to the IEC/IEEE Standard for Binary
4867| Floating-Point Arithmetic.
4868*----------------------------------------------------------------------------*/
4869
4870floatx80 floatx80_round(floatx80 a, float_status *status)
4871{
4872    return roundAndPackFloatx80(status->floatx80_rounding_precision,
4873                                extractFloatx80Sign(a),
4874                                extractFloatx80Exp(a),
4875                                extractFloatx80Frac(a), 0, status);
4876}
4877
4878/*----------------------------------------------------------------------------
4879| Rounds the extended double-precision floating-point value `a' to an integer,
4880| and returns the result as an extended quadruple-precision floating-point
4881| value.  The operation is performed according to the IEC/IEEE Standard for
4882| Binary Floating-Point Arithmetic.
4883*----------------------------------------------------------------------------*/
4884
4885floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4886{
4887    flag aSign;
4888    int32_t aExp;
4889    uint64_t lastBitMask, roundBitsMask;
4890    floatx80 z;
4891
4892    if (floatx80_invalid_encoding(a)) {
4893        float_raise(float_flag_invalid, status);
4894        return floatx80_default_nan(status);
4895    }
4896    aExp = extractFloatx80Exp( a );
4897    if ( 0x403E <= aExp ) {
4898        if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4899            return propagateFloatx80NaN(a, a, status);
4900        }
4901        return a;
4902    }
4903    if ( aExp < 0x3FFF ) {
4904        if (    ( aExp == 0 )
4905             && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4906            return a;
4907        }
4908        status->float_exception_flags |= float_flag_inexact;
4909        aSign = extractFloatx80Sign( a );
4910        switch (status->float_rounding_mode) {
4911         case float_round_nearest_even:
4912            if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4913               ) {
4914                return
4915                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4916            }
4917            break;
4918        case float_round_ties_away:
4919            if (aExp == 0x3FFE) {
4920                return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4921            }
4922            break;
4923         case float_round_down:
4924            return
4925                  aSign ?
4926                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4927                : packFloatx80( 0, 0, 0 );
4928         case float_round_up:
4929            return
4930                  aSign ? packFloatx80( 1, 0, 0 )
4931                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4932        }
4933        return packFloatx80( aSign, 0, 0 );
4934    }
4935    lastBitMask = 1;
4936    lastBitMask <<= 0x403E - aExp;
4937    roundBitsMask = lastBitMask - 1;
4938    z = a;
4939    switch (status->float_rounding_mode) {
4940    case float_round_nearest_even:
4941        z.low += lastBitMask>>1;
4942        if ((z.low & roundBitsMask) == 0) {
4943            z.low &= ~lastBitMask;
4944        }
4945        break;
4946    case float_round_ties_away:
4947        z.low += lastBitMask >> 1;
4948        break;
4949    case float_round_to_zero:
4950        break;
4951    case float_round_up:
4952        if (!extractFloatx80Sign(z)) {
4953            z.low += roundBitsMask;
4954        }
4955        break;
4956    case float_round_down:
4957        if (extractFloatx80Sign(z)) {
4958            z.low += roundBitsMask;
4959        }
4960        break;
4961    default:
4962        abort();
4963    }
4964    z.low &= ~ roundBitsMask;
4965    if ( z.low == 0 ) {
4966        ++z.high;
4967        z.low = LIT64( 0x8000000000000000 );
4968    }
4969    if (z.low != a.low) {
4970        status->float_exception_flags |= float_flag_inexact;
4971    }
4972    return z;
4973
4974}
4975
4976/*----------------------------------------------------------------------------
4977| Returns the result of adding the absolute values of the extended double-
4978| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4979| negated before being returned.  `zSign' is ignored if the result is a NaN.
4980| The addition is performed according to the IEC/IEEE Standard for Binary
4981| Floating-Point Arithmetic.
4982*----------------------------------------------------------------------------*/
4983
4984static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4985                                float_status *status)
4986{
4987    int32_t aExp, bExp, zExp;
4988    uint64_t aSig, bSig, zSig0, zSig1;
4989    int32_t expDiff;
4990
4991    aSig = extractFloatx80Frac( a );
4992    aExp = extractFloatx80Exp( a );
4993    bSig = extractFloatx80Frac( b );
4994    bExp = extractFloatx80Exp( b );
4995    expDiff = aExp - bExp;
4996    if ( 0 < expDiff ) {
4997        if ( aExp == 0x7FFF ) {
4998            if ((uint64_t)(aSig << 1)) {
4999                return propagateFloatx80NaN(a, b, status);
5000            }
5001            return a;
5002        }
5003        if ( bExp == 0 ) --expDiff;
5004        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5005        zExp = aExp;
5006    }
5007    else if ( expDiff < 0 ) {
5008        if ( bExp == 0x7FFF ) {
5009            if ((uint64_t)(bSig << 1)) {
5010                return propagateFloatx80NaN(a, b, status);
5011            }
5012            return packFloatx80(zSign,
5013                                floatx80_infinity_high,
5014                                floatx80_infinity_low);
5015        }
5016        if ( aExp == 0 ) ++expDiff;
5017        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5018        zExp = bExp;
5019    }
5020    else {
5021        if ( aExp == 0x7FFF ) {
5022            if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5023                return propagateFloatx80NaN(a, b, status);
5024            }
5025            return a;
5026        }
5027        zSig1 = 0;
5028        zSig0 = aSig + bSig;
5029        if ( aExp == 0 ) {
5030            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5031            goto roundAndPack;
5032        }
5033        zExp = aExp;
5034        goto shiftRight1;
5035    }
5036    zSig0 = aSig + bSig;
5037    if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5038 shiftRight1:
5039    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5040    zSig0 |= LIT64( 0x8000000000000000 );
5041    ++zExp;
5042 roundAndPack:
5043    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5044                                zSign, zExp, zSig0, zSig1, status);
5045}
5046
5047/*----------------------------------------------------------------------------
5048| Returns the result of subtracting the absolute values of the extended
5049| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5050| difference is negated before being returned.  `zSign' is ignored if the
5051| result is a NaN.  The subtraction is performed according to the IEC/IEEE
5052| Standard for Binary Floating-Point Arithmetic.
5053*----------------------------------------------------------------------------*/
5054
5055static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5056                                float_status *status)
5057{
5058    int32_t aExp, bExp, zExp;
5059    uint64_t aSig, bSig, zSig0, zSig1;
5060    int32_t expDiff;
5061
5062    aSig = extractFloatx80Frac( a );
5063    aExp = extractFloatx80Exp( a );
5064    bSig = extractFloatx80Frac( b );
5065    bExp = extractFloatx80Exp( b );
5066    expDiff = aExp - bExp;
5067    if ( 0 < expDiff ) goto aExpBigger;
5068    if ( expDiff < 0 ) goto bExpBigger;
5069    if ( aExp == 0x7FFF ) {
5070        if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5071            return propagateFloatx80NaN(a, b, status);
5072        }
5073        float_raise(float_flag_invalid, status);
5074        return floatx80_default_nan(status);
5075    }
5076    if ( aExp == 0 ) {
5077        aExp = 1;
5078        bExp = 1;
5079    }
5080    zSig1 = 0;
5081    if ( bSig < aSig ) goto aBigger;
5082    if ( aSig < bSig ) goto bBigger;
5083    return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5084 bExpBigger:
5085    if ( bExp == 0x7FFF ) {
5086        if ((uint64_t)(bSig << 1)) {
5087            return propagateFloatx80NaN(a, b, status);
5088        }
5089        return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5090                            floatx80_infinity_low);
5091    }
5092    if ( aExp == 0 ) ++expDiff;
5093    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5094 bBigger:
5095    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5096    zExp = bExp;
5097    zSign ^= 1;
5098    goto normalizeRoundAndPack;
5099 aExpBigger:
5100    if ( aExp == 0x7FFF ) {
5101        if ((uint64_t)(aSig << 1)) {
5102            return propagateFloatx80NaN(a, b, status);
5103        }
5104        return a;
5105    }
5106    if ( bExp == 0 ) --expDiff;
5107    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5108 aBigger:
5109    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5110    zExp = aExp;
5111 normalizeRoundAndPack:
5112    return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5113                                         zSign, zExp, zSig0, zSig1, status);
5114}
5115
5116/*----------------------------------------------------------------------------
5117| Returns the result of adding the extended double-precision floating-point
5118| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5119| Standard for Binary Floating-Point Arithmetic.
5120*----------------------------------------------------------------------------*/
5121
5122floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5123{
5124    flag aSign, bSign;
5125
5126    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5127        float_raise(float_flag_invalid, status);
5128        return floatx80_default_nan(status);
5129    }
5130    aSign = extractFloatx80Sign( a );
5131    bSign = extractFloatx80Sign( b );
5132    if ( aSign == bSign ) {
5133        return addFloatx80Sigs(a, b, aSign, status);
5134    }
5135    else {
5136        return subFloatx80Sigs(a, b, aSign, status);
5137    }
5138
5139}
5140
5141/*----------------------------------------------------------------------------
5142| Returns the result of subtracting the extended double-precision floating-
5143| point values `a' and `b'.  The operation is performed according to the
5144| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5145*----------------------------------------------------------------------------*/
5146
5147floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5148{
5149    flag aSign, bSign;
5150
5151    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5152        float_raise(float_flag_invalid, status);
5153        return floatx80_default_nan(status);
5154    }
5155    aSign = extractFloatx80Sign( a );
5156    bSign = extractFloatx80Sign( b );
5157    if ( aSign == bSign ) {
5158        return subFloatx80Sigs(a, b, aSign, status);
5159    }
5160    else {
5161        return addFloatx80Sigs(a, b, aSign, status);
5162    }
5163
5164}
5165
5166/*----------------------------------------------------------------------------
5167| Returns the result of multiplying the extended double-precision floating-
5168| point values `a' and `b'.  The operation is performed according to the
5169| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5170*----------------------------------------------------------------------------*/
5171
5172floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5173{
5174    flag aSign, bSign, zSign;
5175    int32_t aExp, bExp, zExp;
5176    uint64_t aSig, bSig, zSig0, zSig1;
5177
5178    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5179        float_raise(float_flag_invalid, status);
5180        return floatx80_default_nan(status);
5181    }
5182    aSig = extractFloatx80Frac( a );
5183    aExp = extractFloatx80Exp( a );
5184    aSign = extractFloatx80Sign( a );
5185    bSig = extractFloatx80Frac( b );
5186    bExp = extractFloatx80Exp( b );
5187    bSign = extractFloatx80Sign( b );
5188    zSign = aSign ^ bSign;
5189    if ( aExp == 0x7FFF ) {
5190        if (    (uint64_t) ( aSig<<1 )
5191             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5192            return propagateFloatx80NaN(a, b, status);
5193        }
5194        if ( ( bExp | bSig ) == 0 ) goto invalid;
5195        return packFloatx80(zSign, floatx80_infinity_high,
5196                                   floatx80_infinity_low);
5197    }
5198    if ( bExp == 0x7FFF ) {
5199        if ((uint64_t)(bSig << 1)) {
5200            return propagateFloatx80NaN(a, b, status);
5201        }
5202        if ( ( aExp | aSig ) == 0 ) {
5203 invalid:
5204            float_raise(float_flag_invalid, status);
5205            return floatx80_default_nan(status);
5206        }
5207        return packFloatx80(zSign, floatx80_infinity_high,
5208                                   floatx80_infinity_low);
5209    }
5210    if ( aExp == 0 ) {
5211        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5212        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5213    }
5214    if ( bExp == 0 ) {
5215        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5216        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5217    }
5218    zExp = aExp + bExp - 0x3FFE;
5219    mul64To128( aSig, bSig, &zSig0, &zSig1 );
5220    if ( 0 < (int64_t) zSig0 ) {
5221        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5222        --zExp;
5223    }
5224    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5225                                zSign, zExp, zSig0, zSig1, status);
5226}
5227
5228/*----------------------------------------------------------------------------
5229| Returns the result of dividing the extended double-precision floating-point
5230| value `a' by the corresponding value `b'.  The operation is performed
5231| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5232*----------------------------------------------------------------------------*/
5233
5234floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5235{
5236    flag aSign, bSign, zSign;
5237    int32_t aExp, bExp, zExp;
5238    uint64_t aSig, bSig, zSig0, zSig1;
5239    uint64_t rem0, rem1, rem2, term0, term1, term2;
5240
5241    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5242        float_raise(float_flag_invalid, status);
5243        return floatx80_default_nan(status);
5244    }
5245    aSig = extractFloatx80Frac( a );
5246    aExp = extractFloatx80Exp( a );
5247    aSign = extractFloatx80Sign( a );
5248    bSig = extractFloatx80Frac( b );
5249    bExp = extractFloatx80Exp( b );
5250    bSign = extractFloatx80Sign( b );
5251    zSign = aSign ^ bSign;
5252    if ( aExp == 0x7FFF ) {
5253        if ((uint64_t)(aSig << 1)) {
5254            return propagateFloatx80NaN(a, b, status);
5255        }
5256        if ( bExp == 0x7FFF ) {
5257            if ((uint64_t)(bSig << 1)) {
5258                return propagateFloatx80NaN(a, b, status);
5259            }
5260            goto invalid;
5261        }
5262        return packFloatx80(zSign, floatx80_infinity_high,
5263                                   floatx80_infinity_low);
5264    }
5265    if ( bExp == 0x7FFF ) {
5266        if ((uint64_t)(bSig << 1)) {
5267            return propagateFloatx80NaN(a, b, status);
5268        }
5269        return packFloatx80( zSign, 0, 0 );
5270    }
5271    if ( bExp == 0 ) {
5272        if ( bSig == 0 ) {
5273            if ( ( aExp | aSig ) == 0 ) {
5274 invalid:
5275                float_raise(float_flag_invalid, status);
5276                return floatx80_default_nan(status);
5277            }
5278            float_raise(float_flag_divbyzero, status);
5279            return packFloatx80(zSign, floatx80_infinity_high,
5280                                       floatx80_infinity_low);
5281        }
5282        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5283    }
5284    if ( aExp == 0 ) {
5285        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5286        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5287    }
5288    zExp = aExp - bExp + 0x3FFE;
5289    rem1 = 0;
5290    if ( bSig <= aSig ) {
5291        shift128Right( aSig, 0, 1, &aSig, &rem1 );
5292        ++zExp;
5293    }
5294    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5295    mul64To128( bSig, zSig0, &term0, &term1 );
5296    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5297    while ( (int64_t) rem0 < 0 ) {
5298        --zSig0;
5299        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5300    }
5301    zSig1 = estimateDiv128To64( rem1, 0, bSig );
5302    if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5303        mul64To128( bSig, zSig1, &term1, &term2 );
5304        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5305        while ( (int64_t) rem1 < 0 ) {
5306            --zSig1;
5307            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5308        }
5309        zSig1 |= ( ( rem1 | rem2 ) != 0 );
5310    }
5311    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5312                                zSign, zExp, zSig0, zSig1, status);
5313}
5314
5315/*----------------------------------------------------------------------------
5316| Returns the remainder of the extended double-precision floating-point value
5317| `a' with respect to the corresponding value `b'.  The operation is performed
5318| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5319*----------------------------------------------------------------------------*/
5320
5321floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5322{
5323    flag aSign, zSign;
5324    int32_t aExp, bExp, expDiff;
5325    uint64_t aSig0, aSig1, bSig;
5326    uint64_t q, term0, term1, alternateASig0, alternateASig1;
5327
5328    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5329        float_raise(float_flag_invalid, status);
5330        return floatx80_default_nan(status);
5331    }
5332    aSig0 = extractFloatx80Frac( a );
5333    aExp = extractFloatx80Exp( a );
5334    aSign = extractFloatx80Sign( a );
5335    bSig = extractFloatx80Frac( b );
5336    bExp = extractFloatx80Exp( b );
5337    if ( aExp == 0x7FFF ) {
5338        if (    (uint64_t) ( aSig0<<1 )
5339             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5340            return propagateFloatx80NaN(a, b, status);
5341        }
5342        goto invalid;
5343    }
5344    if ( bExp == 0x7FFF ) {
5345        if ((uint64_t)(bSig << 1)) {
5346            return propagateFloatx80NaN(a, b, status);
5347        }
5348        return a;
5349    }
5350    if ( bExp == 0 ) {
5351        if ( bSig == 0 ) {
5352 invalid:
5353            float_raise(float_flag_invalid, status);
5354            return floatx80_default_nan(status);
5355        }
5356        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5357    }
5358    if ( aExp == 0 ) {
5359        if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5360        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5361    }
5362    bSig |= LIT64( 0x8000000000000000 );
5363    zSign = aSign;
5364    expDiff = aExp - bExp;
5365    aSig1 = 0;
5366    if ( expDiff < 0 ) {
5367        if ( expDiff < -1 ) return a;
5368        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5369        expDiff = 0;
5370    }
5371    q = ( bSig <= aSig0 );
5372    if ( q ) aSig0 -= bSig;
5373    expDiff -= 64;
5374    while ( 0 < expDiff ) {
5375        q = estimateDiv128To64( aSig0, aSig1, bSig );
5376        q = ( 2 < q ) ? q - 2 : 0;
5377        mul64To128( bSig, q, &term0, &term1 );
5378        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5379        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5380        expDiff -= 62;
5381    }
5382    expDiff += 64;
5383    if ( 0 < expDiff ) {
5384        q = estimateDiv128To64( aSig0, aSig1, bSig );
5385        q = ( 2 < q ) ? q - 2 : 0;
5386        q >>= 64 - expDiff;
5387        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5388        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5389        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5390        while ( le128( term0, term1, aSig0, aSig1 ) ) {
5391            ++q;
5392            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5393        }
5394    }
5395    else {
5396        term1 = 0;
5397        term0 = bSig;
5398    }
5399    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5400    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5401         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5402              && ( q & 1 ) )
5403       ) {
5404        aSig0 = alternateASig0;
5405        aSig1 = alternateASig1;
5406        zSign = ! zSign;
5407    }
5408    return
5409        normalizeRoundAndPackFloatx80(
5410            80, zSign, bExp + expDiff, aSig0, aSig1, status);
5411
5412}
5413
5414/*----------------------------------------------------------------------------
5415| Returns the square root of the extended double-precision floating-point
5416| value `a'.  The operation is performed according to the IEC/IEEE Standard
5417| for Binary Floating-Point Arithmetic.
5418*----------------------------------------------------------------------------*/
5419
5420floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5421{
5422    flag aSign;
5423    int32_t aExp, zExp;
5424    uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5425    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5426
5427    if (floatx80_invalid_encoding(a)) {
5428        float_raise(float_flag_invalid, status);
5429        return floatx80_default_nan(status);
5430    }
5431    aSig0 = extractFloatx80Frac( a );
5432    aExp = extractFloatx80Exp( a );
5433    aSign = extractFloatx80Sign( a );
5434    if ( aExp == 0x7FFF ) {
5435        if ((uint64_t)(aSig0 << 1)) {
5436            return propagateFloatx80NaN(a, a, status);
5437        }
5438        if ( ! aSign ) return a;
5439        goto invalid;
5440    }
5441    if ( aSign ) {
5442        if ( ( aExp | aSig0 ) == 0 ) return a;
5443 invalid:
5444        float_raise(float_flag_invalid, status);
5445        return floatx80_default_nan(status);
5446    }
5447    if ( aExp == 0 ) {
5448        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5449        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5450    }
5451    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5452    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5453    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5454    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5455    doubleZSig0 = zSig0<<1;
5456    mul64To128( zSig0, zSig0, &term0, &term1 );
5457    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5458    while ( (int64_t) rem0 < 0 ) {
5459        --zSig0;
5460        doubleZSig0 -= 2;
5461        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5462    }
5463    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5464    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5465        if ( zSig1 == 0 ) zSig1 = 1;
5466        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5467        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5468        mul64To128( zSig1, zSig1, &term2, &term3 );
5469        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5470        while ( (int64_t) rem1 < 0 ) {
5471            --zSig1;
5472            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5473            term3 |= 1;
5474            term2 |= doubleZSig0;
5475            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5476        }
5477        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5478    }
5479    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5480    zSig0 |= doubleZSig0;
5481    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5482                                0, zExp, zSig0, zSig1, status);
5483}
5484
5485/*----------------------------------------------------------------------------
5486| Returns 1 if the extended double-precision floating-point value `a' is equal
5487| to the corresponding value `b', and 0 otherwise.  The invalid exception is
5488| raised if either operand is a NaN.  Otherwise, the comparison is performed
5489| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5490*----------------------------------------------------------------------------*/
5491
5492int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5493{
5494
5495    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5496        || (extractFloatx80Exp(a) == 0x7FFF
5497            && (uint64_t) (extractFloatx80Frac(a) << 1))
5498        || (extractFloatx80Exp(b) == 0x7FFF
5499            && (uint64_t) (extractFloatx80Frac(b) << 1))
5500       ) {
5501        float_raise(float_flag_invalid, status);
5502        return 0;
5503    }
5504    return
5505           ( a.low == b.low )
5506        && (    ( a.high == b.high )
5507             || (    ( a.low == 0 )
5508                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5509           );
5510
5511}
5512
5513/*----------------------------------------------------------------------------
5514| Returns 1 if the extended double-precision floating-point value `a' is
5515| less than or equal to the corresponding value `b', and 0 otherwise.  The
5516| invalid exception is raised if either operand is a NaN.  The comparison is
5517| performed according to the IEC/IEEE Standard for Binary Floating-Point
5518| Arithmetic.
5519*----------------------------------------------------------------------------*/
5520
5521int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5522{
5523    flag aSign, bSign;
5524
5525    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5526        || (extractFloatx80Exp(a) == 0x7FFF
5527            && (uint64_t) (extractFloatx80Frac(a) << 1))
5528        || (extractFloatx80Exp(b) == 0x7FFF
5529            && (uint64_t) (extractFloatx80Frac(b) << 1))
5530       ) {
5531        float_raise(float_flag_invalid, status);
5532        return 0;
5533    }
5534    aSign = extractFloatx80Sign( a );
5535    bSign = extractFloatx80Sign( b );
5536    if ( aSign != bSign ) {
5537        return
5538               aSign
5539            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5540                 == 0 );
5541    }
5542    return
5543          aSign ? le128( b.high, b.low, a.high, a.low )
5544        : le128( a.high, a.low, b.high, b.low );
5545
5546}
5547
5548/*----------------------------------------------------------------------------
5549| Returns 1 if the extended double-precision floating-point value `a' is
5550| less than the corresponding value `b', and 0 otherwise.  The invalid
5551| exception is raised if either operand is a NaN.  The comparison is performed
5552| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5553*----------------------------------------------------------------------------*/
5554
5555int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5556{
5557    flag aSign, bSign;
5558
5559    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5560        || (extractFloatx80Exp(a) == 0x7FFF
5561            && (uint64_t) (extractFloatx80Frac(a) << 1))
5562        || (extractFloatx80Exp(b) == 0x7FFF
5563            && (uint64_t) (extractFloatx80Frac(b) << 1))
5564       ) {
5565        float_raise(float_flag_invalid, status);
5566        return 0;
5567    }
5568    aSign = extractFloatx80Sign( a );
5569    bSign = extractFloatx80Sign( b );
5570    if ( aSign != bSign ) {
5571        return
5572               aSign
5573            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5574                 != 0 );
5575    }
5576    return
5577          aSign ? lt128( b.high, b.low, a.high, a.low )
5578        : lt128( a.high, a.low, b.high, b.low );
5579
5580}
5581
5582/*----------------------------------------------------------------------------
5583| Returns 1 if the extended double-precision floating-point values `a' and `b'
5584| cannot be compared, and 0 otherwise.  The invalid exception is raised if
5585| either operand is a NaN.   The comparison is performed according to the
5586| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5587*----------------------------------------------------------------------------*/
5588int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5589{
5590    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5591        || (extractFloatx80Exp(a) == 0x7FFF
5592            && (uint64_t) (extractFloatx80Frac(a) << 1))
5593        || (extractFloatx80Exp(b) == 0x7FFF
5594            && (uint64_t) (extractFloatx80Frac(b) << 1))
5595       ) {
5596        float_raise(float_flag_invalid, status);
5597        return 1;
5598    }
5599    return 0;
5600}
5601
5602/*----------------------------------------------------------------------------
5603| Returns 1 if the extended double-precision floating-point value `a' is
5604| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5605| cause an exception.  The comparison is performed according to the IEC/IEEE
5606| Standard for Binary Floating-Point Arithmetic.
5607*----------------------------------------------------------------------------*/
5608
5609int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5610{
5611
5612    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5613        float_raise(float_flag_invalid, status);
5614        return 0;
5615    }
5616    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5617              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5618         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5619              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5620       ) {
5621        if (floatx80_is_signaling_nan(a, status)
5622         || floatx80_is_signaling_nan(b, status)) {
5623            float_raise(float_flag_invalid, status);
5624        }
5625        return 0;
5626    }
5627    return
5628           ( a.low == b.low )
5629        && (    ( a.high == b.high )
5630             || (    ( a.low == 0 )
5631                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5632           );
5633
5634}
5635
5636/*----------------------------------------------------------------------------
5637| Returns 1 if the extended double-precision floating-point value `a' is less
5638| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5639| do not cause an exception.  Otherwise, the comparison is performed according
5640| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5641*----------------------------------------------------------------------------*/
5642
5643int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5644{
5645    flag aSign, bSign;
5646
5647    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5648        float_raise(float_flag_invalid, status);
5649        return 0;
5650    }
5651    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5652              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5653         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5654              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5655       ) {
5656        if (floatx80_is_signaling_nan(a, status)
5657         || floatx80_is_signaling_nan(b, status)) {
5658            float_raise(float_flag_invalid, status);
5659        }
5660        return 0;
5661    }
5662    aSign = extractFloatx80Sign( a );
5663    bSign = extractFloatx80Sign( b );
5664    if ( aSign != bSign ) {
5665        return
5666               aSign
5667            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5668                 == 0 );
5669    }
5670    return
5671          aSign ? le128( b.high, b.low, a.high, a.low )
5672        : le128( a.high, a.low, b.high, b.low );
5673
5674}
5675
5676/*----------------------------------------------------------------------------
5677| Returns 1 if the extended double-precision floating-point value `a' is less
5678| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5679| an exception.  Otherwise, the comparison is performed according to the
5680| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5681*----------------------------------------------------------------------------*/
5682
5683int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5684{
5685    flag aSign, bSign;
5686
5687    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5688        float_raise(float_flag_invalid, status);
5689        return 0;
5690    }
5691    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5692              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5693         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5694              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5695       ) {
5696        if (floatx80_is_signaling_nan(a, status)
5697         || floatx80_is_signaling_nan(b, status)) {
5698            float_raise(float_flag_invalid, status);
5699        }
5700        return 0;
5701    }
5702    aSign = extractFloatx80Sign( a );
5703    bSign = extractFloatx80Sign( b );
5704    if ( aSign != bSign ) {
5705        return
5706               aSign
5707            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5708                 != 0 );
5709    }
5710    return
5711          aSign ? lt128( b.high, b.low, a.high, a.low )
5712        : lt128( a.high, a.low, b.high, b.low );
5713
5714}
5715
5716/*----------------------------------------------------------------------------
5717| Returns 1 if the extended double-precision floating-point values `a' and `b'
5718| cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5719| The comparison is performed according to the IEC/IEEE Standard for Binary
5720| Floating-Point Arithmetic.
5721*----------------------------------------------------------------------------*/
5722int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5723{
5724    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5725        float_raise(float_flag_invalid, status);
5726        return 1;
5727    }
5728    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5729              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5730         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5731              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5732       ) {
5733        if (floatx80_is_signaling_nan(a, status)
5734         || floatx80_is_signaling_nan(b, status)) {
5735            float_raise(float_flag_invalid, status);
5736        }
5737        return 1;
5738    }
5739    return 0;
5740}
5741
5742/*----------------------------------------------------------------------------
5743| Returns the result of converting the quadruple-precision floating-point
5744| value `a' to the 32-bit two's complement integer format.  The conversion
5745| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5746| Arithmetic---which means in particular that the conversion is rounded
5747| according to the current rounding mode.  If `a' is a NaN, the largest
5748| positive integer is returned.  Otherwise, if the conversion overflows, the
5749| largest integer with the same sign as `a' is returned.
5750*----------------------------------------------------------------------------*/
5751
5752int32_t float128_to_int32(float128 a, float_status *status)
5753{
5754    flag aSign;
5755    int32_t aExp, shiftCount;
5756    uint64_t aSig0, aSig1;
5757
5758    aSig1 = extractFloat128Frac1( a );
5759    aSig0 = extractFloat128Frac0( a );
5760    aExp = extractFloat128Exp( a );
5761    aSign = extractFloat128Sign( a );
5762    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5763    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5764    aSig0 |= ( aSig1 != 0 );
5765    shiftCount = 0x4028 - aExp;
5766    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5767    return roundAndPackInt32(aSign, aSig0, status);
5768
5769}
5770
5771/*----------------------------------------------------------------------------
5772| Returns the result of converting the quadruple-precision floating-point
5773| value `a' to the 32-bit two's complement integer format.  The conversion
5774| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5775| Arithmetic, except that the conversion is always rounded toward zero.  If
5776| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5777| conversion overflows, the largest integer with the same sign as `a' is
5778| returned.
5779*----------------------------------------------------------------------------*/
5780
5781int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5782{
5783    flag aSign;
5784    int32_t aExp, shiftCount;
5785    uint64_t aSig0, aSig1, savedASig;
5786    int32_t z;
5787
5788    aSig1 = extractFloat128Frac1( a );
5789    aSig0 = extractFloat128Frac0( a );
5790    aExp = extractFloat128Exp( a );
5791    aSign = extractFloat128Sign( a );
5792    aSig0 |= ( aSig1 != 0 );
5793    if ( 0x401E < aExp ) {
5794        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5795        goto invalid;
5796    }
5797    else if ( aExp < 0x3FFF ) {
5798        if (aExp || aSig0) {
5799            status->float_exception_flags |= float_flag_inexact;
5800        }
5801        return 0;
5802    }
5803    aSig0 |= LIT64( 0x0001000000000000 );
5804    shiftCount = 0x402F - aExp;
5805    savedASig = aSig0;
5806    aSig0 >>= shiftCount;
5807    z = aSig0;
5808    if ( aSign ) z = - z;
5809    if ( ( z < 0 ) ^ aSign ) {
5810 invalid:
5811        float_raise(float_flag_invalid, status);
5812        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5813    }
5814    if ( ( aSig0<<shiftCount ) != savedASig ) {
5815        status->float_exception_flags |= float_flag_inexact;
5816    }
5817    return z;
5818
5819}
5820
5821/*----------------------------------------------------------------------------
5822| Returns the result of converting the quadruple-precision floating-point
5823| value `a' to the 64-bit two's complement integer format.  The conversion
5824| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5825| Arithmetic---which means in particular that the conversion is rounded
5826| according to the current rounding mode.  If `a' is a NaN, the largest
5827| positive integer is returned.  Otherwise, if the conversion overflows, the
5828| largest integer with the same sign as `a' is returned.
5829*----------------------------------------------------------------------------*/
5830
5831int64_t float128_to_int64(float128 a, float_status *status)
5832{
5833    flag aSign;
5834    int32_t aExp, shiftCount;
5835    uint64_t aSig0, aSig1;
5836
5837    aSig1 = extractFloat128Frac1( a );
5838    aSig0 = extractFloat128Frac0( a );
5839    aExp = extractFloat128Exp( a );
5840    aSign = extractFloat128Sign( a );
5841    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5842    shiftCount = 0x402F - aExp;
5843    if ( shiftCount <= 0 ) {
5844        if ( 0x403E < aExp ) {
5845            float_raise(float_flag_invalid, status);
5846            if (    ! aSign
5847                 || (    ( aExp == 0x7FFF )
5848                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5849                    )
5850               ) {
5851                return LIT64( 0x7FFFFFFFFFFFFFFF );
5852            }
5853            return (int64_t) LIT64( 0x8000000000000000 );
5854        }
5855        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5856    }
5857    else {
5858        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5859    }
5860    return roundAndPackInt64(aSign, aSig0, aSig1, status);
5861
5862}
5863
5864/*----------------------------------------------------------------------------
5865| Returns the result of converting the quadruple-precision floating-point
5866| value `a' to the 64-bit two's complement integer format.  The conversion
5867| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5868| Arithmetic, except that the conversion is always rounded toward zero.
5869| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5870| the conversion overflows, the largest integer with the same sign as `a' is
5871| returned.
5872*----------------------------------------------------------------------------*/
5873
5874int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5875{
5876    flag aSign;
5877    int32_t aExp, shiftCount;
5878    uint64_t aSig0, aSig1;
5879    int64_t z;
5880
5881    aSig1 = extractFloat128Frac1( a );
5882    aSig0 = extractFloat128Frac0( a );
5883    aExp = extractFloat128Exp( a );
5884    aSign = extractFloat128Sign( a );
5885    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5886    shiftCount = aExp - 0x402F;
5887    if ( 0 < shiftCount ) {
5888        if ( 0x403E <= aExp ) {
5889            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5890            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5891                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5892                if (aSig1) {
5893                    status->float_exception_flags |= float_flag_inexact;
5894                }
5895            }
5896            else {
5897                float_raise(float_flag_invalid, status);
5898                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5899                    return LIT64( 0x7FFFFFFFFFFFFFFF );
5900                }
5901            }
5902            return (int64_t) LIT64( 0x8000000000000000 );
5903        }
5904        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5905        if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5906            status->float_exception_flags |= float_flag_inexact;
5907        }
5908    }
5909    else {
5910        if ( aExp < 0x3FFF ) {
5911            if ( aExp | aSig0 | aSig1 ) {
5912                status->float_exception_flags |= float_flag_inexact;
5913            }
5914            return 0;
5915        }
5916        z = aSig0>>( - shiftCount );
5917        if (    aSig1
5918             || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5919            status->float_exception_flags |= float_flag_inexact;
5920        }
5921    }
5922    if ( aSign ) z = - z;
5923    return z;
5924
5925}
5926
5927/*----------------------------------------------------------------------------
5928| Returns the result of converting the quadruple-precision floating-point value
5929| `a' to the 64-bit unsigned integer format.  The conversion is
5930| performed according to the IEC/IEEE Standard for Binary Floating-Point
5931| Arithmetic---which means in particular that the conversion is rounded
5932| according to the current rounding mode.  If `a' is a NaN, the largest
5933| positive integer is returned.  If the conversion overflows, the
5934| largest unsigned integer is returned.  If 'a' is negative, the value is
5935| rounded and zero is returned; negative values that do not round to zero
5936| will raise the inexact exception.
5937*----------------------------------------------------------------------------*/
5938
5939uint64_t float128_to_uint64(float128 a, float_status *status)
5940{
5941    flag aSign;
5942    int aExp;
5943    int shiftCount;
5944    uint64_t aSig0, aSig1;
5945
5946    aSig0 = extractFloat128Frac0(a);
5947    aSig1 = extractFloat128Frac1(a);
5948    aExp = extractFloat128Exp(a);
5949    aSign = extractFloat128Sign(a);
5950    if (aSign && (aExp > 0x3FFE)) {
5951        float_raise(float_flag_invalid, status);
5952        if (float128_is_any_nan(a)) {
5953            return LIT64(0xFFFFFFFFFFFFFFFF);
5954        } else {
5955            return 0;
5956        }
5957    }
5958    if (aExp) {
5959        aSig0 |= LIT64(0x0001000000000000);
5960    }
5961    shiftCount = 0x402F - aExp;
5962    if (shiftCount <= 0) {
5963        if (0x403E < aExp) {
5964            float_raise(float_flag_invalid, status);
5965            return LIT64(0xFFFFFFFFFFFFFFFF);
5966        }
5967        shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5968    } else {
5969        shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5970    }
5971    return roundAndPackUint64(aSign, aSig0, aSig1, status);
5972}
5973
5974uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5975{
5976    uint64_t v;
5977    signed char current_rounding_mode = status->float_rounding_mode;
5978
5979    set_float_rounding_mode(float_round_to_zero, status);
5980    v = float128_to_uint64(a, status);
5981    set_float_rounding_mode(current_rounding_mode, status);
5982
5983    return v;
5984}
5985
5986/*----------------------------------------------------------------------------
5987| Returns the result of converting the quadruple-precision floating-point
5988| value `a' to the 32-bit unsigned integer format.  The conversion
5989| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5990| Arithmetic except that the conversion is always rounded toward zero.
5991| If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5992| if the conversion overflows, the largest unsigned integer is returned.
5993| If 'a' is negative, the value is rounded and zero is returned; negative
5994| values that do not round to zero will raise the inexact exception.
5995*----------------------------------------------------------------------------*/
5996
5997uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5998{
5999    uint64_t v;
6000    uint32_t res;
6001    int old_exc_flags = get_float_exception_flags(status);
6002
6003    v = float128_to_uint64_round_to_zero(a, status);
6004    if (v > 0xffffffff) {
6005        res = 0xffffffff;
6006    } else {
6007        return v;
6008    }
6009    set_float_exception_flags(old_exc_flags, status);
6010    float_raise(float_flag_invalid, status);
6011    return res;
6012}
6013
6014/*----------------------------------------------------------------------------
6015| Returns the result of converting the quadruple-precision floating-point
6016| value `a' to the single-precision floating-point format.  The conversion
6017| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6018| Arithmetic.
6019*----------------------------------------------------------------------------*/
6020
6021float32 float128_to_float32(float128 a, float_status *status)
6022{
6023    flag aSign;
6024    int32_t aExp;
6025    uint64_t aSig0, aSig1;
6026    uint32_t zSig;
6027
6028    aSig1 = extractFloat128Frac1( a );
6029    aSig0 = extractFloat128Frac0( a );
6030    aExp = extractFloat128Exp( a );
6031    aSign = extractFloat128Sign( a );
6032    if ( aExp == 0x7FFF ) {
6033        if ( aSig0 | aSig1 ) {
6034            return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6035        }
6036        return packFloat32( aSign, 0xFF, 0 );
6037    }
6038    aSig0 |= ( aSig1 != 0 );
6039    shift64RightJamming( aSig0, 18, &aSig0 );
6040    zSig = aSig0;
6041    if ( aExp || zSig ) {
6042        zSig |= 0x40000000;
6043        aExp -= 0x3F81;
6044    }
6045    return roundAndPackFloat32(aSign, aExp, zSig, status);
6046
6047}
6048
6049/*----------------------------------------------------------------------------
6050| Returns the result of converting the quadruple-precision floating-point
6051| value `a' to the double-precision floating-point format.  The conversion
6052| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6053| Arithmetic.
6054*----------------------------------------------------------------------------*/
6055
6056float64 float128_to_float64(float128 a, float_status *status)
6057{
6058    flag aSign;
6059    int32_t aExp;
6060    uint64_t aSig0, aSig1;
6061
6062    aSig1 = extractFloat128Frac1( a );
6063    aSig0 = extractFloat128Frac0( a );
6064    aExp = extractFloat128Exp( a );
6065    aSign = extractFloat128Sign( a );
6066    if ( aExp == 0x7FFF ) {
6067        if ( aSig0 | aSig1 ) {
6068            return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6069        }
6070        return packFloat64( aSign, 0x7FF, 0 );
6071    }
6072    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6073    aSig0 |= ( aSig1 != 0 );
6074    if ( aExp || aSig0 ) {
6075        aSig0 |= LIT64( 0x4000000000000000 );
6076        aExp -= 0x3C01;
6077    }
6078    return roundAndPackFloat64(aSign, aExp, aSig0, status);
6079
6080}
6081
6082/*----------------------------------------------------------------------------
6083| Returns the result of converting the quadruple-precision floating-point
6084| value `a' to the extended double-precision floating-point format.  The
6085| conversion is performed according to the IEC/IEEE Standard for Binary
6086| Floating-Point Arithmetic.
6087*----------------------------------------------------------------------------*/
6088
6089floatx80 float128_to_floatx80(float128 a, float_status *status)
6090{
6091    flag aSign;
6092    int32_t aExp;
6093    uint64_t aSig0, aSig1;
6094
6095    aSig1 = extractFloat128Frac1( a );
6096    aSig0 = extractFloat128Frac0( a );
6097    aExp = extractFloat128Exp( a );
6098    aSign = extractFloat128Sign( a );
6099    if ( aExp == 0x7FFF ) {
6100        if ( aSig0 | aSig1 ) {
6101            return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6102        }
6103        return packFloatx80(aSign, floatx80_infinity_high,
6104                                   floatx80_infinity_low);
6105    }
6106    if ( aExp == 0 ) {
6107        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6108        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6109    }
6110    else {
6111        aSig0 |= LIT64( 0x0001000000000000 );
6112    }
6113    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6114    return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6115
6116}
6117
6118/*----------------------------------------------------------------------------
6119| Rounds the quadruple-precision floating-point value `a' to an integer, and
6120| returns the result as a quadruple-precision floating-point value.  The
6121| operation is performed according to the IEC/IEEE Standard for Binary
6122| Floating-Point Arithmetic.
6123*----------------------------------------------------------------------------*/
6124
6125float128 float128_round_to_int(float128 a, float_status *status)
6126{
6127    flag aSign;
6128    int32_t aExp;
6129    uint64_t lastBitMask, roundBitsMask;
6130    float128 z;
6131
6132    aExp = extractFloat128Exp( a );
6133    if ( 0x402F <= aExp ) {
6134        if ( 0x406F <= aExp ) {
6135            if (    ( aExp == 0x7FFF )
6136                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6137               ) {
6138                return propagateFloat128NaN(a, a, status);
6139            }
6140            return a;
6141        }
6142        lastBitMask = 1;
6143        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6144        roundBitsMask = lastBitMask - 1;
6145        z = a;
6146        switch (status->float_rounding_mode) {
6147        case float_round_nearest_even:
6148            if ( lastBitMask ) {
6149                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6150                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6151            }
6152            else {
6153                if ( (int64_t) z.low < 0 ) {
6154                    ++z.high;
6155                    if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6156                }
6157            }
6158            break;
6159        case float_round_ties_away:
6160            if (lastBitMask) {
6161                add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6162            } else {
6163                if ((int64_t) z.low < 0) {
6164                    ++z.high;
6165                }
6166            }
6167            break;
6168        case float_round_to_zero:
6169            break;
6170        case float_round_up:
6171            if (!extractFloat128Sign(z)) {
6172                add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6173            }
6174            break;
6175        case float_round_down:
6176            if (extractFloat128Sign(z)) {
6177                add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6178            }
6179            break;
6180        default:
6181            abort();
6182        }
6183        z.low &= ~ roundBitsMask;
6184    }
6185    else {
6186        if ( aExp < 0x3FFF ) {
6187            if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6188            status->float_exception_flags |= float_flag_inexact;
6189            aSign = extractFloat128Sign( a );
6190            switch (status->float_rounding_mode) {
6191             case float_round_nearest_even:
6192                if (    ( aExp == 0x3FFE )
6193                     && (   extractFloat128Frac0( a )
6194                          | extractFloat128Frac1( a ) )
6195                   ) {
6196                    return packFloat128( aSign, 0x3FFF, 0, 0 );
6197                }
6198                break;
6199            case float_round_ties_away:
6200                if (aExp == 0x3FFE) {
6201                    return packFloat128(aSign, 0x3FFF, 0, 0);
6202                }
6203                break;
6204             case float_round_down:
6205                return
6206                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6207                    : packFloat128( 0, 0, 0, 0 );
6208             case float_round_up:
6209                return
6210                      aSign ? packFloat128( 1, 0, 0, 0 )
6211                    : packFloat128( 0, 0x3FFF, 0, 0 );
6212            }
6213            return packFloat128( aSign, 0, 0, 0 );
6214        }
6215        lastBitMask = 1;
6216        lastBitMask <<= 0x402F - aExp;
6217        roundBitsMask = lastBitMask - 1;
6218        z.low = 0;
6219        z.high = a.high;
6220        switch (status->float_rounding_mode) {
6221        case float_round_nearest_even:
6222            z.high += lastBitMask>>1;
6223            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6224                z.high &= ~ lastBitMask;
6225            }
6226            break;
6227        case float_round_ties_away:
6228            z.high += lastBitMask>>1;
6229            break;
6230        case float_round_to_zero:
6231            break;
6232        case float_round_up:
6233            if (!extractFloat128Sign(z)) {
6234                z.high |= ( a.low != 0 );
6235                z.high += roundBitsMask;
6236            }
6237            break;
6238        case float_round_down:
6239            if (extractFloat128Sign(z)) {
6240                z.high |= (a.low != 0);
6241                z.high += roundBitsMask;
6242            }
6243            break;
6244        default:
6245            abort();
6246        }
6247        z.high &= ~ roundBitsMask;
6248    }
6249    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6250        status->float_exception_flags |= float_flag_inexact;
6251    }
6252    return z;
6253
6254}
6255
6256/*----------------------------------------------------------------------------
6257| Returns the result of adding the absolute values of the quadruple-precision
6258| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6259| before being returned.  `zSign' is ignored if the result is a NaN.
6260| The addition is performed according to the IEC/IEEE Standard for Binary
6261| Floating-Point Arithmetic.
6262*----------------------------------------------------------------------------*/
6263
6264static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6265                                float_status *status)
6266{
6267    int32_t aExp, bExp, zExp;
6268    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6269    int32_t expDiff;
6270
6271    aSig1 = extractFloat128Frac1( a );
6272    aSig0 = extractFloat128Frac0( a );
6273    aExp = extractFloat128Exp( a );
6274    bSig1 = extractFloat128Frac1( b );
6275    bSig0 = extractFloat128Frac0( b );
6276    bExp = extractFloat128Exp( b );
6277    expDiff = aExp - bExp;
6278    if ( 0 < expDiff ) {
6279        if ( aExp == 0x7FFF ) {
6280            if (aSig0 | aSig1) {
6281                return propagateFloat128NaN(a, b, status);
6282            }
6283            return a;
6284        }
6285        if ( bExp == 0 ) {
6286            --expDiff;
6287        }
6288        else {
6289            bSig0 |= LIT64( 0x0001000000000000 );
6290        }
6291        shift128ExtraRightJamming(
6292            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6293        zExp = aExp;
6294    }
6295    else if ( expDiff < 0 ) {
6296        if ( bExp == 0x7FFF ) {
6297            if (bSig0 | bSig1) {
6298                return propagateFloat128NaN(a, b, status);
6299            }
6300            return packFloat128( zSign, 0x7FFF, 0, 0 );
6301        }
6302        if ( aExp == 0 ) {
6303            ++expDiff;
6304        }
6305        else {
6306            aSig0 |= LIT64( 0x0001000000000000 );
6307        }
6308        shift128ExtraRightJamming(
6309            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6310        zExp = bExp;
6311    }
6312    else {
6313        if ( aExp == 0x7FFF ) {
6314            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6315                return propagateFloat128NaN(a, b, status);
6316            }
6317            return a;
6318        }
6319        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6320        if ( aExp == 0 ) {
6321            if (status->flush_to_zero) {
6322                if (zSig0 | zSig1) {
6323                    float_raise(float_flag_output_denormal, status);
6324                }
6325                return packFloat128(zSign, 0, 0, 0);
6326            }
6327            return packFloat128( zSign, 0, zSig0, zSig1 );
6328        }
6329        zSig2 = 0;
6330        zSig0 |= LIT64( 0x0002000000000000 );
6331        zExp = aExp;
6332        goto shiftRight1;
6333    }
6334    aSig0 |= LIT64( 0x0001000000000000 );
6335    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6336    --zExp;
6337    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6338    ++zExp;
6339 shiftRight1:
6340    shift128ExtraRightJamming(
6341        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6342 roundAndPack:
6343    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6344
6345}
6346
6347/*----------------------------------------------------------------------------
6348| Returns the result of subtracting the absolute values of the quadruple-
6349| precision floating-point values `a' and `b'.  If `zSign' is 1, the
6350| difference is negated before being returned.  `zSign' is ignored if the
6351| result is a NaN.  The subtraction is performed according to the IEC/IEEE
6352| Standard for Binary Floating-Point Arithmetic.
6353*----------------------------------------------------------------------------*/
6354
6355static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6356                                float_status *status)
6357{
6358    int32_t aExp, bExp, zExp;
6359    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6360    int32_t expDiff;
6361
6362    aSig1 = extractFloat128Frac1( a );
6363    aSig0 = extractFloat128Frac0( a );
6364    aExp = extractFloat128Exp( a );
6365    bSig1 = extractFloat128Frac1( b );
6366    bSig0 = extractFloat128Frac0( b );
6367    bExp = extractFloat128Exp( b );
6368    expDiff = aExp - bExp;
6369    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6370    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6371    if ( 0 < expDiff ) goto aExpBigger;
6372    if ( expDiff < 0 ) goto bExpBigger;
6373    if ( aExp == 0x7FFF ) {
6374        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6375            return propagateFloat128NaN(a, b, status);
6376        }
6377        float_raise(float_flag_invalid, status);
6378        return float128_default_nan(status);
6379    }
6380    if ( aExp == 0 ) {
6381        aExp = 1;
6382        bExp = 1;
6383    }
6384    if ( bSig0 < aSig0 ) goto aBigger;
6385    if ( aSig0 < bSig0 ) goto bBigger;
6386    if ( bSig1 < aSig1 ) goto aBigger;
6387    if ( aSig1 < bSig1 ) goto bBigger;
6388    return packFloat128(status->float_rounding_mode == float_round_down,
6389                        0, 0, 0);
6390 bExpBigger:
6391    if ( bExp == 0x7FFF ) {
6392        if (bSig0 | bSig1) {
6393            return propagateFloat128NaN(a, b, status);
6394        }
6395        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6396    }
6397    if ( aExp == 0 ) {
6398        ++expDiff;
6399    }
6400    else {
6401        aSig0 |= LIT64( 0x4000000000000000 );
6402    }
6403    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6404    bSig0 |= LIT64( 0x4000000000000000 );
6405 bBigger:
6406    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6407    zExp = bExp;
6408    zSign ^= 1;
6409    goto normalizeRoundAndPack;
6410 aExpBigger:
6411    if ( aExp == 0x7FFF ) {
6412        if (aSig0 | aSig1) {
6413            return propagateFloat128NaN(a, b, status);
6414        }
6415        return a;
6416    }
6417    if ( bExp == 0 ) {
6418        --expDiff;
6419    }
6420    else {
6421        bSig0 |= LIT64( 0x4000000000000000 );
6422    }
6423    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6424    aSig0 |= LIT64( 0x4000000000000000 );
6425 aBigger:
6426    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6427    zExp = aExp;
6428 normalizeRoundAndPack:
6429    --zExp;
6430    return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6431                                         status);
6432
6433}
6434
6435/*----------------------------------------------------------------------------
6436| Returns the result of adding the quadruple-precision floating-point values
6437| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6438| for Binary Floating-Point Arithmetic.
6439*----------------------------------------------------------------------------*/
6440
6441float128 float128_add(float128 a, float128 b, float_status *status)
6442{
6443    flag aSign, bSign;
6444
6445    aSign = extractFloat128Sign( a );
6446    bSign = extractFloat128Sign( b );
6447    if ( aSign == bSign ) {
6448        return addFloat128Sigs(a, b, aSign, status);
6449    }
6450    else {
6451        return subFloat128Sigs(a, b, aSign, status);
6452    }
6453
6454}
6455
6456/*----------------------------------------------------------------------------
6457| Returns the result of subtracting the quadruple-precision floating-point
6458| values `a' and `b'.  The operation is performed according to the IEC/IEEE
6459| Standard for Binary Floating-Point Arithmetic.
6460*----------------------------------------------------------------------------*/
6461
6462float128 float128_sub(float128 a, float128 b, float_status *status)
6463{
6464    flag aSign, bSign;
6465
6466    aSign = extractFloat128Sign( a );
6467    bSign = extractFloat128Sign( b );
6468    if ( aSign == bSign ) {
6469        return subFloat128Sigs(a, b, aSign, status);
6470    }
6471    else {
6472        return addFloat128Sigs(a, b, aSign, status);
6473    }
6474
6475}
6476
6477/*----------------------------------------------------------------------------
6478| Returns the result of multiplying the quadruple-precision floating-point
6479| values `a' and `b'.  The operation is performed according to the IEC/IEEE
6480| Standard for Binary Floating-Point Arithmetic.
6481*----------------------------------------------------------------------------*/
6482
6483float128 float128_mul(float128 a, float128 b, float_status *status)
6484{
6485    flag aSign, bSign, zSign;
6486    int32_t aExp, bExp, zExp;
6487    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6488
6489    aSig1 = extractFloat128Frac1( a );
6490    aSig0 = extractFloat128Frac0( a );
6491    aExp = extractFloat128Exp( a );
6492    aSign = extractFloat128Sign( a );
6493    bSig1 = extractFloat128Frac1( b );
6494    bSig0 = extractFloat128Frac0( b );
6495    bExp = extractFloat128Exp( b );
6496    bSign = extractFloat128Sign( b );
6497    zSign = aSign ^ bSign;
6498    if ( aExp == 0x7FFF ) {
6499        if (    ( aSig0 | aSig1 )
6500             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6501            return propagateFloat128NaN(a, b, status);
6502        }
6503        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6504        return packFloat128( zSign, 0x7FFF, 0, 0 );
6505    }
6506    if ( bExp == 0x7FFF ) {
6507        if (bSig0 | bSig1) {
6508            return propagateFloat128NaN(a, b, status);
6509        }
6510        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6511 invalid:
6512            float_raise(float_flag_invalid, status);
6513            return float128_default_nan(status);
6514        }
6515        return packFloat128( zSign, 0x7FFF, 0, 0 );
6516    }
6517    if ( aExp == 0 ) {
6518        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6519        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6520    }
6521    if ( bExp == 0 ) {
6522        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6523        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6524    }
6525    zExp = aExp + bExp - 0x4000;
6526    aSig0 |= LIT64( 0x0001000000000000 );
6527    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6528    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6529    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6530    zSig2 |= ( zSig3 != 0 );
6531    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6532        shift128ExtraRightJamming(
6533            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6534        ++zExp;
6535    }
6536    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6537
6538}
6539
6540/*----------------------------------------------------------------------------
6541| Returns the result of dividing the quadruple-precision floating-point value
6542| `a' by the corresponding value `b'.  The operation is performed according to
6543| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6544*----------------------------------------------------------------------------*/
6545
6546float128 float128_div(float128 a, float128 b, float_status *status)
6547{
6548    flag aSign, bSign, zSign;
6549    int32_t aExp, bExp, zExp;
6550    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6551    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6552
6553    aSig1 = extractFloat128Frac1( a );
6554    aSig0 = extractFloat128Frac0( a );
6555    aExp = extractFloat128Exp( a );
6556    aSign = extractFloat128Sign( a );
6557    bSig1 = extractFloat128Frac1( b );
6558    bSig0 = extractFloat128Frac0( b );
6559    bExp = extractFloat128Exp( b );
6560    bSign = extractFloat128Sign( b );
6561    zSign = aSign ^ bSign;
6562    if ( aExp == 0x7FFF ) {
6563        if (aSig0 | aSig1) {
6564            return propagateFloat128NaN(a, b, status);
6565        }
6566        if ( bExp == 0x7FFF ) {
6567            if (bSig0 | bSig1) {
6568                return propagateFloat128NaN(a, b, status);
6569            }
6570            goto invalid;
6571        }
6572        return packFloat128( zSign, 0x7FFF, 0, 0 );
6573    }
6574    if ( bExp == 0x7FFF ) {
6575        if (bSig0 | bSig1) {
6576            return propagateFloat128NaN(a, b, status);
6577        }
6578        return packFloat128( zSign, 0, 0, 0 );
6579    }
6580    if ( bExp == 0 ) {
6581        if ( ( bSig0 | bSig1 ) == 0 ) {
6582            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6583 invalid:
6584                float_raise(float_flag_invalid, status);
6585                return float128_default_nan(status);
6586            }
6587            float_raise(float_flag_divbyzero, status);
6588            return packFloat128( zSign, 0x7FFF, 0, 0 );
6589        }
6590        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6591    }
6592    if ( aExp == 0 ) {
6593        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6594        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6595    }
6596    zExp = aExp - bExp + 0x3FFD;
6597    shortShift128Left(
6598        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6599    shortShift128Left(
6600        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6601    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6602        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6603        ++zExp;
6604    }
6605    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6606    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6607    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6608    while ( (int64_t) rem0 < 0 ) {
6609        --zSig0;
6610        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6611    }
6612    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6613    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6614        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6615        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6616        while ( (int64_t) rem1 < 0 ) {
6617            --zSig1;
6618            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6619        }
6620        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6621    }
6622    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6623    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6624
6625}
6626
6627/*----------------------------------------------------------------------------
6628| Returns the remainder of the quadruple-precision floating-point value `a'
6629| with respect to the corresponding value `b'.  The operation is performed
6630| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6631*----------------------------------------------------------------------------*/
6632
6633float128 float128_rem(float128 a, float128 b, float_status *status)
6634{
6635    flag aSign, zSign;
6636    int32_t aExp, bExp, expDiff;
6637    uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6638    uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6639    int64_t sigMean0;
6640
6641    aSig1 = extractFloat128Frac1( a );
6642    aSig0 = extractFloat128Frac0( a );
6643    aExp = extractFloat128Exp( a );
6644    aSign = extractFloat128Sign( a );
6645    bSig1 = extractFloat128Frac1( b );
6646    bSig0 = extractFloat128Frac0( b );
6647    bExp = extractFloat128Exp( b );
6648    if ( aExp == 0x7FFF ) {
6649        if (    ( aSig0 | aSig1 )
6650             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6651            return propagateFloat128NaN(a, b, status);
6652        }
6653        goto invalid;
6654    }
6655    if ( bExp == 0x7FFF ) {
6656        if (bSig0 | bSig1) {
6657            return propagateFloat128NaN(a, b, status);
6658        }
6659        return a;
6660    }
6661    if ( bExp == 0 ) {
6662        if ( ( bSig0 | bSig1 ) == 0 ) {
6663 invalid:
6664            float_raise(float_flag_invalid, status);
6665            return float128_default_nan(status);
6666        }
6667        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6668    }
6669    if ( aExp == 0 ) {
6670        if ( ( aSig0 | aSig1 ) == 0 ) return a;
6671        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6672    }
6673    expDiff = aExp - bExp;
6674    if ( expDiff < -1 ) return a;
6675    shortShift128Left(
6676        aSig0 | LIT64( 0x0001000000000000 ),
6677        aSig1,
6678        15 - ( expDiff < 0 ),
6679        &aSig0,
6680        &aSig1
6681    );
6682    shortShift128Left(
6683        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6684    q = le128( bSig0, bSig1, aSig0, aSig1 );
6685    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6686    expDiff -= 64;
6687    while ( 0 < expDiff ) {
6688        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6689        q = ( 4 < q ) ? q - 4 : 0;
6690        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6691        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6692        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6693        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6694        expDiff -= 61;
6695    }
6696    if ( -64 < expDiff ) {
6697        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6698        q = ( 4 < q ) ? q - 4 : 0;
6699        q >>= - expDiff;
6700        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6701        expDiff += 52;
6702        if ( expDiff < 0 ) {
6703            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6704        }
6705        else {
6706            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6707        }
6708        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6709        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6710    }
6711    else {
6712        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6713        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6714    }
6715    do {
6716        alternateASig0 = aSig0;
6717        alternateASig1 = aSig1;
6718        ++q;
6719        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6720    } while ( 0 <= (int64_t) aSig0 );
6721    add128(
6722        aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6723    if (    ( sigMean0 < 0 )
6724         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6725        aSig0 = alternateASig0;
6726        aSig1 = alternateASig1;
6727    }
6728    zSign = ( (int64_t) aSig0 < 0 );
6729    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6730    return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6731                                         status);
6732}
6733
6734/*----------------------------------------------------------------------------
6735| Returns the square root of the quadruple-precision floating-point value `a'.
6736| The operation is performed according to the IEC/IEEE Standard for Binary
6737| Floating-Point Arithmetic.
6738*----------------------------------------------------------------------------*/
6739
6740float128 float128_sqrt(float128 a, float_status *status)
6741{
6742    flag aSign;
6743    int32_t aExp, zExp;
6744    uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6745    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6746
6747    aSig1 = extractFloat128Frac1( a );
6748    aSig0 = extractFloat128Frac0( a );
6749    aExp = extractFloat128Exp( a );
6750    aSign = extractFloat128Sign( a );
6751    if ( aExp == 0x7FFF ) {
6752        if (aSig0 | aSig1) {
6753            return propagateFloat128NaN(a, a, status);
6754        }
6755        if ( ! aSign ) return a;
6756        goto invalid;
6757    }
6758    if ( aSign ) {
6759        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6760 invalid:
6761        float_raise(float_flag_invalid, status);
6762        return float128_default_nan(status);
6763    }
6764    if ( aExp == 0 ) {
6765        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6766        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6767    }
6768    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6769    aSig0 |= LIT64( 0x0001000000000000 );
6770    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6771    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6772    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6773    doubleZSig0 = zSig0<<1;
6774    mul64To128( zSig0, zSig0, &term0, &term1 );
6775    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6776    while ( (int64_t) rem0 < 0 ) {
6777        --zSig0;
6778        doubleZSig0 -= 2;
6779        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6780    }
6781    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6782    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6783        if ( zSig1 == 0 ) zSig1 = 1;
6784        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6785        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6786        mul64To128( zSig1, zSig1, &term2, &term3 );
6787        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6788        while ( (int64_t) rem1 < 0 ) {
6789            --zSig1;
6790            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6791            term3 |= 1;
6792            term2 |= doubleZSig0;
6793            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6794        }
6795        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6796    }
6797    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6798    return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6799
6800}
6801
6802/*----------------------------------------------------------------------------
6803| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6804| the corresponding value `b', and 0 otherwise.  The invalid exception is
6805| raised if either operand is a NaN.  Otherwise, the comparison is performed
6806| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6807*----------------------------------------------------------------------------*/
6808
6809int float128_eq(float128 a, float128 b, float_status *status)
6810{
6811
6812    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6813              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6814         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6815              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6816       ) {
6817        float_raise(float_flag_invalid, status);
6818        return 0;
6819    }
6820    return
6821           ( a.low == b.low )
6822        && (    ( a.high == b.high )
6823             || (    ( a.low == 0 )
6824                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6825           );
6826
6827}
6828
6829/*----------------------------------------------------------------------------
6830| Returns 1 if the quadruple-precision floating-point value `a' is less than
6831| or equal to the corresponding value `b', and 0 otherwise.  The invalid
6832| exception is raised if either operand is a NaN.  The comparison is performed
6833| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6834*----------------------------------------------------------------------------*/
6835
6836int float128_le(float128 a, float128 b, float_status *status)
6837{
6838    flag aSign, bSign;
6839
6840    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6841              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6842         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6843              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6844       ) {
6845        float_raise(float_flag_invalid, status);
6846        return 0;
6847    }
6848    aSign = extractFloat128Sign( a );
6849    bSign = extractFloat128Sign( b );
6850    if ( aSign != bSign ) {
6851        return
6852               aSign
6853            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6854                 == 0 );
6855    }
6856    return
6857          aSign ? le128( b.high, b.low, a.high, a.low )
6858        : le128( a.high, a.low, b.high, b.low );
6859
6860}
6861
6862/*----------------------------------------------------------------------------
6863| Returns 1 if the quadruple-precision floating-point value `a' is less than
6864| the corresponding value `b', and 0 otherwise.  The invalid exception is
6865| raised if either operand is a NaN.  The comparison is performed according
6866| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6867*----------------------------------------------------------------------------*/
6868
6869int float128_lt(float128 a, float128 b, float_status *status)
6870{
6871    flag aSign, bSign;
6872
6873    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6874              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6875         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6876              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6877       ) {
6878        float_raise(float_flag_invalid, status);
6879        return 0;
6880    }
6881    aSign = extractFloat128Sign( a );
6882    bSign = extractFloat128Sign( b );
6883    if ( aSign != bSign ) {
6884        return
6885               aSign
6886            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6887                 != 0 );
6888    }
6889    return
6890          aSign ? lt128( b.high, b.low, a.high, a.low )
6891        : lt128( a.high, a.low, b.high, b.low );
6892
6893}
6894
6895/*----------------------------------------------------------------------------
6896| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6897| be compared, and 0 otherwise.  The invalid exception is raised if either
6898| operand is a NaN. The comparison is performed according to the IEC/IEEE
6899| Standard for Binary Floating-Point Arithmetic.
6900*----------------------------------------------------------------------------*/
6901
6902int float128_unordered(float128 a, float128 b, float_status *status)
6903{
6904    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6905              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6906         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6907              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6908       ) {
6909        float_raise(float_flag_invalid, status);
6910        return 1;
6911    }
6912    return 0;
6913}
6914
6915/*----------------------------------------------------------------------------
6916| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6917| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6918| exception.  The comparison is performed according to the IEC/IEEE Standard
6919| for Binary Floating-Point Arithmetic.
6920*----------------------------------------------------------------------------*/
6921
6922int float128_eq_quiet(float128 a, float128 b, float_status *status)
6923{
6924
6925    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6926              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6927         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6928              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6929       ) {
6930        if (float128_is_signaling_nan(a, status)
6931         || float128_is_signaling_nan(b, status)) {
6932            float_raise(float_flag_invalid, status);
6933        }
6934        return 0;
6935    }
6936    return
6937           ( a.low == b.low )
6938        && (    ( a.high == b.high )
6939             || (    ( a.low == 0 )
6940                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6941           );
6942
6943}
6944
6945/*----------------------------------------------------------------------------
6946| Returns 1 if the quadruple-precision floating-point value `a' is less than
6947| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6948| cause an exception.  Otherwise, the comparison is performed according to the
6949| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6950*----------------------------------------------------------------------------*/
6951
6952int float128_le_quiet(float128 a, float128 b, float_status *status)
6953{
6954    flag aSign, bSign;
6955
6956    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6957              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6958         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6959              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6960       ) {
6961        if (float128_is_signaling_nan(a, status)
6962         || float128_is_signaling_nan(b, status)) {
6963            float_raise(float_flag_invalid, status);
6964        }
6965        return 0;
6966    }
6967    aSign = extractFloat128Sign( a );
6968    bSign = extractFloat128Sign( b );
6969    if ( aSign != bSign ) {
6970        return
6971               aSign
6972            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6973                 == 0 );
6974    }
6975    return
6976          aSign ? le128( b.high, b.low, a.high, a.low )
6977        : le128( a.high, a.low, b.high, b.low );
6978
6979}
6980
6981/*----------------------------------------------------------------------------
6982| Returns 1 if the quadruple-precision floating-point value `a' is less than
6983| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6984| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6985| Standard for Binary Floating-Point Arithmetic.
6986*----------------------------------------------------------------------------*/
6987
6988int float128_lt_quiet(float128 a, float128 b, float_status *status)
6989{
6990    flag aSign, bSign;
6991
6992    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6993              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6994         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6995              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6996       ) {
6997        if (float128_is_signaling_nan(a, status)
6998         || float128_is_signaling_nan(b, status)) {
6999            float_raise(float_flag_invalid, status);
7000        }
7001        return 0;
7002    }
7003    aSign = extractFloat128Sign( a );
7004    bSign = extractFloat128Sign( b );
7005    if ( aSign != bSign ) {
7006        return
7007               aSign
7008            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7009                 != 0 );
7010    }
7011    return
7012          aSign ? lt128( b.high, b.low, a.high, a.low )
7013        : lt128( a.high, a.low, b.high, b.low );
7014
7015}
7016
7017/*----------------------------------------------------------------------------
7018| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7019| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7020| comparison is performed according to the IEC/IEEE Standard for Binary
7021| Floating-Point Arithmetic.
7022*----------------------------------------------------------------------------*/
7023
7024int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7025{
7026    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7027              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7028         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7029              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7030       ) {
7031        if (float128_is_signaling_nan(a, status)
7032         || float128_is_signaling_nan(b, status)) {
7033            float_raise(float_flag_invalid, status);
7034        }
7035        return 1;
7036    }
7037    return 0;
7038}
7039
7040static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7041                                            int is_quiet, float_status *status)
7042{
7043    flag aSign, bSign;
7044
7045    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7046        float_raise(float_flag_invalid, status);
7047        return float_relation_unordered;
7048    }
7049    if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7050          ( extractFloatx80Frac( a )<<1 ) ) ||
7051        ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7052          ( extractFloatx80Frac( b )<<1 ) )) {
7053        if (!is_quiet ||
7054            floatx80_is_signaling_nan(a, status) ||
7055            floatx80_is_signaling_nan(b, status)) {
7056            float_raise(float_flag_invalid, status);
7057        }
7058        return float_relation_unordered;
7059    }
7060    aSign = extractFloatx80Sign( a );
7061    bSign = extractFloatx80Sign( b );
7062    if ( aSign != bSign ) {
7063
7064        if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7065             ( ( a.low | b.low ) == 0 ) ) {
7066            /* zero case */
7067            return float_relation_equal;
7068        } else {
7069            return 1 - (2 * aSign);
7070        }
7071    } else {
7072        if (a.low == b.low && a.high == b.high) {
7073            return float_relation_equal;
7074        } else {
7075            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7076        }
7077    }
7078}
7079
7080int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7081{
7082    return floatx80_compare_internal(a, b, 0, status);
7083}
7084
7085int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7086{
7087    return floatx80_compare_internal(a, b, 1, status);
7088}
7089
7090static inline int float128_compare_internal(float128 a, float128 b,
7091                                            int is_quiet, float_status *status)
7092{
7093    flag aSign, bSign;
7094
7095    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7096          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7097        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7098          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7099        if (!is_quiet ||
7100            float128_is_signaling_nan(a, status) ||
7101            float128_is_signaling_nan(b, status)) {
7102            float_raise(float_flag_invalid, status);
7103        }
7104        return float_relation_unordered;
7105    }
7106    aSign = extractFloat128Sign( a );
7107    bSign = extractFloat128Sign( b );
7108    if ( aSign != bSign ) {
7109        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7110            /* zero case */
7111            return float_relation_equal;
7112        } else {
7113            return 1 - (2 * aSign);
7114        }
7115    } else {
7116        if (a.low == b.low && a.high == b.high) {
7117            return float_relation_equal;
7118        } else {
7119            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7120        }
7121    }
7122}
7123
7124int float128_compare(float128 a, float128 b, float_status *status)
7125{
7126    return float128_compare_internal(a, b, 0, status);
7127}
7128
7129int float128_compare_quiet(float128 a, float128 b, float_status *status)
7130{
7131    return float128_compare_internal(a, b, 1, status);
7132}
7133
7134floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7135{
7136    flag aSign;
7137    int32_t aExp;
7138    uint64_t aSig;
7139
7140    if (floatx80_invalid_encoding(a)) {
7141        float_raise(float_flag_invalid, status);
7142        return floatx80_default_nan(status);
7143    }
7144    aSig = extractFloatx80Frac( a );
7145    aExp = extractFloatx80Exp( a );
7146    aSign = extractFloatx80Sign( a );
7147
7148    if ( aExp == 0x7FFF ) {
7149        if ( aSig<<1 ) {
7150            return propagateFloatx80NaN(a, a, status);
7151        }
7152        return a;
7153    }
7154
7155    if (aExp == 0) {
7156        if (aSig == 0) {
7157            return a;
7158        }
7159        aExp++;
7160    }
7161
7162    if (n > 0x10000) {
7163        n = 0x10000;
7164    } else if (n < -0x10000) {
7165        n = -0x10000;
7166    }
7167
7168    aExp += n;
7169    return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7170                                         aSign, aExp, aSig, 0, status);
7171}
7172
7173float128 float128_scalbn(float128 a, int n, float_status *status)
7174{
7175    flag aSign;
7176    int32_t aExp;
7177    uint64_t aSig0, aSig1;
7178
7179    aSig1 = extractFloat128Frac1( a );
7180    aSig0 = extractFloat128Frac0( a );
7181    aExp = extractFloat128Exp( a );
7182    aSign = extractFloat128Sign( a );
7183    if ( aExp == 0x7FFF ) {
7184        if ( aSig0 | aSig1 ) {
7185            return propagateFloat128NaN(a, a, status);
7186        }
7187        return a;
7188    }
7189    if (aExp != 0) {
7190        aSig0 |= LIT64( 0x0001000000000000 );
7191    } else if (aSig0 == 0 && aSig1 == 0) {
7192        return a;
7193    } else {
7194        aExp++;
7195    }
7196
7197    if (n > 0x10000) {
7198        n = 0x10000;
7199    } else if (n < -0x10000) {
7200        n = -0x10000;
7201    }
7202
7203    aExp += n - 1;
7204    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7205                                         , status);
7206
7207}
7208