qemu/fpu/softfloat.c
<<
>>
Prefs
   1/*
   2 * QEMU float support
   3 *
   4 * The code in this source file is derived from release 2a of the SoftFloat
   5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6 * some later contributions) are provided under that license, as detailed below.
   7 * It has subsequently been modified by contributors to the QEMU Project,
   8 * so some portions are provided under:
   9 *  the SoftFloat-2a license
  10 *  the BSD license
  11 *  GPL-v2-or-later
  12 *
  13 * Any future contributions to this file after December 1st 2014 will be
  14 * taken to be licensed under the Softfloat-2a license unless specifically
  15 * indicated otherwise.
  16 */
  17
  18/*
  19===============================================================================
  20This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21Arithmetic Package, Release 2a.
  22
  23Written by John R. Hauser.  This work was made possible in part by the
  24International Computer Science Institute, located at Suite 600, 1947 Center
  25Street, Berkeley, California 94704.  Funding was partially provided by the
  26National Science Foundation under grant MIP-9311980.  The original version
  27of this code was written as part of a project to build a fixed-point vector
  28processor in collaboration with the University of California at Berkeley,
  29overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31arithmetic/SoftFloat.html'.
  32
  33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39Derivative works are acceptable, even for commercial purposes, so long as
  40(1) they include prominent notice that the work is derivative, and (2) they
  41include prominent notice akin to these four paragraphs for those parts of
  42this code that are retained.
  43
  44===============================================================================
  45*/
  46
  47/* BSD licensing:
  48 * Copyright (c) 2006, Fabrice Bellard
  49 * All rights reserved.
  50 *
  51 * Redistribution and use in source and binary forms, with or without
  52 * modification, are permitted provided that the following conditions are met:
  53 *
  54 * 1. Redistributions of source code must retain the above copyright notice,
  55 * this list of conditions and the following disclaimer.
  56 *
  57 * 2. Redistributions in binary form must reproduce the above copyright notice,
  58 * this list of conditions and the following disclaimer in the documentation
  59 * and/or other materials provided with the distribution.
  60 *
  61 * 3. Neither the name of the copyright holder nor the names of its contributors
  62 * may be used to endorse or promote products derived from this software without
  63 * specific prior written permission.
  64 *
  65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75 * THE POSSIBILITY OF SUCH DAMAGE.
  76 */
  77
  78/* Portions of this work are licensed under the terms of the GNU GPL,
  79 * version 2 or later. See the COPYING file in the top-level directory.
  80 */
  81
  82/* softfloat (and in particular the code in softfloat-specialize.h) is
  83 * target-dependent and needs the TARGET_* macros.
  84 */
  85#include "qemu/osdep.h"
  86#include "qemu/bitops.h"
  87#include "fpu/softfloat.h"
  88
  89/* We only need stdlib for abort() */
  90
  91/*----------------------------------------------------------------------------
  92| Primitive arithmetic functions, including multi-word arithmetic, and
  93| division and square root approximations.  (Can be specialized to target if
  94| desired.)
  95*----------------------------------------------------------------------------*/
  96#include "fpu/softfloat-macros.h"
  97
  98/*----------------------------------------------------------------------------
  99| Returns the fraction bits of the half-precision floating-point value `a'.
 100*----------------------------------------------------------------------------*/
 101
 102static inline uint32_t extractFloat16Frac(float16 a)
 103{
 104    return float16_val(a) & 0x3ff;
 105}
 106
 107/*----------------------------------------------------------------------------
 108| Returns the exponent bits of the half-precision floating-point value `a'.
 109*----------------------------------------------------------------------------*/
 110
 111static inline int extractFloat16Exp(float16 a)
 112{
 113    return (float16_val(a) >> 10) & 0x1f;
 114}
 115
 116/*----------------------------------------------------------------------------
 117| Returns the fraction bits of the single-precision floating-point value `a'.
 118*----------------------------------------------------------------------------*/
 119
 120static inline uint32_t extractFloat32Frac(float32 a)
 121{
 122    return float32_val(a) & 0x007FFFFF;
 123}
 124
 125/*----------------------------------------------------------------------------
 126| Returns the exponent bits of the single-precision floating-point value `a'.
 127*----------------------------------------------------------------------------*/
 128
 129static inline int extractFloat32Exp(float32 a)
 130{
 131    return (float32_val(a) >> 23) & 0xFF;
 132}
 133
 134/*----------------------------------------------------------------------------
 135| Returns the sign bit of the single-precision floating-point value `a'.
 136*----------------------------------------------------------------------------*/
 137
 138static inline flag extractFloat32Sign(float32 a)
 139{
 140    return float32_val(a) >> 31;
 141}
 142
 143/*----------------------------------------------------------------------------
 144| Returns the fraction bits of the double-precision floating-point value `a'.
 145*----------------------------------------------------------------------------*/
 146
 147static inline uint64_t extractFloat64Frac(float64 a)
 148{
 149    return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
 150}
 151
 152/*----------------------------------------------------------------------------
 153| Returns the exponent bits of the double-precision floating-point value `a'.
 154*----------------------------------------------------------------------------*/
 155
 156static inline int extractFloat64Exp(float64 a)
 157{
 158    return (float64_val(a) >> 52) & 0x7FF;
 159}
 160
 161/*----------------------------------------------------------------------------
 162| Returns the sign bit of the double-precision floating-point value `a'.
 163*----------------------------------------------------------------------------*/
 164
 165static inline flag extractFloat64Sign(float64 a)
 166{
 167    return float64_val(a) >> 63;
 168}
 169
 170/*
 171 * Classify a floating point number. Everything above float_class_qnan
 172 * is a NaN so cls >= float_class_qnan is any NaN.
 173 */
 174
 175typedef enum __attribute__ ((__packed__)) {
 176    float_class_unclassified,
 177    float_class_zero,
 178    float_class_normal,
 179    float_class_inf,
 180    float_class_qnan,  /* all NaNs from here */
 181    float_class_snan,
 182} FloatClass;
 183
 184/* Simple helpers for checking if, or what kind of, NaN we have */
 185static inline __attribute__((unused)) bool is_nan(FloatClass c)
 186{
 187    return unlikely(c >= float_class_qnan);
 188}
 189
 190static inline __attribute__((unused)) bool is_snan(FloatClass c)
 191{
 192    return c == float_class_snan;
 193}
 194
 195static inline __attribute__((unused)) bool is_qnan(FloatClass c)
 196{
 197    return c == float_class_qnan;
 198}
 199
 200/*
 201 * Structure holding all of the decomposed parts of a float. The
 202 * exponent is unbiased and the fraction is normalized. All
 203 * calculations are done with a 64 bit fraction and then rounded as
 204 * appropriate for the final format.
 205 *
 206 * Thanks to the packed FloatClass a decent compiler should be able to
 207 * fit the whole structure into registers and avoid using the stack
 208 * for parameter passing.
 209 */
 210
 211typedef struct {
 212    uint64_t frac;
 213    int32_t  exp;
 214    FloatClass cls;
 215    bool sign;
 216} FloatParts;
 217
 218#define DECOMPOSED_BINARY_POINT    (64 - 2)
 219#define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 220#define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
 221
 222/* Structure holding all of the relevant parameters for a format.
 223 *   exp_size: the size of the exponent field
 224 *   exp_bias: the offset applied to the exponent field
 225 *   exp_max: the maximum normalised exponent
 226 *   frac_size: the size of the fraction field
 227 *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 228 * The following are computed based the size of fraction
 229 *   frac_lsb: least significant bit of fraction
 230 *   frac_lsbm1: the bit below the least significant bit (for rounding)
 231 *   round_mask/roundeven_mask: masks used for rounding
 232 * The following optional modifiers are available:
 233 *   arm_althp: handle ARM Alternative Half Precision
 234 */
 235typedef struct {
 236    int exp_size;
 237    int exp_bias;
 238    int exp_max;
 239    int frac_size;
 240    int frac_shift;
 241    uint64_t frac_lsb;
 242    uint64_t frac_lsbm1;
 243    uint64_t round_mask;
 244    uint64_t roundeven_mask;
 245    bool arm_althp;
 246} FloatFmt;
 247
 248/* Expand fields based on the size of exponent and fraction */
 249#define FLOAT_PARAMS(E, F)                                           \
 250    .exp_size       = E,                                             \
 251    .exp_bias       = ((1 << E) - 1) >> 1,                           \
 252    .exp_max        = (1 << E) - 1,                                  \
 253    .frac_size      = F,                                             \
 254    .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 255    .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 256    .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 257    .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 258    .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 259
 260static const FloatFmt float16_params = {
 261    FLOAT_PARAMS(5, 10)
 262};
 263
 264static const FloatFmt float16_params_ahp = {
 265    FLOAT_PARAMS(5, 10),
 266    .arm_althp = true
 267};
 268
 269static const FloatFmt float32_params = {
 270    FLOAT_PARAMS(8, 23)
 271};
 272
 273static const FloatFmt float64_params = {
 274    FLOAT_PARAMS(11, 52)
 275};
 276
 277/* Unpack a float to parts, but do not canonicalize.  */
 278static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
 279{
 280    const int sign_pos = fmt.frac_size + fmt.exp_size;
 281
 282    return (FloatParts) {
 283        .cls = float_class_unclassified,
 284        .sign = extract64(raw, sign_pos, 1),
 285        .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 286        .frac = extract64(raw, 0, fmt.frac_size),
 287    };
 288}
 289
 290static inline FloatParts float16_unpack_raw(float16 f)
 291{
 292    return unpack_raw(float16_params, f);
 293}
 294
 295static inline FloatParts float32_unpack_raw(float32 f)
 296{
 297    return unpack_raw(float32_params, f);
 298}
 299
 300static inline FloatParts float64_unpack_raw(float64 f)
 301{
 302    return unpack_raw(float64_params, f);
 303}
 304
 305/* Pack a float from parts, but do not canonicalize.  */
 306static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
 307{
 308    const int sign_pos = fmt.frac_size + fmt.exp_size;
 309    uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 310    return deposit64(ret, sign_pos, 1, p.sign);
 311}
 312
 313static inline float16 float16_pack_raw(FloatParts p)
 314{
 315    return make_float16(pack_raw(float16_params, p));
 316}
 317
 318static inline float32 float32_pack_raw(FloatParts p)
 319{
 320    return make_float32(pack_raw(float32_params, p));
 321}
 322
 323static inline float64 float64_pack_raw(FloatParts p)
 324{
 325    return make_float64(pack_raw(float64_params, p));
 326}
 327
 328/*----------------------------------------------------------------------------
 329| Functions and definitions to determine:  (1) whether tininess for underflow
 330| is detected before or after rounding by default, (2) what (if anything)
 331| happens when exceptions are raised, (3) how signaling NaNs are distinguished
 332| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 333| are propagated from function inputs to output.  These details are target-
 334| specific.
 335*----------------------------------------------------------------------------*/
 336#include "softfloat-specialize.h"
 337
 338/* Canonicalize EXP and FRAC, setting CLS.  */
 339static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
 340                               float_status *status)
 341{
 342    if (part.exp == parm->exp_max && !parm->arm_althp) {
 343        if (part.frac == 0) {
 344            part.cls = float_class_inf;
 345        } else {
 346            part.frac <<= parm->frac_shift;
 347            part.cls = (parts_is_snan_frac(part.frac, status)
 348                        ? float_class_snan : float_class_qnan);
 349        }
 350    } else if (part.exp == 0) {
 351        if (likely(part.frac == 0)) {
 352            part.cls = float_class_zero;
 353        } else if (status->flush_inputs_to_zero) {
 354            float_raise(float_flag_input_denormal, status);
 355            part.cls = float_class_zero;
 356            part.frac = 0;
 357        } else {
 358            int shift = clz64(part.frac) - 1;
 359            part.cls = float_class_normal;
 360            part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 361            part.frac <<= shift;
 362        }
 363    } else {
 364        part.cls = float_class_normal;
 365        part.exp -= parm->exp_bias;
 366        part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 367    }
 368    return part;
 369}
 370
 371/* Round and uncanonicalize a floating-point number by parts. There
 372 * are FRAC_SHIFT bits that may require rounding at the bottom of the
 373 * fraction; these bits will be removed. The exponent will be biased
 374 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 375 */
 376
 377static FloatParts round_canonical(FloatParts p, float_status *s,
 378                                  const FloatFmt *parm)
 379{
 380    const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 381    const uint64_t round_mask = parm->round_mask;
 382    const uint64_t roundeven_mask = parm->roundeven_mask;
 383    const int exp_max = parm->exp_max;
 384    const int frac_shift = parm->frac_shift;
 385    uint64_t frac, inc;
 386    int exp, flags = 0;
 387    bool overflow_norm;
 388
 389    frac = p.frac;
 390    exp = p.exp;
 391
 392    switch (p.cls) {
 393    case float_class_normal:
 394        switch (s->float_rounding_mode) {
 395        case float_round_nearest_even:
 396            overflow_norm = false;
 397            inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 398            break;
 399        case float_round_ties_away:
 400            overflow_norm = false;
 401            inc = frac_lsbm1;
 402            break;
 403        case float_round_to_zero:
 404            overflow_norm = true;
 405            inc = 0;
 406            break;
 407        case float_round_up:
 408            inc = p.sign ? 0 : round_mask;
 409            overflow_norm = p.sign;
 410            break;
 411        case float_round_down:
 412            inc = p.sign ? round_mask : 0;
 413            overflow_norm = !p.sign;
 414            break;
 415        default:
 416            g_assert_not_reached();
 417        }
 418
 419        exp += parm->exp_bias;
 420        if (likely(exp > 0)) {
 421            if (frac & round_mask) {
 422                flags |= float_flag_inexact;
 423                frac += inc;
 424                if (frac & DECOMPOSED_OVERFLOW_BIT) {
 425                    frac >>= 1;
 426                    exp++;
 427                }
 428            }
 429            frac >>= frac_shift;
 430
 431            if (parm->arm_althp) {
 432                /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
 433                if (unlikely(exp > exp_max)) {
 434                    /* Overflow.  Return the maximum normal.  */
 435                    flags = float_flag_invalid;
 436                    exp = exp_max;
 437                    frac = -1;
 438                }
 439            } else if (unlikely(exp >= exp_max)) {
 440                flags |= float_flag_overflow | float_flag_inexact;
 441                if (overflow_norm) {
 442                    exp = exp_max - 1;
 443                    frac = -1;
 444                } else {
 445                    p.cls = float_class_inf;
 446                    goto do_inf;
 447                }
 448            }
 449        } else if (s->flush_to_zero) {
 450            flags |= float_flag_output_denormal;
 451            p.cls = float_class_zero;
 452            goto do_zero;
 453        } else {
 454            bool is_tiny = (s->float_detect_tininess
 455                            == float_tininess_before_rounding)
 456                        || (exp < 0)
 457                        || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
 458
 459            shift64RightJamming(frac, 1 - exp, &frac);
 460            if (frac & round_mask) {
 461                /* Need to recompute round-to-even.  */
 462                if (s->float_rounding_mode == float_round_nearest_even) {
 463                    inc = ((frac & roundeven_mask) != frac_lsbm1
 464                           ? frac_lsbm1 : 0);
 465                }
 466                flags |= float_flag_inexact;
 467                frac += inc;
 468            }
 469
 470            exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 471            frac >>= frac_shift;
 472
 473            if (is_tiny && (flags & float_flag_inexact)) {
 474                flags |= float_flag_underflow;
 475            }
 476            if (exp == 0 && frac == 0) {
 477                p.cls = float_class_zero;
 478            }
 479        }
 480        break;
 481
 482    case float_class_zero:
 483    do_zero:
 484        exp = 0;
 485        frac = 0;
 486        break;
 487
 488    case float_class_inf:
 489    do_inf:
 490        assert(!parm->arm_althp);
 491        exp = exp_max;
 492        frac = 0;
 493        break;
 494
 495    case float_class_qnan:
 496    case float_class_snan:
 497        assert(!parm->arm_althp);
 498        exp = exp_max;
 499        frac >>= parm->frac_shift;
 500        break;
 501
 502    default:
 503        g_assert_not_reached();
 504    }
 505
 506    float_raise(flags, s);
 507    p.exp = exp;
 508    p.frac = frac;
 509    return p;
 510}
 511
 512/* Explicit FloatFmt version */
 513static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
 514                                            const FloatFmt *params)
 515{
 516    return canonicalize(float16_unpack_raw(f), params, s);
 517}
 518
 519static FloatParts float16_unpack_canonical(float16 f, float_status *s)
 520{
 521    return float16a_unpack_canonical(f, s, &float16_params);
 522}
 523
 524static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
 525                                             const FloatFmt *params)
 526{
 527    return float16_pack_raw(round_canonical(p, s, params));
 528}
 529
 530static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
 531{
 532    return float16a_round_pack_canonical(p, s, &float16_params);
 533}
 534
 535static FloatParts float32_unpack_canonical(float32 f, float_status *s)
 536{
 537    return canonicalize(float32_unpack_raw(f), &float32_params, s);
 538}
 539
 540static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
 541{
 542    return float32_pack_raw(round_canonical(p, s, &float32_params));
 543}
 544
 545static FloatParts float64_unpack_canonical(float64 f, float_status *s)
 546{
 547    return canonicalize(float64_unpack_raw(f), &float64_params, s);
 548}
 549
 550static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
 551{
 552    return float64_pack_raw(round_canonical(p, s, &float64_params));
 553}
 554
 555static FloatParts return_nan(FloatParts a, float_status *s)
 556{
 557    switch (a.cls) {
 558    case float_class_snan:
 559        s->float_exception_flags |= float_flag_invalid;
 560        a = parts_silence_nan(a, s);
 561        /* fall through */
 562    case float_class_qnan:
 563        if (s->default_nan_mode) {
 564            return parts_default_nan(s);
 565        }
 566        break;
 567
 568    default:
 569        g_assert_not_reached();
 570    }
 571    return a;
 572}
 573
 574static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
 575{
 576    if (is_snan(a.cls) || is_snan(b.cls)) {
 577        s->float_exception_flags |= float_flag_invalid;
 578    }
 579
 580    if (s->default_nan_mode) {
 581        return parts_default_nan(s);
 582    } else {
 583        if (pickNaN(a.cls, b.cls,
 584                    a.frac > b.frac ||
 585                    (a.frac == b.frac && a.sign < b.sign))) {
 586            a = b;
 587        }
 588        if (is_snan(a.cls)) {
 589            return parts_silence_nan(a, s);
 590        }
 591    }
 592    return a;
 593}
 594
 595static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
 596                                  bool inf_zero, float_status *s)
 597{
 598    int which;
 599
 600    if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 601        s->float_exception_flags |= float_flag_invalid;
 602    }
 603
 604    which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
 605
 606    if (s->default_nan_mode) {
 607        /* Note that this check is after pickNaNMulAdd so that function
 608         * has an opportunity to set the Invalid flag.
 609         */
 610        which = 3;
 611    }
 612
 613    switch (which) {
 614    case 0:
 615        break;
 616    case 1:
 617        a = b;
 618        break;
 619    case 2:
 620        a = c;
 621        break;
 622    case 3:
 623        return parts_default_nan(s);
 624    default:
 625        g_assert_not_reached();
 626    }
 627
 628    if (is_snan(a.cls)) {
 629        return parts_silence_nan(a, s);
 630    }
 631    return a;
 632}
 633
 634/*
 635 * Returns the result of adding or subtracting the values of the
 636 * floating-point values `a' and `b'. The operation is performed
 637 * according to the IEC/IEEE Standard for Binary Floating-Point
 638 * Arithmetic.
 639 */
 640
 641static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
 642                                float_status *s)
 643{
 644    bool a_sign = a.sign;
 645    bool b_sign = b.sign ^ subtract;
 646
 647    if (a_sign != b_sign) {
 648        /* Subtraction */
 649
 650        if (a.cls == float_class_normal && b.cls == float_class_normal) {
 651            if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 652                shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 653                a.frac = a.frac - b.frac;
 654            } else {
 655                shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 656                a.frac = b.frac - a.frac;
 657                a.exp = b.exp;
 658                a_sign ^= 1;
 659            }
 660
 661            if (a.frac == 0) {
 662                a.cls = float_class_zero;
 663                a.sign = s->float_rounding_mode == float_round_down;
 664            } else {
 665                int shift = clz64(a.frac) - 1;
 666                a.frac = a.frac << shift;
 667                a.exp = a.exp - shift;
 668                a.sign = a_sign;
 669            }
 670            return a;
 671        }
 672        if (is_nan(a.cls) || is_nan(b.cls)) {
 673            return pick_nan(a, b, s);
 674        }
 675        if (a.cls == float_class_inf) {
 676            if (b.cls == float_class_inf) {
 677                float_raise(float_flag_invalid, s);
 678                return parts_default_nan(s);
 679            }
 680            return a;
 681        }
 682        if (a.cls == float_class_zero && b.cls == float_class_zero) {
 683            a.sign = s->float_rounding_mode == float_round_down;
 684            return a;
 685        }
 686        if (a.cls == float_class_zero || b.cls == float_class_inf) {
 687            b.sign = a_sign ^ 1;
 688            return b;
 689        }
 690        if (b.cls == float_class_zero) {
 691            return a;
 692        }
 693    } else {
 694        /* Addition */
 695        if (a.cls == float_class_normal && b.cls == float_class_normal) {
 696            if (a.exp > b.exp) {
 697                shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 698            } else if (a.exp < b.exp) {
 699                shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 700                a.exp = b.exp;
 701            }
 702            a.frac += b.frac;
 703            if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
 704                a.frac >>= 1;
 705                a.exp += 1;
 706            }
 707            return a;
 708        }
 709        if (is_nan(a.cls) || is_nan(b.cls)) {
 710            return pick_nan(a, b, s);
 711        }
 712        if (a.cls == float_class_inf || b.cls == float_class_zero) {
 713            return a;
 714        }
 715        if (b.cls == float_class_inf || a.cls == float_class_zero) {
 716            b.sign = b_sign;
 717            return b;
 718        }
 719    }
 720    g_assert_not_reached();
 721}
 722
 723/*
 724 * Returns the result of adding or subtracting the floating-point
 725 * values `a' and `b'. The operation is performed according to the
 726 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 727 */
 728
 729float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
 730                                              float_status *status)
 731{
 732    FloatParts pa = float16_unpack_canonical(a, status);
 733    FloatParts pb = float16_unpack_canonical(b, status);
 734    FloatParts pr = addsub_floats(pa, pb, false, status);
 735
 736    return float16_round_pack_canonical(pr, status);
 737}
 738
 739float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
 740                                             float_status *status)
 741{
 742    FloatParts pa = float32_unpack_canonical(a, status);
 743    FloatParts pb = float32_unpack_canonical(b, status);
 744    FloatParts pr = addsub_floats(pa, pb, false, status);
 745
 746    return float32_round_pack_canonical(pr, status);
 747}
 748
 749float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
 750                                             float_status *status)
 751{
 752    FloatParts pa = float64_unpack_canonical(a, status);
 753    FloatParts pb = float64_unpack_canonical(b, status);
 754    FloatParts pr = addsub_floats(pa, pb, false, status);
 755
 756    return float64_round_pack_canonical(pr, status);
 757}
 758
 759float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
 760                                             float_status *status)
 761{
 762    FloatParts pa = float16_unpack_canonical(a, status);
 763    FloatParts pb = float16_unpack_canonical(b, status);
 764    FloatParts pr = addsub_floats(pa, pb, true, status);
 765
 766    return float16_round_pack_canonical(pr, status);
 767}
 768
 769float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
 770                                             float_status *status)
 771{
 772    FloatParts pa = float32_unpack_canonical(a, status);
 773    FloatParts pb = float32_unpack_canonical(b, status);
 774    FloatParts pr = addsub_floats(pa, pb, true, status);
 775
 776    return float32_round_pack_canonical(pr, status);
 777}
 778
 779float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
 780                                             float_status *status)
 781{
 782    FloatParts pa = float64_unpack_canonical(a, status);
 783    FloatParts pb = float64_unpack_canonical(b, status);
 784    FloatParts pr = addsub_floats(pa, pb, true, status);
 785
 786    return float64_round_pack_canonical(pr, status);
 787}
 788
 789/*
 790 * Returns the result of multiplying the floating-point values `a' and
 791 * `b'. The operation is performed according to the IEC/IEEE Standard
 792 * for Binary Floating-Point Arithmetic.
 793 */
 794
 795static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
 796{
 797    bool sign = a.sign ^ b.sign;
 798
 799    if (a.cls == float_class_normal && b.cls == float_class_normal) {
 800        uint64_t hi, lo;
 801        int exp = a.exp + b.exp;
 802
 803        mul64To128(a.frac, b.frac, &hi, &lo);
 804        shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 805        if (lo & DECOMPOSED_OVERFLOW_BIT) {
 806            shift64RightJamming(lo, 1, &lo);
 807            exp += 1;
 808        }
 809
 810        /* Re-use a */
 811        a.exp = exp;
 812        a.sign = sign;
 813        a.frac = lo;
 814        return a;
 815    }
 816    /* handle all the NaN cases */
 817    if (is_nan(a.cls) || is_nan(b.cls)) {
 818        return pick_nan(a, b, s);
 819    }
 820    /* Inf * Zero == NaN */
 821    if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
 822        (a.cls == float_class_zero && b.cls == float_class_inf)) {
 823        s->float_exception_flags |= float_flag_invalid;
 824        return parts_default_nan(s);
 825    }
 826    /* Multiply by 0 or Inf */
 827    if (a.cls == float_class_inf || a.cls == float_class_zero) {
 828        a.sign = sign;
 829        return a;
 830    }
 831    if (b.cls == float_class_inf || b.cls == float_class_zero) {
 832        b.sign = sign;
 833        return b;
 834    }
 835    g_assert_not_reached();
 836}
 837
 838float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
 839                                             float_status *status)
 840{
 841    FloatParts pa = float16_unpack_canonical(a, status);
 842    FloatParts pb = float16_unpack_canonical(b, status);
 843    FloatParts pr = mul_floats(pa, pb, status);
 844
 845    return float16_round_pack_canonical(pr, status);
 846}
 847
 848float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
 849                                             float_status *status)
 850{
 851    FloatParts pa = float32_unpack_canonical(a, status);
 852    FloatParts pb = float32_unpack_canonical(b, status);
 853    FloatParts pr = mul_floats(pa, pb, status);
 854
 855    return float32_round_pack_canonical(pr, status);
 856}
 857
 858float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
 859                                             float_status *status)
 860{
 861    FloatParts pa = float64_unpack_canonical(a, status);
 862    FloatParts pb = float64_unpack_canonical(b, status);
 863    FloatParts pr = mul_floats(pa, pb, status);
 864
 865    return float64_round_pack_canonical(pr, status);
 866}
 867
 868/*
 869 * Returns the result of multiplying the floating-point values `a' and
 870 * `b' then adding 'c', with no intermediate rounding step after the
 871 * multiplication. The operation is performed according to the
 872 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
 873 * The flags argument allows the caller to select negation of the
 874 * addend, the intermediate product, or the final result. (The
 875 * difference between this and having the caller do a separate
 876 * negation is that negating externally will flip the sign bit on
 877 * NaNs.)
 878 */
 879
 880static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
 881                                int flags, float_status *s)
 882{
 883    bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
 884                    ((1 << float_class_inf) | (1 << float_class_zero));
 885    bool p_sign;
 886    bool sign_flip = flags & float_muladd_negate_result;
 887    FloatClass p_class;
 888    uint64_t hi, lo;
 889    int p_exp;
 890
 891    /* It is implementation-defined whether the cases of (0,inf,qnan)
 892     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
 893     * they return if they do), so we have to hand this information
 894     * off to the target-specific pick-a-NaN routine.
 895     */
 896    if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
 897        return pick_nan_muladd(a, b, c, inf_zero, s);
 898    }
 899
 900    if (inf_zero) {
 901        s->float_exception_flags |= float_flag_invalid;
 902        return parts_default_nan(s);
 903    }
 904
 905    if (flags & float_muladd_negate_c) {
 906        c.sign ^= 1;
 907    }
 908
 909    p_sign = a.sign ^ b.sign;
 910
 911    if (flags & float_muladd_negate_product) {
 912        p_sign ^= 1;
 913    }
 914
 915    if (a.cls == float_class_inf || b.cls == float_class_inf) {
 916        p_class = float_class_inf;
 917    } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
 918        p_class = float_class_zero;
 919    } else {
 920        p_class = float_class_normal;
 921    }
 922
 923    if (c.cls == float_class_inf) {
 924        if (p_class == float_class_inf && p_sign != c.sign) {
 925            s->float_exception_flags |= float_flag_invalid;
 926            return parts_default_nan(s);
 927        } else {
 928            a.cls = float_class_inf;
 929            a.sign = c.sign ^ sign_flip;
 930            return a;
 931        }
 932    }
 933
 934    if (p_class == float_class_inf) {
 935        a.cls = float_class_inf;
 936        a.sign = p_sign ^ sign_flip;
 937        return a;
 938    }
 939
 940    if (p_class == float_class_zero) {
 941        if (c.cls == float_class_zero) {
 942            if (p_sign != c.sign) {
 943                p_sign = s->float_rounding_mode == float_round_down;
 944            }
 945            c.sign = p_sign;
 946        } else if (flags & float_muladd_halve_result) {
 947            c.exp -= 1;
 948        }
 949        c.sign ^= sign_flip;
 950        return c;
 951    }
 952
 953    /* a & b should be normals now... */
 954    assert(a.cls == float_class_normal &&
 955           b.cls == float_class_normal);
 956
 957    p_exp = a.exp + b.exp;
 958
 959    /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
 960     * result.
 961     */
 962    mul64To128(a.frac, b.frac, &hi, &lo);
 963    /* binary point now at bit 124 */
 964
 965    /* check for overflow */
 966    if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
 967        shift128RightJamming(hi, lo, 1, &hi, &lo);
 968        p_exp += 1;
 969    }
 970
 971    /* + add/sub */
 972    if (c.cls == float_class_zero) {
 973        /* move binary point back to 62 */
 974        shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 975    } else {
 976        int exp_diff = p_exp - c.exp;
 977        if (p_sign == c.sign) {
 978            /* Addition */
 979            if (exp_diff <= 0) {
 980                shift128RightJamming(hi, lo,
 981                                     DECOMPOSED_BINARY_POINT - exp_diff,
 982                                     &hi, &lo);
 983                lo += c.frac;
 984                p_exp = c.exp;
 985            } else {
 986                uint64_t c_hi, c_lo;
 987                /* shift c to the same binary point as the product (124) */
 988                c_hi = c.frac >> 2;
 989                c_lo = 0;
 990                shift128RightJamming(c_hi, c_lo,
 991                                     exp_diff,
 992                                     &c_hi, &c_lo);
 993                add128(hi, lo, c_hi, c_lo, &hi, &lo);
 994                /* move binary point back to 62 */
 995                shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 996            }
 997
 998            if (lo & DECOMPOSED_OVERFLOW_BIT) {
 999                shift64RightJamming(lo, 1, &lo);
1000                p_exp += 1;
1001            }
1002
1003        } else {
1004            /* Subtraction */
1005            uint64_t c_hi, c_lo;
1006            /* make C binary point match product at bit 124 */
1007            c_hi = c.frac >> 2;
1008            c_lo = 0;
1009
1010            if (exp_diff <= 0) {
1011                shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1012                if (exp_diff == 0
1013                    &&
1014                    (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1015                    sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1016                } else {
1017                    sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1018                    p_sign ^= 1;
1019                    p_exp = c.exp;
1020                }
1021            } else {
1022                shift128RightJamming(c_hi, c_lo,
1023                                     exp_diff,
1024                                     &c_hi, &c_lo);
1025                sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1026            }
1027
1028            if (hi == 0 && lo == 0) {
1029                a.cls = float_class_zero;
1030                a.sign = s->float_rounding_mode == float_round_down;
1031                a.sign ^= sign_flip;
1032                return a;
1033            } else {
1034                int shift;
1035                if (hi != 0) {
1036                    shift = clz64(hi);
1037                } else {
1038                    shift = clz64(lo) + 64;
1039                }
1040                /* Normalizing to a binary point of 124 is the
1041                   correct adjust for the exponent.  However since we're
1042                   shifting, we might as well put the binary point back
1043                   at 62 where we really want it.  Therefore shift as
1044                   if we're leaving 1 bit at the top of the word, but
1045                   adjust the exponent as if we're leaving 3 bits.  */
1046                shift -= 1;
1047                if (shift >= 64) {
1048                    lo = lo << (shift - 64);
1049                } else {
1050                    hi = (hi << shift) | (lo >> (64 - shift));
1051                    lo = hi | ((lo << shift) != 0);
1052                }
1053                p_exp -= shift - 2;
1054            }
1055        }
1056    }
1057
1058    if (flags & float_muladd_halve_result) {
1059        p_exp -= 1;
1060    }
1061
1062    /* finally prepare our result */
1063    a.cls = float_class_normal;
1064    a.sign = p_sign ^ sign_flip;
1065    a.exp = p_exp;
1066    a.frac = lo;
1067
1068    return a;
1069}
1070
1071float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1072                                                int flags, float_status *status)
1073{
1074    FloatParts pa = float16_unpack_canonical(a, status);
1075    FloatParts pb = float16_unpack_canonical(b, status);
1076    FloatParts pc = float16_unpack_canonical(c, status);
1077    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1078
1079    return float16_round_pack_canonical(pr, status);
1080}
1081
1082float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1083                                                int flags, float_status *status)
1084{
1085    FloatParts pa = float32_unpack_canonical(a, status);
1086    FloatParts pb = float32_unpack_canonical(b, status);
1087    FloatParts pc = float32_unpack_canonical(c, status);
1088    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1089
1090    return float32_round_pack_canonical(pr, status);
1091}
1092
1093float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1094                                                int flags, float_status *status)
1095{
1096    FloatParts pa = float64_unpack_canonical(a, status);
1097    FloatParts pb = float64_unpack_canonical(b, status);
1098    FloatParts pc = float64_unpack_canonical(c, status);
1099    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1100
1101    return float64_round_pack_canonical(pr, status);
1102}
1103
1104/*
1105 * Returns the result of dividing the floating-point value `a' by the
1106 * corresponding value `b'. The operation is performed according to
1107 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1108 */
1109
1110static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1111{
1112    bool sign = a.sign ^ b.sign;
1113
1114    if (a.cls == float_class_normal && b.cls == float_class_normal) {
1115        uint64_t temp_lo, temp_hi;
1116        int exp = a.exp - b.exp;
1117        if (a.frac < b.frac) {
1118            exp -= 1;
1119            shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1120                              &temp_hi, &temp_lo);
1121        } else {
1122            shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1123                              &temp_hi, &temp_lo);
1124        }
1125        /* LSB of quot is set if inexact which roundandpack will use
1126         * to set flags. Yet again we re-use a for the result */
1127        a.frac = div128To64(temp_lo, temp_hi, b.frac);
1128        a.sign = sign;
1129        a.exp = exp;
1130        return a;
1131    }
1132    /* handle all the NaN cases */
1133    if (is_nan(a.cls) || is_nan(b.cls)) {
1134        return pick_nan(a, b, s);
1135    }
1136    /* 0/0 or Inf/Inf */
1137    if (a.cls == b.cls
1138        &&
1139        (a.cls == float_class_inf || a.cls == float_class_zero)) {
1140        s->float_exception_flags |= float_flag_invalid;
1141        return parts_default_nan(s);
1142    }
1143    /* Inf / x or 0 / x */
1144    if (a.cls == float_class_inf || a.cls == float_class_zero) {
1145        a.sign = sign;
1146        return a;
1147    }
1148    /* Div 0 => Inf */
1149    if (b.cls == float_class_zero) {
1150        s->float_exception_flags |= float_flag_divbyzero;
1151        a.cls = float_class_inf;
1152        a.sign = sign;
1153        return a;
1154    }
1155    /* Div by Inf */
1156    if (b.cls == float_class_inf) {
1157        a.cls = float_class_zero;
1158        a.sign = sign;
1159        return a;
1160    }
1161    g_assert_not_reached();
1162}
1163
1164float16 float16_div(float16 a, float16 b, float_status *status)
1165{
1166    FloatParts pa = float16_unpack_canonical(a, status);
1167    FloatParts pb = float16_unpack_canonical(b, status);
1168    FloatParts pr = div_floats(pa, pb, status);
1169
1170    return float16_round_pack_canonical(pr, status);
1171}
1172
1173float32 float32_div(float32 a, float32 b, float_status *status)
1174{
1175    FloatParts pa = float32_unpack_canonical(a, status);
1176    FloatParts pb = float32_unpack_canonical(b, status);
1177    FloatParts pr = div_floats(pa, pb, status);
1178
1179    return float32_round_pack_canonical(pr, status);
1180}
1181
1182float64 float64_div(float64 a, float64 b, float_status *status)
1183{
1184    FloatParts pa = float64_unpack_canonical(a, status);
1185    FloatParts pb = float64_unpack_canonical(b, status);
1186    FloatParts pr = div_floats(pa, pb, status);
1187
1188    return float64_round_pack_canonical(pr, status);
1189}
1190
1191/*
1192 * Float to Float conversions
1193 *
1194 * Returns the result of converting one float format to another. The
1195 * conversion is performed according to the IEC/IEEE Standard for
1196 * Binary Floating-Point Arithmetic.
1197 *
1198 * The float_to_float helper only needs to take care of raising
1199 * invalid exceptions and handling the conversion on NaNs.
1200 */
1201
1202static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1203                                 float_status *s)
1204{
1205    if (dstf->arm_althp) {
1206        switch (a.cls) {
1207        case float_class_qnan:
1208        case float_class_snan:
1209            /* There is no NaN in the destination format.  Raise Invalid
1210             * and return a zero with the sign of the input NaN.
1211             */
1212            s->float_exception_flags |= float_flag_invalid;
1213            a.cls = float_class_zero;
1214            a.frac = 0;
1215            a.exp = 0;
1216            break;
1217
1218        case float_class_inf:
1219            /* There is no Inf in the destination format.  Raise Invalid
1220             * and return the maximum normal with the correct sign.
1221             */
1222            s->float_exception_flags |= float_flag_invalid;
1223            a.cls = float_class_normal;
1224            a.exp = dstf->exp_max;
1225            a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1226            break;
1227
1228        default:
1229            break;
1230        }
1231    } else if (is_nan(a.cls)) {
1232        if (is_snan(a.cls)) {
1233            s->float_exception_flags |= float_flag_invalid;
1234            a = parts_silence_nan(a, s);
1235        }
1236        if (s->default_nan_mode) {
1237            return parts_default_nan(s);
1238        }
1239    }
1240    return a;
1241}
1242
1243float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1244{
1245    const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1246    FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1247    FloatParts pr = float_to_float(p, &float32_params, s);
1248    return float32_round_pack_canonical(pr, s);
1249}
1250
1251float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1252{
1253    const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1254    FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1255    FloatParts pr = float_to_float(p, &float64_params, s);
1256    return float64_round_pack_canonical(pr, s);
1257}
1258
1259float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1260{
1261    const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1262    FloatParts p = float32_unpack_canonical(a, s);
1263    FloatParts pr = float_to_float(p, fmt16, s);
1264    return float16a_round_pack_canonical(pr, s, fmt16);
1265}
1266
1267float64 float32_to_float64(float32 a, float_status *s)
1268{
1269    FloatParts p = float32_unpack_canonical(a, s);
1270    FloatParts pr = float_to_float(p, &float64_params, s);
1271    return float64_round_pack_canonical(pr, s);
1272}
1273
1274float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1275{
1276    const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1277    FloatParts p = float64_unpack_canonical(a, s);
1278    FloatParts pr = float_to_float(p, fmt16, s);
1279    return float16a_round_pack_canonical(pr, s, fmt16);
1280}
1281
1282float32 float64_to_float32(float64 a, float_status *s)
1283{
1284    FloatParts p = float64_unpack_canonical(a, s);
1285    FloatParts pr = float_to_float(p, &float32_params, s);
1286    return float32_round_pack_canonical(pr, s);
1287}
1288
1289/*
1290 * Rounds the floating-point value `a' to an integer, and returns the
1291 * result as a floating-point value. The operation is performed
1292 * according to the IEC/IEEE Standard for Binary Floating-Point
1293 * Arithmetic.
1294 */
1295
1296static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1297{
1298    if (is_nan(a.cls)) {
1299        return return_nan(a, s);
1300    }
1301
1302    switch (a.cls) {
1303    case float_class_zero:
1304    case float_class_inf:
1305    case float_class_qnan:
1306        /* already "integral" */
1307        break;
1308    case float_class_normal:
1309        if (a.exp >= DECOMPOSED_BINARY_POINT) {
1310            /* already integral */
1311            break;
1312        }
1313        if (a.exp < 0) {
1314            bool one;
1315            /* all fractional */
1316            s->float_exception_flags |= float_flag_inexact;
1317            switch (rounding_mode) {
1318            case float_round_nearest_even:
1319                one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1320                break;
1321            case float_round_ties_away:
1322                one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1323                break;
1324            case float_round_to_zero:
1325                one = false;
1326                break;
1327            case float_round_up:
1328                one = !a.sign;
1329                break;
1330            case float_round_down:
1331                one = a.sign;
1332                break;
1333            default:
1334                g_assert_not_reached();
1335            }
1336
1337            if (one) {
1338                a.frac = DECOMPOSED_IMPLICIT_BIT;
1339                a.exp = 0;
1340            } else {
1341                a.cls = float_class_zero;
1342            }
1343        } else {
1344            uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1345            uint64_t frac_lsbm1 = frac_lsb >> 1;
1346            uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1347            uint64_t rnd_mask = rnd_even_mask >> 1;
1348            uint64_t inc;
1349
1350            switch (rounding_mode) {
1351            case float_round_nearest_even:
1352                inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1353                break;
1354            case float_round_ties_away:
1355                inc = frac_lsbm1;
1356                break;
1357            case float_round_to_zero:
1358                inc = 0;
1359                break;
1360            case float_round_up:
1361                inc = a.sign ? 0 : rnd_mask;
1362                break;
1363            case float_round_down:
1364                inc = a.sign ? rnd_mask : 0;
1365                break;
1366            default:
1367                g_assert_not_reached();
1368            }
1369
1370            if (a.frac & rnd_mask) {
1371                s->float_exception_flags |= float_flag_inexact;
1372                a.frac += inc;
1373                a.frac &= ~rnd_mask;
1374                if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1375                    a.frac >>= 1;
1376                    a.exp++;
1377                }
1378            }
1379        }
1380        break;
1381    default:
1382        g_assert_not_reached();
1383    }
1384    return a;
1385}
1386
1387float16 float16_round_to_int(float16 a, float_status *s)
1388{
1389    FloatParts pa = float16_unpack_canonical(a, s);
1390    FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1391    return float16_round_pack_canonical(pr, s);
1392}
1393
1394float32 float32_round_to_int(float32 a, float_status *s)
1395{
1396    FloatParts pa = float32_unpack_canonical(a, s);
1397    FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1398    return float32_round_pack_canonical(pr, s);
1399}
1400
1401float64 float64_round_to_int(float64 a, float_status *s)
1402{
1403    FloatParts pa = float64_unpack_canonical(a, s);
1404    FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1405    return float64_round_pack_canonical(pr, s);
1406}
1407
1408float64 float64_trunc_to_int(float64 a, float_status *s)
1409{
1410    FloatParts pa = float64_unpack_canonical(a, s);
1411    FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1412    return float64_round_pack_canonical(pr, s);
1413}
1414
1415/*
1416 * Returns the result of converting the floating-point value `a' to
1417 * the two's complement integer format. The conversion is performed
1418 * according to the IEC/IEEE Standard for Binary Floating-Point
1419 * Arithmetic---which means in particular that the conversion is
1420 * rounded according to the current rounding mode. If `a' is a NaN,
1421 * the largest positive integer is returned. Otherwise, if the
1422 * conversion overflows, the largest integer with the same sign as `a'
1423 * is returned.
1424*/
1425
1426static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1427                                     int64_t min, int64_t max,
1428                                     float_status *s)
1429{
1430    uint64_t r;
1431    int orig_flags = get_float_exception_flags(s);
1432    FloatParts p = round_to_int(in, rmode, s);
1433
1434    switch (p.cls) {
1435    case float_class_snan:
1436    case float_class_qnan:
1437        s->float_exception_flags = orig_flags | float_flag_invalid;
1438        return max;
1439    case float_class_inf:
1440        s->float_exception_flags = orig_flags | float_flag_invalid;
1441        return p.sign ? min : max;
1442    case float_class_zero:
1443        return 0;
1444    case float_class_normal:
1445        if (p.exp < DECOMPOSED_BINARY_POINT) {
1446            r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1447        } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1448            r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1449        } else {
1450            r = UINT64_MAX;
1451        }
1452        if (p.sign) {
1453            if (r <= -(uint64_t) min) {
1454                return -r;
1455            } else {
1456                s->float_exception_flags = orig_flags | float_flag_invalid;
1457                return min;
1458            }
1459        } else {
1460            if (r <= max) {
1461                return r;
1462            } else {
1463                s->float_exception_flags = orig_flags | float_flag_invalid;
1464                return max;
1465            }
1466        }
1467    default:
1468        g_assert_not_reached();
1469    }
1470}
1471
1472#define FLOAT_TO_INT(fsz, isz)                                          \
1473int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a,         \
1474                                                float_status *s)        \
1475{                                                                       \
1476    FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1477    return round_to_int_and_pack(p, s->float_rounding_mode,             \
1478                                 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1479                                 s);                                    \
1480}                                                                       \
1481                                                                        \
1482int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero       \
1483 (float ## fsz a, float_status *s)                                      \
1484{                                                                       \
1485    FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1486    return round_to_int_and_pack(p, float_round_to_zero,                \
1487                                 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1488                                 s);                                    \
1489}
1490
1491FLOAT_TO_INT(16, 16)
1492FLOAT_TO_INT(16, 32)
1493FLOAT_TO_INT(16, 64)
1494
1495FLOAT_TO_INT(32, 16)
1496FLOAT_TO_INT(32, 32)
1497FLOAT_TO_INT(32, 64)
1498
1499FLOAT_TO_INT(64, 16)
1500FLOAT_TO_INT(64, 32)
1501FLOAT_TO_INT(64, 64)
1502
1503#undef FLOAT_TO_INT
1504
1505/*
1506 *  Returns the result of converting the floating-point value `a' to
1507 *  the unsigned integer format. The conversion is performed according
1508 *  to the IEC/IEEE Standard for Binary Floating-Point
1509 *  Arithmetic---which means in particular that the conversion is
1510 *  rounded according to the current rounding mode. If `a' is a NaN,
1511 *  the largest unsigned integer is returned. Otherwise, if the
1512 *  conversion overflows, the largest unsigned integer is returned. If
1513 *  the 'a' is negative, the result is rounded and zero is returned;
1514 *  values that do not round to zero will raise the inexact exception
1515 *  flag.
1516 */
1517
1518static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1519                                       float_status *s)
1520{
1521    int orig_flags = get_float_exception_flags(s);
1522    FloatParts p = round_to_int(in, rmode, s);
1523
1524    switch (p.cls) {
1525    case float_class_snan:
1526    case float_class_qnan:
1527        s->float_exception_flags = orig_flags | float_flag_invalid;
1528        return max;
1529    case float_class_inf:
1530        s->float_exception_flags = orig_flags | float_flag_invalid;
1531        return p.sign ? 0 : max;
1532    case float_class_zero:
1533        return 0;
1534    case float_class_normal:
1535    {
1536        uint64_t r;
1537        if (p.sign) {
1538            s->float_exception_flags = orig_flags | float_flag_invalid;
1539            return 0;
1540        }
1541
1542        if (p.exp < DECOMPOSED_BINARY_POINT) {
1543            r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1544        } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1545            r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1546        } else {
1547            s->float_exception_flags = orig_flags | float_flag_invalid;
1548            return max;
1549        }
1550
1551        /* For uint64 this will never trip, but if p.exp is too large
1552         * to shift a decomposed fraction we shall have exited via the
1553         * 3rd leg above.
1554         */
1555        if (r > max) {
1556            s->float_exception_flags = orig_flags | float_flag_invalid;
1557            return max;
1558        } else {
1559            return r;
1560        }
1561    }
1562    default:
1563        g_assert_not_reached();
1564    }
1565}
1566
1567#define FLOAT_TO_UINT(fsz, isz) \
1568uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a,       \
1569                                                  float_status *s)      \
1570{                                                                       \
1571    FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1572    return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1573                                 UINT ## isz ## _MAX, s);               \
1574}                                                                       \
1575                                                                        \
1576uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero     \
1577 (float ## fsz a, float_status *s)                                      \
1578{                                                                       \
1579    FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1580    return round_to_uint_and_pack(p, float_round_to_zero,               \
1581                                  UINT ## isz ## _MAX, s);              \
1582}
1583
1584FLOAT_TO_UINT(16, 16)
1585FLOAT_TO_UINT(16, 32)
1586FLOAT_TO_UINT(16, 64)
1587
1588FLOAT_TO_UINT(32, 16)
1589FLOAT_TO_UINT(32, 32)
1590FLOAT_TO_UINT(32, 64)
1591
1592FLOAT_TO_UINT(64, 16)
1593FLOAT_TO_UINT(64, 32)
1594FLOAT_TO_UINT(64, 64)
1595
1596#undef FLOAT_TO_UINT
1597
1598/*
1599 * Integer to float conversions
1600 *
1601 * Returns the result of converting the two's complement integer `a'
1602 * to the floating-point format. The conversion is performed according
1603 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1604 */
1605
1606static FloatParts int_to_float(int64_t a, float_status *status)
1607{
1608    FloatParts r = {};
1609    if (a == 0) {
1610        r.cls = float_class_zero;
1611        r.sign = false;
1612    } else if (a == (1ULL << 63)) {
1613        r.cls = float_class_normal;
1614        r.sign = true;
1615        r.frac = DECOMPOSED_IMPLICIT_BIT;
1616        r.exp = 63;
1617    } else {
1618        uint64_t f;
1619        if (a < 0) {
1620            f = -a;
1621            r.sign = true;
1622        } else {
1623            f = a;
1624            r.sign = false;
1625        }
1626        int shift = clz64(f) - 1;
1627        r.cls = float_class_normal;
1628        r.exp = (DECOMPOSED_BINARY_POINT - shift);
1629        r.frac = f << shift;
1630    }
1631
1632    return r;
1633}
1634
1635float16 int64_to_float16(int64_t a, float_status *status)
1636{
1637    FloatParts pa = int_to_float(a, status);
1638    return float16_round_pack_canonical(pa, status);
1639}
1640
1641float16 int32_to_float16(int32_t a, float_status *status)
1642{
1643    return int64_to_float16(a, status);
1644}
1645
1646float16 int16_to_float16(int16_t a, float_status *status)
1647{
1648    return int64_to_float16(a, status);
1649}
1650
1651float32 int64_to_float32(int64_t a, float_status *status)
1652{
1653    FloatParts pa = int_to_float(a, status);
1654    return float32_round_pack_canonical(pa, status);
1655}
1656
1657float32 int32_to_float32(int32_t a, float_status *status)
1658{
1659    return int64_to_float32(a, status);
1660}
1661
1662float32 int16_to_float32(int16_t a, float_status *status)
1663{
1664    return int64_to_float32(a, status);
1665}
1666
1667float64 int64_to_float64(int64_t a, float_status *status)
1668{
1669    FloatParts pa = int_to_float(a, status);
1670    return float64_round_pack_canonical(pa, status);
1671}
1672
1673float64 int32_to_float64(int32_t a, float_status *status)
1674{
1675    return int64_to_float64(a, status);
1676}
1677
1678float64 int16_to_float64(int16_t a, float_status *status)
1679{
1680    return int64_to_float64(a, status);
1681}
1682
1683
1684/*
1685 * Unsigned Integer to float conversions
1686 *
1687 * Returns the result of converting the unsigned integer `a' to the
1688 * floating-point format. The conversion is performed according to the
1689 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1690 */
1691
1692static FloatParts uint_to_float(uint64_t a, float_status *status)
1693{
1694    FloatParts r = { .sign = false};
1695
1696    if (a == 0) {
1697        r.cls = float_class_zero;
1698    } else {
1699        int spare_bits = clz64(a) - 1;
1700        r.cls = float_class_normal;
1701        r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1702        if (spare_bits < 0) {
1703            shift64RightJamming(a, -spare_bits, &a);
1704            r.frac = a;
1705        } else {
1706            r.frac = a << spare_bits;
1707        }
1708    }
1709
1710    return r;
1711}
1712
1713float16 uint64_to_float16(uint64_t a, float_status *status)
1714{
1715    FloatParts pa = uint_to_float(a, status);
1716    return float16_round_pack_canonical(pa, status);
1717}
1718
1719float16 uint32_to_float16(uint32_t a, float_status *status)
1720{
1721    return uint64_to_float16(a, status);
1722}
1723
1724float16 uint16_to_float16(uint16_t a, float_status *status)
1725{
1726    return uint64_to_float16(a, status);
1727}
1728
1729float32 uint64_to_float32(uint64_t a, float_status *status)
1730{
1731    FloatParts pa = uint_to_float(a, status);
1732    return float32_round_pack_canonical(pa, status);
1733}
1734
1735float32 uint32_to_float32(uint32_t a, float_status *status)
1736{
1737    return uint64_to_float32(a, status);
1738}
1739
1740float32 uint16_to_float32(uint16_t a, float_status *status)
1741{
1742    return uint64_to_float32(a, status);
1743}
1744
1745float64 uint64_to_float64(uint64_t a, float_status *status)
1746{
1747    FloatParts pa = uint_to_float(a, status);
1748    return float64_round_pack_canonical(pa, status);
1749}
1750
1751float64 uint32_to_float64(uint32_t a, float_status *status)
1752{
1753    return uint64_to_float64(a, status);
1754}
1755
1756float64 uint16_to_float64(uint16_t a, float_status *status)
1757{
1758    return uint64_to_float64(a, status);
1759}
1760
1761/* Float Min/Max */
1762/* min() and max() functions. These can't be implemented as
1763 * 'compare and pick one input' because that would mishandle
1764 * NaNs and +0 vs -0.
1765 *
1766 * minnum() and maxnum() functions. These are similar to the min()
1767 * and max() functions but if one of the arguments is a QNaN and
1768 * the other is numerical then the numerical argument is returned.
1769 * SNaNs will get quietened before being returned.
1770 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1771 * and maxNum() operations. min() and max() are the typical min/max
1772 * semantics provided by many CPUs which predate that specification.
1773 *
1774 * minnummag() and maxnummag() functions correspond to minNumMag()
1775 * and minNumMag() from the IEEE-754 2008.
1776 */
1777static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1778                                bool ieee, bool ismag, float_status *s)
1779{
1780    if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1781        if (ieee) {
1782            /* Takes two floating-point values `a' and `b', one of
1783             * which is a NaN, and returns the appropriate NaN
1784             * result. If either `a' or `b' is a signaling NaN,
1785             * the invalid exception is raised.
1786             */
1787            if (is_snan(a.cls) || is_snan(b.cls)) {
1788                return pick_nan(a, b, s);
1789            } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1790                return b;
1791            } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1792                return a;
1793            }
1794        }
1795        return pick_nan(a, b, s);
1796    } else {
1797        int a_exp, b_exp;
1798
1799        switch (a.cls) {
1800        case float_class_normal:
1801            a_exp = a.exp;
1802            break;
1803        case float_class_inf:
1804            a_exp = INT_MAX;
1805            break;
1806        case float_class_zero:
1807            a_exp = INT_MIN;
1808            break;
1809        default:
1810            g_assert_not_reached();
1811            break;
1812        }
1813        switch (b.cls) {
1814        case float_class_normal:
1815            b_exp = b.exp;
1816            break;
1817        case float_class_inf:
1818            b_exp = INT_MAX;
1819            break;
1820        case float_class_zero:
1821            b_exp = INT_MIN;
1822            break;
1823        default:
1824            g_assert_not_reached();
1825            break;
1826        }
1827
1828        if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1829            bool a_less = a_exp < b_exp;
1830            if (a_exp == b_exp) {
1831                a_less = a.frac < b.frac;
1832            }
1833            return a_less ^ ismin ? b : a;
1834        }
1835
1836        if (a.sign == b.sign) {
1837            bool a_less = a_exp < b_exp;
1838            if (a_exp == b_exp) {
1839                a_less = a.frac < b.frac;
1840            }
1841            return a.sign ^ a_less ^ ismin ? b : a;
1842        } else {
1843            return a.sign ^ ismin ? b : a;
1844        }
1845    }
1846}
1847
1848#define MINMAX(sz, name, ismin, isiee, ismag)                           \
1849float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
1850                                     float_status *s)                   \
1851{                                                                       \
1852    FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1853    FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1854    FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
1855                                                                        \
1856    return float ## sz ## _round_pack_canonical(pr, s);                 \
1857}
1858
1859MINMAX(16, min, true, false, false)
1860MINMAX(16, minnum, true, true, false)
1861MINMAX(16, minnummag, true, true, true)
1862MINMAX(16, max, false, false, false)
1863MINMAX(16, maxnum, false, true, false)
1864MINMAX(16, maxnummag, false, true, true)
1865
1866MINMAX(32, min, true, false, false)
1867MINMAX(32, minnum, true, true, false)
1868MINMAX(32, minnummag, true, true, true)
1869MINMAX(32, max, false, false, false)
1870MINMAX(32, maxnum, false, true, false)
1871MINMAX(32, maxnummag, false, true, true)
1872
1873MINMAX(64, min, true, false, false)
1874MINMAX(64, minnum, true, true, false)
1875MINMAX(64, minnummag, true, true, true)
1876MINMAX(64, max, false, false, false)
1877MINMAX(64, maxnum, false, true, false)
1878MINMAX(64, maxnummag, false, true, true)
1879
1880#undef MINMAX
1881
1882/* Floating point compare */
1883static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1884                          float_status *s)
1885{
1886    if (is_nan(a.cls) || is_nan(b.cls)) {
1887        if (!is_quiet ||
1888            a.cls == float_class_snan ||
1889            b.cls == float_class_snan) {
1890            s->float_exception_flags |= float_flag_invalid;
1891        }
1892        return float_relation_unordered;
1893    }
1894
1895    if (a.cls == float_class_zero) {
1896        if (b.cls == float_class_zero) {
1897            return float_relation_equal;
1898        }
1899        return b.sign ? float_relation_greater : float_relation_less;
1900    } else if (b.cls == float_class_zero) {
1901        return a.sign ? float_relation_less : float_relation_greater;
1902    }
1903
1904    /* The only really important thing about infinity is its sign. If
1905     * both are infinities the sign marks the smallest of the two.
1906     */
1907    if (a.cls == float_class_inf) {
1908        if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1909            return float_relation_equal;
1910        }
1911        return a.sign ? float_relation_less : float_relation_greater;
1912    } else if (b.cls == float_class_inf) {
1913        return b.sign ? float_relation_greater : float_relation_less;
1914    }
1915
1916    if (a.sign != b.sign) {
1917        return a.sign ? float_relation_less : float_relation_greater;
1918    }
1919
1920    if (a.exp == b.exp) {
1921        if (a.frac == b.frac) {
1922            return float_relation_equal;
1923        }
1924        if (a.sign) {
1925            return a.frac > b.frac ?
1926                float_relation_less : float_relation_greater;
1927        } else {
1928            return a.frac > b.frac ?
1929                float_relation_greater : float_relation_less;
1930        }
1931    } else {
1932        if (a.sign) {
1933            return a.exp > b.exp ? float_relation_less : float_relation_greater;
1934        } else {
1935            return a.exp > b.exp ? float_relation_greater : float_relation_less;
1936        }
1937    }
1938}
1939
1940#define COMPARE(sz)                                                     \
1941int float ## sz ## _compare(float ## sz a, float ## sz b,               \
1942                            float_status *s)                            \
1943{                                                                       \
1944    FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1945    FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1946    return compare_floats(pa, pb, false, s);                            \
1947}                                                                       \
1948int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
1949                                  float_status *s)                      \
1950{                                                                       \
1951    FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1952    FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1953    return compare_floats(pa, pb, true, s);                             \
1954}
1955
1956COMPARE(16)
1957COMPARE(32)
1958COMPARE(64)
1959
1960#undef COMPARE
1961
1962/* Multiply A by 2 raised to the power N.  */
1963static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1964{
1965    if (unlikely(is_nan(a.cls))) {
1966        return return_nan(a, s);
1967    }
1968    if (a.cls == float_class_normal) {
1969        /* The largest float type (even though not supported by FloatParts)
1970         * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
1971         * still allows rounding to infinity, without allowing overflow
1972         * within the int32_t that backs FloatParts.exp.
1973         */
1974        n = MIN(MAX(n, -0x10000), 0x10000);
1975        a.exp += n;
1976    }
1977    return a;
1978}
1979
1980float16 float16_scalbn(float16 a, int n, float_status *status)
1981{
1982    FloatParts pa = float16_unpack_canonical(a, status);
1983    FloatParts pr = scalbn_decomposed(pa, n, status);
1984    return float16_round_pack_canonical(pr, status);
1985}
1986
1987float32 float32_scalbn(float32 a, int n, float_status *status)
1988{
1989    FloatParts pa = float32_unpack_canonical(a, status);
1990    FloatParts pr = scalbn_decomposed(pa, n, status);
1991    return float32_round_pack_canonical(pr, status);
1992}
1993
1994float64 float64_scalbn(float64 a, int n, float_status *status)
1995{
1996    FloatParts pa = float64_unpack_canonical(a, status);
1997    FloatParts pr = scalbn_decomposed(pa, n, status);
1998    return float64_round_pack_canonical(pr, status);
1999}
2000
2001/*
2002 * Square Root
2003 *
2004 * The old softfloat code did an approximation step before zeroing in
2005 * on the final result. However for simpleness we just compute the
2006 * square root by iterating down from the implicit bit to enough extra
2007 * bits to ensure we get a correctly rounded result.
2008 *
2009 * This does mean however the calculation is slower than before,
2010 * especially for 64 bit floats.
2011 */
2012
2013static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2014{
2015    uint64_t a_frac, r_frac, s_frac;
2016    int bit, last_bit;
2017
2018    if (is_nan(a.cls)) {
2019        return return_nan(a, s);
2020    }
2021    if (a.cls == float_class_zero) {
2022        return a;  /* sqrt(+-0) = +-0 */
2023    }
2024    if (a.sign) {
2025        s->float_exception_flags |= float_flag_invalid;
2026        return parts_default_nan(s);
2027    }
2028    if (a.cls == float_class_inf) {
2029        return a;  /* sqrt(+inf) = +inf */
2030    }
2031
2032    assert(a.cls == float_class_normal);
2033
2034    /* We need two overflow bits at the top. Adding room for that is a
2035     * right shift. If the exponent is odd, we can discard the low bit
2036     * by multiplying the fraction by 2; that's a left shift. Combine
2037     * those and we shift right if the exponent is even.
2038     */
2039    a_frac = a.frac;
2040    if (!(a.exp & 1)) {
2041        a_frac >>= 1;
2042    }
2043    a.exp >>= 1;
2044
2045    /* Bit-by-bit computation of sqrt.  */
2046    r_frac = 0;
2047    s_frac = 0;
2048
2049    /* Iterate from implicit bit down to the 3 extra bits to compute a
2050     * properly rounded result. Remember we've inserted one more bit
2051     * at the top, so these positions are one less.
2052     */
2053    bit = DECOMPOSED_BINARY_POINT - 1;
2054    last_bit = MAX(p->frac_shift - 4, 0);
2055    do {
2056        uint64_t q = 1ULL << bit;
2057        uint64_t t_frac = s_frac + q;
2058        if (t_frac <= a_frac) {
2059            s_frac = t_frac + q;
2060            a_frac -= t_frac;
2061            r_frac += q;
2062        }
2063        a_frac <<= 1;
2064    } while (--bit >= last_bit);
2065
2066    /* Undo the right shift done above. If there is any remaining
2067     * fraction, the result is inexact. Set the sticky bit.
2068     */
2069    a.frac = (r_frac << 1) + (a_frac != 0);
2070
2071    return a;
2072}
2073
2074float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
2075{
2076    FloatParts pa = float16_unpack_canonical(a, status);
2077    FloatParts pr = sqrt_float(pa, status, &float16_params);
2078    return float16_round_pack_canonical(pr, status);
2079}
2080
2081float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
2082{
2083    FloatParts pa = float32_unpack_canonical(a, status);
2084    FloatParts pr = sqrt_float(pa, status, &float32_params);
2085    return float32_round_pack_canonical(pr, status);
2086}
2087
2088float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
2089{
2090    FloatParts pa = float64_unpack_canonical(a, status);
2091    FloatParts pr = sqrt_float(pa, status, &float64_params);
2092    return float64_round_pack_canonical(pr, status);
2093}
2094
2095/*----------------------------------------------------------------------------
2096| The pattern for a default generated NaN.
2097*----------------------------------------------------------------------------*/
2098
2099float16 float16_default_nan(float_status *status)
2100{
2101    FloatParts p = parts_default_nan(status);
2102    p.frac >>= float16_params.frac_shift;
2103    return float16_pack_raw(p);
2104}
2105
2106float32 float32_default_nan(float_status *status)
2107{
2108    FloatParts p = parts_default_nan(status);
2109    p.frac >>= float32_params.frac_shift;
2110    return float32_pack_raw(p);
2111}
2112
2113float64 float64_default_nan(float_status *status)
2114{
2115    FloatParts p = parts_default_nan(status);
2116    p.frac >>= float64_params.frac_shift;
2117    return float64_pack_raw(p);
2118}
2119
2120float128 float128_default_nan(float_status *status)
2121{
2122    FloatParts p = parts_default_nan(status);
2123    float128 r;
2124
2125    /* Extrapolate from the choices made by parts_default_nan to fill
2126     * in the quad-floating format.  If the low bit is set, assume we
2127     * want to set all non-snan bits.
2128     */
2129    r.low = -(p.frac & 1);
2130    r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
2131    r.high |= LIT64(0x7FFF000000000000);
2132    r.high |= (uint64_t)p.sign << 63;
2133
2134    return r;
2135}
2136
2137/*----------------------------------------------------------------------------
2138| Returns a quiet NaN from a signalling NaN for the floating point value `a'.
2139*----------------------------------------------------------------------------*/
2140
2141float16 float16_silence_nan(float16 a, float_status *status)
2142{
2143    FloatParts p = float16_unpack_raw(a);
2144    p.frac <<= float16_params.frac_shift;
2145    p = parts_silence_nan(p, status);
2146    p.frac >>= float16_params.frac_shift;
2147    return float16_pack_raw(p);
2148}
2149
2150float32 float32_silence_nan(float32 a, float_status *status)
2151{
2152    FloatParts p = float32_unpack_raw(a);
2153    p.frac <<= float32_params.frac_shift;
2154    p = parts_silence_nan(p, status);
2155    p.frac >>= float32_params.frac_shift;
2156    return float32_pack_raw(p);
2157}
2158
2159float64 float64_silence_nan(float64 a, float_status *status)
2160{
2161    FloatParts p = float64_unpack_raw(a);
2162    p.frac <<= float64_params.frac_shift;
2163    p = parts_silence_nan(p, status);
2164    p.frac >>= float64_params.frac_shift;
2165    return float64_pack_raw(p);
2166}
2167
2168/*----------------------------------------------------------------------------
2169| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2170| and 7, and returns the properly rounded 32-bit integer corresponding to the
2171| input.  If `zSign' is 1, the input is negated before being converted to an
2172| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2173| is simply rounded to an integer, with the inexact exception raised if the
2174| input cannot be represented exactly as an integer.  However, if the fixed-
2175| point input is too large, the invalid exception is raised and the largest
2176| positive or negative integer is returned.
2177*----------------------------------------------------------------------------*/
2178
2179static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2180{
2181    int8_t roundingMode;
2182    flag roundNearestEven;
2183    int8_t roundIncrement, roundBits;
2184    int32_t z;
2185
2186    roundingMode = status->float_rounding_mode;
2187    roundNearestEven = ( roundingMode == float_round_nearest_even );
2188    switch (roundingMode) {
2189    case float_round_nearest_even:
2190    case float_round_ties_away:
2191        roundIncrement = 0x40;
2192        break;
2193    case float_round_to_zero:
2194        roundIncrement = 0;
2195        break;
2196    case float_round_up:
2197        roundIncrement = zSign ? 0 : 0x7f;
2198        break;
2199    case float_round_down:
2200        roundIncrement = zSign ? 0x7f : 0;
2201        break;
2202    default:
2203        abort();
2204    }
2205    roundBits = absZ & 0x7F;
2206    absZ = ( absZ + roundIncrement )>>7;
2207    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2208    z = absZ;
2209    if ( zSign ) z = - z;
2210    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2211        float_raise(float_flag_invalid, status);
2212        return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2213    }
2214    if (roundBits) {
2215        status->float_exception_flags |= float_flag_inexact;
2216    }
2217    return z;
2218
2219}
2220
2221/*----------------------------------------------------------------------------
2222| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2223| `absZ1', with binary point between bits 63 and 64 (between the input words),
2224| and returns the properly rounded 64-bit integer corresponding to the input.
2225| If `zSign' is 1, the input is negated before being converted to an integer.
2226| Ordinarily, the fixed-point input is simply rounded to an integer, with
2227| the inexact exception raised if the input cannot be represented exactly as
2228| an integer.  However, if the fixed-point input is too large, the invalid
2229| exception is raised and the largest positive or negative integer is
2230| returned.
2231*----------------------------------------------------------------------------*/
2232
2233static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2234                               float_status *status)
2235{
2236    int8_t roundingMode;
2237    flag roundNearestEven, increment;
2238    int64_t z;
2239
2240    roundingMode = status->float_rounding_mode;
2241    roundNearestEven = ( roundingMode == float_round_nearest_even );
2242    switch (roundingMode) {
2243    case float_round_nearest_even:
2244    case float_round_ties_away:
2245        increment = ((int64_t) absZ1 < 0);
2246        break;
2247    case float_round_to_zero:
2248        increment = 0;
2249        break;
2250    case float_round_up:
2251        increment = !zSign && absZ1;
2252        break;
2253    case float_round_down:
2254        increment = zSign && absZ1;
2255        break;
2256    default:
2257        abort();
2258    }
2259    if ( increment ) {
2260        ++absZ0;
2261        if ( absZ0 == 0 ) goto overflow;
2262        absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2263    }
2264    z = absZ0;
2265    if ( zSign ) z = - z;
2266    if ( z && ( ( z < 0 ) ^ zSign ) ) {
2267 overflow:
2268        float_raise(float_flag_invalid, status);
2269        return
2270              zSign ? (int64_t) LIT64( 0x8000000000000000 )
2271            : LIT64( 0x7FFFFFFFFFFFFFFF );
2272    }
2273    if (absZ1) {
2274        status->float_exception_flags |= float_flag_inexact;
2275    }
2276    return z;
2277
2278}
2279
2280/*----------------------------------------------------------------------------
2281| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2282| `absZ1', with binary point between bits 63 and 64 (between the input words),
2283| and returns the properly rounded 64-bit unsigned integer corresponding to the
2284| input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2285| with the inexact exception raised if the input cannot be represented exactly
2286| as an integer.  However, if the fixed-point input is too large, the invalid
2287| exception is raised and the largest unsigned integer is returned.
2288*----------------------------------------------------------------------------*/
2289
2290static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2291                                uint64_t absZ1, float_status *status)
2292{
2293    int8_t roundingMode;
2294    flag roundNearestEven, increment;
2295
2296    roundingMode = status->float_rounding_mode;
2297    roundNearestEven = (roundingMode == float_round_nearest_even);
2298    switch (roundingMode) {
2299    case float_round_nearest_even:
2300    case float_round_ties_away:
2301        increment = ((int64_t)absZ1 < 0);
2302        break;
2303    case float_round_to_zero:
2304        increment = 0;
2305        break;
2306    case float_round_up:
2307        increment = !zSign && absZ1;
2308        break;
2309    case float_round_down:
2310        increment = zSign && absZ1;
2311        break;
2312    default:
2313        abort();
2314    }
2315    if (increment) {
2316        ++absZ0;
2317        if (absZ0 == 0) {
2318            float_raise(float_flag_invalid, status);
2319            return LIT64(0xFFFFFFFFFFFFFFFF);
2320        }
2321        absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2322    }
2323
2324    if (zSign && absZ0) {
2325        float_raise(float_flag_invalid, status);
2326        return 0;
2327    }
2328
2329    if (absZ1) {
2330        status->float_exception_flags |= float_flag_inexact;
2331    }
2332    return absZ0;
2333}
2334
2335/*----------------------------------------------------------------------------
2336| If `a' is denormal and we are in flush-to-zero mode then set the
2337| input-denormal exception and return zero. Otherwise just return the value.
2338*----------------------------------------------------------------------------*/
2339float32 float32_squash_input_denormal(float32 a, float_status *status)
2340{
2341    if (status->flush_inputs_to_zero) {
2342        if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2343            float_raise(float_flag_input_denormal, status);
2344            return make_float32(float32_val(a) & 0x80000000);
2345        }
2346    }
2347    return a;
2348}
2349
2350/*----------------------------------------------------------------------------
2351| Normalizes the subnormal single-precision floating-point value represented
2352| by the denormalized significand `aSig'.  The normalized exponent and
2353| significand are stored at the locations pointed to by `zExpPtr' and
2354| `zSigPtr', respectively.
2355*----------------------------------------------------------------------------*/
2356
2357static void
2358 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2359{
2360    int8_t shiftCount;
2361
2362    shiftCount = countLeadingZeros32( aSig ) - 8;
2363    *zSigPtr = aSig<<shiftCount;
2364    *zExpPtr = 1 - shiftCount;
2365
2366}
2367
2368/*----------------------------------------------------------------------------
2369| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2370| and significand `zSig', and returns the proper single-precision floating-
2371| point value corresponding to the abstract input.  Ordinarily, the abstract
2372| value is simply rounded and packed into the single-precision format, with
2373| the inexact exception raised if the abstract input cannot be represented
2374| exactly.  However, if the abstract value is too large, the overflow and
2375| inexact exceptions are raised and an infinity or maximal finite value is
2376| returned.  If the abstract value is too small, the input value is rounded to
2377| a subnormal number, and the underflow and inexact exceptions are raised if
2378| the abstract input cannot be represented exactly as a subnormal single-
2379| precision floating-point number.
2380|     The input significand `zSig' has its binary point between bits 30
2381| and 29, which is 7 bits to the left of the usual location.  This shifted
2382| significand must be normalized or smaller.  If `zSig' is not normalized,
2383| `zExp' must be 0; in that case, the result returned is a subnormal number,
2384| and it must not require rounding.  In the usual case that `zSig' is
2385| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2386| The handling of underflow and overflow follows the IEC/IEEE Standard for
2387| Binary Floating-Point Arithmetic.
2388*----------------------------------------------------------------------------*/
2389
2390static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2391                                   float_status *status)
2392{
2393    int8_t roundingMode;
2394    flag roundNearestEven;
2395    int8_t roundIncrement, roundBits;
2396    flag isTiny;
2397
2398    roundingMode = status->float_rounding_mode;
2399    roundNearestEven = ( roundingMode == float_round_nearest_even );
2400    switch (roundingMode) {
2401    case float_round_nearest_even:
2402    case float_round_ties_away:
2403        roundIncrement = 0x40;
2404        break;
2405    case float_round_to_zero:
2406        roundIncrement = 0;
2407        break;
2408    case float_round_up:
2409        roundIncrement = zSign ? 0 : 0x7f;
2410        break;
2411    case float_round_down:
2412        roundIncrement = zSign ? 0x7f : 0;
2413        break;
2414    default:
2415        abort();
2416        break;
2417    }
2418    roundBits = zSig & 0x7F;
2419    if ( 0xFD <= (uint16_t) zExp ) {
2420        if (    ( 0xFD < zExp )
2421             || (    ( zExp == 0xFD )
2422                  && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2423           ) {
2424            float_raise(float_flag_overflow | float_flag_inexact, status);
2425            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2426        }
2427        if ( zExp < 0 ) {
2428            if (status->flush_to_zero) {
2429                float_raise(float_flag_output_denormal, status);
2430                return packFloat32(zSign, 0, 0);
2431            }
2432            isTiny =
2433                (status->float_detect_tininess
2434                 == float_tininess_before_rounding)
2435                || ( zExp < -1 )
2436                || ( zSig + roundIncrement < 0x80000000 );
2437            shift32RightJamming( zSig, - zExp, &zSig );
2438            zExp = 0;
2439            roundBits = zSig & 0x7F;
2440            if (isTiny && roundBits) {
2441                float_raise(float_flag_underflow, status);
2442            }
2443        }
2444    }
2445    if (roundBits) {
2446        status->float_exception_flags |= float_flag_inexact;
2447    }
2448    zSig = ( zSig + roundIncrement )>>7;
2449    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2450    if ( zSig == 0 ) zExp = 0;
2451    return packFloat32( zSign, zExp, zSig );
2452
2453}
2454
2455/*----------------------------------------------------------------------------
2456| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2457| and significand `zSig', and returns the proper single-precision floating-
2458| point value corresponding to the abstract input.  This routine is just like
2459| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2460| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2461| floating-point exponent.
2462*----------------------------------------------------------------------------*/
2463
2464static float32
2465 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2466                              float_status *status)
2467{
2468    int8_t shiftCount;
2469
2470    shiftCount = countLeadingZeros32( zSig ) - 1;
2471    return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2472                               status);
2473
2474}
2475
2476/*----------------------------------------------------------------------------
2477| If `a' is denormal and we are in flush-to-zero mode then set the
2478| input-denormal exception and return zero. Otherwise just return the value.
2479*----------------------------------------------------------------------------*/
2480float64 float64_squash_input_denormal(float64 a, float_status *status)
2481{
2482    if (status->flush_inputs_to_zero) {
2483        if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2484            float_raise(float_flag_input_denormal, status);
2485            return make_float64(float64_val(a) & (1ULL << 63));
2486        }
2487    }
2488    return a;
2489}
2490
2491/*----------------------------------------------------------------------------
2492| Normalizes the subnormal double-precision floating-point value represented
2493| by the denormalized significand `aSig'.  The normalized exponent and
2494| significand are stored at the locations pointed to by `zExpPtr' and
2495| `zSigPtr', respectively.
2496*----------------------------------------------------------------------------*/
2497
2498static void
2499 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2500{
2501    int8_t shiftCount;
2502
2503    shiftCount = countLeadingZeros64( aSig ) - 11;
2504    *zSigPtr = aSig<<shiftCount;
2505    *zExpPtr = 1 - shiftCount;
2506
2507}
2508
2509/*----------------------------------------------------------------------------
2510| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2511| double-precision floating-point value, returning the result.  After being
2512| shifted into the proper positions, the three fields are simply added
2513| together to form the result.  This means that any integer portion of `zSig'
2514| will be added into the exponent.  Since a properly normalized significand
2515| will have an integer portion equal to 1, the `zExp' input should be 1 less
2516| than the desired result exponent whenever `zSig' is a complete, normalized
2517| significand.
2518*----------------------------------------------------------------------------*/
2519
2520static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2521{
2522
2523    return make_float64(
2524        ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2525
2526}
2527
2528/*----------------------------------------------------------------------------
2529| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2530| and significand `zSig', and returns the proper double-precision floating-
2531| point value corresponding to the abstract input.  Ordinarily, the abstract
2532| value is simply rounded and packed into the double-precision format, with
2533| the inexact exception raised if the abstract input cannot be represented
2534| exactly.  However, if the abstract value is too large, the overflow and
2535| inexact exceptions are raised and an infinity or maximal finite value is
2536| returned.  If the abstract value is too small, the input value is rounded to
2537| a subnormal number, and the underflow and inexact exceptions are raised if
2538| the abstract input cannot be represented exactly as a subnormal double-
2539| precision floating-point number.
2540|     The input significand `zSig' has its binary point between bits 62
2541| and 61, which is 10 bits to the left of the usual location.  This shifted
2542| significand must be normalized or smaller.  If `zSig' is not normalized,
2543| `zExp' must be 0; in that case, the result returned is a subnormal number,
2544| and it must not require rounding.  In the usual case that `zSig' is
2545| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2546| The handling of underflow and overflow follows the IEC/IEEE Standard for
2547| Binary Floating-Point Arithmetic.
2548*----------------------------------------------------------------------------*/
2549
2550static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2551                                   float_status *status)
2552{
2553    int8_t roundingMode;
2554    flag roundNearestEven;
2555    int roundIncrement, roundBits;
2556    flag isTiny;
2557
2558    roundingMode = status->float_rounding_mode;
2559    roundNearestEven = ( roundingMode == float_round_nearest_even );
2560    switch (roundingMode) {
2561    case float_round_nearest_even:
2562    case float_round_ties_away:
2563        roundIncrement = 0x200;
2564        break;
2565    case float_round_to_zero:
2566        roundIncrement = 0;
2567        break;
2568    case float_round_up:
2569        roundIncrement = zSign ? 0 : 0x3ff;
2570        break;
2571    case float_round_down:
2572        roundIncrement = zSign ? 0x3ff : 0;
2573        break;
2574    case float_round_to_odd:
2575        roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2576        break;
2577    default:
2578        abort();
2579    }
2580    roundBits = zSig & 0x3FF;
2581    if ( 0x7FD <= (uint16_t) zExp ) {
2582        if (    ( 0x7FD < zExp )
2583             || (    ( zExp == 0x7FD )
2584                  && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2585           ) {
2586            bool overflow_to_inf = roundingMode != float_round_to_odd &&
2587                                   roundIncrement != 0;
2588            float_raise(float_flag_overflow | float_flag_inexact, status);
2589            return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2590        }
2591        if ( zExp < 0 ) {
2592            if (status->flush_to_zero) {
2593                float_raise(float_flag_output_denormal, status);
2594                return packFloat64(zSign, 0, 0);
2595            }
2596            isTiny =
2597                   (status->float_detect_tininess
2598                    == float_tininess_before_rounding)
2599                || ( zExp < -1 )
2600                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2601            shift64RightJamming( zSig, - zExp, &zSig );
2602            zExp = 0;
2603            roundBits = zSig & 0x3FF;
2604            if (isTiny && roundBits) {
2605                float_raise(float_flag_underflow, status);
2606            }
2607            if (roundingMode == float_round_to_odd) {
2608                /*
2609                 * For round-to-odd case, the roundIncrement depends on
2610                 * zSig which just changed.
2611                 */
2612                roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2613            }
2614        }
2615    }
2616    if (roundBits) {
2617        status->float_exception_flags |= float_flag_inexact;
2618    }
2619    zSig = ( zSig + roundIncrement )>>10;
2620    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2621    if ( zSig == 0 ) zExp = 0;
2622    return packFloat64( zSign, zExp, zSig );
2623
2624}
2625
2626/*----------------------------------------------------------------------------
2627| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2628| and significand `zSig', and returns the proper double-precision floating-
2629| point value corresponding to the abstract input.  This routine is just like
2630| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2631| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2632| floating-point exponent.
2633*----------------------------------------------------------------------------*/
2634
2635static float64
2636 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2637                              float_status *status)
2638{
2639    int8_t shiftCount;
2640
2641    shiftCount = countLeadingZeros64( zSig ) - 1;
2642    return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2643                               status);
2644
2645}
2646
2647/*----------------------------------------------------------------------------
2648| Normalizes the subnormal extended double-precision floating-point value
2649| represented by the denormalized significand `aSig'.  The normalized exponent
2650| and significand are stored at the locations pointed to by `zExpPtr' and
2651| `zSigPtr', respectively.
2652*----------------------------------------------------------------------------*/
2653
2654void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2655                                uint64_t *zSigPtr)
2656{
2657    int8_t shiftCount;
2658
2659    shiftCount = countLeadingZeros64( aSig );
2660    *zSigPtr = aSig<<shiftCount;
2661    *zExpPtr = 1 - shiftCount;
2662}
2663
2664/*----------------------------------------------------------------------------
2665| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2666| and extended significand formed by the concatenation of `zSig0' and `zSig1',
2667| and returns the proper extended double-precision floating-point value
2668| corresponding to the abstract input.  Ordinarily, the abstract value is
2669| rounded and packed into the extended double-precision format, with the
2670| inexact exception raised if the abstract input cannot be represented
2671| exactly.  However, if the abstract value is too large, the overflow and
2672| inexact exceptions are raised and an infinity or maximal finite value is
2673| returned.  If the abstract value is too small, the input value is rounded to
2674| a subnormal number, and the underflow and inexact exceptions are raised if
2675| the abstract input cannot be represented exactly as a subnormal extended
2676| double-precision floating-point number.
2677|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
2678| number of bits as single or double precision, respectively.  Otherwise, the
2679| result is rounded to the full precision of the extended double-precision
2680| format.
2681|     The input significand must be normalized or smaller.  If the input
2682| significand is not normalized, `zExp' must be 0; in that case, the result
2683| returned is a subnormal number, and it must not require rounding.  The
2684| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2685| Floating-Point Arithmetic.
2686*----------------------------------------------------------------------------*/
2687
2688floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2689                              int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2690                              float_status *status)
2691{
2692    int8_t roundingMode;
2693    flag roundNearestEven, increment, isTiny;
2694    int64_t roundIncrement, roundMask, roundBits;
2695
2696    roundingMode = status->float_rounding_mode;
2697    roundNearestEven = ( roundingMode == float_round_nearest_even );
2698    if ( roundingPrecision == 80 ) goto precision80;
2699    if ( roundingPrecision == 64 ) {
2700        roundIncrement = LIT64( 0x0000000000000400 );
2701        roundMask = LIT64( 0x00000000000007FF );
2702    }
2703    else if ( roundingPrecision == 32 ) {
2704        roundIncrement = LIT64( 0x0000008000000000 );
2705        roundMask = LIT64( 0x000000FFFFFFFFFF );
2706    }
2707    else {
2708        goto precision80;
2709    }
2710    zSig0 |= ( zSig1 != 0 );
2711    switch (roundingMode) {
2712    case float_round_nearest_even:
2713    case float_round_ties_away:
2714        break;
2715    case float_round_to_zero:
2716        roundIncrement = 0;
2717        break;
2718    case float_round_up:
2719        roundIncrement = zSign ? 0 : roundMask;
2720        break;
2721    case float_round_down:
2722        roundIncrement = zSign ? roundMask : 0;
2723        break;
2724    default:
2725        abort();
2726    }
2727    roundBits = zSig0 & roundMask;
2728    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2729        if (    ( 0x7FFE < zExp )
2730             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2731           ) {
2732            goto overflow;
2733        }
2734        if ( zExp <= 0 ) {
2735            if (status->flush_to_zero) {
2736                float_raise(float_flag_output_denormal, status);
2737                return packFloatx80(zSign, 0, 0);
2738            }
2739            isTiny =
2740                   (status->float_detect_tininess
2741                    == float_tininess_before_rounding)
2742                || ( zExp < 0 )
2743                || ( zSig0 <= zSig0 + roundIncrement );
2744            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2745            zExp = 0;
2746            roundBits = zSig0 & roundMask;
2747            if (isTiny && roundBits) {
2748                float_raise(float_flag_underflow, status);
2749            }
2750            if (roundBits) {
2751                status->float_exception_flags |= float_flag_inexact;
2752            }
2753            zSig0 += roundIncrement;
2754            if ( (int64_t) zSig0 < 0 ) zExp = 1;
2755            roundIncrement = roundMask + 1;
2756            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2757                roundMask |= roundIncrement;
2758            }
2759            zSig0 &= ~ roundMask;
2760            return packFloatx80( zSign, zExp, zSig0 );
2761        }
2762    }
2763    if (roundBits) {
2764        status->float_exception_flags |= float_flag_inexact;
2765    }
2766    zSig0 += roundIncrement;
2767    if ( zSig0 < roundIncrement ) {
2768        ++zExp;
2769        zSig0 = LIT64( 0x8000000000000000 );
2770    }
2771    roundIncrement = roundMask + 1;
2772    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2773        roundMask |= roundIncrement;
2774    }
2775    zSig0 &= ~ roundMask;
2776    if ( zSig0 == 0 ) zExp = 0;
2777    return packFloatx80( zSign, zExp, zSig0 );
2778 precision80:
2779    switch (roundingMode) {
2780    case float_round_nearest_even:
2781    case float_round_ties_away:
2782        increment = ((int64_t)zSig1 < 0);
2783        break;
2784    case float_round_to_zero:
2785        increment = 0;
2786        break;
2787    case float_round_up:
2788        increment = !zSign && zSig1;
2789        break;
2790    case float_round_down:
2791        increment = zSign && zSig1;
2792        break;
2793    default:
2794        abort();
2795    }
2796    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2797        if (    ( 0x7FFE < zExp )
2798             || (    ( zExp == 0x7FFE )
2799                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2800                  && increment
2801                )
2802           ) {
2803            roundMask = 0;
2804 overflow:
2805            float_raise(float_flag_overflow | float_flag_inexact, status);
2806            if (    ( roundingMode == float_round_to_zero )
2807                 || ( zSign && ( roundingMode == float_round_up ) )
2808                 || ( ! zSign && ( roundingMode == float_round_down ) )
2809               ) {
2810                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2811            }
2812            return packFloatx80(zSign,
2813                                floatx80_infinity_high,
2814                                floatx80_infinity_low);
2815        }
2816        if ( zExp <= 0 ) {
2817            isTiny =
2818                   (status->float_detect_tininess
2819                    == float_tininess_before_rounding)
2820                || ( zExp < 0 )
2821                || ! increment
2822                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2823            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2824            zExp = 0;
2825            if (isTiny && zSig1) {
2826                float_raise(float_flag_underflow, status);
2827            }
2828            if (zSig1) {
2829                status->float_exception_flags |= float_flag_inexact;
2830            }
2831            switch (roundingMode) {
2832            case float_round_nearest_even:
2833            case float_round_ties_away:
2834                increment = ((int64_t)zSig1 < 0);
2835                break;
2836            case float_round_to_zero:
2837                increment = 0;
2838                break;
2839            case float_round_up:
2840                increment = !zSign && zSig1;
2841                break;
2842            case float_round_down:
2843                increment = zSign && zSig1;
2844                break;
2845            default:
2846                abort();
2847            }
2848            if ( increment ) {
2849                ++zSig0;
2850                zSig0 &=
2851                    ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2852                if ( (int64_t) zSig0 < 0 ) zExp = 1;
2853            }
2854            return packFloatx80( zSign, zExp, zSig0 );
2855        }
2856    }
2857    if (zSig1) {
2858        status->float_exception_flags |= float_flag_inexact;
2859    }
2860    if ( increment ) {
2861        ++zSig0;
2862        if ( zSig0 == 0 ) {
2863            ++zExp;
2864            zSig0 = LIT64( 0x8000000000000000 );
2865        }
2866        else {
2867            zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2868        }
2869    }
2870    else {
2871        if ( zSig0 == 0 ) zExp = 0;
2872    }
2873    return packFloatx80( zSign, zExp, zSig0 );
2874
2875}
2876
2877/*----------------------------------------------------------------------------
2878| Takes an abstract floating-point value having sign `zSign', exponent
2879| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2880| and returns the proper extended double-precision floating-point value
2881| corresponding to the abstract input.  This routine is just like
2882| `roundAndPackFloatx80' except that the input significand does not have to be
2883| normalized.
2884*----------------------------------------------------------------------------*/
2885
2886floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2887                                       flag zSign, int32_t zExp,
2888                                       uint64_t zSig0, uint64_t zSig1,
2889                                       float_status *status)
2890{
2891    int8_t shiftCount;
2892
2893    if ( zSig0 == 0 ) {
2894        zSig0 = zSig1;
2895        zSig1 = 0;
2896        zExp -= 64;
2897    }
2898    shiftCount = countLeadingZeros64( zSig0 );
2899    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2900    zExp -= shiftCount;
2901    return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2902                                zSig0, zSig1, status);
2903
2904}
2905
2906/*----------------------------------------------------------------------------
2907| Returns the least-significant 64 fraction bits of the quadruple-precision
2908| floating-point value `a'.
2909*----------------------------------------------------------------------------*/
2910
2911static inline uint64_t extractFloat128Frac1( float128 a )
2912{
2913
2914    return a.low;
2915
2916}
2917
2918/*----------------------------------------------------------------------------
2919| Returns the most-significant 48 fraction bits of the quadruple-precision
2920| floating-point value `a'.
2921*----------------------------------------------------------------------------*/
2922
2923static inline uint64_t extractFloat128Frac0( float128 a )
2924{
2925
2926    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2927
2928}
2929
2930/*----------------------------------------------------------------------------
2931| Returns the exponent bits of the quadruple-precision floating-point value
2932| `a'.
2933*----------------------------------------------------------------------------*/
2934
2935static inline int32_t extractFloat128Exp( float128 a )
2936{
2937
2938    return ( a.high>>48 ) & 0x7FFF;
2939
2940}
2941
2942/*----------------------------------------------------------------------------
2943| Returns the sign bit of the quadruple-precision floating-point value `a'.
2944*----------------------------------------------------------------------------*/
2945
2946static inline flag extractFloat128Sign( float128 a )
2947{
2948
2949    return a.high>>63;
2950
2951}
2952
2953/*----------------------------------------------------------------------------
2954| Normalizes the subnormal quadruple-precision floating-point value
2955| represented by the denormalized significand formed by the concatenation of
2956| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
2957| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
2958| significand are stored at the location pointed to by `zSig0Ptr', and the
2959| least significant 64 bits of the normalized significand are stored at the
2960| location pointed to by `zSig1Ptr'.
2961*----------------------------------------------------------------------------*/
2962
2963static void
2964 normalizeFloat128Subnormal(
2965     uint64_t aSig0,
2966     uint64_t aSig1,
2967     int32_t *zExpPtr,
2968     uint64_t *zSig0Ptr,
2969     uint64_t *zSig1Ptr
2970 )
2971{
2972    int8_t shiftCount;
2973
2974    if ( aSig0 == 0 ) {
2975        shiftCount = countLeadingZeros64( aSig1 ) - 15;
2976        if ( shiftCount < 0 ) {
2977            *zSig0Ptr = aSig1>>( - shiftCount );
2978            *zSig1Ptr = aSig1<<( shiftCount & 63 );
2979        }
2980        else {
2981            *zSig0Ptr = aSig1<<shiftCount;
2982            *zSig1Ptr = 0;
2983        }
2984        *zExpPtr = - shiftCount - 63;
2985    }
2986    else {
2987        shiftCount = countLeadingZeros64( aSig0 ) - 15;
2988        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2989        *zExpPtr = 1 - shiftCount;
2990    }
2991
2992}
2993
2994/*----------------------------------------------------------------------------
2995| Packs the sign `zSign', the exponent `zExp', and the significand formed
2996| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2997| floating-point value, returning the result.  After being shifted into the
2998| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2999| added together to form the most significant 32 bits of the result.  This
3000| means that any integer portion of `zSig0' will be added into the exponent.
3001| Since a properly normalized significand will have an integer portion equal
3002| to 1, the `zExp' input should be 1 less than the desired result exponent
3003| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
3004| significand.
3005*----------------------------------------------------------------------------*/
3006
3007static inline float128
3008 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
3009{
3010    float128 z;
3011
3012    z.low = zSig1;
3013    z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
3014    return z;
3015
3016}
3017
3018/*----------------------------------------------------------------------------
3019| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3020| and extended significand formed by the concatenation of `zSig0', `zSig1',
3021| and `zSig2', and returns the proper quadruple-precision floating-point value
3022| corresponding to the abstract input.  Ordinarily, the abstract value is
3023| simply rounded and packed into the quadruple-precision format, with the
3024| inexact exception raised if the abstract input cannot be represented
3025| exactly.  However, if the abstract value is too large, the overflow and
3026| inexact exceptions are raised and an infinity or maximal finite value is
3027| returned.  If the abstract value is too small, the input value is rounded to
3028| a subnormal number, and the underflow and inexact exceptions are raised if
3029| the abstract input cannot be represented exactly as a subnormal quadruple-
3030| precision floating-point number.
3031|     The input significand must be normalized or smaller.  If the input
3032| significand is not normalized, `zExp' must be 0; in that case, the result
3033| returned is a subnormal number, and it must not require rounding.  In the
3034| usual case that the input significand is normalized, `zExp' must be 1 less
3035| than the ``true'' floating-point exponent.  The handling of underflow and
3036| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3037*----------------------------------------------------------------------------*/
3038
3039static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
3040                                     uint64_t zSig0, uint64_t zSig1,
3041                                     uint64_t zSig2, float_status *status)
3042{
3043    int8_t roundingMode;
3044    flag roundNearestEven, increment, isTiny;
3045
3046    roundingMode = status->float_rounding_mode;
3047    roundNearestEven = ( roundingMode == float_round_nearest_even );
3048    switch (roundingMode) {
3049    case float_round_nearest_even:
3050    case float_round_ties_away:
3051        increment = ((int64_t)zSig2 < 0);
3052        break;
3053    case float_round_to_zero:
3054        increment = 0;
3055        break;
3056    case float_round_up:
3057        increment = !zSign && zSig2;
3058        break;
3059    case float_round_down:
3060        increment = zSign && zSig2;
3061        break;
3062    case float_round_to_odd:
3063        increment = !(zSig1 & 0x1) && zSig2;
3064        break;
3065    default:
3066        abort();
3067    }
3068    if ( 0x7FFD <= (uint32_t) zExp ) {
3069        if (    ( 0x7FFD < zExp )
3070             || (    ( zExp == 0x7FFD )
3071                  && eq128(
3072                         LIT64( 0x0001FFFFFFFFFFFF ),
3073                         LIT64( 0xFFFFFFFFFFFFFFFF ),
3074                         zSig0,
3075                         zSig1
3076                     )
3077                  && increment
3078                )
3079           ) {
3080            float_raise(float_flag_overflow | float_flag_inexact, status);
3081            if (    ( roundingMode == float_round_to_zero )
3082                 || ( zSign && ( roundingMode == float_round_up ) )
3083                 || ( ! zSign && ( roundingMode == float_round_down ) )
3084                 || (roundingMode == float_round_to_odd)
3085               ) {
3086                return
3087                    packFloat128(
3088                        zSign,
3089                        0x7FFE,
3090                        LIT64( 0x0000FFFFFFFFFFFF ),
3091                        LIT64( 0xFFFFFFFFFFFFFFFF )
3092                    );
3093            }
3094            return packFloat128( zSign, 0x7FFF, 0, 0 );
3095        }
3096        if ( zExp < 0 ) {
3097            if (status->flush_to_zero) {
3098                float_raise(float_flag_output_denormal, status);
3099                return packFloat128(zSign, 0, 0, 0);
3100            }
3101            isTiny =
3102                   (status->float_detect_tininess
3103                    == float_tininess_before_rounding)
3104                || ( zExp < -1 )
3105                || ! increment
3106                || lt128(
3107                       zSig0,
3108                       zSig1,
3109                       LIT64( 0x0001FFFFFFFFFFFF ),
3110                       LIT64( 0xFFFFFFFFFFFFFFFF )
3111                   );
3112            shift128ExtraRightJamming(
3113                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3114            zExp = 0;
3115            if (isTiny && zSig2) {
3116                float_raise(float_flag_underflow, status);
3117            }
3118            switch (roundingMode) {
3119            case float_round_nearest_even:
3120            case float_round_ties_away:
3121                increment = ((int64_t)zSig2 < 0);
3122                break;
3123            case float_round_to_zero:
3124                increment = 0;
3125                break;
3126            case float_round_up:
3127                increment = !zSign && zSig2;
3128                break;
3129            case float_round_down:
3130                increment = zSign && zSig2;
3131                break;
3132            case float_round_to_odd:
3133                increment = !(zSig1 & 0x1) && zSig2;
3134                break;
3135            default:
3136                abort();
3137            }
3138        }
3139    }
3140    if (zSig2) {
3141        status->float_exception_flags |= float_flag_inexact;
3142    }
3143    if ( increment ) {
3144        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3145        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3146    }
3147    else {
3148        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3149    }
3150    return packFloat128( zSign, zExp, zSig0, zSig1 );
3151
3152}
3153
3154/*----------------------------------------------------------------------------
3155| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3156| and significand formed by the concatenation of `zSig0' and `zSig1', and
3157| returns the proper quadruple-precision floating-point value corresponding
3158| to the abstract input.  This routine is just like `roundAndPackFloat128'
3159| except that the input significand has fewer bits and does not have to be
3160| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
3161| point exponent.
3162*----------------------------------------------------------------------------*/
3163
3164static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3165                                              uint64_t zSig0, uint64_t zSig1,
3166                                              float_status *status)
3167{
3168    int8_t shiftCount;
3169    uint64_t zSig2;
3170
3171    if ( zSig0 == 0 ) {
3172        zSig0 = zSig1;
3173        zSig1 = 0;
3174        zExp -= 64;
3175    }
3176    shiftCount = countLeadingZeros64( zSig0 ) - 15;
3177    if ( 0 <= shiftCount ) {
3178        zSig2 = 0;
3179        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3180    }
3181    else {
3182        shift128ExtraRightJamming(
3183            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3184    }
3185    zExp -= shiftCount;
3186    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3187
3188}
3189
3190
3191/*----------------------------------------------------------------------------
3192| Returns the result of converting the 32-bit two's complement integer `a'
3193| to the extended double-precision floating-point format.  The conversion
3194| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3195| Arithmetic.
3196*----------------------------------------------------------------------------*/
3197
3198floatx80 int32_to_floatx80(int32_t a, float_status *status)
3199{
3200    flag zSign;
3201    uint32_t absA;
3202    int8_t shiftCount;
3203    uint64_t zSig;
3204
3205    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3206    zSign = ( a < 0 );
3207    absA = zSign ? - a : a;
3208    shiftCount = countLeadingZeros32( absA ) + 32;
3209    zSig = absA;
3210    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3211
3212}
3213
3214/*----------------------------------------------------------------------------
3215| Returns the result of converting the 32-bit two's complement integer `a' to
3216| the quadruple-precision floating-point format.  The conversion is performed
3217| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3218*----------------------------------------------------------------------------*/
3219
3220float128 int32_to_float128(int32_t a, float_status *status)
3221{
3222    flag zSign;
3223    uint32_t absA;
3224    int8_t shiftCount;
3225    uint64_t zSig0;
3226
3227    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3228    zSign = ( a < 0 );
3229    absA = zSign ? - a : a;
3230    shiftCount = countLeadingZeros32( absA ) + 17;
3231    zSig0 = absA;
3232    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3233
3234}
3235
3236/*----------------------------------------------------------------------------
3237| Returns the result of converting the 64-bit two's complement integer `a'
3238| to the extended double-precision floating-point format.  The conversion
3239| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3240| Arithmetic.
3241*----------------------------------------------------------------------------*/
3242
3243floatx80 int64_to_floatx80(int64_t a, float_status *status)
3244{
3245    flag zSign;
3246    uint64_t absA;
3247    int8_t shiftCount;
3248
3249    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3250    zSign = ( a < 0 );
3251    absA = zSign ? - a : a;
3252    shiftCount = countLeadingZeros64( absA );
3253    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3254
3255}
3256
3257/*----------------------------------------------------------------------------
3258| Returns the result of converting the 64-bit two's complement integer `a' to
3259| the quadruple-precision floating-point format.  The conversion is performed
3260| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3261*----------------------------------------------------------------------------*/
3262
3263float128 int64_to_float128(int64_t a, float_status *status)
3264{
3265    flag zSign;
3266    uint64_t absA;
3267    int8_t shiftCount;
3268    int32_t zExp;
3269    uint64_t zSig0, zSig1;
3270
3271    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3272    zSign = ( a < 0 );
3273    absA = zSign ? - a : a;
3274    shiftCount = countLeadingZeros64( absA ) + 49;
3275    zExp = 0x406E - shiftCount;
3276    if ( 64 <= shiftCount ) {
3277        zSig1 = 0;
3278        zSig0 = absA;
3279        shiftCount -= 64;
3280    }
3281    else {
3282        zSig1 = absA;
3283        zSig0 = 0;
3284    }
3285    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3286    return packFloat128( zSign, zExp, zSig0, zSig1 );
3287
3288}
3289
3290/*----------------------------------------------------------------------------
3291| Returns the result of converting the 64-bit unsigned integer `a'
3292| to the quadruple-precision floating-point format.  The conversion is performed
3293| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3294*----------------------------------------------------------------------------*/
3295
3296float128 uint64_to_float128(uint64_t a, float_status *status)
3297{
3298    if (a == 0) {
3299        return float128_zero;
3300    }
3301    return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
3302}
3303
3304/*----------------------------------------------------------------------------
3305| Returns the result of converting the single-precision floating-point value
3306| `a' to the extended double-precision floating-point format.  The conversion
3307| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3308| Arithmetic.
3309*----------------------------------------------------------------------------*/
3310
3311floatx80 float32_to_floatx80(float32 a, float_status *status)
3312{
3313    flag aSign;
3314    int aExp;
3315    uint32_t aSig;
3316
3317    a = float32_squash_input_denormal(a, status);
3318    aSig = extractFloat32Frac( a );
3319    aExp = extractFloat32Exp( a );
3320    aSign = extractFloat32Sign( a );
3321    if ( aExp == 0xFF ) {
3322        if (aSig) {
3323            return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3324        }
3325        return packFloatx80(aSign,
3326                            floatx80_infinity_high,
3327                            floatx80_infinity_low);
3328    }
3329    if ( aExp == 0 ) {
3330        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3331        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3332    }
3333    aSig |= 0x00800000;
3334    return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3335
3336}
3337
3338/*----------------------------------------------------------------------------
3339| Returns the result of converting the single-precision floating-point value
3340| `a' to the double-precision floating-point format.  The conversion is
3341| performed according to the IEC/IEEE Standard for Binary Floating-Point
3342| Arithmetic.
3343*----------------------------------------------------------------------------*/
3344
3345float128 float32_to_float128(float32 a, float_status *status)
3346{
3347    flag aSign;
3348    int aExp;
3349    uint32_t aSig;
3350
3351    a = float32_squash_input_denormal(a, status);
3352    aSig = extractFloat32Frac( a );
3353    aExp = extractFloat32Exp( a );
3354    aSign = extractFloat32Sign( a );
3355    if ( aExp == 0xFF ) {
3356        if (aSig) {
3357            return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3358        }
3359        return packFloat128( aSign, 0x7FFF, 0, 0 );
3360    }
3361    if ( aExp == 0 ) {
3362        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3363        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3364        --aExp;
3365    }
3366    return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3367
3368}
3369
3370/*----------------------------------------------------------------------------
3371| Returns the remainder of the single-precision floating-point value `a'
3372| with respect to the corresponding value `b'.  The operation is performed
3373| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3374*----------------------------------------------------------------------------*/
3375
3376float32 float32_rem(float32 a, float32 b, float_status *status)
3377{
3378    flag aSign, zSign;
3379    int aExp, bExp, expDiff;
3380    uint32_t aSig, bSig;
3381    uint32_t q;
3382    uint64_t aSig64, bSig64, q64;
3383    uint32_t alternateASig;
3384    int32_t sigMean;
3385    a = float32_squash_input_denormal(a, status);
3386    b = float32_squash_input_denormal(b, status);
3387
3388    aSig = extractFloat32Frac( a );
3389    aExp = extractFloat32Exp( a );
3390    aSign = extractFloat32Sign( a );
3391    bSig = extractFloat32Frac( b );
3392    bExp = extractFloat32Exp( b );
3393    if ( aExp == 0xFF ) {
3394        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3395            return propagateFloat32NaN(a, b, status);
3396        }
3397        float_raise(float_flag_invalid, status);
3398        return float32_default_nan(status);
3399    }
3400    if ( bExp == 0xFF ) {
3401        if (bSig) {
3402            return propagateFloat32NaN(a, b, status);
3403        }
3404        return a;
3405    }
3406    if ( bExp == 0 ) {
3407        if ( bSig == 0 ) {
3408            float_raise(float_flag_invalid, status);
3409            return float32_default_nan(status);
3410        }
3411        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3412    }
3413    if ( aExp == 0 ) {
3414        if ( aSig == 0 ) return a;
3415        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3416    }
3417    expDiff = aExp - bExp;
3418    aSig |= 0x00800000;
3419    bSig |= 0x00800000;
3420    if ( expDiff < 32 ) {
3421        aSig <<= 8;
3422        bSig <<= 8;
3423        if ( expDiff < 0 ) {
3424            if ( expDiff < -1 ) return a;
3425            aSig >>= 1;
3426        }
3427        q = ( bSig <= aSig );
3428        if ( q ) aSig -= bSig;
3429        if ( 0 < expDiff ) {
3430            q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3431            q >>= 32 - expDiff;
3432            bSig >>= 2;
3433            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3434        }
3435        else {
3436            aSig >>= 2;
3437            bSig >>= 2;
3438        }
3439    }
3440    else {
3441        if ( bSig <= aSig ) aSig -= bSig;
3442        aSig64 = ( (uint64_t) aSig )<<40;
3443        bSig64 = ( (uint64_t) bSig )<<40;
3444        expDiff -= 64;
3445        while ( 0 < expDiff ) {
3446            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3447            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3448            aSig64 = - ( ( bSig * q64 )<<38 );
3449            expDiff -= 62;
3450        }
3451        expDiff += 64;
3452        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3453        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3454        q = q64>>( 64 - expDiff );
3455        bSig <<= 6;
3456        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3457    }
3458    do {
3459        alternateASig = aSig;
3460        ++q;
3461        aSig -= bSig;
3462    } while ( 0 <= (int32_t) aSig );
3463    sigMean = aSig + alternateASig;
3464    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3465        aSig = alternateASig;
3466    }
3467    zSign = ( (int32_t) aSig < 0 );
3468    if ( zSign ) aSig = - aSig;
3469    return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3470}
3471
3472
3473
3474/*----------------------------------------------------------------------------
3475| Returns the binary exponential of the single-precision floating-point value
3476| `a'. The operation is performed according to the IEC/IEEE Standard for
3477| Binary Floating-Point Arithmetic.
3478|
3479| Uses the following identities:
3480|
3481| 1. -------------------------------------------------------------------------
3482|      x    x*ln(2)
3483|     2  = e
3484|
3485| 2. -------------------------------------------------------------------------
3486|                      2     3     4     5           n
3487|      x        x     x     x     x     x           x
3488|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3489|               1!    2!    3!    4!    5!          n!
3490*----------------------------------------------------------------------------*/
3491
3492static const float64 float32_exp2_coefficients[15] =
3493{
3494    const_float64( 0x3ff0000000000000ll ), /*  1 */
3495    const_float64( 0x3fe0000000000000ll ), /*  2 */
3496    const_float64( 0x3fc5555555555555ll ), /*  3 */
3497    const_float64( 0x3fa5555555555555ll ), /*  4 */
3498    const_float64( 0x3f81111111111111ll ), /*  5 */
3499    const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3500    const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3501    const_float64( 0x3efa01a01a01a01all ), /*  8 */
3502    const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3503    const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3504    const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3505    const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3506    const_float64( 0x3de6124613a86d09ll ), /* 13 */
3507    const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3508    const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3509};
3510
3511float32 float32_exp2(float32 a, float_status *status)
3512{
3513    flag aSign;
3514    int aExp;
3515    uint32_t aSig;
3516    float64 r, x, xn;
3517    int i;
3518    a = float32_squash_input_denormal(a, status);
3519
3520    aSig = extractFloat32Frac( a );
3521    aExp = extractFloat32Exp( a );
3522    aSign = extractFloat32Sign( a );
3523
3524    if ( aExp == 0xFF) {
3525        if (aSig) {
3526            return propagateFloat32NaN(a, float32_zero, status);
3527        }
3528        return (aSign) ? float32_zero : a;
3529    }
3530    if (aExp == 0) {
3531        if (aSig == 0) return float32_one;
3532    }
3533
3534    float_raise(float_flag_inexact, status);
3535
3536    /* ******************************* */
3537    /* using float64 for approximation */
3538    /* ******************************* */
3539    x = float32_to_float64(a, status);
3540    x = float64_mul(x, float64_ln2, status);
3541
3542    xn = x;
3543    r = float64_one;
3544    for (i = 0 ; i < 15 ; i++) {
3545        float64 f;
3546
3547        f = float64_mul(xn, float32_exp2_coefficients[i], status);
3548        r = float64_add(r, f, status);
3549
3550        xn = float64_mul(xn, x, status);
3551    }
3552
3553    return float64_to_float32(r, status);
3554}
3555
3556/*----------------------------------------------------------------------------
3557| Returns the binary log of the single-precision floating-point value `a'.
3558| The operation is performed according to the IEC/IEEE Standard for Binary
3559| Floating-Point Arithmetic.
3560*----------------------------------------------------------------------------*/
3561float32 float32_log2(float32 a, float_status *status)
3562{
3563    flag aSign, zSign;
3564    int aExp;
3565    uint32_t aSig, zSig, i;
3566
3567    a = float32_squash_input_denormal(a, status);
3568    aSig = extractFloat32Frac( a );
3569    aExp = extractFloat32Exp( a );
3570    aSign = extractFloat32Sign( a );
3571
3572    if ( aExp == 0 ) {
3573        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3574        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3575    }
3576    if ( aSign ) {
3577        float_raise(float_flag_invalid, status);
3578        return float32_default_nan(status);
3579    }
3580    if ( aExp == 0xFF ) {
3581        if (aSig) {
3582            return propagateFloat32NaN(a, float32_zero, status);
3583        }
3584        return a;
3585    }
3586
3587    aExp -= 0x7F;
3588    aSig |= 0x00800000;
3589    zSign = aExp < 0;
3590    zSig = aExp << 23;
3591
3592    for (i = 1 << 22; i > 0; i >>= 1) {
3593        aSig = ( (uint64_t)aSig * aSig ) >> 23;
3594        if ( aSig & 0x01000000 ) {
3595            aSig >>= 1;
3596            zSig |= i;
3597        }
3598    }
3599
3600    if ( zSign )
3601        zSig = -zSig;
3602
3603    return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3604}
3605
3606/*----------------------------------------------------------------------------
3607| Returns 1 if the single-precision floating-point value `a' is equal to
3608| the corresponding value `b', and 0 otherwise.  The invalid exception is
3609| raised if either operand is a NaN.  Otherwise, the comparison is performed
3610| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3611*----------------------------------------------------------------------------*/
3612
3613int float32_eq(float32 a, float32 b, float_status *status)
3614{
3615    uint32_t av, bv;
3616    a = float32_squash_input_denormal(a, status);
3617    b = float32_squash_input_denormal(b, status);
3618
3619    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3620         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3621       ) {
3622        float_raise(float_flag_invalid, status);
3623        return 0;
3624    }
3625    av = float32_val(a);
3626    bv = float32_val(b);
3627    return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3628}
3629
3630/*----------------------------------------------------------------------------
3631| Returns 1 if the single-precision floating-point value `a' is less than
3632| or equal to the corresponding value `b', and 0 otherwise.  The invalid
3633| exception is raised if either operand is a NaN.  The comparison is performed
3634| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3635*----------------------------------------------------------------------------*/
3636
3637int float32_le(float32 a, float32 b, float_status *status)
3638{
3639    flag aSign, bSign;
3640    uint32_t av, bv;
3641    a = float32_squash_input_denormal(a, status);
3642    b = float32_squash_input_denormal(b, status);
3643
3644    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3645         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3646       ) {
3647        float_raise(float_flag_invalid, status);
3648        return 0;
3649    }
3650    aSign = extractFloat32Sign( a );
3651    bSign = extractFloat32Sign( b );
3652    av = float32_val(a);
3653    bv = float32_val(b);
3654    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3655    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3656
3657}
3658
3659/*----------------------------------------------------------------------------
3660| Returns 1 if the single-precision floating-point value `a' is less than
3661| the corresponding value `b', and 0 otherwise.  The invalid exception is
3662| raised if either operand is a NaN.  The comparison is performed according
3663| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3664*----------------------------------------------------------------------------*/
3665
3666int float32_lt(float32 a, float32 b, float_status *status)
3667{
3668    flag aSign, bSign;
3669    uint32_t av, bv;
3670    a = float32_squash_input_denormal(a, status);
3671    b = float32_squash_input_denormal(b, status);
3672
3673    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3674         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3675       ) {
3676        float_raise(float_flag_invalid, status);
3677        return 0;
3678    }
3679    aSign = extractFloat32Sign( a );
3680    bSign = extractFloat32Sign( b );
3681    av = float32_val(a);
3682    bv = float32_val(b);
3683    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3684    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3685
3686}
3687
3688/*----------------------------------------------------------------------------
3689| Returns 1 if the single-precision floating-point values `a' and `b' cannot
3690| be compared, and 0 otherwise.  The invalid exception is raised if either
3691| operand is a NaN.  The comparison is performed according to the IEC/IEEE
3692| Standard for Binary Floating-Point Arithmetic.
3693*----------------------------------------------------------------------------*/
3694
3695int float32_unordered(float32 a, float32 b, float_status *status)
3696{
3697    a = float32_squash_input_denormal(a, status);
3698    b = float32_squash_input_denormal(b, status);
3699
3700    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3701         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3702       ) {
3703        float_raise(float_flag_invalid, status);
3704        return 1;
3705    }
3706    return 0;
3707}
3708
3709/*----------------------------------------------------------------------------
3710| Returns 1 if the single-precision floating-point value `a' is equal to
3711| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3712| exception.  The comparison is performed according to the IEC/IEEE Standard
3713| for Binary Floating-Point Arithmetic.
3714*----------------------------------------------------------------------------*/
3715
3716int float32_eq_quiet(float32 a, float32 b, float_status *status)
3717{
3718    a = float32_squash_input_denormal(a, status);
3719    b = float32_squash_input_denormal(b, status);
3720
3721    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3722         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3723       ) {
3724        if (float32_is_signaling_nan(a, status)
3725         || float32_is_signaling_nan(b, status)) {
3726            float_raise(float_flag_invalid, status);
3727        }
3728        return 0;
3729    }
3730    return ( float32_val(a) == float32_val(b) ) ||
3731            ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3732}
3733
3734/*----------------------------------------------------------------------------
3735| Returns 1 if the single-precision floating-point value `a' is less than or
3736| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3737| cause an exception.  Otherwise, the comparison is performed according to the
3738| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3739*----------------------------------------------------------------------------*/
3740
3741int float32_le_quiet(float32 a, float32 b, float_status *status)
3742{
3743    flag aSign, bSign;
3744    uint32_t av, bv;
3745    a = float32_squash_input_denormal(a, status);
3746    b = float32_squash_input_denormal(b, status);
3747
3748    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3749         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3750       ) {
3751        if (float32_is_signaling_nan(a, status)
3752         || float32_is_signaling_nan(b, status)) {
3753            float_raise(float_flag_invalid, status);
3754        }
3755        return 0;
3756    }
3757    aSign = extractFloat32Sign( a );
3758    bSign = extractFloat32Sign( b );
3759    av = float32_val(a);
3760    bv = float32_val(b);
3761    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3762    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3763
3764}
3765
3766/*----------------------------------------------------------------------------
3767| Returns 1 if the single-precision floating-point value `a' is less than
3768| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3769| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3770| Standard for Binary Floating-Point Arithmetic.
3771*----------------------------------------------------------------------------*/
3772
3773int float32_lt_quiet(float32 a, float32 b, float_status *status)
3774{
3775    flag aSign, bSign;
3776    uint32_t av, bv;
3777    a = float32_squash_input_denormal(a, status);
3778    b = float32_squash_input_denormal(b, status);
3779
3780    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3781         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3782       ) {
3783        if (float32_is_signaling_nan(a, status)
3784         || float32_is_signaling_nan(b, status)) {
3785            float_raise(float_flag_invalid, status);
3786        }
3787        return 0;
3788    }
3789    aSign = extractFloat32Sign( a );
3790    bSign = extractFloat32Sign( b );
3791    av = float32_val(a);
3792    bv = float32_val(b);
3793    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3794    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3795
3796}
3797
3798/*----------------------------------------------------------------------------
3799| Returns 1 if the single-precision floating-point values `a' and `b' cannot
3800| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3801| comparison is performed according to the IEC/IEEE Standard for Binary
3802| Floating-Point Arithmetic.
3803*----------------------------------------------------------------------------*/
3804
3805int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3806{
3807    a = float32_squash_input_denormal(a, status);
3808    b = float32_squash_input_denormal(b, status);
3809
3810    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3811         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3812       ) {
3813        if (float32_is_signaling_nan(a, status)
3814         || float32_is_signaling_nan(b, status)) {
3815            float_raise(float_flag_invalid, status);
3816        }
3817        return 1;
3818    }
3819    return 0;
3820}
3821
3822/*----------------------------------------------------------------------------
3823| If `a' is denormal and we are in flush-to-zero mode then set the
3824| input-denormal exception and return zero. Otherwise just return the value.
3825*----------------------------------------------------------------------------*/
3826float16 float16_squash_input_denormal(float16 a, float_status *status)
3827{
3828    if (status->flush_inputs_to_zero) {
3829        if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3830            float_raise(float_flag_input_denormal, status);
3831            return make_float16(float16_val(a) & 0x8000);
3832        }
3833    }
3834    return a;
3835}
3836
3837/*----------------------------------------------------------------------------
3838| Returns the result of converting the double-precision floating-point value
3839| `a' to the extended double-precision floating-point format.  The conversion
3840| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3841| Arithmetic.
3842*----------------------------------------------------------------------------*/
3843
3844floatx80 float64_to_floatx80(float64 a, float_status *status)
3845{
3846    flag aSign;
3847    int aExp;
3848    uint64_t aSig;
3849
3850    a = float64_squash_input_denormal(a, status);
3851    aSig = extractFloat64Frac( a );
3852    aExp = extractFloat64Exp( a );
3853    aSign = extractFloat64Sign( a );
3854    if ( aExp == 0x7FF ) {
3855        if (aSig) {
3856            return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3857        }
3858        return packFloatx80(aSign,
3859                            floatx80_infinity_high,
3860                            floatx80_infinity_low);
3861    }
3862    if ( aExp == 0 ) {
3863        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3864        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3865    }
3866    return
3867        packFloatx80(
3868            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3869
3870}
3871
3872/*----------------------------------------------------------------------------
3873| Returns the result of converting the double-precision floating-point value
3874| `a' to the quadruple-precision floating-point format.  The conversion is
3875| performed according to the IEC/IEEE Standard for Binary Floating-Point
3876| Arithmetic.
3877*----------------------------------------------------------------------------*/
3878
3879float128 float64_to_float128(float64 a, float_status *status)
3880{
3881    flag aSign;
3882    int aExp;
3883    uint64_t aSig, zSig0, zSig1;
3884
3885    a = float64_squash_input_denormal(a, status);
3886    aSig = extractFloat64Frac( a );
3887    aExp = extractFloat64Exp( a );
3888    aSign = extractFloat64Sign( a );
3889    if ( aExp == 0x7FF ) {
3890        if (aSig) {
3891            return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3892        }
3893        return packFloat128( aSign, 0x7FFF, 0, 0 );
3894    }
3895    if ( aExp == 0 ) {
3896        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3897        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3898        --aExp;
3899    }
3900    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3901    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3902
3903}
3904
3905
3906/*----------------------------------------------------------------------------
3907| Returns the remainder of the double-precision floating-point value `a'
3908| with respect to the corresponding value `b'.  The operation is performed
3909| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3910*----------------------------------------------------------------------------*/
3911
3912float64 float64_rem(float64 a, float64 b, float_status *status)
3913{
3914    flag aSign, zSign;
3915    int aExp, bExp, expDiff;
3916    uint64_t aSig, bSig;
3917    uint64_t q, alternateASig;
3918    int64_t sigMean;
3919
3920    a = float64_squash_input_denormal(a, status);
3921    b = float64_squash_input_denormal(b, status);
3922    aSig = extractFloat64Frac( a );
3923    aExp = extractFloat64Exp( a );
3924    aSign = extractFloat64Sign( a );
3925    bSig = extractFloat64Frac( b );
3926    bExp = extractFloat64Exp( b );
3927    if ( aExp == 0x7FF ) {
3928        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3929            return propagateFloat64NaN(a, b, status);
3930        }
3931        float_raise(float_flag_invalid, status);
3932        return float64_default_nan(status);
3933    }
3934    if ( bExp == 0x7FF ) {
3935        if (bSig) {
3936            return propagateFloat64NaN(a, b, status);
3937        }
3938        return a;
3939    }
3940    if ( bExp == 0 ) {
3941        if ( bSig == 0 ) {
3942            float_raise(float_flag_invalid, status);
3943            return float64_default_nan(status);
3944        }
3945        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3946    }
3947    if ( aExp == 0 ) {
3948        if ( aSig == 0 ) return a;
3949        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3950    }
3951    expDiff = aExp - bExp;
3952    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3953    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3954    if ( expDiff < 0 ) {
3955        if ( expDiff < -1 ) return a;
3956        aSig >>= 1;
3957    }
3958    q = ( bSig <= aSig );
3959    if ( q ) aSig -= bSig;
3960    expDiff -= 64;
3961    while ( 0 < expDiff ) {
3962        q = estimateDiv128To64( aSig, 0, bSig );
3963        q = ( 2 < q ) ? q - 2 : 0;
3964        aSig = - ( ( bSig>>2 ) * q );
3965        expDiff -= 62;
3966    }
3967    expDiff += 64;
3968    if ( 0 < expDiff ) {
3969        q = estimateDiv128To64( aSig, 0, bSig );
3970        q = ( 2 < q ) ? q - 2 : 0;
3971        q >>= 64 - expDiff;
3972        bSig >>= 2;
3973        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3974    }
3975    else {
3976        aSig >>= 2;
3977        bSig >>= 2;
3978    }
3979    do {
3980        alternateASig = aSig;
3981        ++q;
3982        aSig -= bSig;
3983    } while ( 0 <= (int64_t) aSig );
3984    sigMean = aSig + alternateASig;
3985    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3986        aSig = alternateASig;
3987    }
3988    zSign = ( (int64_t) aSig < 0 );
3989    if ( zSign ) aSig = - aSig;
3990    return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
3991
3992}
3993
3994/*----------------------------------------------------------------------------
3995| Returns the binary log of the double-precision floating-point value `a'.
3996| The operation is performed according to the IEC/IEEE Standard for Binary
3997| Floating-Point Arithmetic.
3998*----------------------------------------------------------------------------*/
3999float64 float64_log2(float64 a, float_status *status)
4000{
4001    flag aSign, zSign;
4002    int aExp;
4003    uint64_t aSig, aSig0, aSig1, zSig, i;
4004    a = float64_squash_input_denormal(a, status);
4005
4006    aSig = extractFloat64Frac( a );
4007    aExp = extractFloat64Exp( a );
4008    aSign = extractFloat64Sign( a );
4009
4010    if ( aExp == 0 ) {
4011        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4012        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4013    }
4014    if ( aSign ) {
4015        float_raise(float_flag_invalid, status);
4016        return float64_default_nan(status);
4017    }
4018    if ( aExp == 0x7FF ) {
4019        if (aSig) {
4020            return propagateFloat64NaN(a, float64_zero, status);
4021        }
4022        return a;
4023    }
4024
4025    aExp -= 0x3FF;
4026    aSig |= LIT64( 0x0010000000000000 );
4027    zSign = aExp < 0;
4028    zSig = (uint64_t)aExp << 52;
4029    for (i = 1LL << 51; i > 0; i >>= 1) {
4030        mul64To128( aSig, aSig, &aSig0, &aSig1 );
4031        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4032        if ( aSig & LIT64( 0x0020000000000000 ) ) {
4033            aSig >>= 1;
4034            zSig |= i;
4035        }
4036    }
4037
4038    if ( zSign )
4039        zSig = -zSig;
4040    return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4041}
4042
4043/*----------------------------------------------------------------------------
4044| Returns 1 if the double-precision floating-point value `a' is equal to the
4045| corresponding value `b', and 0 otherwise.  The invalid exception is raised
4046| if either operand is a NaN.  Otherwise, the comparison is performed
4047| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4048*----------------------------------------------------------------------------*/
4049
4050int float64_eq(float64 a, float64 b, float_status *status)
4051{
4052    uint64_t av, bv;
4053    a = float64_squash_input_denormal(a, status);
4054    b = float64_squash_input_denormal(b, status);
4055
4056    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4057         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4058       ) {
4059        float_raise(float_flag_invalid, status);
4060        return 0;
4061    }
4062    av = float64_val(a);
4063    bv = float64_val(b);
4064    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4065
4066}
4067
4068/*----------------------------------------------------------------------------
4069| Returns 1 if the double-precision floating-point value `a' is less than or
4070| equal to the corresponding value `b', and 0 otherwise.  The invalid
4071| exception is raised if either operand is a NaN.  The comparison is performed
4072| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4073*----------------------------------------------------------------------------*/
4074
4075int float64_le(float64 a, float64 b, float_status *status)
4076{
4077    flag aSign, bSign;
4078    uint64_t av, bv;
4079    a = float64_squash_input_denormal(a, status);
4080    b = float64_squash_input_denormal(b, status);
4081
4082    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4083         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4084       ) {
4085        float_raise(float_flag_invalid, status);
4086        return 0;
4087    }
4088    aSign = extractFloat64Sign( a );
4089    bSign = extractFloat64Sign( b );
4090    av = float64_val(a);
4091    bv = float64_val(b);
4092    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4093    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4094
4095}
4096
4097/*----------------------------------------------------------------------------
4098| Returns 1 if the double-precision floating-point value `a' is less than
4099| the corresponding value `b', and 0 otherwise.  The invalid exception is
4100| raised if either operand is a NaN.  The comparison is performed according
4101| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4102*----------------------------------------------------------------------------*/
4103
4104int float64_lt(float64 a, float64 b, float_status *status)
4105{
4106    flag aSign, bSign;
4107    uint64_t av, bv;
4108
4109    a = float64_squash_input_denormal(a, status);
4110    b = float64_squash_input_denormal(b, status);
4111    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4112         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4113       ) {
4114        float_raise(float_flag_invalid, status);
4115        return 0;
4116    }
4117    aSign = extractFloat64Sign( a );
4118    bSign = extractFloat64Sign( b );
4119    av = float64_val(a);
4120    bv = float64_val(b);
4121    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4122    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4123
4124}
4125
4126/*----------------------------------------------------------------------------
4127| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4128| be compared, and 0 otherwise.  The invalid exception is raised if either
4129| operand is a NaN.  The comparison is performed according to the IEC/IEEE
4130| Standard for Binary Floating-Point Arithmetic.
4131*----------------------------------------------------------------------------*/
4132
4133int float64_unordered(float64 a, float64 b, float_status *status)
4134{
4135    a = float64_squash_input_denormal(a, status);
4136    b = float64_squash_input_denormal(b, status);
4137
4138    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4139         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4140       ) {
4141        float_raise(float_flag_invalid, status);
4142        return 1;
4143    }
4144    return 0;
4145}
4146
4147/*----------------------------------------------------------------------------
4148| Returns 1 if the double-precision floating-point value `a' is equal to the
4149| corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4150| exception.The comparison is performed according to the IEC/IEEE Standard
4151| for Binary Floating-Point Arithmetic.
4152*----------------------------------------------------------------------------*/
4153
4154int float64_eq_quiet(float64 a, float64 b, float_status *status)
4155{
4156    uint64_t av, bv;
4157    a = float64_squash_input_denormal(a, status);
4158    b = float64_squash_input_denormal(b, status);
4159
4160    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4161         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4162       ) {
4163        if (float64_is_signaling_nan(a, status)
4164         || float64_is_signaling_nan(b, status)) {
4165            float_raise(float_flag_invalid, status);
4166        }
4167        return 0;
4168    }
4169    av = float64_val(a);
4170    bv = float64_val(b);
4171    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4172
4173}
4174
4175/*----------------------------------------------------------------------------
4176| Returns 1 if the double-precision floating-point value `a' is less than or
4177| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4178| cause an exception.  Otherwise, the comparison is performed according to the
4179| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4180*----------------------------------------------------------------------------*/
4181
4182int float64_le_quiet(float64 a, float64 b, float_status *status)
4183{
4184    flag aSign, bSign;
4185    uint64_t av, bv;
4186    a = float64_squash_input_denormal(a, status);
4187    b = float64_squash_input_denormal(b, status);
4188
4189    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4190         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4191       ) {
4192        if (float64_is_signaling_nan(a, status)
4193         || float64_is_signaling_nan(b, status)) {
4194            float_raise(float_flag_invalid, status);
4195        }
4196        return 0;
4197    }
4198    aSign = extractFloat64Sign( a );
4199    bSign = extractFloat64Sign( b );
4200    av = float64_val(a);
4201    bv = float64_val(b);
4202    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4203    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4204
4205}
4206
4207/*----------------------------------------------------------------------------
4208| Returns 1 if the double-precision floating-point value `a' is less than
4209| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4210| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4211| Standard for Binary Floating-Point Arithmetic.
4212*----------------------------------------------------------------------------*/
4213
4214int float64_lt_quiet(float64 a, float64 b, float_status *status)
4215{
4216    flag aSign, bSign;
4217    uint64_t av, bv;
4218    a = float64_squash_input_denormal(a, status);
4219    b = float64_squash_input_denormal(b, status);
4220
4221    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4222         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4223       ) {
4224        if (float64_is_signaling_nan(a, status)
4225         || float64_is_signaling_nan(b, status)) {
4226            float_raise(float_flag_invalid, status);
4227        }
4228        return 0;
4229    }
4230    aSign = extractFloat64Sign( a );
4231    bSign = extractFloat64Sign( b );
4232    av = float64_val(a);
4233    bv = float64_val(b);
4234    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4235    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4236
4237}
4238
4239/*----------------------------------------------------------------------------
4240| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4241| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4242| comparison is performed according to the IEC/IEEE Standard for Binary
4243| Floating-Point Arithmetic.
4244*----------------------------------------------------------------------------*/
4245
4246int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4247{
4248    a = float64_squash_input_denormal(a, status);
4249    b = float64_squash_input_denormal(b, status);
4250
4251    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4252         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4253       ) {
4254        if (float64_is_signaling_nan(a, status)
4255         || float64_is_signaling_nan(b, status)) {
4256            float_raise(float_flag_invalid, status);
4257        }
4258        return 1;
4259    }
4260    return 0;
4261}
4262
4263/*----------------------------------------------------------------------------
4264| Returns the result of converting the extended double-precision floating-
4265| point value `a' to the 32-bit two's complement integer format.  The
4266| conversion is performed according to the IEC/IEEE Standard for Binary
4267| Floating-Point Arithmetic---which means in particular that the conversion
4268| is rounded according to the current rounding mode.  If `a' is a NaN, the
4269| largest positive integer is returned.  Otherwise, if the conversion
4270| overflows, the largest integer with the same sign as `a' is returned.
4271*----------------------------------------------------------------------------*/
4272
4273int32_t floatx80_to_int32(floatx80 a, float_status *status)
4274{
4275    flag aSign;
4276    int32_t aExp, shiftCount;
4277    uint64_t aSig;
4278
4279    if (floatx80_invalid_encoding(a)) {
4280        float_raise(float_flag_invalid, status);
4281        return 1 << 31;
4282    }
4283    aSig = extractFloatx80Frac( a );
4284    aExp = extractFloatx80Exp( a );
4285    aSign = extractFloatx80Sign( a );
4286    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4287    shiftCount = 0x4037 - aExp;
4288    if ( shiftCount <= 0 ) shiftCount = 1;
4289    shift64RightJamming( aSig, shiftCount, &aSig );
4290    return roundAndPackInt32(aSign, aSig, status);
4291
4292}
4293
4294/*----------------------------------------------------------------------------
4295| Returns the result of converting the extended double-precision floating-
4296| point value `a' to the 32-bit two's complement integer format.  The
4297| conversion is performed according to the IEC/IEEE Standard for Binary
4298| Floating-Point Arithmetic, except that the conversion is always rounded
4299| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4300| Otherwise, if the conversion overflows, the largest integer with the same
4301| sign as `a' is returned.
4302*----------------------------------------------------------------------------*/
4303
4304int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4305{
4306    flag aSign;
4307    int32_t aExp, shiftCount;
4308    uint64_t aSig, savedASig;
4309    int32_t z;
4310
4311    if (floatx80_invalid_encoding(a)) {
4312        float_raise(float_flag_invalid, status);
4313        return 1 << 31;
4314    }
4315    aSig = extractFloatx80Frac( a );
4316    aExp = extractFloatx80Exp( a );
4317    aSign = extractFloatx80Sign( a );
4318    if ( 0x401E < aExp ) {
4319        if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4320        goto invalid;
4321    }
4322    else if ( aExp < 0x3FFF ) {
4323        if (aExp || aSig) {
4324            status->float_exception_flags |= float_flag_inexact;
4325        }
4326        return 0;
4327    }
4328    shiftCount = 0x403E - aExp;
4329    savedASig = aSig;
4330    aSig >>= shiftCount;
4331    z = aSig;
4332    if ( aSign ) z = - z;
4333    if ( ( z < 0 ) ^ aSign ) {
4334 invalid:
4335        float_raise(float_flag_invalid, status);
4336        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4337    }
4338    if ( ( aSig<<shiftCount ) != savedASig ) {
4339        status->float_exception_flags |= float_flag_inexact;
4340    }
4341    return z;
4342
4343}
4344
4345/*----------------------------------------------------------------------------
4346| Returns the result of converting the extended double-precision floating-
4347| point value `a' to the 64-bit two's complement integer format.  The
4348| conversion is performed according to the IEC/IEEE Standard for Binary
4349| Floating-Point Arithmetic---which means in particular that the conversion
4350| is rounded according to the current rounding mode.  If `a' is a NaN,
4351| the largest positive integer is returned.  Otherwise, if the conversion
4352| overflows, the largest integer with the same sign as `a' is returned.
4353*----------------------------------------------------------------------------*/
4354
4355int64_t floatx80_to_int64(floatx80 a, float_status *status)
4356{
4357    flag aSign;
4358    int32_t aExp, shiftCount;
4359    uint64_t aSig, aSigExtra;
4360
4361    if (floatx80_invalid_encoding(a)) {
4362        float_raise(float_flag_invalid, status);
4363        return 1ULL << 63;
4364    }
4365    aSig = extractFloatx80Frac( a );
4366    aExp = extractFloatx80Exp( a );
4367    aSign = extractFloatx80Sign( a );
4368    shiftCount = 0x403E - aExp;
4369    if ( shiftCount <= 0 ) {
4370        if ( shiftCount ) {
4371            float_raise(float_flag_invalid, status);
4372            if (!aSign || floatx80_is_any_nan(a)) {
4373                return LIT64( 0x7FFFFFFFFFFFFFFF );
4374            }
4375            return (int64_t) LIT64( 0x8000000000000000 );
4376        }
4377        aSigExtra = 0;
4378    }
4379    else {
4380        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4381    }
4382    return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4383
4384}
4385
4386/*----------------------------------------------------------------------------
4387| Returns the result of converting the extended double-precision floating-
4388| point value `a' to the 64-bit two's complement integer format.  The
4389| conversion is performed according to the IEC/IEEE Standard for Binary
4390| Floating-Point Arithmetic, except that the conversion is always rounded
4391| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4392| Otherwise, if the conversion overflows, the largest integer with the same
4393| sign as `a' is returned.
4394*----------------------------------------------------------------------------*/
4395
4396int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4397{
4398    flag aSign;
4399    int32_t aExp, shiftCount;
4400    uint64_t aSig;
4401    int64_t z;
4402
4403    if (floatx80_invalid_encoding(a)) {
4404        float_raise(float_flag_invalid, status);
4405        return 1ULL << 63;
4406    }
4407    aSig = extractFloatx80Frac( a );
4408    aExp = extractFloatx80Exp( a );
4409    aSign = extractFloatx80Sign( a );
4410    shiftCount = aExp - 0x403E;
4411    if ( 0 <= shiftCount ) {
4412        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4413        if ( ( a.high != 0xC03E ) || aSig ) {
4414            float_raise(float_flag_invalid, status);
4415            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4416                return LIT64( 0x7FFFFFFFFFFFFFFF );
4417            }
4418        }
4419        return (int64_t) LIT64( 0x8000000000000000 );
4420    }
4421    else if ( aExp < 0x3FFF ) {
4422        if (aExp | aSig) {
4423            status->float_exception_flags |= float_flag_inexact;
4424        }
4425        return 0;
4426    }
4427    z = aSig>>( - shiftCount );
4428    if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4429        status->float_exception_flags |= float_flag_inexact;
4430    }
4431    if ( aSign ) z = - z;
4432    return z;
4433
4434}
4435
4436/*----------------------------------------------------------------------------
4437| Returns the result of converting the extended double-precision floating-
4438| point value `a' to the single-precision floating-point format.  The
4439| conversion is performed according to the IEC/IEEE Standard for Binary
4440| Floating-Point Arithmetic.
4441*----------------------------------------------------------------------------*/
4442
4443float32 floatx80_to_float32(floatx80 a, float_status *status)
4444{
4445    flag aSign;
4446    int32_t aExp;
4447    uint64_t aSig;
4448
4449    if (floatx80_invalid_encoding(a)) {
4450        float_raise(float_flag_invalid, status);
4451        return float32_default_nan(status);
4452    }
4453    aSig = extractFloatx80Frac( a );
4454    aExp = extractFloatx80Exp( a );
4455    aSign = extractFloatx80Sign( a );
4456    if ( aExp == 0x7FFF ) {
4457        if ( (uint64_t) ( aSig<<1 ) ) {
4458            return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4459        }
4460        return packFloat32( aSign, 0xFF, 0 );
4461    }
4462    shift64RightJamming( aSig, 33, &aSig );
4463    if ( aExp || aSig ) aExp -= 0x3F81;
4464    return roundAndPackFloat32(aSign, aExp, aSig, status);
4465
4466}
4467
4468/*----------------------------------------------------------------------------
4469| Returns the result of converting the extended double-precision floating-
4470| point value `a' to the double-precision floating-point format.  The
4471| conversion is performed according to the IEC/IEEE Standard for Binary
4472| Floating-Point Arithmetic.
4473*----------------------------------------------------------------------------*/
4474
4475float64 floatx80_to_float64(floatx80 a, float_status *status)
4476{
4477    flag aSign;
4478    int32_t aExp;
4479    uint64_t aSig, zSig;
4480
4481    if (floatx80_invalid_encoding(a)) {
4482        float_raise(float_flag_invalid, status);
4483        return float64_default_nan(status);
4484    }
4485    aSig = extractFloatx80Frac( a );
4486    aExp = extractFloatx80Exp( a );
4487    aSign = extractFloatx80Sign( a );
4488    if ( aExp == 0x7FFF ) {
4489        if ( (uint64_t) ( aSig<<1 ) ) {
4490            return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4491        }
4492        return packFloat64( aSign, 0x7FF, 0 );
4493    }
4494    shift64RightJamming( aSig, 1, &zSig );
4495    if ( aExp || aSig ) aExp -= 0x3C01;
4496    return roundAndPackFloat64(aSign, aExp, zSig, status);
4497
4498}
4499
4500/*----------------------------------------------------------------------------
4501| Returns the result of converting the extended double-precision floating-
4502| point value `a' to the quadruple-precision floating-point format.  The
4503| conversion is performed according to the IEC/IEEE Standard for Binary
4504| Floating-Point Arithmetic.
4505*----------------------------------------------------------------------------*/
4506
4507float128 floatx80_to_float128(floatx80 a, float_status *status)
4508{
4509    flag aSign;
4510    int aExp;
4511    uint64_t aSig, zSig0, zSig1;
4512
4513    if (floatx80_invalid_encoding(a)) {
4514        float_raise(float_flag_invalid, status);
4515        return float128_default_nan(status);
4516    }
4517    aSig = extractFloatx80Frac( a );
4518    aExp = extractFloatx80Exp( a );
4519    aSign = extractFloatx80Sign( a );
4520    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4521        return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4522    }
4523    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4524    return packFloat128( aSign, aExp, zSig0, zSig1 );
4525
4526}
4527
4528/*----------------------------------------------------------------------------
4529| Rounds the extended double-precision floating-point value `a'
4530| to the precision provided by floatx80_rounding_precision and returns the
4531| result as an extended double-precision floating-point value.
4532| The operation is performed according to the IEC/IEEE Standard for Binary
4533| Floating-Point Arithmetic.
4534*----------------------------------------------------------------------------*/
4535
4536floatx80 floatx80_round(floatx80 a, float_status *status)
4537{
4538    return roundAndPackFloatx80(status->floatx80_rounding_precision,
4539                                extractFloatx80Sign(a),
4540                                extractFloatx80Exp(a),
4541                                extractFloatx80Frac(a), 0, status);
4542}
4543
4544/*----------------------------------------------------------------------------
4545| Rounds the extended double-precision floating-point value `a' to an integer,
4546| and returns the result as an extended quadruple-precision floating-point
4547| value.  The operation is performed according to the IEC/IEEE Standard for
4548| Binary Floating-Point Arithmetic.
4549*----------------------------------------------------------------------------*/
4550
4551floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4552{
4553    flag aSign;
4554    int32_t aExp;
4555    uint64_t lastBitMask, roundBitsMask;
4556    floatx80 z;
4557
4558    if (floatx80_invalid_encoding(a)) {
4559        float_raise(float_flag_invalid, status);
4560        return floatx80_default_nan(status);
4561    }
4562    aExp = extractFloatx80Exp( a );
4563    if ( 0x403E <= aExp ) {
4564        if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4565            return propagateFloatx80NaN(a, a, status);
4566        }
4567        return a;
4568    }
4569    if ( aExp < 0x3FFF ) {
4570        if (    ( aExp == 0 )
4571             && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4572            return a;
4573        }
4574        status->float_exception_flags |= float_flag_inexact;
4575        aSign = extractFloatx80Sign( a );
4576        switch (status->float_rounding_mode) {
4577         case float_round_nearest_even:
4578            if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4579               ) {
4580                return
4581                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4582            }
4583            break;
4584        case float_round_ties_away:
4585            if (aExp == 0x3FFE) {
4586                return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4587            }
4588            break;
4589         case float_round_down:
4590            return
4591                  aSign ?
4592                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4593                : packFloatx80( 0, 0, 0 );
4594         case float_round_up:
4595            return
4596                  aSign ? packFloatx80( 1, 0, 0 )
4597                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4598        }
4599        return packFloatx80( aSign, 0, 0 );
4600    }
4601    lastBitMask = 1;
4602    lastBitMask <<= 0x403E - aExp;
4603    roundBitsMask = lastBitMask - 1;
4604    z = a;
4605    switch (status->float_rounding_mode) {
4606    case float_round_nearest_even:
4607        z.low += lastBitMask>>1;
4608        if ((z.low & roundBitsMask) == 0) {
4609            z.low &= ~lastBitMask;
4610        }
4611        break;
4612    case float_round_ties_away:
4613        z.low += lastBitMask >> 1;
4614        break;
4615    case float_round_to_zero:
4616        break;
4617    case float_round_up:
4618        if (!extractFloatx80Sign(z)) {
4619            z.low += roundBitsMask;
4620        }
4621        break;
4622    case float_round_down:
4623        if (extractFloatx80Sign(z)) {
4624            z.low += roundBitsMask;
4625        }
4626        break;
4627    default:
4628        abort();
4629    }
4630    z.low &= ~ roundBitsMask;
4631    if ( z.low == 0 ) {
4632        ++z.high;
4633        z.low = LIT64( 0x8000000000000000 );
4634    }
4635    if (z.low != a.low) {
4636        status->float_exception_flags |= float_flag_inexact;
4637    }
4638    return z;
4639
4640}
4641
4642/*----------------------------------------------------------------------------
4643| Returns the result of adding the absolute values of the extended double-
4644| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4645| negated before being returned.  `zSign' is ignored if the result is a NaN.
4646| The addition is performed according to the IEC/IEEE Standard for Binary
4647| Floating-Point Arithmetic.
4648*----------------------------------------------------------------------------*/
4649
4650static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4651                                float_status *status)
4652{
4653    int32_t aExp, bExp, zExp;
4654    uint64_t aSig, bSig, zSig0, zSig1;
4655    int32_t expDiff;
4656
4657    aSig = extractFloatx80Frac( a );
4658    aExp = extractFloatx80Exp( a );
4659    bSig = extractFloatx80Frac( b );
4660    bExp = extractFloatx80Exp( b );
4661    expDiff = aExp - bExp;
4662    if ( 0 < expDiff ) {
4663        if ( aExp == 0x7FFF ) {
4664            if ((uint64_t)(aSig << 1)) {
4665                return propagateFloatx80NaN(a, b, status);
4666            }
4667            return a;
4668        }
4669        if ( bExp == 0 ) --expDiff;
4670        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4671        zExp = aExp;
4672    }
4673    else if ( expDiff < 0 ) {
4674        if ( bExp == 0x7FFF ) {
4675            if ((uint64_t)(bSig << 1)) {
4676                return propagateFloatx80NaN(a, b, status);
4677            }
4678            return packFloatx80(zSign,
4679                                floatx80_infinity_high,
4680                                floatx80_infinity_low);
4681        }
4682        if ( aExp == 0 ) ++expDiff;
4683        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4684        zExp = bExp;
4685    }
4686    else {
4687        if ( aExp == 0x7FFF ) {
4688            if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4689                return propagateFloatx80NaN(a, b, status);
4690            }
4691            return a;
4692        }
4693        zSig1 = 0;
4694        zSig0 = aSig + bSig;
4695        if ( aExp == 0 ) {
4696            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4697            goto roundAndPack;
4698        }
4699        zExp = aExp;
4700        goto shiftRight1;
4701    }
4702    zSig0 = aSig + bSig;
4703    if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4704 shiftRight1:
4705    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4706    zSig0 |= LIT64( 0x8000000000000000 );
4707    ++zExp;
4708 roundAndPack:
4709    return roundAndPackFloatx80(status->floatx80_rounding_precision,
4710                                zSign, zExp, zSig0, zSig1, status);
4711}
4712
4713/*----------------------------------------------------------------------------
4714| Returns the result of subtracting the absolute values of the extended
4715| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4716| difference is negated before being returned.  `zSign' is ignored if the
4717| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4718| Standard for Binary Floating-Point Arithmetic.
4719*----------------------------------------------------------------------------*/
4720
4721static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4722                                float_status *status)
4723{
4724    int32_t aExp, bExp, zExp;
4725    uint64_t aSig, bSig, zSig0, zSig1;
4726    int32_t expDiff;
4727
4728    aSig = extractFloatx80Frac( a );
4729    aExp = extractFloatx80Exp( a );
4730    bSig = extractFloatx80Frac( b );
4731    bExp = extractFloatx80Exp( b );
4732    expDiff = aExp - bExp;
4733    if ( 0 < expDiff ) goto aExpBigger;
4734    if ( expDiff < 0 ) goto bExpBigger;
4735    if ( aExp == 0x7FFF ) {
4736        if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4737            return propagateFloatx80NaN(a, b, status);
4738        }
4739        float_raise(float_flag_invalid, status);
4740        return floatx80_default_nan(status);
4741    }
4742    if ( aExp == 0 ) {
4743        aExp = 1;
4744        bExp = 1;
4745    }
4746    zSig1 = 0;
4747    if ( bSig < aSig ) goto aBigger;
4748    if ( aSig < bSig ) goto bBigger;
4749    return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4750 bExpBigger:
4751    if ( bExp == 0x7FFF ) {
4752        if ((uint64_t)(bSig << 1)) {
4753            return propagateFloatx80NaN(a, b, status);
4754        }
4755        return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4756                            floatx80_infinity_low);
4757    }
4758    if ( aExp == 0 ) ++expDiff;
4759    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4760 bBigger:
4761    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4762    zExp = bExp;
4763    zSign ^= 1;
4764    goto normalizeRoundAndPack;
4765 aExpBigger:
4766    if ( aExp == 0x7FFF ) {
4767        if ((uint64_t)(aSig << 1)) {
4768            return propagateFloatx80NaN(a, b, status);
4769        }
4770        return a;
4771    }
4772    if ( bExp == 0 ) --expDiff;
4773    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4774 aBigger:
4775    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4776    zExp = aExp;
4777 normalizeRoundAndPack:
4778    return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4779                                         zSign, zExp, zSig0, zSig1, status);
4780}
4781
4782/*----------------------------------------------------------------------------
4783| Returns the result of adding the extended double-precision floating-point
4784| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4785| Standard for Binary Floating-Point Arithmetic.
4786*----------------------------------------------------------------------------*/
4787
4788floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4789{
4790    flag aSign, bSign;
4791
4792    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4793        float_raise(float_flag_invalid, status);
4794        return floatx80_default_nan(status);
4795    }
4796    aSign = extractFloatx80Sign( a );
4797    bSign = extractFloatx80Sign( b );
4798    if ( aSign == bSign ) {
4799        return addFloatx80Sigs(a, b, aSign, status);
4800    }
4801    else {
4802        return subFloatx80Sigs(a, b, aSign, status);
4803    }
4804
4805}
4806
4807/*----------------------------------------------------------------------------
4808| Returns the result of subtracting the extended double-precision floating-
4809| point values `a' and `b'.  The operation is performed according to the
4810| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4811*----------------------------------------------------------------------------*/
4812
4813floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
4814{
4815    flag aSign, bSign;
4816
4817    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4818        float_raise(float_flag_invalid, status);
4819        return floatx80_default_nan(status);
4820    }
4821    aSign = extractFloatx80Sign( a );
4822    bSign = extractFloatx80Sign( b );
4823    if ( aSign == bSign ) {
4824        return subFloatx80Sigs(a, b, aSign, status);
4825    }
4826    else {
4827        return addFloatx80Sigs(a, b, aSign, status);
4828    }
4829
4830}
4831
4832/*----------------------------------------------------------------------------
4833| Returns the result of multiplying the extended double-precision floating-
4834| point values `a' and `b'.  The operation is performed according to the
4835| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4836*----------------------------------------------------------------------------*/
4837
4838floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
4839{
4840    flag aSign, bSign, zSign;
4841    int32_t aExp, bExp, zExp;
4842    uint64_t aSig, bSig, zSig0, zSig1;
4843
4844    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4845        float_raise(float_flag_invalid, status);
4846        return floatx80_default_nan(status);
4847    }
4848    aSig = extractFloatx80Frac( a );
4849    aExp = extractFloatx80Exp( a );
4850    aSign = extractFloatx80Sign( a );
4851    bSig = extractFloatx80Frac( b );
4852    bExp = extractFloatx80Exp( b );
4853    bSign = extractFloatx80Sign( b );
4854    zSign = aSign ^ bSign;
4855    if ( aExp == 0x7FFF ) {
4856        if (    (uint64_t) ( aSig<<1 )
4857             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4858            return propagateFloatx80NaN(a, b, status);
4859        }
4860        if ( ( bExp | bSig ) == 0 ) goto invalid;
4861        return packFloatx80(zSign, floatx80_infinity_high,
4862                                   floatx80_infinity_low);
4863    }
4864    if ( bExp == 0x7FFF ) {
4865        if ((uint64_t)(bSig << 1)) {
4866            return propagateFloatx80NaN(a, b, status);
4867        }
4868        if ( ( aExp | aSig ) == 0 ) {
4869 invalid:
4870            float_raise(float_flag_invalid, status);
4871            return floatx80_default_nan(status);
4872        }
4873        return packFloatx80(zSign, floatx80_infinity_high,
4874                                   floatx80_infinity_low);
4875    }
4876    if ( aExp == 0 ) {
4877        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4878        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4879    }
4880    if ( bExp == 0 ) {
4881        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4882        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4883    }
4884    zExp = aExp + bExp - 0x3FFE;
4885    mul64To128( aSig, bSig, &zSig0, &zSig1 );
4886    if ( 0 < (int64_t) zSig0 ) {
4887        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4888        --zExp;
4889    }
4890    return roundAndPackFloatx80(status->floatx80_rounding_precision,
4891                                zSign, zExp, zSig0, zSig1, status);
4892}
4893
4894/*----------------------------------------------------------------------------
4895| Returns the result of dividing the extended double-precision floating-point
4896| value `a' by the corresponding value `b'.  The operation is performed
4897| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4898*----------------------------------------------------------------------------*/
4899
4900floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
4901{
4902    flag aSign, bSign, zSign;
4903    int32_t aExp, bExp, zExp;
4904    uint64_t aSig, bSig, zSig0, zSig1;
4905    uint64_t rem0, rem1, rem2, term0, term1, term2;
4906
4907    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4908        float_raise(float_flag_invalid, status);
4909        return floatx80_default_nan(status);
4910    }
4911    aSig = extractFloatx80Frac( a );
4912    aExp = extractFloatx80Exp( a );
4913    aSign = extractFloatx80Sign( a );
4914    bSig = extractFloatx80Frac( b );
4915    bExp = extractFloatx80Exp( b );
4916    bSign = extractFloatx80Sign( b );
4917    zSign = aSign ^ bSign;
4918    if ( aExp == 0x7FFF ) {
4919        if ((uint64_t)(aSig << 1)) {
4920            return propagateFloatx80NaN(a, b, status);
4921        }
4922        if ( bExp == 0x7FFF ) {
4923            if ((uint64_t)(bSig << 1)) {
4924                return propagateFloatx80NaN(a, b, status);
4925            }
4926            goto invalid;
4927        }
4928        return packFloatx80(zSign, floatx80_infinity_high,
4929                                   floatx80_infinity_low);
4930    }
4931    if ( bExp == 0x7FFF ) {
4932        if ((uint64_t)(bSig << 1)) {
4933            return propagateFloatx80NaN(a, b, status);
4934        }
4935        return packFloatx80( zSign, 0, 0 );
4936    }
4937    if ( bExp == 0 ) {
4938        if ( bSig == 0 ) {
4939            if ( ( aExp | aSig ) == 0 ) {
4940 invalid:
4941                float_raise(float_flag_invalid, status);
4942                return floatx80_default_nan(status);
4943            }
4944            float_raise(float_flag_divbyzero, status);
4945            return packFloatx80(zSign, floatx80_infinity_high,
4946                                       floatx80_infinity_low);
4947        }
4948        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4949    }
4950    if ( aExp == 0 ) {
4951        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4952        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4953    }
4954    zExp = aExp - bExp + 0x3FFE;
4955    rem1 = 0;
4956    if ( bSig <= aSig ) {
4957        shift128Right( aSig, 0, 1, &aSig, &rem1 );
4958        ++zExp;
4959    }
4960    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4961    mul64To128( bSig, zSig0, &term0, &term1 );
4962    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4963    while ( (int64_t) rem0 < 0 ) {
4964        --zSig0;
4965        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4966    }
4967    zSig1 = estimateDiv128To64( rem1, 0, bSig );
4968    if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4969        mul64To128( bSig, zSig1, &term1, &term2 );
4970        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4971        while ( (int64_t) rem1 < 0 ) {
4972            --zSig1;
4973            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4974        }
4975        zSig1 |= ( ( rem1 | rem2 ) != 0 );
4976    }
4977    return roundAndPackFloatx80(status->floatx80_rounding_precision,
4978                                zSign, zExp, zSig0, zSig1, status);
4979}
4980
4981/*----------------------------------------------------------------------------
4982| Returns the remainder of the extended double-precision floating-point value
4983| `a' with respect to the corresponding value `b'.  The operation is performed
4984| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4985*----------------------------------------------------------------------------*/
4986
4987floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
4988{
4989    flag aSign, zSign;
4990    int32_t aExp, bExp, expDiff;
4991    uint64_t aSig0, aSig1, bSig;
4992    uint64_t q, term0, term1, alternateASig0, alternateASig1;
4993
4994    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4995        float_raise(float_flag_invalid, status);
4996        return floatx80_default_nan(status);
4997    }
4998    aSig0 = extractFloatx80Frac( a );
4999    aExp = extractFloatx80Exp( a );
5000    aSign = extractFloatx80Sign( a );
5001    bSig = extractFloatx80Frac( b );
5002    bExp = extractFloatx80Exp( b );
5003    if ( aExp == 0x7FFF ) {
5004        if (    (uint64_t) ( aSig0<<1 )
5005             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5006            return propagateFloatx80NaN(a, b, status);
5007        }
5008        goto invalid;
5009    }
5010    if ( bExp == 0x7FFF ) {
5011        if ((uint64_t)(bSig << 1)) {
5012            return propagateFloatx80NaN(a, b, status);
5013        }
5014        return a;
5015    }
5016    if ( bExp == 0 ) {
5017        if ( bSig == 0 ) {
5018 invalid:
5019            float_raise(float_flag_invalid, status);
5020            return floatx80_default_nan(status);
5021        }
5022        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5023    }
5024    if ( aExp == 0 ) {
5025        if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5026        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5027    }
5028    bSig |= LIT64( 0x8000000000000000 );
5029    zSign = aSign;
5030    expDiff = aExp - bExp;
5031    aSig1 = 0;
5032    if ( expDiff < 0 ) {
5033        if ( expDiff < -1 ) return a;
5034        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5035        expDiff = 0;
5036    }
5037    q = ( bSig <= aSig0 );
5038    if ( q ) aSig0 -= bSig;
5039    expDiff -= 64;
5040    while ( 0 < expDiff ) {
5041        q = estimateDiv128To64( aSig0, aSig1, bSig );
5042        q = ( 2 < q ) ? q - 2 : 0;
5043        mul64To128( bSig, q, &term0, &term1 );
5044        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5045        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5046        expDiff -= 62;
5047    }
5048    expDiff += 64;
5049    if ( 0 < expDiff ) {
5050        q = estimateDiv128To64( aSig0, aSig1, bSig );
5051        q = ( 2 < q ) ? q - 2 : 0;
5052        q >>= 64 - expDiff;
5053        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5054        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5055        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5056        while ( le128( term0, term1, aSig0, aSig1 ) ) {
5057            ++q;
5058            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5059        }
5060    }
5061    else {
5062        term1 = 0;
5063        term0 = bSig;
5064    }
5065    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5066    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5067         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5068              && ( q & 1 ) )
5069       ) {
5070        aSig0 = alternateASig0;
5071        aSig1 = alternateASig1;
5072        zSign = ! zSign;
5073    }
5074    return
5075        normalizeRoundAndPackFloatx80(
5076            80, zSign, bExp + expDiff, aSig0, aSig1, status);
5077
5078}
5079
5080/*----------------------------------------------------------------------------
5081| Returns the square root of the extended double-precision floating-point
5082| value `a'.  The operation is performed according to the IEC/IEEE Standard
5083| for Binary Floating-Point Arithmetic.
5084*----------------------------------------------------------------------------*/
5085
5086floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5087{
5088    flag aSign;
5089    int32_t aExp, zExp;
5090    uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5091    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5092
5093    if (floatx80_invalid_encoding(a)) {
5094        float_raise(float_flag_invalid, status);
5095        return floatx80_default_nan(status);
5096    }
5097    aSig0 = extractFloatx80Frac( a );
5098    aExp = extractFloatx80Exp( a );
5099    aSign = extractFloatx80Sign( a );
5100    if ( aExp == 0x7FFF ) {
5101        if ((uint64_t)(aSig0 << 1)) {
5102            return propagateFloatx80NaN(a, a, status);
5103        }
5104        if ( ! aSign ) return a;
5105        goto invalid;
5106    }
5107    if ( aSign ) {
5108        if ( ( aExp | aSig0 ) == 0 ) return a;
5109 invalid:
5110        float_raise(float_flag_invalid, status);
5111        return floatx80_default_nan(status);
5112    }
5113    if ( aExp == 0 ) {
5114        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5115        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5116    }
5117    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5118    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5119    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5120    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5121    doubleZSig0 = zSig0<<1;
5122    mul64To128( zSig0, zSig0, &term0, &term1 );
5123    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5124    while ( (int64_t) rem0 < 0 ) {
5125        --zSig0;
5126        doubleZSig0 -= 2;
5127        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5128    }
5129    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5130    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5131        if ( zSig1 == 0 ) zSig1 = 1;
5132        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5133        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5134        mul64To128( zSig1, zSig1, &term2, &term3 );
5135        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5136        while ( (int64_t) rem1 < 0 ) {
5137            --zSig1;
5138            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5139            term3 |= 1;
5140            term2 |= doubleZSig0;
5141            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5142        }
5143        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5144    }
5145    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5146    zSig0 |= doubleZSig0;
5147    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5148                                0, zExp, zSig0, zSig1, status);
5149}
5150
5151/*----------------------------------------------------------------------------
5152| Returns 1 if the extended double-precision floating-point value `a' is equal
5153| to the corresponding value `b', and 0 otherwise.  The invalid exception is
5154| raised if either operand is a NaN.  Otherwise, the comparison is performed
5155| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5156*----------------------------------------------------------------------------*/
5157
5158int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5159{
5160
5161    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5162        || (extractFloatx80Exp(a) == 0x7FFF
5163            && (uint64_t) (extractFloatx80Frac(a) << 1))
5164        || (extractFloatx80Exp(b) == 0x7FFF
5165            && (uint64_t) (extractFloatx80Frac(b) << 1))
5166       ) {
5167        float_raise(float_flag_invalid, status);
5168        return 0;
5169    }
5170    return
5171           ( a.low == b.low )
5172        && (    ( a.high == b.high )
5173             || (    ( a.low == 0 )
5174                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5175           );
5176
5177}
5178
5179/*----------------------------------------------------------------------------
5180| Returns 1 if the extended double-precision floating-point value `a' is
5181| less than or equal to the corresponding value `b', and 0 otherwise.  The
5182| invalid exception is raised if either operand is a NaN.  The comparison is
5183| performed according to the IEC/IEEE Standard for Binary Floating-Point
5184| Arithmetic.
5185*----------------------------------------------------------------------------*/
5186
5187int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5188{
5189    flag aSign, bSign;
5190
5191    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5192        || (extractFloatx80Exp(a) == 0x7FFF
5193            && (uint64_t) (extractFloatx80Frac(a) << 1))
5194        || (extractFloatx80Exp(b) == 0x7FFF
5195            && (uint64_t) (extractFloatx80Frac(b) << 1))
5196       ) {
5197        float_raise(float_flag_invalid, status);
5198        return 0;
5199    }
5200    aSign = extractFloatx80Sign( a );
5201    bSign = extractFloatx80Sign( b );
5202    if ( aSign != bSign ) {
5203        return
5204               aSign
5205            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5206                 == 0 );
5207    }
5208    return
5209          aSign ? le128( b.high, b.low, a.high, a.low )
5210        : le128( a.high, a.low, b.high, b.low );
5211
5212}
5213
5214/*----------------------------------------------------------------------------
5215| Returns 1 if the extended double-precision floating-point value `a' is
5216| less than the corresponding value `b', and 0 otherwise.  The invalid
5217| exception is raised if either operand is a NaN.  The comparison is performed
5218| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5219*----------------------------------------------------------------------------*/
5220
5221int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5222{
5223    flag aSign, bSign;
5224
5225    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5226        || (extractFloatx80Exp(a) == 0x7FFF
5227            && (uint64_t) (extractFloatx80Frac(a) << 1))
5228        || (extractFloatx80Exp(b) == 0x7FFF
5229            && (uint64_t) (extractFloatx80Frac(b) << 1))
5230       ) {
5231        float_raise(float_flag_invalid, status);
5232        return 0;
5233    }
5234    aSign = extractFloatx80Sign( a );
5235    bSign = extractFloatx80Sign( b );
5236    if ( aSign != bSign ) {
5237        return
5238               aSign
5239            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5240                 != 0 );
5241    }
5242    return
5243          aSign ? lt128( b.high, b.low, a.high, a.low )
5244        : lt128( a.high, a.low, b.high, b.low );
5245
5246}
5247
5248/*----------------------------------------------------------------------------
5249| Returns 1 if the extended double-precision floating-point values `a' and `b'
5250| cannot be compared, and 0 otherwise.  The invalid exception is raised if
5251| either operand is a NaN.   The comparison is performed according to the
5252| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5253*----------------------------------------------------------------------------*/
5254int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5255{
5256    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5257        || (extractFloatx80Exp(a) == 0x7FFF
5258            && (uint64_t) (extractFloatx80Frac(a) << 1))
5259        || (extractFloatx80Exp(b) == 0x7FFF
5260            && (uint64_t) (extractFloatx80Frac(b) << 1))
5261       ) {
5262        float_raise(float_flag_invalid, status);
5263        return 1;
5264    }
5265    return 0;
5266}
5267
5268/*----------------------------------------------------------------------------
5269| Returns 1 if the extended double-precision floating-point value `a' is
5270| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5271| cause an exception.  The comparison is performed according to the IEC/IEEE
5272| Standard for Binary Floating-Point Arithmetic.
5273*----------------------------------------------------------------------------*/
5274
5275int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5276{
5277
5278    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5279        float_raise(float_flag_invalid, status);
5280        return 0;
5281    }
5282    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5283              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5284         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5285              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5286       ) {
5287        if (floatx80_is_signaling_nan(a, status)
5288         || floatx80_is_signaling_nan(b, status)) {
5289            float_raise(float_flag_invalid, status);
5290        }
5291        return 0;
5292    }
5293    return
5294           ( a.low == b.low )
5295        && (    ( a.high == b.high )
5296             || (    ( a.low == 0 )
5297                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5298           );
5299
5300}
5301
5302/*----------------------------------------------------------------------------
5303| Returns 1 if the extended double-precision floating-point value `a' is less
5304| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5305| do not cause an exception.  Otherwise, the comparison is performed according
5306| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5307*----------------------------------------------------------------------------*/
5308
5309int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5310{
5311    flag aSign, bSign;
5312
5313    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5314        float_raise(float_flag_invalid, status);
5315        return 0;
5316    }
5317    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5318              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5319         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5320              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5321       ) {
5322        if (floatx80_is_signaling_nan(a, status)
5323         || floatx80_is_signaling_nan(b, status)) {
5324            float_raise(float_flag_invalid, status);
5325        }
5326        return 0;
5327    }
5328    aSign = extractFloatx80Sign( a );
5329    bSign = extractFloatx80Sign( b );
5330    if ( aSign != bSign ) {
5331        return
5332               aSign
5333            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5334                 == 0 );
5335    }
5336    return
5337          aSign ? le128( b.high, b.low, a.high, a.low )
5338        : le128( a.high, a.low, b.high, b.low );
5339
5340}
5341
5342/*----------------------------------------------------------------------------
5343| Returns 1 if the extended double-precision floating-point value `a' is less
5344| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5345| an exception.  Otherwise, the comparison is performed according to the
5346| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5347*----------------------------------------------------------------------------*/
5348
5349int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5350{
5351    flag aSign, bSign;
5352
5353    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5354        float_raise(float_flag_invalid, status);
5355        return 0;
5356    }
5357    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5358              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5359         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5360              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5361       ) {
5362        if (floatx80_is_signaling_nan(a, status)
5363         || floatx80_is_signaling_nan(b, status)) {
5364            float_raise(float_flag_invalid, status);
5365        }
5366        return 0;
5367    }
5368    aSign = extractFloatx80Sign( a );
5369    bSign = extractFloatx80Sign( b );
5370    if ( aSign != bSign ) {
5371        return
5372               aSign
5373            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5374                 != 0 );
5375    }
5376    return
5377          aSign ? lt128( b.high, b.low, a.high, a.low )
5378        : lt128( a.high, a.low, b.high, b.low );
5379
5380}
5381
5382/*----------------------------------------------------------------------------
5383| Returns 1 if the extended double-precision floating-point values `a' and `b'
5384| cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5385| The comparison is performed according to the IEC/IEEE Standard for Binary
5386| Floating-Point Arithmetic.
5387*----------------------------------------------------------------------------*/
5388int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5389{
5390    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5391        float_raise(float_flag_invalid, status);
5392        return 1;
5393    }
5394    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5395              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5396         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5397              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5398       ) {
5399        if (floatx80_is_signaling_nan(a, status)
5400         || floatx80_is_signaling_nan(b, status)) {
5401            float_raise(float_flag_invalid, status);
5402        }
5403        return 1;
5404    }
5405    return 0;
5406}
5407
5408/*----------------------------------------------------------------------------
5409| Returns the result of converting the quadruple-precision floating-point
5410| value `a' to the 32-bit two's complement integer format.  The conversion
5411| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5412| Arithmetic---which means in particular that the conversion is rounded
5413| according to the current rounding mode.  If `a' is a NaN, the largest
5414| positive integer is returned.  Otherwise, if the conversion overflows, the
5415| largest integer with the same sign as `a' is returned.
5416*----------------------------------------------------------------------------*/
5417
5418int32_t float128_to_int32(float128 a, float_status *status)
5419{
5420    flag aSign;
5421    int32_t aExp, shiftCount;
5422    uint64_t aSig0, aSig1;
5423
5424    aSig1 = extractFloat128Frac1( a );
5425    aSig0 = extractFloat128Frac0( a );
5426    aExp = extractFloat128Exp( a );
5427    aSign = extractFloat128Sign( a );
5428    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5429    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5430    aSig0 |= ( aSig1 != 0 );
5431    shiftCount = 0x4028 - aExp;
5432    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5433    return roundAndPackInt32(aSign, aSig0, status);
5434
5435}
5436
5437/*----------------------------------------------------------------------------
5438| Returns the result of converting the quadruple-precision floating-point
5439| value `a' to the 32-bit two's complement integer format.  The conversion
5440| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5441| Arithmetic, except that the conversion is always rounded toward zero.  If
5442| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5443| conversion overflows, the largest integer with the same sign as `a' is
5444| returned.
5445*----------------------------------------------------------------------------*/
5446
5447int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5448{
5449    flag aSign;
5450    int32_t aExp, shiftCount;
5451    uint64_t aSig0, aSig1, savedASig;
5452    int32_t z;
5453
5454    aSig1 = extractFloat128Frac1( a );
5455    aSig0 = extractFloat128Frac0( a );
5456    aExp = extractFloat128Exp( a );
5457    aSign = extractFloat128Sign( a );
5458    aSig0 |= ( aSig1 != 0 );
5459    if ( 0x401E < aExp ) {
5460        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5461        goto invalid;
5462    }
5463    else if ( aExp < 0x3FFF ) {
5464        if (aExp || aSig0) {
5465            status->float_exception_flags |= float_flag_inexact;
5466        }
5467        return 0;
5468    }
5469    aSig0 |= LIT64( 0x0001000000000000 );
5470    shiftCount = 0x402F - aExp;
5471    savedASig = aSig0;
5472    aSig0 >>= shiftCount;
5473    z = aSig0;
5474    if ( aSign ) z = - z;
5475    if ( ( z < 0 ) ^ aSign ) {
5476 invalid:
5477        float_raise(float_flag_invalid, status);
5478        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5479    }
5480    if ( ( aSig0<<shiftCount ) != savedASig ) {
5481        status->float_exception_flags |= float_flag_inexact;
5482    }
5483    return z;
5484
5485}
5486
5487/*----------------------------------------------------------------------------
5488| Returns the result of converting the quadruple-precision floating-point
5489| value `a' to the 64-bit two's complement integer format.  The conversion
5490| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5491| Arithmetic---which means in particular that the conversion is rounded
5492| according to the current rounding mode.  If `a' is a NaN, the largest
5493| positive integer is returned.  Otherwise, if the conversion overflows, the
5494| largest integer with the same sign as `a' is returned.
5495*----------------------------------------------------------------------------*/
5496
5497int64_t float128_to_int64(float128 a, float_status *status)
5498{
5499    flag aSign;
5500    int32_t aExp, shiftCount;
5501    uint64_t aSig0, aSig1;
5502
5503    aSig1 = extractFloat128Frac1( a );
5504    aSig0 = extractFloat128Frac0( a );
5505    aExp = extractFloat128Exp( a );
5506    aSign = extractFloat128Sign( a );
5507    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5508    shiftCount = 0x402F - aExp;
5509    if ( shiftCount <= 0 ) {
5510        if ( 0x403E < aExp ) {
5511            float_raise(float_flag_invalid, status);
5512            if (    ! aSign
5513                 || (    ( aExp == 0x7FFF )
5514                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5515                    )
5516               ) {
5517                return LIT64( 0x7FFFFFFFFFFFFFFF );
5518            }
5519            return (int64_t) LIT64( 0x8000000000000000 );
5520        }
5521        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5522    }
5523    else {
5524        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5525    }
5526    return roundAndPackInt64(aSign, aSig0, aSig1, status);
5527
5528}
5529
5530/*----------------------------------------------------------------------------
5531| Returns the result of converting the quadruple-precision floating-point
5532| value `a' to the 64-bit two's complement integer format.  The conversion
5533| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5534| Arithmetic, except that the conversion is always rounded toward zero.
5535| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5536| the conversion overflows, the largest integer with the same sign as `a' is
5537| returned.
5538*----------------------------------------------------------------------------*/
5539
5540int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5541{
5542    flag aSign;
5543    int32_t aExp, shiftCount;
5544    uint64_t aSig0, aSig1;
5545    int64_t z;
5546
5547    aSig1 = extractFloat128Frac1( a );
5548    aSig0 = extractFloat128Frac0( a );
5549    aExp = extractFloat128Exp( a );
5550    aSign = extractFloat128Sign( a );
5551    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5552    shiftCount = aExp - 0x402F;
5553    if ( 0 < shiftCount ) {
5554        if ( 0x403E <= aExp ) {
5555            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5556            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5557                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5558                if (aSig1) {
5559                    status->float_exception_flags |= float_flag_inexact;
5560                }
5561            }
5562            else {
5563                float_raise(float_flag_invalid, status);
5564                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5565                    return LIT64( 0x7FFFFFFFFFFFFFFF );
5566                }
5567            }
5568            return (int64_t) LIT64( 0x8000000000000000 );
5569        }
5570        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5571        if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5572            status->float_exception_flags |= float_flag_inexact;
5573        }
5574    }
5575    else {
5576        if ( aExp < 0x3FFF ) {
5577            if ( aExp | aSig0 | aSig1 ) {
5578                status->float_exception_flags |= float_flag_inexact;
5579            }
5580            return 0;
5581        }
5582        z = aSig0>>( - shiftCount );
5583        if (    aSig1
5584             || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5585            status->float_exception_flags |= float_flag_inexact;
5586        }
5587    }
5588    if ( aSign ) z = - z;
5589    return z;
5590
5591}
5592
5593/*----------------------------------------------------------------------------
5594| Returns the result of converting the quadruple-precision floating-point value
5595| `a' to the 64-bit unsigned integer format.  The conversion is
5596| performed according to the IEC/IEEE Standard for Binary Floating-Point
5597| Arithmetic---which means in particular that the conversion is rounded
5598| according to the current rounding mode.  If `a' is a NaN, the largest
5599| positive integer is returned.  If the conversion overflows, the
5600| largest unsigned integer is returned.  If 'a' is negative, the value is
5601| rounded and zero is returned; negative values that do not round to zero
5602| will raise the inexact exception.
5603*----------------------------------------------------------------------------*/
5604
5605uint64_t float128_to_uint64(float128 a, float_status *status)
5606{
5607    flag aSign;
5608    int aExp;
5609    int shiftCount;
5610    uint64_t aSig0, aSig1;
5611
5612    aSig0 = extractFloat128Frac0(a);
5613    aSig1 = extractFloat128Frac1(a);
5614    aExp = extractFloat128Exp(a);
5615    aSign = extractFloat128Sign(a);
5616    if (aSign && (aExp > 0x3FFE)) {
5617        float_raise(float_flag_invalid, status);
5618        if (float128_is_any_nan(a)) {
5619            return LIT64(0xFFFFFFFFFFFFFFFF);
5620        } else {
5621            return 0;
5622        }
5623    }
5624    if (aExp) {
5625        aSig0 |= LIT64(0x0001000000000000);
5626    }
5627    shiftCount = 0x402F - aExp;
5628    if (shiftCount <= 0) {
5629        if (0x403E < aExp) {
5630            float_raise(float_flag_invalid, status);
5631            return LIT64(0xFFFFFFFFFFFFFFFF);
5632        }
5633        shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5634    } else {
5635        shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5636    }
5637    return roundAndPackUint64(aSign, aSig0, aSig1, status);
5638}
5639
5640uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5641{
5642    uint64_t v;
5643    signed char current_rounding_mode = status->float_rounding_mode;
5644
5645    set_float_rounding_mode(float_round_to_zero, status);
5646    v = float128_to_uint64(a, status);
5647    set_float_rounding_mode(current_rounding_mode, status);
5648
5649    return v;
5650}
5651
5652/*----------------------------------------------------------------------------
5653| Returns the result of converting the quadruple-precision floating-point
5654| value `a' to the 32-bit unsigned integer format.  The conversion
5655| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5656| Arithmetic except that the conversion is always rounded toward zero.
5657| If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5658| if the conversion overflows, the largest unsigned integer is returned.
5659| If 'a' is negative, the value is rounded and zero is returned; negative
5660| values that do not round to zero will raise the inexact exception.
5661*----------------------------------------------------------------------------*/
5662
5663uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5664{
5665    uint64_t v;
5666    uint32_t res;
5667    int old_exc_flags = get_float_exception_flags(status);
5668
5669    v = float128_to_uint64_round_to_zero(a, status);
5670    if (v > 0xffffffff) {
5671        res = 0xffffffff;
5672    } else {
5673        return v;
5674    }
5675    set_float_exception_flags(old_exc_flags, status);
5676    float_raise(float_flag_invalid, status);
5677    return res;
5678}
5679
5680/*----------------------------------------------------------------------------
5681| Returns the result of converting the quadruple-precision floating-point
5682| value `a' to the single-precision floating-point format.  The conversion
5683| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5684| Arithmetic.
5685*----------------------------------------------------------------------------*/
5686
5687float32 float128_to_float32(float128 a, float_status *status)
5688{
5689    flag aSign;
5690    int32_t aExp;
5691    uint64_t aSig0, aSig1;
5692    uint32_t zSig;
5693
5694    aSig1 = extractFloat128Frac1( a );
5695    aSig0 = extractFloat128Frac0( a );
5696    aExp = extractFloat128Exp( a );
5697    aSign = extractFloat128Sign( a );
5698    if ( aExp == 0x7FFF ) {
5699        if ( aSig0 | aSig1 ) {
5700            return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5701        }
5702        return packFloat32( aSign, 0xFF, 0 );
5703    }
5704    aSig0 |= ( aSig1 != 0 );
5705    shift64RightJamming( aSig0, 18, &aSig0 );
5706    zSig = aSig0;
5707    if ( aExp || zSig ) {
5708        zSig |= 0x40000000;
5709        aExp -= 0x3F81;
5710    }
5711    return roundAndPackFloat32(aSign, aExp, zSig, status);
5712
5713}
5714
5715/*----------------------------------------------------------------------------
5716| Returns the result of converting the quadruple-precision floating-point
5717| value `a' to the double-precision floating-point format.  The conversion
5718| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5719| Arithmetic.
5720*----------------------------------------------------------------------------*/
5721
5722float64 float128_to_float64(float128 a, float_status *status)
5723{
5724    flag aSign;
5725    int32_t aExp;
5726    uint64_t aSig0, aSig1;
5727
5728    aSig1 = extractFloat128Frac1( a );
5729    aSig0 = extractFloat128Frac0( a );
5730    aExp = extractFloat128Exp( a );
5731    aSign = extractFloat128Sign( a );
5732    if ( aExp == 0x7FFF ) {
5733        if ( aSig0 | aSig1 ) {
5734            return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5735        }
5736        return packFloat64( aSign, 0x7FF, 0 );
5737    }
5738    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5739    aSig0 |= ( aSig1 != 0 );
5740    if ( aExp || aSig0 ) {
5741        aSig0 |= LIT64( 0x4000000000000000 );
5742        aExp -= 0x3C01;
5743    }
5744    return roundAndPackFloat64(aSign, aExp, aSig0, status);
5745
5746}
5747
5748/*----------------------------------------------------------------------------
5749| Returns the result of converting the quadruple-precision floating-point
5750| value `a' to the extended double-precision floating-point format.  The
5751| conversion is performed according to the IEC/IEEE Standard for Binary
5752| Floating-Point Arithmetic.
5753*----------------------------------------------------------------------------*/
5754
5755floatx80 float128_to_floatx80(float128 a, float_status *status)
5756{
5757    flag aSign;
5758    int32_t aExp;
5759    uint64_t aSig0, aSig1;
5760
5761    aSig1 = extractFloat128Frac1( a );
5762    aSig0 = extractFloat128Frac0( a );
5763    aExp = extractFloat128Exp( a );
5764    aSign = extractFloat128Sign( a );
5765    if ( aExp == 0x7FFF ) {
5766        if ( aSig0 | aSig1 ) {
5767            return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5768        }
5769        return packFloatx80(aSign, floatx80_infinity_high,
5770                                   floatx80_infinity_low);
5771    }
5772    if ( aExp == 0 ) {
5773        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5774        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5775    }
5776    else {
5777        aSig0 |= LIT64( 0x0001000000000000 );
5778    }
5779    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5780    return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5781
5782}
5783
5784/*----------------------------------------------------------------------------
5785| Rounds the quadruple-precision floating-point value `a' to an integer, and
5786| returns the result as a quadruple-precision floating-point value.  The
5787| operation is performed according to the IEC/IEEE Standard for Binary
5788| Floating-Point Arithmetic.
5789*----------------------------------------------------------------------------*/
5790
5791float128 float128_round_to_int(float128 a, float_status *status)
5792{
5793    flag aSign;
5794    int32_t aExp;
5795    uint64_t lastBitMask, roundBitsMask;
5796    float128 z;
5797
5798    aExp = extractFloat128Exp( a );
5799    if ( 0x402F <= aExp ) {
5800        if ( 0x406F <= aExp ) {
5801            if (    ( aExp == 0x7FFF )
5802                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5803               ) {
5804                return propagateFloat128NaN(a, a, status);
5805            }
5806            return a;
5807        }
5808        lastBitMask = 1;
5809        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5810        roundBitsMask = lastBitMask - 1;
5811        z = a;
5812        switch (status->float_rounding_mode) {
5813        case float_round_nearest_even:
5814            if ( lastBitMask ) {
5815                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5816                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5817            }
5818            else {
5819                if ( (int64_t) z.low < 0 ) {
5820                    ++z.high;
5821                    if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5822                }
5823            }
5824            break;
5825        case float_round_ties_away:
5826            if (lastBitMask) {
5827                add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
5828            } else {
5829                if ((int64_t) z.low < 0) {
5830                    ++z.high;
5831                }
5832            }
5833            break;
5834        case float_round_to_zero:
5835            break;
5836        case float_round_up:
5837            if (!extractFloat128Sign(z)) {
5838                add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5839            }
5840            break;
5841        case float_round_down:
5842            if (extractFloat128Sign(z)) {
5843                add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5844            }
5845            break;
5846        default:
5847            abort();
5848        }
5849        z.low &= ~ roundBitsMask;
5850    }
5851    else {
5852        if ( aExp < 0x3FFF ) {
5853            if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5854            status->float_exception_flags |= float_flag_inexact;
5855            aSign = extractFloat128Sign( a );
5856            switch (status->float_rounding_mode) {
5857             case float_round_nearest_even:
5858                if (    ( aExp == 0x3FFE )
5859                     && (   extractFloat128Frac0( a )
5860                          | extractFloat128Frac1( a ) )
5861                   ) {
5862                    return packFloat128( aSign, 0x3FFF, 0, 0 );
5863                }
5864                break;
5865            case float_round_ties_away:
5866                if (aExp == 0x3FFE) {
5867                    return packFloat128(aSign, 0x3FFF, 0, 0);
5868                }
5869                break;
5870             case float_round_down:
5871                return
5872                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5873                    : packFloat128( 0, 0, 0, 0 );
5874             case float_round_up:
5875                return
5876                      aSign ? packFloat128( 1, 0, 0, 0 )
5877                    : packFloat128( 0, 0x3FFF, 0, 0 );
5878            }
5879            return packFloat128( aSign, 0, 0, 0 );
5880        }
5881        lastBitMask = 1;
5882        lastBitMask <<= 0x402F - aExp;
5883        roundBitsMask = lastBitMask - 1;
5884        z.low = 0;
5885        z.high = a.high;
5886        switch (status->float_rounding_mode) {
5887        case float_round_nearest_even:
5888            z.high += lastBitMask>>1;
5889            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5890                z.high &= ~ lastBitMask;
5891            }
5892            break;
5893        case float_round_ties_away:
5894            z.high += lastBitMask>>1;
5895            break;
5896        case float_round_to_zero:
5897            break;
5898        case float_round_up:
5899            if (!extractFloat128Sign(z)) {
5900                z.high |= ( a.low != 0 );
5901                z.high += roundBitsMask;
5902            }
5903            break;
5904        case float_round_down:
5905            if (extractFloat128Sign(z)) {
5906                z.high |= (a.low != 0);
5907                z.high += roundBitsMask;
5908            }
5909            break;
5910        default:
5911            abort();
5912        }
5913        z.high &= ~ roundBitsMask;
5914    }
5915    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5916        status->float_exception_flags |= float_flag_inexact;
5917    }
5918    return z;
5919
5920}
5921
5922/*----------------------------------------------------------------------------
5923| Returns the result of adding the absolute values of the quadruple-precision
5924| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5925| before being returned.  `zSign' is ignored if the result is a NaN.
5926| The addition is performed according to the IEC/IEEE Standard for Binary
5927| Floating-Point Arithmetic.
5928*----------------------------------------------------------------------------*/
5929
5930static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
5931                                float_status *status)
5932{
5933    int32_t aExp, bExp, zExp;
5934    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5935    int32_t expDiff;
5936
5937    aSig1 = extractFloat128Frac1( a );
5938    aSig0 = extractFloat128Frac0( a );
5939    aExp = extractFloat128Exp( a );
5940    bSig1 = extractFloat128Frac1( b );
5941    bSig0 = extractFloat128Frac0( b );
5942    bExp = extractFloat128Exp( b );
5943    expDiff = aExp - bExp;
5944    if ( 0 < expDiff ) {
5945        if ( aExp == 0x7FFF ) {
5946            if (aSig0 | aSig1) {
5947                return propagateFloat128NaN(a, b, status);
5948            }
5949            return a;
5950        }
5951        if ( bExp == 0 ) {
5952            --expDiff;
5953        }
5954        else {
5955            bSig0 |= LIT64( 0x0001000000000000 );
5956        }
5957        shift128ExtraRightJamming(
5958            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5959        zExp = aExp;
5960    }
5961    else if ( expDiff < 0 ) {
5962        if ( bExp == 0x7FFF ) {
5963            if (bSig0 | bSig1) {
5964                return propagateFloat128NaN(a, b, status);
5965            }
5966            return packFloat128( zSign, 0x7FFF, 0, 0 );
5967        }
5968        if ( aExp == 0 ) {
5969            ++expDiff;
5970        }
5971        else {
5972            aSig0 |= LIT64( 0x0001000000000000 );
5973        }
5974        shift128ExtraRightJamming(
5975            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5976        zExp = bExp;
5977    }
5978    else {
5979        if ( aExp == 0x7FFF ) {
5980            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5981                return propagateFloat128NaN(a, b, status);
5982            }
5983            return a;
5984        }
5985        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5986        if ( aExp == 0 ) {
5987            if (status->flush_to_zero) {
5988                if (zSig0 | zSig1) {
5989                    float_raise(float_flag_output_denormal, status);
5990                }
5991                return packFloat128(zSign, 0, 0, 0);
5992            }
5993            return packFloat128( zSign, 0, zSig0, zSig1 );
5994        }
5995        zSig2 = 0;
5996        zSig0 |= LIT64( 0x0002000000000000 );
5997        zExp = aExp;
5998        goto shiftRight1;
5999    }
6000    aSig0 |= LIT64( 0x0001000000000000 );
6001    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6002    --zExp;
6003    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6004    ++zExp;
6005 shiftRight1:
6006    shift128ExtraRightJamming(
6007        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6008 roundAndPack:
6009    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6010
6011}
6012
6013/*----------------------------------------------------------------------------
6014| Returns the result of subtracting the absolute values of the quadruple-
6015| precision floating-point values `a' and `b'.  If `zSign' is 1, the
6016| difference is negated before being returned.  `zSign' is ignored if the
6017| result is a NaN.  The subtraction is performed according to the IEC/IEEE
6018| Standard for Binary Floating-Point Arithmetic.
6019*----------------------------------------------------------------------------*/
6020
6021static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6022                                float_status *status)
6023{
6024    int32_t aExp, bExp, zExp;
6025    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6026    int32_t expDiff;
6027
6028    aSig1 = extractFloat128Frac1( a );
6029    aSig0 = extractFloat128Frac0( a );
6030    aExp = extractFloat128Exp( a );
6031    bSig1 = extractFloat128Frac1( b );
6032    bSig0 = extractFloat128Frac0( b );
6033    bExp = extractFloat128Exp( b );
6034    expDiff = aExp - bExp;
6035    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6036    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6037    if ( 0 < expDiff ) goto aExpBigger;
6038    if ( expDiff < 0 ) goto bExpBigger;
6039    if ( aExp == 0x7FFF ) {
6040        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6041            return propagateFloat128NaN(a, b, status);
6042        }
6043        float_raise(float_flag_invalid, status);
6044        return float128_default_nan(status);
6045    }
6046    if ( aExp == 0 ) {
6047        aExp = 1;
6048        bExp = 1;
6049    }
6050    if ( bSig0 < aSig0 ) goto aBigger;
6051    if ( aSig0 < bSig0 ) goto bBigger;
6052    if ( bSig1 < aSig1 ) goto aBigger;
6053    if ( aSig1 < bSig1 ) goto bBigger;
6054    return packFloat128(status->float_rounding_mode == float_round_down,
6055                        0, 0, 0);
6056 bExpBigger:
6057    if ( bExp == 0x7FFF ) {
6058        if (bSig0 | bSig1) {
6059            return propagateFloat128NaN(a, b, status);
6060        }
6061        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6062    }
6063    if ( aExp == 0 ) {
6064        ++expDiff;
6065    }
6066    else {
6067        aSig0 |= LIT64( 0x4000000000000000 );
6068    }
6069    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6070    bSig0 |= LIT64( 0x4000000000000000 );
6071 bBigger:
6072    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6073    zExp = bExp;
6074    zSign ^= 1;
6075    goto normalizeRoundAndPack;
6076 aExpBigger:
6077    if ( aExp == 0x7FFF ) {
6078        if (aSig0 | aSig1) {
6079            return propagateFloat128NaN(a, b, status);
6080        }
6081        return a;
6082    }
6083    if ( bExp == 0 ) {
6084        --expDiff;
6085    }
6086    else {
6087        bSig0 |= LIT64( 0x4000000000000000 );
6088    }
6089    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6090    aSig0 |= LIT64( 0x4000000000000000 );
6091 aBigger:
6092    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6093    zExp = aExp;
6094 normalizeRoundAndPack:
6095    --zExp;
6096    return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6097                                         status);
6098
6099}
6100
6101/*----------------------------------------------------------------------------
6102| Returns the result of adding the quadruple-precision floating-point values
6103| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6104| for Binary Floating-Point Arithmetic.
6105*----------------------------------------------------------------------------*/
6106
6107float128 float128_add(float128 a, float128 b, float_status *status)
6108{
6109    flag aSign, bSign;
6110
6111    aSign = extractFloat128Sign( a );
6112    bSign = extractFloat128Sign( b );
6113    if ( aSign == bSign ) {
6114        return addFloat128Sigs(a, b, aSign, status);
6115    }
6116    else {
6117        return subFloat128Sigs(a, b, aSign, status);
6118    }
6119
6120}
6121
6122/*----------------------------------------------------------------------------
6123| Returns the result of subtracting the quadruple-precision floating-point
6124| values `a' and `b'.  The operation is performed according to the IEC/IEEE
6125| Standard for Binary Floating-Point Arithmetic.
6126*----------------------------------------------------------------------------*/
6127
6128float128 float128_sub(float128 a, float128 b, float_status *status)
6129{
6130    flag aSign, bSign;
6131
6132    aSign = extractFloat128Sign( a );
6133    bSign = extractFloat128Sign( b );
6134    if ( aSign == bSign ) {
6135        return subFloat128Sigs(a, b, aSign, status);
6136    }
6137    else {
6138        return addFloat128Sigs(a, b, aSign, status);
6139    }
6140
6141}
6142
6143/*----------------------------------------------------------------------------
6144| Returns the result of multiplying the quadruple-precision floating-point
6145| values `a' and `b'.  The operation is performed according to the IEC/IEEE
6146| Standard for Binary Floating-Point Arithmetic.
6147*----------------------------------------------------------------------------*/
6148
6149float128 float128_mul(float128 a, float128 b, float_status *status)
6150{
6151    flag aSign, bSign, zSign;
6152    int32_t aExp, bExp, zExp;
6153    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6154
6155    aSig1 = extractFloat128Frac1( a );
6156    aSig0 = extractFloat128Frac0( a );
6157    aExp = extractFloat128Exp( a );
6158    aSign = extractFloat128Sign( a );
6159    bSig1 = extractFloat128Frac1( b );
6160    bSig0 = extractFloat128Frac0( b );
6161    bExp = extractFloat128Exp( b );
6162    bSign = extractFloat128Sign( b );
6163    zSign = aSign ^ bSign;
6164    if ( aExp == 0x7FFF ) {
6165        if (    ( aSig0 | aSig1 )
6166             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6167            return propagateFloat128NaN(a, b, status);
6168        }
6169        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6170        return packFloat128( zSign, 0x7FFF, 0, 0 );
6171    }
6172    if ( bExp == 0x7FFF ) {
6173        if (bSig0 | bSig1) {
6174            return propagateFloat128NaN(a, b, status);
6175        }
6176        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6177 invalid:
6178            float_raise(float_flag_invalid, status);
6179            return float128_default_nan(status);
6180        }
6181        return packFloat128( zSign, 0x7FFF, 0, 0 );
6182    }
6183    if ( aExp == 0 ) {
6184        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6185        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6186    }
6187    if ( bExp == 0 ) {
6188        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6189        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6190    }
6191    zExp = aExp + bExp - 0x4000;
6192    aSig0 |= LIT64( 0x0001000000000000 );
6193    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6194    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6195    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6196    zSig2 |= ( zSig3 != 0 );
6197    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6198        shift128ExtraRightJamming(
6199            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6200        ++zExp;
6201    }
6202    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6203
6204}
6205
6206/*----------------------------------------------------------------------------
6207| Returns the result of dividing the quadruple-precision floating-point value
6208| `a' by the corresponding value `b'.  The operation is performed according to
6209| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6210*----------------------------------------------------------------------------*/
6211
6212float128 float128_div(float128 a, float128 b, float_status *status)
6213{
6214    flag aSign, bSign, zSign;
6215    int32_t aExp, bExp, zExp;
6216    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6217    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6218
6219    aSig1 = extractFloat128Frac1( a );
6220    aSig0 = extractFloat128Frac0( a );
6221    aExp = extractFloat128Exp( a );
6222    aSign = extractFloat128Sign( a );
6223    bSig1 = extractFloat128Frac1( b );
6224    bSig0 = extractFloat128Frac0( b );
6225    bExp = extractFloat128Exp( b );
6226    bSign = extractFloat128Sign( b );
6227    zSign = aSign ^ bSign;
6228    if ( aExp == 0x7FFF ) {
6229        if (aSig0 | aSig1) {
6230            return propagateFloat128NaN(a, b, status);
6231        }
6232        if ( bExp == 0x7FFF ) {
6233            if (bSig0 | bSig1) {
6234                return propagateFloat128NaN(a, b, status);
6235            }
6236            goto invalid;
6237        }
6238        return packFloat128( zSign, 0x7FFF, 0, 0 );
6239    }
6240    if ( bExp == 0x7FFF ) {
6241        if (bSig0 | bSig1) {
6242            return propagateFloat128NaN(a, b, status);
6243        }
6244        return packFloat128( zSign, 0, 0, 0 );
6245    }
6246    if ( bExp == 0 ) {
6247        if ( ( bSig0 | bSig1 ) == 0 ) {
6248            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6249 invalid:
6250                float_raise(float_flag_invalid, status);
6251                return float128_default_nan(status);
6252            }
6253            float_raise(float_flag_divbyzero, status);
6254            return packFloat128( zSign, 0x7FFF, 0, 0 );
6255        }
6256        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6257    }
6258    if ( aExp == 0 ) {
6259        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6260        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6261    }
6262    zExp = aExp - bExp + 0x3FFD;
6263    shortShift128Left(
6264        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6265    shortShift128Left(
6266        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6267    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6268        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6269        ++zExp;
6270    }
6271    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6272    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6273    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6274    while ( (int64_t) rem0 < 0 ) {
6275        --zSig0;
6276        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6277    }
6278    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6279    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6280        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6281        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6282        while ( (int64_t) rem1 < 0 ) {
6283            --zSig1;
6284            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6285        }
6286        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6287    }
6288    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6289    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6290
6291}
6292
6293/*----------------------------------------------------------------------------
6294| Returns the remainder of the quadruple-precision floating-point value `a'
6295| with respect to the corresponding value `b'.  The operation is performed
6296| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6297*----------------------------------------------------------------------------*/
6298
6299float128 float128_rem(float128 a, float128 b, float_status *status)
6300{
6301    flag aSign, zSign;
6302    int32_t aExp, bExp, expDiff;
6303    uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6304    uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6305    int64_t sigMean0;
6306
6307    aSig1 = extractFloat128Frac1( a );
6308    aSig0 = extractFloat128Frac0( a );
6309    aExp = extractFloat128Exp( a );
6310    aSign = extractFloat128Sign( a );
6311    bSig1 = extractFloat128Frac1( b );
6312    bSig0 = extractFloat128Frac0( b );
6313    bExp = extractFloat128Exp( b );
6314    if ( aExp == 0x7FFF ) {
6315        if (    ( aSig0 | aSig1 )
6316             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6317            return propagateFloat128NaN(a, b, status);
6318        }
6319        goto invalid;
6320    }
6321    if ( bExp == 0x7FFF ) {
6322        if (bSig0 | bSig1) {
6323            return propagateFloat128NaN(a, b, status);
6324        }
6325        return a;
6326    }
6327    if ( bExp == 0 ) {
6328        if ( ( bSig0 | bSig1 ) == 0 ) {
6329 invalid:
6330            float_raise(float_flag_invalid, status);
6331            return float128_default_nan(status);
6332        }
6333        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6334    }
6335    if ( aExp == 0 ) {
6336        if ( ( aSig0 | aSig1 ) == 0 ) return a;
6337        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6338    }
6339    expDiff = aExp - bExp;
6340    if ( expDiff < -1 ) return a;
6341    shortShift128Left(
6342        aSig0 | LIT64( 0x0001000000000000 ),
6343        aSig1,
6344        15 - ( expDiff < 0 ),
6345        &aSig0,
6346        &aSig1
6347    );
6348    shortShift128Left(
6349        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6350    q = le128( bSig0, bSig1, aSig0, aSig1 );
6351    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6352    expDiff -= 64;
6353    while ( 0 < expDiff ) {
6354        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6355        q = ( 4 < q ) ? q - 4 : 0;
6356        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6357        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6358        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6359        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6360        expDiff -= 61;
6361    }
6362    if ( -64 < expDiff ) {
6363        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6364        q = ( 4 < q ) ? q - 4 : 0;
6365        q >>= - expDiff;
6366        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6367        expDiff += 52;
6368        if ( expDiff < 0 ) {
6369            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6370        }
6371        else {
6372            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6373        }
6374        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6375        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6376    }
6377    else {
6378        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6379        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6380    }
6381    do {
6382        alternateASig0 = aSig0;
6383        alternateASig1 = aSig1;
6384        ++q;
6385        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6386    } while ( 0 <= (int64_t) aSig0 );
6387    add128(
6388        aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6389    if (    ( sigMean0 < 0 )
6390         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6391        aSig0 = alternateASig0;
6392        aSig1 = alternateASig1;
6393    }
6394    zSign = ( (int64_t) aSig0 < 0 );
6395    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6396    return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6397                                         status);
6398}
6399
6400/*----------------------------------------------------------------------------
6401| Returns the square root of the quadruple-precision floating-point value `a'.
6402| The operation is performed according to the IEC/IEEE Standard for Binary
6403| Floating-Point Arithmetic.
6404*----------------------------------------------------------------------------*/
6405
6406float128 float128_sqrt(float128 a, float_status *status)
6407{
6408    flag aSign;
6409    int32_t aExp, zExp;
6410    uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6411    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6412
6413    aSig1 = extractFloat128Frac1( a );
6414    aSig0 = extractFloat128Frac0( a );
6415    aExp = extractFloat128Exp( a );
6416    aSign = extractFloat128Sign( a );
6417    if ( aExp == 0x7FFF ) {
6418        if (aSig0 | aSig1) {
6419            return propagateFloat128NaN(a, a, status);
6420        }
6421        if ( ! aSign ) return a;
6422        goto invalid;
6423    }
6424    if ( aSign ) {
6425        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6426 invalid:
6427        float_raise(float_flag_invalid, status);
6428        return float128_default_nan(status);
6429    }
6430    if ( aExp == 0 ) {
6431        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6432        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6433    }
6434    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6435    aSig0 |= LIT64( 0x0001000000000000 );
6436    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6437    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6438    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6439    doubleZSig0 = zSig0<<1;
6440    mul64To128( zSig0, zSig0, &term0, &term1 );
6441    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6442    while ( (int64_t) rem0 < 0 ) {
6443        --zSig0;
6444        doubleZSig0 -= 2;
6445        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6446    }
6447    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6448    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6449        if ( zSig1 == 0 ) zSig1 = 1;
6450        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6451        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6452        mul64To128( zSig1, zSig1, &term2, &term3 );
6453        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6454        while ( (int64_t) rem1 < 0 ) {
6455            --zSig1;
6456            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6457            term3 |= 1;
6458            term2 |= doubleZSig0;
6459            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6460        }
6461        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6462    }
6463    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6464    return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6465
6466}
6467
6468/*----------------------------------------------------------------------------
6469| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6470| the corresponding value `b', and 0 otherwise.  The invalid exception is
6471| raised if either operand is a NaN.  Otherwise, the comparison is performed
6472| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6473*----------------------------------------------------------------------------*/
6474
6475int float128_eq(float128 a, float128 b, float_status *status)
6476{
6477
6478    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6479              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6480         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6481              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6482       ) {
6483        float_raise(float_flag_invalid, status);
6484        return 0;
6485    }
6486    return
6487           ( a.low == b.low )
6488        && (    ( a.high == b.high )
6489             || (    ( a.low == 0 )
6490                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6491           );
6492
6493}
6494
6495/*----------------------------------------------------------------------------
6496| Returns 1 if the quadruple-precision floating-point value `a' is less than
6497| or equal to the corresponding value `b', and 0 otherwise.  The invalid
6498| exception is raised if either operand is a NaN.  The comparison is performed
6499| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6500*----------------------------------------------------------------------------*/
6501
6502int float128_le(float128 a, float128 b, float_status *status)
6503{
6504    flag aSign, bSign;
6505
6506    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6507              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6508         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6509              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6510       ) {
6511        float_raise(float_flag_invalid, status);
6512        return 0;
6513    }
6514    aSign = extractFloat128Sign( a );
6515    bSign = extractFloat128Sign( b );
6516    if ( aSign != bSign ) {
6517        return
6518               aSign
6519            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6520                 == 0 );
6521    }
6522    return
6523          aSign ? le128( b.high, b.low, a.high, a.low )
6524        : le128( a.high, a.low, b.high, b.low );
6525
6526}
6527
6528/*----------------------------------------------------------------------------
6529| Returns 1 if the quadruple-precision floating-point value `a' is less than
6530| the corresponding value `b', and 0 otherwise.  The invalid exception is
6531| raised if either operand is a NaN.  The comparison is performed according
6532| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6533*----------------------------------------------------------------------------*/
6534
6535int float128_lt(float128 a, float128 b, float_status *status)
6536{
6537    flag aSign, bSign;
6538
6539    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6540              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6541         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6542              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6543       ) {
6544        float_raise(float_flag_invalid, status);
6545        return 0;
6546    }
6547    aSign = extractFloat128Sign( a );
6548    bSign = extractFloat128Sign( b );
6549    if ( aSign != bSign ) {
6550        return
6551               aSign
6552            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6553                 != 0 );
6554    }
6555    return
6556          aSign ? lt128( b.high, b.low, a.high, a.low )
6557        : lt128( a.high, a.low, b.high, b.low );
6558
6559}
6560
6561/*----------------------------------------------------------------------------
6562| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6563| be compared, and 0 otherwise.  The invalid exception is raised if either
6564| operand is a NaN. The comparison is performed according to the IEC/IEEE
6565| Standard for Binary Floating-Point Arithmetic.
6566*----------------------------------------------------------------------------*/
6567
6568int float128_unordered(float128 a, float128 b, float_status *status)
6569{
6570    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6571              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6572         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6573              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6574       ) {
6575        float_raise(float_flag_invalid, status);
6576        return 1;
6577    }
6578    return 0;
6579}
6580
6581/*----------------------------------------------------------------------------
6582| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6583| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6584| exception.  The comparison is performed according to the IEC/IEEE Standard
6585| for Binary Floating-Point Arithmetic.
6586*----------------------------------------------------------------------------*/
6587
6588int float128_eq_quiet(float128 a, float128 b, float_status *status)
6589{
6590
6591    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6592              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6593         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6594              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6595       ) {
6596        if (float128_is_signaling_nan(a, status)
6597         || float128_is_signaling_nan(b, status)) {
6598            float_raise(float_flag_invalid, status);
6599        }
6600        return 0;
6601    }
6602    return
6603           ( a.low == b.low )
6604        && (    ( a.high == b.high )
6605             || (    ( a.low == 0 )
6606                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6607           );
6608
6609}
6610
6611/*----------------------------------------------------------------------------
6612| Returns 1 if the quadruple-precision floating-point value `a' is less than
6613| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6614| cause an exception.  Otherwise, the comparison is performed according to the
6615| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6616*----------------------------------------------------------------------------*/
6617
6618int float128_le_quiet(float128 a, float128 b, float_status *status)
6619{
6620    flag aSign, bSign;
6621
6622    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6623              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6624         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6625              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6626       ) {
6627        if (float128_is_signaling_nan(a, status)
6628         || float128_is_signaling_nan(b, status)) {
6629            float_raise(float_flag_invalid, status);
6630        }
6631        return 0;
6632    }
6633    aSign = extractFloat128Sign( a );
6634    bSign = extractFloat128Sign( b );
6635    if ( aSign != bSign ) {
6636        return
6637               aSign
6638            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6639                 == 0 );
6640    }
6641    return
6642          aSign ? le128( b.high, b.low, a.high, a.low )
6643        : le128( a.high, a.low, b.high, b.low );
6644
6645}
6646
6647/*----------------------------------------------------------------------------
6648| Returns 1 if the quadruple-precision floating-point value `a' is less than
6649| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6650| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6651| Standard for Binary Floating-Point Arithmetic.
6652*----------------------------------------------------------------------------*/
6653
6654int float128_lt_quiet(float128 a, float128 b, float_status *status)
6655{
6656    flag aSign, bSign;
6657
6658    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6659              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6660         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6661              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6662       ) {
6663        if (float128_is_signaling_nan(a, status)
6664         || float128_is_signaling_nan(b, status)) {
6665            float_raise(float_flag_invalid, status);
6666        }
6667        return 0;
6668    }
6669    aSign = extractFloat128Sign( a );
6670    bSign = extractFloat128Sign( b );
6671    if ( aSign != bSign ) {
6672        return
6673               aSign
6674            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6675                 != 0 );
6676    }
6677    return
6678          aSign ? lt128( b.high, b.low, a.high, a.low )
6679        : lt128( a.high, a.low, b.high, b.low );
6680
6681}
6682
6683/*----------------------------------------------------------------------------
6684| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6685| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6686| comparison is performed according to the IEC/IEEE Standard for Binary
6687| Floating-Point Arithmetic.
6688*----------------------------------------------------------------------------*/
6689
6690int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6691{
6692    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6693              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6694         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6695              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6696       ) {
6697        if (float128_is_signaling_nan(a, status)
6698         || float128_is_signaling_nan(b, status)) {
6699            float_raise(float_flag_invalid, status);
6700        }
6701        return 1;
6702    }
6703    return 0;
6704}
6705
6706static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6707                                            int is_quiet, float_status *status)
6708{
6709    flag aSign, bSign;
6710
6711    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6712        float_raise(float_flag_invalid, status);
6713        return float_relation_unordered;
6714    }
6715    if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6716          ( extractFloatx80Frac( a )<<1 ) ) ||
6717        ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6718          ( extractFloatx80Frac( b )<<1 ) )) {
6719        if (!is_quiet ||
6720            floatx80_is_signaling_nan(a, status) ||
6721            floatx80_is_signaling_nan(b, status)) {
6722            float_raise(float_flag_invalid, status);
6723        }
6724        return float_relation_unordered;
6725    }
6726    aSign = extractFloatx80Sign( a );
6727    bSign = extractFloatx80Sign( b );
6728    if ( aSign != bSign ) {
6729
6730        if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6731             ( ( a.low | b.low ) == 0 ) ) {
6732            /* zero case */
6733            return float_relation_equal;
6734        } else {
6735            return 1 - (2 * aSign);
6736        }
6737    } else {
6738        if (a.low == b.low && a.high == b.high) {
6739            return float_relation_equal;
6740        } else {
6741            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6742        }
6743    }
6744}
6745
6746int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6747{
6748    return floatx80_compare_internal(a, b, 0, status);
6749}
6750
6751int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6752{
6753    return floatx80_compare_internal(a, b, 1, status);
6754}
6755
6756static inline int float128_compare_internal(float128 a, float128 b,
6757                                            int is_quiet, float_status *status)
6758{
6759    flag aSign, bSign;
6760
6761    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6762          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6763        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6764          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6765        if (!is_quiet ||
6766            float128_is_signaling_nan(a, status) ||
6767            float128_is_signaling_nan(b, status)) {
6768            float_raise(float_flag_invalid, status);
6769        }
6770        return float_relation_unordered;
6771    }
6772    aSign = extractFloat128Sign( a );
6773    bSign = extractFloat128Sign( b );
6774    if ( aSign != bSign ) {
6775        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6776            /* zero case */
6777            return float_relation_equal;
6778        } else {
6779            return 1 - (2 * aSign);
6780        }
6781    } else {
6782        if (a.low == b.low && a.high == b.high) {
6783            return float_relation_equal;
6784        } else {
6785            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6786        }
6787    }
6788}
6789
6790int float128_compare(float128 a, float128 b, float_status *status)
6791{
6792    return float128_compare_internal(a, b, 0, status);
6793}
6794
6795int float128_compare_quiet(float128 a, float128 b, float_status *status)
6796{
6797    return float128_compare_internal(a, b, 1, status);
6798}
6799
6800floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
6801{
6802    flag aSign;
6803    int32_t aExp;
6804    uint64_t aSig;
6805
6806    if (floatx80_invalid_encoding(a)) {
6807        float_raise(float_flag_invalid, status);
6808        return floatx80_default_nan(status);
6809    }
6810    aSig = extractFloatx80Frac( a );
6811    aExp = extractFloatx80Exp( a );
6812    aSign = extractFloatx80Sign( a );
6813
6814    if ( aExp == 0x7FFF ) {
6815        if ( aSig<<1 ) {
6816            return propagateFloatx80NaN(a, a, status);
6817        }
6818        return a;
6819    }
6820
6821    if (aExp == 0) {
6822        if (aSig == 0) {
6823            return a;
6824        }
6825        aExp++;
6826    }
6827
6828    if (n > 0x10000) {
6829        n = 0x10000;
6830    } else if (n < -0x10000) {
6831        n = -0x10000;
6832    }
6833
6834    aExp += n;
6835    return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6836                                         aSign, aExp, aSig, 0, status);
6837}
6838
6839float128 float128_scalbn(float128 a, int n, float_status *status)
6840{
6841    flag aSign;
6842    int32_t aExp;
6843    uint64_t aSig0, aSig1;
6844
6845    aSig1 = extractFloat128Frac1( a );
6846    aSig0 = extractFloat128Frac0( a );
6847    aExp = extractFloat128Exp( a );
6848    aSign = extractFloat128Sign( a );
6849    if ( aExp == 0x7FFF ) {
6850        if ( aSig0 | aSig1 ) {
6851            return propagateFloat128NaN(a, a, status);
6852        }
6853        return a;
6854    }
6855    if (aExp != 0) {
6856        aSig0 |= LIT64( 0x0001000000000000 );
6857    } else if (aSig0 == 0 && aSig1 == 0) {
6858        return a;
6859    } else {
6860        aExp++;
6861    }
6862
6863    if (n > 0x10000) {
6864        n = 0x10000;
6865    } else if (n < -0x10000) {
6866        n = -0x10000;
6867    }
6868
6869    aExp += n - 1;
6870    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6871                                         , status);
6872
6873}
6874