LXR qemu/fpu/softfloat.c

   1/*
   2 * QEMU float support
   3 *
   4 * The code in this source file is derived from release 2a of the SoftFloat
   5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6 * some later contributions) are provided under that license, as detailed below.
   7 * It has subsequently been modified by contributors to the QEMU Project,
   8 * so some portions are provided under:
   9 *  the SoftFloat-2a license
  10 *  the BSD license
  11 *  GPL-v2-or-later
  12 *
  13 * Any future contributions to this file after December 1st 2014 will be
  14 * taken to be licensed under the Softfloat-2a license unless specifically
  15 * indicated otherwise.
  16 */
  17
  18/*
  19===============================================================================
  20This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21Arithmetic Package, Release 2a.
  22
  23Written by John R. Hauser.  This work was made possible in part by the
  24International Computer Science Institute, located at Suite 600, 1947 Center
  25Street, Berkeley, California 94704.  Funding was partially provided by the
  26National Science Foundation under grant MIP-9311980.  The original version
  27of this code was written as part of a project to build a fixed-point vector
  28processor in collaboration with the University of California at Berkeley,
  29overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31arithmetic/SoftFloat.html'.
  32
  33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39Derivative works are acceptable, even for commercial purposes, so long as
  40(1) they include prominent notice that the work is derivative, and (2) they
  41include prominent notice akin to these four paragraphs for those parts of
  42this code that are retained.
  43
  44===============================================================================
  45*/
  46
  47/* BSD licensing:
  48 * Copyright (c) 2006, Fabrice Bellard
  49 * All rights reserved.
  50 *
  51 * Redistribution and use in source and binary forms, with or without
  52 * modification, are permitted provided that the following conditions are met:
  53 *
  54 * 1. Redistributions of source code must retain the above copyright notice,
  55 * this list of conditions and the following disclaimer.
  56 *
  57 * 2. Redistributions in binary form must reproduce the above copyright notice,
  58 * this list of conditions and the following disclaimer in the documentation
  59 * and/or other materials provided with the distribution.
  60 *
  61 * 3. Neither the name of the copyright holder nor the names of its contributors
  62 * may be used to endorse or promote products derived from this software without
  63 * specific prior written permission.
  64 *
  65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75 * THE POSSIBILITY OF SUCH DAMAGE.
  76 */
  77
  78/* Portions of this work are licensed under the terms of the GNU GPL,
  79 * version 2 or later. See the COPYING file in the top-level directory.
  80 */
  81
  82/* softfloat (and in particular the code in softfloat-specialize.h) is
  83 * target-dependent and needs the TARGET_* macros.
  84 */
  85#include "qemu/osdep.h"
  86#include "qemu/bitops.h"
  87#include "fpu/softfloat.h"
  88
  89/* We only need stdlib for abort() */
  90
  91/*----------------------------------------------------------------------------
  92| Primitive arithmetic functions, including multi-word arithmetic, and
  93| division and square root approximations.  (Can be specialized to target if
  94| desired.)
  95*----------------------------------------------------------------------------*/
  96#include "fpu/softfloat-macros.h"
  97
  98/*----------------------------------------------------------------------------
  99| Functions and definitions to determine:  (1) whether tininess for underflow
 100| is detected before or after rounding by default, (2) what (if anything)
 101| happens when exceptions are raised, (3) how signaling NaNs are distinguished
 102| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 103| are propagated from function inputs to output.  These details are target-
 104| specific.
 105*----------------------------------------------------------------------------*/
 106#include "softfloat-specialize.h"
 107
 108/*----------------------------------------------------------------------------
 109| Returns the fraction bits of the half-precision floating-point value `a'.
 110*----------------------------------------------------------------------------*/
 111
 112static inline uint32_t extractFloat16Frac(float16 a)
 113{
 114    return float16_val(a) & 0x3ff;
 115}
 116
 117/*----------------------------------------------------------------------------
 118| Returns the exponent bits of the half-precision floating-point value `a'.
 119*----------------------------------------------------------------------------*/
 120
 121static inline int extractFloat16Exp(float16 a)
 122{
 123    return (float16_val(a) >> 10) & 0x1f;
 124}
 125
 126/*----------------------------------------------------------------------------
 127| Returns the sign bit of the single-precision floating-point value `a'.
 128*----------------------------------------------------------------------------*/
 129
 130static inline flag extractFloat16Sign(float16 a)
 131{
 132    return float16_val(a)>>15;
 133}
 134
 135/*----------------------------------------------------------------------------
 136| Returns the fraction bits of the single-precision floating-point value `a'.
 137*----------------------------------------------------------------------------*/
 138
 139static inline uint32_t extractFloat32Frac(float32 a)
 140{
 141    return float32_val(a) & 0x007FFFFF;
 142}
 143
 144/*----------------------------------------------------------------------------
 145| Returns the exponent bits of the single-precision floating-point value `a'.
 146*----------------------------------------------------------------------------*/
 147
 148static inline int extractFloat32Exp(float32 a)
 149{
 150    return (float32_val(a) >> 23) & 0xFF;
 151}
 152
 153/*----------------------------------------------------------------------------
 154| Returns the sign bit of the single-precision floating-point value `a'.
 155*----------------------------------------------------------------------------*/
 156
 157static inline flag extractFloat32Sign(float32 a)
 158{
 159    return float32_val(a) >> 31;
 160}
 161
 162/*----------------------------------------------------------------------------
 163| Returns the fraction bits of the double-precision floating-point value `a'.
 164*----------------------------------------------------------------------------*/
 165
 166static inline uint64_t extractFloat64Frac(float64 a)
 167{
 168    return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
 169}
 170
 171/*----------------------------------------------------------------------------
 172| Returns the exponent bits of the double-precision floating-point value `a'.
 173*----------------------------------------------------------------------------*/
 174
 175static inline int extractFloat64Exp(float64 a)
 176{
 177    return (float64_val(a) >> 52) & 0x7FF;
 178}
 179
 180/*----------------------------------------------------------------------------
 181| Returns the sign bit of the double-precision floating-point value `a'.
 182*----------------------------------------------------------------------------*/
 183
 184static inline flag extractFloat64Sign(float64 a)
 185{
 186    return float64_val(a) >> 63;
 187}
 188
 189/*
 190 * Classify a floating point number. Everything above float_class_qnan
 191 * is a NaN so cls >= float_class_qnan is any NaN.
 192 */
 193
 194typedef enum __attribute__ ((__packed__)) {
 195    float_class_unclassified,
 196    float_class_zero,
 197    float_class_normal,
 198    float_class_inf,
 199    float_class_qnan,  /* all NaNs from here */
 200    float_class_snan,
 201    float_class_dnan,
 202    float_class_msnan, /* maybe silenced */
 203} FloatClass;
 204
 205/*
 206 * Structure holding all of the decomposed parts of a float. The
 207 * exponent is unbiased and the fraction is normalized. All
 208 * calculations are done with a 64 bit fraction and then rounded as
 209 * appropriate for the final format.
 210 *
 211 * Thanks to the packed FloatClass a decent compiler should be able to
 212 * fit the whole structure into registers and avoid using the stack
 213 * for parameter passing.
 214 */
 215
 216typedef struct {
 217    uint64_t frac;
 218    int32_t  exp;
 219    FloatClass cls;
 220    bool sign;
 221} FloatParts;
 222
 223#define DECOMPOSED_BINARY_POINT    (64 - 2)
 224#define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 225#define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
 226
 227/* Structure holding all of the relevant parameters for a format.
 228 *   exp_size: the size of the exponent field
 229 *   exp_bias: the offset applied to the exponent field
 230 *   exp_max: the maximum normalised exponent
 231 *   frac_size: the size of the fraction field
 232 *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 233 * The following are computed based the size of fraction
 234 *   frac_lsb: least significant bit of fraction
 235 *   fram_lsbm1: the bit bellow the least significant bit (for rounding)
 236 *   round_mask/roundeven_mask: masks used for rounding
 237 */
 238typedef struct {
 239    int exp_size;
 240    int exp_bias;
 241    int exp_max;
 242    int frac_size;
 243    int frac_shift;
 244    uint64_t frac_lsb;
 245    uint64_t frac_lsbm1;
 246    uint64_t round_mask;
 247    uint64_t roundeven_mask;
 248} FloatFmt;
 249
 250/* Expand fields based on the size of exponent and fraction */
 251#define FLOAT_PARAMS(E, F)                                           \
 252    .exp_size       = E,                                             \
 253    .exp_bias       = ((1 << E) - 1) >> 1,                           \
 254    .exp_max        = (1 << E) - 1,                                  \
 255    .frac_size      = F,                                             \
 256    .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 257    .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 258    .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 259    .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 260    .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 261
 262static const FloatFmt float16_params = {
 263    FLOAT_PARAMS(5, 10)
 264};
 265
 266static const FloatFmt float32_params = {
 267    FLOAT_PARAMS(8, 23)
 268};
 269
 270static const FloatFmt float64_params = {
 271    FLOAT_PARAMS(11, 52)
 272};
 273
 274/* Unpack a float to parts, but do not canonicalize.  */
 275static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
 276{
 277    const int sign_pos = fmt.frac_size + fmt.exp_size;
 278
 279    return (FloatParts) {
 280        .cls = float_class_unclassified,
 281        .sign = extract64(raw, sign_pos, 1),
 282        .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 283        .frac = extract64(raw, 0, fmt.frac_size),
 284    };
 285}
 286
 287static inline FloatParts float16_unpack_raw(float16 f)
 288{
 289    return unpack_raw(float16_params, f);
 290}
 291
 292static inline FloatParts float32_unpack_raw(float32 f)
 293{
 294    return unpack_raw(float32_params, f);
 295}
 296
 297static inline FloatParts float64_unpack_raw(float64 f)
 298{
 299    return unpack_raw(float64_params, f);
 300}
 301
 302/* Pack a float from parts, but do not canonicalize.  */
 303static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
 304{
 305    const int sign_pos = fmt.frac_size + fmt.exp_size;
 306    uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 307    return deposit64(ret, sign_pos, 1, p.sign);
 308}
 309
 310static inline float16 float16_pack_raw(FloatParts p)
 311{
 312    return make_float16(pack_raw(float16_params, p));
 313}
 314
 315static inline float32 float32_pack_raw(FloatParts p)
 316{
 317    return make_float32(pack_raw(float32_params, p));
 318}
 319
 320static inline float64 float64_pack_raw(FloatParts p)
 321{
 322    return make_float64(pack_raw(float64_params, p));
 323}
 324
 325/* Canonicalize EXP and FRAC, setting CLS.  */
 326static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
 327                               float_status *status)
 328{
 329    if (part.exp == parm->exp_max) {
 330        if (part.frac == 0) {
 331            part.cls = float_class_inf;
 332        } else {
 333#ifdef NO_SIGNALING_NANS
 334            part.cls = float_class_qnan;
 335#else
 336            int64_t msb = part.frac << (parm->frac_shift + 2);
 337            if ((msb < 0) == status->snan_bit_is_one) {
 338                part.cls = float_class_snan;
 339            } else {
 340                part.cls = float_class_qnan;
 341            }
 342#endif
 343        }
 344    } else if (part.exp == 0) {
 345        if (likely(part.frac == 0)) {
 346            part.cls = float_class_zero;
 347        } else if (status->flush_inputs_to_zero) {
 348            float_raise(float_flag_input_denormal, status);
 349            part.cls = float_class_zero;
 350            part.frac = 0;
 351        } else {
 352            int shift = clz64(part.frac) - 1;
 353            part.cls = float_class_normal;
 354            part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 355            part.frac <<= shift;
 356        }
 357    } else {
 358        part.cls = float_class_normal;
 359        part.exp -= parm->exp_bias;
 360        part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 361    }
 362    return part;
 363}
 364
 365/* Round and uncanonicalize a floating-point number by parts. There
 366 * are FRAC_SHIFT bits that may require rounding at the bottom of the
 367 * fraction; these bits will be removed. The exponent will be biased
 368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 369 */
 370
 371static FloatParts round_canonical(FloatParts p, float_status *s,
 372                                  const FloatFmt *parm)
 373{
 374    const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 375    const uint64_t round_mask = parm->round_mask;
 376    const uint64_t roundeven_mask = parm->roundeven_mask;
 377    const int exp_max = parm->exp_max;
 378    const int frac_shift = parm->frac_shift;
 379    uint64_t frac, inc;
 380    int exp, flags = 0;
 381    bool overflow_norm;
 382
 383    frac = p.frac;
 384    exp = p.exp;
 385
 386    switch (p.cls) {
 387    case float_class_normal:
 388        switch (s->float_rounding_mode) {
 389        case float_round_nearest_even:
 390            overflow_norm = false;
 391            inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 392            break;
 393        case float_round_ties_away:
 394            overflow_norm = false;
 395            inc = frac_lsbm1;
 396            break;
 397        case float_round_to_zero:
 398            overflow_norm = true;
 399            inc = 0;
 400            break;
 401        case float_round_up:
 402            inc = p.sign ? 0 : round_mask;
 403            overflow_norm = p.sign;
 404            break;
 405        case float_round_down:
 406            inc = p.sign ? round_mask : 0;
 407            overflow_norm = !p.sign;
 408            break;
 409        default:
 410            g_assert_not_reached();
 411        }
 412
 413        exp += parm->exp_bias;
 414        if (likely(exp > 0)) {
 415            if (frac & round_mask) {
 416                flags |= float_flag_inexact;
 417                frac += inc;
 418                if (frac & DECOMPOSED_OVERFLOW_BIT) {
 419                    frac >>= 1;
 420                    exp++;
 421                }
 422            }
 423            frac >>= frac_shift;
 424
 425            if (unlikely(exp >= exp_max)) {
 426                flags |= float_flag_overflow | float_flag_inexact;
 427                if (overflow_norm) {
 428                    exp = exp_max - 1;
 429                    frac = -1;
 430                } else {
 431                    p.cls = float_class_inf;
 432                    goto do_inf;
 433                }
 434            }
 435        } else if (s->flush_to_zero) {
 436            flags |= float_flag_output_denormal;
 437            p.cls = float_class_zero;
 438            goto do_zero;
 439        } else {
 440            bool is_tiny = (s->float_detect_tininess
 441                            == float_tininess_before_rounding)
 442                        || (exp < 0)
 443                        || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
 444
 445            shift64RightJamming(frac, 1 - exp, &frac);
 446            if (frac & round_mask) {
 447                /* Need to recompute round-to-even.  */
 448                if (s->float_rounding_mode == float_round_nearest_even) {
 449                    inc = ((frac & roundeven_mask) != frac_lsbm1
 450                           ? frac_lsbm1 : 0);
 451                }
 452                flags |= float_flag_inexact;
 453                frac += inc;
 454            }
 455
 456            exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 457            frac >>= frac_shift;
 458
 459            if (is_tiny && (flags & float_flag_inexact)) {
 460                flags |= float_flag_underflow;
 461            }
 462            if (exp == 0 && frac == 0) {
 463                p.cls = float_class_zero;
 464            }
 465        }
 466        break;
 467
 468    case float_class_zero:
 469    do_zero:
 470        exp = 0;
 471        frac = 0;
 472        break;
 473
 474    case float_class_inf:
 475    do_inf:
 476        exp = exp_max;
 477        frac = 0;
 478        break;
 479
 480    case float_class_qnan:
 481    case float_class_snan:
 482        exp = exp_max;
 483        break;
 484
 485    default:
 486        g_assert_not_reached();
 487    }
 488
 489    float_raise(flags, s);
 490    p.exp = exp;
 491    p.frac = frac;
 492    return p;
 493}
 494
 495static FloatParts float16_unpack_canonical(float16 f, float_status *s)
 496{
 497    return canonicalize(float16_unpack_raw(f), &float16_params, s);
 498}
 499
 500static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
 501{
 502    switch (p.cls) {
 503    case float_class_dnan:
 504        return float16_default_nan(s);
 505    case float_class_msnan:
 506        return float16_maybe_silence_nan(float16_pack_raw(p), s);
 507    default:
 508        p = round_canonical(p, s, &float16_params);
 509        return float16_pack_raw(p);
 510    }
 511}
 512
 513static FloatParts float32_unpack_canonical(float32 f, float_status *s)
 514{
 515    return canonicalize(float32_unpack_raw(f), &float32_params, s);
 516}
 517
 518static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
 519{
 520    switch (p.cls) {
 521    case float_class_dnan:
 522        return float32_default_nan(s);
 523    case float_class_msnan:
 524        return float32_maybe_silence_nan(float32_pack_raw(p), s);
 525    default:
 526        p = round_canonical(p, s, &float32_params);
 527        return float32_pack_raw(p);
 528    }
 529}
 530
 531static FloatParts float64_unpack_canonical(float64 f, float_status *s)
 532{
 533    return canonicalize(float64_unpack_raw(f), &float64_params, s);
 534}
 535
 536static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
 537{
 538    switch (p.cls) {
 539    case float_class_dnan:
 540        return float64_default_nan(s);
 541    case float_class_msnan:
 542        return float64_maybe_silence_nan(float64_pack_raw(p), s);
 543    default:
 544        p = round_canonical(p, s, &float64_params);
 545        return float64_pack_raw(p);
 546    }
 547}
 548
 549/* Simple helpers for checking if what NaN we have */
 550static bool is_nan(FloatClass c)
 551{
 552    return unlikely(c >= float_class_qnan);
 553}
 554static bool is_snan(FloatClass c)
 555{
 556    return c == float_class_snan;
 557}
 558static bool is_qnan(FloatClass c)
 559{
 560    return c == float_class_qnan;
 561}
 562
 563static FloatParts return_nan(FloatParts a, float_status *s)
 564{
 565    switch (a.cls) {
 566    case float_class_snan:
 567        s->float_exception_flags |= float_flag_invalid;
 568        a.cls = float_class_msnan;
 569        /* fall through */
 570    case float_class_qnan:
 571        if (s->default_nan_mode) {
 572            a.cls = float_class_dnan;
 573        }
 574        break;
 575
 576    default:
 577        g_assert_not_reached();
 578    }
 579    return a;
 580}
 581
 582static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
 583{
 584    if (is_snan(a.cls) || is_snan(b.cls)) {
 585        s->float_exception_flags |= float_flag_invalid;
 586    }
 587
 588    if (s->default_nan_mode) {
 589        a.cls = float_class_dnan;
 590    } else {
 591        if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
 592                    is_qnan(b.cls), is_snan(b.cls),
 593                    a.frac > b.frac ||
 594                    (a.frac == b.frac && a.sign < b.sign))) {
 595            a = b;
 596        }
 597        a.cls = float_class_msnan;
 598    }
 599    return a;
 600}
 601
 602static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
 603                                  bool inf_zero, float_status *s)
 604{
 605    if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 606        s->float_exception_flags |= float_flag_invalid;
 607    }
 608
 609    if (s->default_nan_mode) {
 610        a.cls = float_class_dnan;
 611    } else {
 612        switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
 613                              is_qnan(b.cls), is_snan(b.cls),
 614                              is_qnan(c.cls), is_snan(c.cls),
 615                              inf_zero, s)) {
 616        case 0:
 617            break;
 618        case 1:
 619            a = b;
 620            break;
 621        case 2:
 622            a = c;
 623            break;
 624        case 3:
 625            a.cls = float_class_dnan;
 626            return a;
 627        default:
 628            g_assert_not_reached();
 629        }
 630
 631        a.cls = float_class_msnan;
 632    }
 633    return a;
 634}
 635
 636/*
 637 * Returns the result of adding or subtracting the values of the
 638 * floating-point values `a' and `b'. The operation is performed
 639 * according to the IEC/IEEE Standard for Binary Floating-Point
 640 * Arithmetic.
 641 */
 642
 643static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
 644                                float_status *s)
 645{
 646    bool a_sign = a.sign;
 647    bool b_sign = b.sign ^ subtract;
 648
 649    if (a_sign != b_sign) {
 650        /* Subtraction */
 651
 652        if (a.cls == float_class_normal && b.cls == float_class_normal) {
 653            if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 654                shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 655                a.frac = a.frac - b.frac;
 656            } else {
 657                shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 658                a.frac = b.frac - a.frac;
 659                a.exp = b.exp;
 660                a_sign ^= 1;
 661            }
 662
 663            if (a.frac == 0) {
 664                a.cls = float_class_zero;
 665                a.sign = s->float_rounding_mode == float_round_down;
 666            } else {
 667                int shift = clz64(a.frac) - 1;
 668                a.frac = a.frac << shift;
 669                a.exp = a.exp - shift;
 670                a.sign = a_sign;
 671            }
 672            return a;
 673        }
 674        if (is_nan(a.cls) || is_nan(b.cls)) {
 675            return pick_nan(a, b, s);
 676        }
 677        if (a.cls == float_class_inf) {
 678            if (b.cls == float_class_inf) {
 679                float_raise(float_flag_invalid, s);
 680                a.cls = float_class_dnan;
 681            }
 682            return a;
 683        }
 684        if (a.cls == float_class_zero && b.cls == float_class_zero) {
 685            a.sign = s->float_rounding_mode == float_round_down;
 686            return a;
 687        }
 688        if (a.cls == float_class_zero || b.cls == float_class_inf) {
 689            b.sign = a_sign ^ 1;
 690            return b;
 691        }
 692        if (b.cls == float_class_zero) {
 693            return a;
 694        }
 695    } else {
 696        /* Addition */
 697        if (a.cls == float_class_normal && b.cls == float_class_normal) {
 698            if (a.exp > b.exp) {
 699                shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 700            } else if (a.exp < b.exp) {
 701                shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 702                a.exp = b.exp;
 703            }
 704            a.frac += b.frac;
 705            if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
 706                a.frac >>= 1;
 707                a.exp += 1;
 708            }
 709            return a;
 710        }
 711        if (is_nan(a.cls) || is_nan(b.cls)) {
 712            return pick_nan(a, b, s);
 713        }
 714        if (a.cls == float_class_inf || b.cls == float_class_zero) {
 715            return a;
 716        }
 717        if (b.cls == float_class_inf || a.cls == float_class_zero) {
 718            b.sign = b_sign;
 719            return b;
 720        }
 721    }
 722    g_assert_not_reached();
 723}
 724
 725/*
 726 * Returns the result of adding or subtracting the floating-point
 727 * values `a' and `b'. The operation is performed according to the
 728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 729 */
 730
 731float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
 732                                              float_status *status)
 733{
 734    FloatParts pa = float16_unpack_canonical(a, status);
 735    FloatParts pb = float16_unpack_canonical(b, status);
 736    FloatParts pr = addsub_floats(pa, pb, false, status);
 737
 738    return float16_round_pack_canonical(pr, status);
 739}
 740
 741float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
 742                                             float_status *status)
 743{
 744    FloatParts pa = float32_unpack_canonical(a, status);
 745    FloatParts pb = float32_unpack_canonical(b, status);
 746    FloatParts pr = addsub_floats(pa, pb, false, status);
 747
 748    return float32_round_pack_canonical(pr, status);
 749}
 750
 751float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
 752                                             float_status *status)
 753{
 754    FloatParts pa = float64_unpack_canonical(a, status);
 755    FloatParts pb = float64_unpack_canonical(b, status);
 756    FloatParts pr = addsub_floats(pa, pb, false, status);
 757
 758    return float64_round_pack_canonical(pr, status);
 759}
 760
 761float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
 762                                             float_status *status)
 763{
 764    FloatParts pa = float16_unpack_canonical(a, status);
 765    FloatParts pb = float16_unpack_canonical(b, status);
 766    FloatParts pr = addsub_floats(pa, pb, true, status);
 767
 768    return float16_round_pack_canonical(pr, status);
 769}
 770
 771float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
 772                                             float_status *status)
 773{
 774    FloatParts pa = float32_unpack_canonical(a, status);
 775    FloatParts pb = float32_unpack_canonical(b, status);
 776    FloatParts pr = addsub_floats(pa, pb, true, status);
 777
 778    return float32_round_pack_canonical(pr, status);
 779}
 780
 781float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
 782                                             float_status *status)
 783{
 784    FloatParts pa = float64_unpack_canonical(a, status);
 785    FloatParts pb = float64_unpack_canonical(b, status);
 786    FloatParts pr = addsub_floats(pa, pb, true, status);
 787
 788    return float64_round_pack_canonical(pr, status);
 789}
 790
 791/*
 792 * Returns the result of multiplying the floating-point values `a' and
 793 * `b'. The operation is performed according to the IEC/IEEE Standard
 794 * for Binary Floating-Point Arithmetic.
 795 */
 796
 797static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
 798{
 799    bool sign = a.sign ^ b.sign;
 800
 801    if (a.cls == float_class_normal && b.cls == float_class_normal) {
 802        uint64_t hi, lo;
 803        int exp = a.exp + b.exp;
 804
 805        mul64To128(a.frac, b.frac, &hi, &lo);
 806        shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 807        if (lo & DECOMPOSED_OVERFLOW_BIT) {
 808            shift64RightJamming(lo, 1, &lo);
 809            exp += 1;
 810        }
 811
 812        /* Re-use a */
 813        a.exp = exp;
 814        a.sign = sign;
 815        a.frac = lo;
 816        return a;
 817    }
 818    /* handle all the NaN cases */
 819    if (is_nan(a.cls) || is_nan(b.cls)) {
 820        return pick_nan(a, b, s);
 821    }
 822    /* Inf * Zero == NaN */
 823    if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
 824        (a.cls == float_class_zero && b.cls == float_class_inf)) {
 825        s->float_exception_flags |= float_flag_invalid;
 826        a.cls = float_class_dnan;
 827        a.sign = sign;
 828        return a;
 829    }
 830    /* Multiply by 0 or Inf */
 831    if (a.cls == float_class_inf || a.cls == float_class_zero) {
 832        a.sign = sign;
 833        return a;
 834    }
 835    if (b.cls == float_class_inf || b.cls == float_class_zero) {
 836        b.sign = sign;
 837        return b;
 838    }
 839    g_assert_not_reached();
 840}
 841
 842float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
 843                                             float_status *status)
 844{
 845    FloatParts pa = float16_unpack_canonical(a, status);
 846    FloatParts pb = float16_unpack_canonical(b, status);
 847    FloatParts pr = mul_floats(pa, pb, status);
 848
 849    return float16_round_pack_canonical(pr, status);
 850}
 851
 852float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
 853                                             float_status *status)
 854{
 855    FloatParts pa = float32_unpack_canonical(a, status);
 856    FloatParts pb = float32_unpack_canonical(b, status);
 857    FloatParts pr = mul_floats(pa, pb, status);
 858
 859    return float32_round_pack_canonical(pr, status);
 860}
 861
 862float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
 863                                             float_status *status)
 864{
 865    FloatParts pa = float64_unpack_canonical(a, status);
 866    FloatParts pb = float64_unpack_canonical(b, status);
 867    FloatParts pr = mul_floats(pa, pb, status);
 868
 869    return float64_round_pack_canonical(pr, status);
 870}
 871
 872/*
 873 * Returns the result of multiplying the floating-point values `a' and
 874 * `b' then adding 'c', with no intermediate rounding step after the
 875 * multiplication. The operation is performed according to the
 876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
 877 * The flags argument allows the caller to select negation of the
 878 * addend, the intermediate product, or the final result. (The
 879 * difference between this and having the caller do a separate
 880 * negation is that negating externally will flip the sign bit on
 881 * NaNs.)
 882 */
 883
 884static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
 885                                int flags, float_status *s)
 886{
 887    bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
 888                    ((1 << float_class_inf) | (1 << float_class_zero));
 889    bool p_sign;
 890    bool sign_flip = flags & float_muladd_negate_result;
 891    FloatClass p_class;
 892    uint64_t hi, lo;
 893    int p_exp;
 894
 895    /* It is implementation-defined whether the cases of (0,inf,qnan)
 896     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
 897     * they return if they do), so we have to hand this information
 898     * off to the target-specific pick-a-NaN routine.
 899     */
 900    if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
 901        return pick_nan_muladd(a, b, c, inf_zero, s);
 902    }
 903
 904    if (inf_zero) {
 905        s->float_exception_flags |= float_flag_invalid;
 906        a.cls = float_class_dnan;
 907        return a;
 908    }
 909
 910    if (flags & float_muladd_negate_c) {
 911        c.sign ^= 1;
 912    }
 913
 914    p_sign = a.sign ^ b.sign;
 915
 916    if (flags & float_muladd_negate_product) {
 917        p_sign ^= 1;
 918    }
 919
 920    if (a.cls == float_class_inf || b.cls == float_class_inf) {
 921        p_class = float_class_inf;
 922    } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
 923        p_class = float_class_zero;
 924    } else {
 925        p_class = float_class_normal;
 926    }
 927
 928    if (c.cls == float_class_inf) {
 929        if (p_class == float_class_inf && p_sign != c.sign) {
 930            s->float_exception_flags |= float_flag_invalid;
 931            a.cls = float_class_dnan;
 932        } else {
 933            a.cls = float_class_inf;
 934            a.sign = c.sign ^ sign_flip;
 935        }
 936        return a;
 937    }
 938
 939    if (p_class == float_class_inf) {
 940        a.cls = float_class_inf;
 941        a.sign = p_sign ^ sign_flip;
 942        return a;
 943    }
 944
 945    if (p_class == float_class_zero) {
 946        if (c.cls == float_class_zero) {
 947            if (p_sign != c.sign) {
 948                p_sign = s->float_rounding_mode == float_round_down;
 949            }
 950            c.sign = p_sign;
 951        } else if (flags & float_muladd_halve_result) {
 952            c.exp -= 1;
 953        }
 954        c.sign ^= sign_flip;
 955        return c;
 956    }
 957
 958    /* a & b should be normals now... */
 959    assert(a.cls == float_class_normal &&
 960           b.cls == float_class_normal);
 961
 962    p_exp = a.exp + b.exp;
 963
 964    /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
 965     * result.
 966     */
 967    mul64To128(a.frac, b.frac, &hi, &lo);
 968    /* binary point now at bit 124 */
 969
 970    /* check for overflow */
 971    if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
 972        shift128RightJamming(hi, lo, 1, &hi, &lo);
 973        p_exp += 1;
 974    }
 975
 976    /* + add/sub */
 977    if (c.cls == float_class_zero) {
 978        /* move binary point back to 62 */
 979        shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 980    } else {
 981        int exp_diff = p_exp - c.exp;
 982        if (p_sign == c.sign) {
 983            /* Addition */
 984            if (exp_diff <= 0) {
 985                shift128RightJamming(hi, lo,
 986                                     DECOMPOSED_BINARY_POINT - exp_diff,
 987                                     &hi, &lo);
 988                lo += c.frac;
 989                p_exp = c.exp;
 990            } else {
 991                uint64_t c_hi, c_lo;
 992                /* shift c to the same binary point as the product (124) */
 993                c_hi = c.frac >> 2;
 994                c_lo = 0;
 995                shift128RightJamming(c_hi, c_lo,
 996                                     exp_diff,
 997                                     &c_hi, &c_lo);
 998                add128(hi, lo, c_hi, c_lo, &hi, &lo);
 999                /* move binary point back to 62 */
1000                shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);

1001            }
1002
1003            if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004                shift64RightJamming(lo, 1, &lo);
1005                p_exp += 1;
1006            }
1007
1008        } else {
1009            /* Subtraction */
1010            uint64_t c_hi, c_lo;
1011            /* make C binary point match product at bit 124 */
1012            c_hi = c.frac >> 2;
1013            c_lo = 0;
1014
1015            if (exp_diff <= 0) {
1016                shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1017                if (exp_diff == 0
1018                    &&
1019                    (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020                    sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1021                } else {
1022                    sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1023                    p_sign ^= 1;
1024                    p_exp = c.exp;
1025                }
1026            } else {
1027                shift128RightJamming(c_hi, c_lo,
1028                                     exp_diff,
1029                                     &c_hi, &c_lo);
1030                sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1031            }
1032
1033            if (hi == 0 && lo == 0) {
1034                a.cls = float_class_zero;
1035                a.sign = s->float_rounding_mode == float_round_down;
1036                a.sign ^= sign_flip;
1037                return a;
1038            } else {
1039                int shift;
1040                if (hi != 0) {
1041                    shift = clz64(hi);
1042                } else {
1043                    shift = clz64(lo) + 64;
1044                }
1045                /* Normalizing to a binary point of 124 is the
1046                   correct adjust for the exponent.  However since we're
1047                   shifting, we might as well put the binary point back
1048                   at 62 where we really want it.  Therefore shift as
1049                   if we're leaving 1 bit at the top of the word, but
1050                   adjust the exponent as if we're leaving 3 bits.  */
1051                shift -= 1;
1052                if (shift >= 64) {
1053                    lo = lo << (shift - 64);
1054                } else {
1055                    hi = (hi << shift) | (lo >> (64 - shift));
1056                    lo = hi | ((lo << shift) != 0);
1057                }
1058                p_exp -= shift - 2;
1059            }
1060        }
1061    }
1062
1063    if (flags & float_muladd_halve_result) {
1064        p_exp -= 1;
1065    }
1066
1067    /* finally prepare our result */
1068    a.cls = float_class_normal;
1069    a.sign = p_sign ^ sign_flip;
1070    a.exp = p_exp;
1071    a.frac = lo;
1072
1073    return a;
1074}
1075
1076float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077                                                int flags, float_status *status)
1078{
1079    FloatParts pa = float16_unpack_canonical(a, status);
1080    FloatParts pb = float16_unpack_canonical(b, status);
1081    FloatParts pc = float16_unpack_canonical(c, status);
1082    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1083
1084    return float16_round_pack_canonical(pr, status);
1085}
1086
1087float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088                                                int flags, float_status *status)
1089{
1090    FloatParts pa = float32_unpack_canonical(a, status);
1091    FloatParts pb = float32_unpack_canonical(b, status);
1092    FloatParts pc = float32_unpack_canonical(c, status);
1093    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1094
1095    return float32_round_pack_canonical(pr, status);
1096}
1097
1098float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099                                                int flags, float_status *status)
1100{
1101    FloatParts pa = float64_unpack_canonical(a, status);
1102    FloatParts pb = float64_unpack_canonical(b, status);
1103    FloatParts pc = float64_unpack_canonical(c, status);
1104    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1105
1106    return float64_round_pack_canonical(pr, status);
1107}
1108
1109/*
1110 * Returns the result of dividing the floating-point value `a' by the
1111 * corresponding value `b'. The operation is performed according to
1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113 */
1114
1115static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1116{
1117    bool sign = a.sign ^ b.sign;
1118
1119    if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120        uint64_t temp_lo, temp_hi;
1121        int exp = a.exp - b.exp;
1122        if (a.frac < b.frac) {
1123            exp -= 1;
1124            shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125                              &temp_hi, &temp_lo);
1126        } else {
1127            shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128                              &temp_hi, &temp_lo);
1129        }
1130        /* LSB of quot is set if inexact which roundandpack will use
1131         * to set flags. Yet again we re-use a for the result */
1132        a.frac = div128To64(temp_lo, temp_hi, b.frac);
1133        a.sign = sign;
1134        a.exp = exp;
1135        return a;
1136    }
1137    /* handle all the NaN cases */
1138    if (is_nan(a.cls) || is_nan(b.cls)) {
1139        return pick_nan(a, b, s);
1140    }
1141    /* 0/0 or Inf/Inf */
1142    if (a.cls == b.cls
1143        &&
1144        (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145        s->float_exception_flags |= float_flag_invalid;
1146        a.cls = float_class_dnan;
1147        return a;
1148    }
1149    /* Inf / x or 0 / x */
1150    if (a.cls == float_class_inf || a.cls == float_class_zero) {
1151        a.sign = sign;
1152        return a;
1153    }
1154    /* Div 0 => Inf */
1155    if (b.cls == float_class_zero) {
1156        s->float_exception_flags |= float_flag_divbyzero;
1157        a.cls = float_class_inf;
1158        a.sign = sign;
1159        return a;
1160    }
1161    /* Div by Inf */
1162    if (b.cls == float_class_inf) {
1163        a.cls = float_class_zero;
1164        a.sign = sign;
1165        return a;
1166    }
1167    g_assert_not_reached();
1168}
1169
1170float16 float16_div(float16 a, float16 b, float_status *status)
1171{
1172    FloatParts pa = float16_unpack_canonical(a, status);
1173    FloatParts pb = float16_unpack_canonical(b, status);
1174    FloatParts pr = div_floats(pa, pb, status);
1175
1176    return float16_round_pack_canonical(pr, status);
1177}
1178
1179float32 float32_div(float32 a, float32 b, float_status *status)
1180{
1181    FloatParts pa = float32_unpack_canonical(a, status);
1182    FloatParts pb = float32_unpack_canonical(b, status);
1183    FloatParts pr = div_floats(pa, pb, status);
1184
1185    return float32_round_pack_canonical(pr, status);
1186}
1187
1188float64 float64_div(float64 a, float64 b, float_status *status)
1189{
1190    FloatParts pa = float64_unpack_canonical(a, status);
1191    FloatParts pb = float64_unpack_canonical(b, status);
1192    FloatParts pr = div_floats(pa, pb, status);
1193
1194    return float64_round_pack_canonical(pr, status);
1195}
1196
1197/*
1198 * Rounds the floating-point value `a' to an integer, and returns the
1199 * result as a floating-point value. The operation is performed
1200 * according to the IEC/IEEE Standard for Binary Floating-Point
1201 * Arithmetic.
1202 */
1203
1204static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1205{
1206    if (is_nan(a.cls)) {
1207        return return_nan(a, s);
1208    }
1209
1210    switch (a.cls) {
1211    case float_class_zero:
1212    case float_class_inf:
1213    case float_class_qnan:
1214        /* already "integral" */
1215        break;
1216    case float_class_normal:
1217        if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218            /* already integral */
1219            break;
1220        }
1221        if (a.exp < 0) {
1222            bool one;
1223            /* all fractional */
1224            s->float_exception_flags |= float_flag_inexact;
1225            switch (rounding_mode) {
1226            case float_round_nearest_even:
1227                one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1228                break;
1229            case float_round_ties_away:
1230                one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1231                break;
1232            case float_round_to_zero:
1233                one = false;
1234                break;
1235            case float_round_up:
1236                one = !a.sign;
1237                break;
1238            case float_round_down:
1239                one = a.sign;
1240                break;
1241            default:
1242                g_assert_not_reached();
1243            }
1244
1245            if (one) {
1246                a.frac = DECOMPOSED_IMPLICIT_BIT;
1247                a.exp = 0;
1248            } else {
1249                a.cls = float_class_zero;
1250            }
1251        } else {
1252            uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253            uint64_t frac_lsbm1 = frac_lsb >> 1;
1254            uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255            uint64_t rnd_mask = rnd_even_mask >> 1;
1256            uint64_t inc;
1257
1258            switch (rounding_mode) {
1259            case float_round_nearest_even:
1260                inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1261                break;
1262            case float_round_ties_away:
1263                inc = frac_lsbm1;
1264                break;
1265            case float_round_to_zero:
1266                inc = 0;
1267                break;
1268            case float_round_up:
1269                inc = a.sign ? 0 : rnd_mask;
1270                break;
1271            case float_round_down:
1272                inc = a.sign ? rnd_mask : 0;
1273                break;
1274            default:
1275                g_assert_not_reached();
1276            }
1277
1278            if (a.frac & rnd_mask) {
1279                s->float_exception_flags |= float_flag_inexact;
1280                a.frac += inc;
1281                a.frac &= ~rnd_mask;
1282                if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1283                    a.frac >>= 1;
1284                    a.exp++;
1285                }
1286            }
1287        }
1288        break;
1289    default:
1290        g_assert_not_reached();
1291    }
1292    return a;
1293}
1294
1295float16 float16_round_to_int(float16 a, float_status *s)
1296{
1297    FloatParts pa = float16_unpack_canonical(a, s);
1298    FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299    return float16_round_pack_canonical(pr, s);
1300}
1301
1302float32 float32_round_to_int(float32 a, float_status *s)
1303{
1304    FloatParts pa = float32_unpack_canonical(a, s);
1305    FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306    return float32_round_pack_canonical(pr, s);
1307}
1308
1309float64 float64_round_to_int(float64 a, float_status *s)
1310{
1311    FloatParts pa = float64_unpack_canonical(a, s);
1312    FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313    return float64_round_pack_canonical(pr, s);
1314}
1315
1316float64 float64_trunc_to_int(float64 a, float_status *s)
1317{
1318    FloatParts pa = float64_unpack_canonical(a, s);
1319    FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320    return float64_round_pack_canonical(pr, s);
1321}
1322
1323/*
1324 * Returns the result of converting the floating-point value `a' to
1325 * the two's complement integer format. The conversion is performed
1326 * according to the IEC/IEEE Standard for Binary Floating-Point
1327 * Arithmetic---which means in particular that the conversion is
1328 * rounded according to the current rounding mode. If `a' is a NaN,
1329 * the largest positive integer is returned. Otherwise, if the
1330 * conversion overflows, the largest integer with the same sign as `a'
1331 * is returned.
1332*/
1333
1334static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1335                                     int64_t min, int64_t max,
1336                                     float_status *s)
1337{
1338    uint64_t r;
1339    int orig_flags = get_float_exception_flags(s);
1340    FloatParts p = round_to_int(in, rmode, s);
1341
1342    switch (p.cls) {
1343    case float_class_snan:
1344    case float_class_qnan:
1345    case float_class_dnan:
1346    case float_class_msnan:
1347        s->float_exception_flags = orig_flags | float_flag_invalid;
1348        return max;
1349    case float_class_inf:
1350        s->float_exception_flags = orig_flags | float_flag_invalid;
1351        return p.sign ? min : max;
1352    case float_class_zero:
1353        return 0;
1354    case float_class_normal:
1355        if (p.exp < DECOMPOSED_BINARY_POINT) {
1356            r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1357        } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1358            r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1359        } else {
1360            r = UINT64_MAX;
1361        }
1362        if (p.sign) {
1363            if (r < -(uint64_t) min) {
1364                return -r;
1365            } else {
1366                s->float_exception_flags = orig_flags | float_flag_invalid;
1367                return min;
1368            }
1369        } else {
1370            if (r < max) {
1371                return r;
1372            } else {
1373                s->float_exception_flags = orig_flags | float_flag_invalid;
1374                return max;
1375            }
1376        }
1377    default:
1378        g_assert_not_reached();
1379    }
1380}
1381
1382#define FLOAT_TO_INT(fsz, isz)                                          \
1383int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a,         \
1384                                                float_status *s)        \
1385{                                                                       \
1386    FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1387    return round_to_int_and_pack(p, s->float_rounding_mode,             \
1388                                 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1389                                 s);                                    \
1390}                                                                       \
1391                                                                        \
1392int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero       \
1393 (float ## fsz a, float_status *s)                                      \
1394{                                                                       \
1395    FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1396    return round_to_int_and_pack(p, float_round_to_zero,                \
1397                                 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1398                                 s);                                    \
1399}
1400
1401FLOAT_TO_INT(16, 16)
1402FLOAT_TO_INT(16, 32)
1403FLOAT_TO_INT(16, 64)
1404
1405FLOAT_TO_INT(32, 16)
1406FLOAT_TO_INT(32, 32)
1407FLOAT_TO_INT(32, 64)
1408
1409FLOAT_TO_INT(64, 16)
1410FLOAT_TO_INT(64, 32)
1411FLOAT_TO_INT(64, 64)
1412
1413#undef FLOAT_TO_INT
1414
1415/*
1416 *  Returns the result of converting the floating-point value `a' to
1417 *  the unsigned integer format. The conversion is performed according
1418 *  to the IEC/IEEE Standard for Binary Floating-Point
1419 *  Arithmetic---which means in particular that the conversion is
1420 *  rounded according to the current rounding mode. If `a' is a NaN,
1421 *  the largest unsigned integer is returned. Otherwise, if the
1422 *  conversion overflows, the largest unsigned integer is returned. If
1423 *  the 'a' is negative, the result is rounded and zero is returned;
1424 *  values that do not round to zero will raise the inexact exception
1425 *  flag.
1426 */
1427
1428static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1429                                       float_status *s)
1430{
1431    int orig_flags = get_float_exception_flags(s);
1432    FloatParts p = round_to_int(in, rmode, s);
1433
1434    switch (p.cls) {
1435    case float_class_snan:
1436    case float_class_qnan:
1437    case float_class_dnan:
1438    case float_class_msnan:
1439        s->float_exception_flags = orig_flags | float_flag_invalid;
1440        return max;
1441    case float_class_inf:
1442        s->float_exception_flags = orig_flags | float_flag_invalid;
1443        return p.sign ? 0 : max;
1444    case float_class_zero:
1445        return 0;
1446    case float_class_normal:
1447    {
1448        uint64_t r;
1449        if (p.sign) {
1450            s->float_exception_flags = orig_flags | float_flag_invalid;
1451            return 0;
1452        }
1453
1454        if (p.exp < DECOMPOSED_BINARY_POINT) {
1455            r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1456        } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1457            r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1458        } else {
1459            s->float_exception_flags = orig_flags | float_flag_invalid;
1460            return max;
1461        }
1462
1463        /* For uint64 this will never trip, but if p.exp is too large
1464         * to shift a decomposed fraction we shall have exited via the
1465         * 3rd leg above.
1466         */
1467        if (r > max) {
1468            s->float_exception_flags = orig_flags | float_flag_invalid;
1469            return max;
1470        } else {
1471            return r;
1472        }
1473    }
1474    default:
1475        g_assert_not_reached();
1476    }
1477}
1478
1479#define FLOAT_TO_UINT(fsz, isz) \
1480uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a,       \
1481                                                  float_status *s)      \
1482{                                                                       \
1483    FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1484    return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1485                                 UINT ## isz ## _MAX, s);               \
1486}                                                                       \
1487                                                                        \
1488uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero     \
1489 (float ## fsz a, float_status *s)                                      \
1490{                                                                       \
1491    FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1492    return round_to_uint_and_pack(p, float_round_to_zero,               \
1493                                  UINT ## isz ## _MAX, s);              \
1494}
1495
1496FLOAT_TO_UINT(16, 16)
1497FLOAT_TO_UINT(16, 32)
1498FLOAT_TO_UINT(16, 64)
1499
1500FLOAT_TO_UINT(32, 16)
1501FLOAT_TO_UINT(32, 32)
1502FLOAT_TO_UINT(32, 64)
1503
1504FLOAT_TO_UINT(64, 16)
1505FLOAT_TO_UINT(64, 32)
1506FLOAT_TO_UINT(64, 64)
1507
1508#undef FLOAT_TO_UINT
1509
1510/*
1511 * Integer to float conversions
1512 *
1513 * Returns the result of converting the two's complement integer `a'
1514 * to the floating-point format. The conversion is performed according
1515 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1516 */
1517
1518static FloatParts int_to_float(int64_t a, float_status *status)
1519{
1520    FloatParts r;
1521    if (a == 0) {
1522        r.cls = float_class_zero;
1523        r.sign = false;
1524    } else if (a == (1ULL << 63)) {
1525        r.cls = float_class_normal;
1526        r.sign = true;
1527        r.frac = DECOMPOSED_IMPLICIT_BIT;
1528        r.exp = 63;
1529    } else {
1530        uint64_t f;
1531        if (a < 0) {
1532            f = -a;
1533            r.sign = true;
1534        } else {
1535            f = a;
1536            r.sign = false;
1537        }
1538        int shift = clz64(f) - 1;
1539        r.cls = float_class_normal;
1540        r.exp = (DECOMPOSED_BINARY_POINT - shift);
1541        r.frac = f << shift;
1542    }
1543
1544    return r;
1545}
1546
1547float16 int64_to_float16(int64_t a, float_status *status)
1548{
1549    FloatParts pa = int_to_float(a, status);
1550    return float16_round_pack_canonical(pa, status);
1551}
1552
1553float16 int32_to_float16(int32_t a, float_status *status)
1554{
1555    return int64_to_float16(a, status);
1556}
1557
1558float16 int16_to_float16(int16_t a, float_status *status)
1559{
1560    return int64_to_float16(a, status);
1561}
1562
1563float32 int64_to_float32(int64_t a, float_status *status)
1564{
1565    FloatParts pa = int_to_float(a, status);
1566    return float32_round_pack_canonical(pa, status);
1567}
1568
1569float32 int32_to_float32(int32_t a, float_status *status)
1570{
1571    return int64_to_float32(a, status);
1572}
1573
1574float32 int16_to_float32(int16_t a, float_status *status)
1575{
1576    return int64_to_float32(a, status);
1577}
1578
1579float64 int64_to_float64(int64_t a, float_status *status)
1580{
1581    FloatParts pa = int_to_float(a, status);
1582    return float64_round_pack_canonical(pa, status);
1583}
1584
1585float64 int32_to_float64(int32_t a, float_status *status)
1586{
1587    return int64_to_float64(a, status);
1588}
1589
1590float64 int16_to_float64(int16_t a, float_status *status)
1591{
1592    return int64_to_float64(a, status);
1593}
1594
1595
1596/*
1597 * Unsigned Integer to float conversions
1598 *
1599 * Returns the result of converting the unsigned integer `a' to the
1600 * floating-point format. The conversion is performed according to the
1601 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1602 */
1603
1604static FloatParts uint_to_float(uint64_t a, float_status *status)
1605{
1606    FloatParts r = { .sign = false};
1607
1608    if (a == 0) {
1609        r.cls = float_class_zero;
1610    } else {
1611        int spare_bits = clz64(a) - 1;
1612        r.cls = float_class_normal;
1613        r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1614        if (spare_bits < 0) {
1615            shift64RightJamming(a, -spare_bits, &a);
1616            r.frac = a;
1617        } else {
1618            r.frac = a << spare_bits;
1619        }
1620    }
1621
1622    return r;
1623}
1624
1625float16 uint64_to_float16(uint64_t a, float_status *status)
1626{
1627    FloatParts pa = uint_to_float(a, status);
1628    return float16_round_pack_canonical(pa, status);
1629}
1630
1631float16 uint32_to_float16(uint32_t a, float_status *status)
1632{
1633    return uint64_to_float16(a, status);
1634}
1635
1636float16 uint16_to_float16(uint16_t a, float_status *status)
1637{
1638    return uint64_to_float16(a, status);
1639}
1640
1641float32 uint64_to_float32(uint64_t a, float_status *status)
1642{
1643    FloatParts pa = uint_to_float(a, status);
1644    return float32_round_pack_canonical(pa, status);
1645}
1646
1647float32 uint32_to_float32(uint32_t a, float_status *status)
1648{
1649    return uint64_to_float32(a, status);
1650}
1651
1652float32 uint16_to_float32(uint16_t a, float_status *status)
1653{
1654    return uint64_to_float32(a, status);
1655}
1656
1657float64 uint64_to_float64(uint64_t a, float_status *status)
1658{
1659    FloatParts pa = uint_to_float(a, status);
1660    return float64_round_pack_canonical(pa, status);
1661}
1662
1663float64 uint32_to_float64(uint32_t a, float_status *status)
1664{
1665    return uint64_to_float64(a, status);
1666}
1667
1668float64 uint16_to_float64(uint16_t a, float_status *status)
1669{
1670    return uint64_to_float64(a, status);
1671}
1672
1673/* Float Min/Max */
1674/* min() and max() functions. These can't be implemented as
1675 * 'compare and pick one input' because that would mishandle
1676 * NaNs and +0 vs -0.
1677 *
1678 * minnum() and maxnum() functions. These are similar to the min()
1679 * and max() functions but if one of the arguments is a QNaN and
1680 * the other is numerical then the numerical argument is returned.
1681 * SNaNs will get quietened before being returned.
1682 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1683 * and maxNum() operations. min() and max() are the typical min/max
1684 * semantics provided by many CPUs which predate that specification.
1685 *
1686 * minnummag() and maxnummag() functions correspond to minNumMag()
1687 * and minNumMag() from the IEEE-754 2008.
1688 */
1689static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1690                                bool ieee, bool ismag, float_status *s)
1691{
1692    if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1693        if (ieee) {
1694            /* Takes two floating-point values `a' and `b', one of
1695             * which is a NaN, and returns the appropriate NaN
1696             * result. If either `a' or `b' is a signaling NaN,
1697             * the invalid exception is raised.
1698             */
1699            if (is_snan(a.cls) || is_snan(b.cls)) {
1700                return pick_nan(a, b, s);
1701            } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1702                return b;
1703            } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1704                return a;
1705            }
1706        }
1707        return pick_nan(a, b, s);
1708    } else {
1709        int a_exp, b_exp;
1710
1711        switch (a.cls) {
1712        case float_class_normal:
1713            a_exp = a.exp;
1714            break;
1715        case float_class_inf:
1716            a_exp = INT_MAX;
1717            break;
1718        case float_class_zero:
1719            a_exp = INT_MIN;
1720            break;
1721        default:
1722            g_assert_not_reached();
1723            break;
1724        }
1725        switch (b.cls) {
1726        case float_class_normal:
1727            b_exp = b.exp;
1728            break;
1729        case float_class_inf:
1730            b_exp = INT_MAX;
1731            break;
1732        case float_class_zero:
1733            b_exp = INT_MIN;
1734            break;
1735        default:
1736            g_assert_not_reached();
1737            break;
1738        }
1739
1740        if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1741            bool a_less = a_exp < b_exp;
1742            if (a_exp == b_exp) {
1743                a_less = a.frac < b.frac;
1744            }
1745            return a_less ^ ismin ? b : a;
1746        }
1747
1748        if (a.sign == b.sign) {
1749            bool a_less = a_exp < b_exp;
1750            if (a_exp == b_exp) {
1751                a_less = a.frac < b.frac;
1752            }
1753            return a.sign ^ a_less ^ ismin ? b : a;
1754        } else {
1755            return a.sign ^ ismin ? b : a;
1756        }
1757    }
1758}
1759
1760#define MINMAX(sz, name, ismin, isiee, ismag)                           \
1761float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
1762                                     float_status *s)                   \
1763{                                                                       \
1764    FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1765    FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1766    FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
1767                                                                        \
1768    return float ## sz ## _round_pack_canonical(pr, s);                 \
1769}
1770
1771MINMAX(16, min, true, false, false)
1772MINMAX(16, minnum, true, true, false)
1773MINMAX(16, minnummag, true, true, true)
1774MINMAX(16, max, false, false, false)
1775MINMAX(16, maxnum, false, true, false)
1776MINMAX(16, maxnummag, false, true, true)
1777
1778MINMAX(32, min, true, false, false)
1779MINMAX(32, minnum, true, true, false)
1780MINMAX(32, minnummag, true, true, true)
1781MINMAX(32, max, false, false, false)
1782MINMAX(32, maxnum, false, true, false)
1783MINMAX(32, maxnummag, false, true, true)
1784
1785MINMAX(64, min, true, false, false)
1786MINMAX(64, minnum, true, true, false)
1787MINMAX(64, minnummag, true, true, true)
1788MINMAX(64, max, false, false, false)
1789MINMAX(64, maxnum, false, true, false)
1790MINMAX(64, maxnummag, false, true, true)
1791
1792#undef MINMAX
1793
1794/* Floating point compare */
1795static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1796                          float_status *s)
1797{
1798    if (is_nan(a.cls) || is_nan(b.cls)) {
1799        if (!is_quiet ||
1800            a.cls == float_class_snan ||
1801            b.cls == float_class_snan) {
1802            s->float_exception_flags |= float_flag_invalid;
1803        }
1804        return float_relation_unordered;
1805    }
1806
1807    if (a.cls == float_class_zero) {
1808        if (b.cls == float_class_zero) {
1809            return float_relation_equal;
1810        }
1811        return b.sign ? float_relation_greater : float_relation_less;
1812    } else if (b.cls == float_class_zero) {
1813        return a.sign ? float_relation_less : float_relation_greater;
1814    }
1815
1816    /* The only really important thing about infinity is its sign. If
1817     * both are infinities the sign marks the smallest of the two.
1818     */
1819    if (a.cls == float_class_inf) {
1820        if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1821            return float_relation_equal;
1822        }
1823        return a.sign ? float_relation_less : float_relation_greater;
1824    } else if (b.cls == float_class_inf) {
1825        return b.sign ? float_relation_greater : float_relation_less;
1826    }
1827
1828    if (a.sign != b.sign) {
1829        return a.sign ? float_relation_less : float_relation_greater;
1830    }
1831
1832    if (a.exp == b.exp) {
1833        if (a.frac == b.frac) {
1834            return float_relation_equal;
1835        }
1836        if (a.sign) {
1837            return a.frac > b.frac ?
1838                float_relation_less : float_relation_greater;
1839        } else {
1840            return a.frac > b.frac ?
1841                float_relation_greater : float_relation_less;
1842        }
1843    } else {
1844        if (a.sign) {
1845            return a.exp > b.exp ? float_relation_less : float_relation_greater;
1846        } else {
1847            return a.exp > b.exp ? float_relation_greater : float_relation_less;
1848        }
1849    }
1850}
1851
1852#define COMPARE(sz)                                                     \
1853int float ## sz ## _compare(float ## sz a, float ## sz b,               \
1854                            float_status *s)                            \
1855{                                                                       \
1856    FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1857    FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1858    return compare_floats(pa, pb, false, s);                            \
1859}                                                                       \
1860int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
1861                                  float_status *s)                      \
1862{                                                                       \
1863    FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1864    FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1865    return compare_floats(pa, pb, true, s);                             \
1866}
1867
1868COMPARE(16)
1869COMPARE(32)
1870COMPARE(64)
1871
1872#undef COMPARE
1873
1874/* Multiply A by 2 raised to the power N.  */
1875static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1876{
1877    if (unlikely(is_nan(a.cls))) {
1878        return return_nan(a, s);
1879    }
1880    if (a.cls == float_class_normal) {
1881        /* The largest float type (even though not supported by FloatParts)
1882         * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
1883         * still allows rounding to infinity, without allowing overflow
1884         * within the int32_t that backs FloatParts.exp.
1885         */
1886        n = MIN(MAX(n, -0x10000), 0x10000);
1887        a.exp += n;
1888    }
1889    return a;
1890}
1891
1892float16 float16_scalbn(float16 a, int n, float_status *status)
1893{
1894    FloatParts pa = float16_unpack_canonical(a, status);
1895    FloatParts pr = scalbn_decomposed(pa, n, status);
1896    return float16_round_pack_canonical(pr, status);
1897}
1898
1899float32 float32_scalbn(float32 a, int n, float_status *status)
1900{
1901    FloatParts pa = float32_unpack_canonical(a, status);
1902    FloatParts pr = scalbn_decomposed(pa, n, status);
1903    return float32_round_pack_canonical(pr, status);
1904}
1905
1906float64 float64_scalbn(float64 a, int n, float_status *status)
1907{
1908    FloatParts pa = float64_unpack_canonical(a, status);
1909    FloatParts pr = scalbn_decomposed(pa, n, status);
1910    return float64_round_pack_canonical(pr, status);
1911}
1912
1913/*
1914 * Square Root
1915 *
1916 * The old softfloat code did an approximation step before zeroing in
1917 * on the final result. However for simpleness we just compute the
1918 * square root by iterating down from the implicit bit to enough extra
1919 * bits to ensure we get a correctly rounded result.
1920 *
1921 * This does mean however the calculation is slower than before,
1922 * especially for 64 bit floats.
1923 */
1924
1925static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1926{
1927    uint64_t a_frac, r_frac, s_frac;
1928    int bit, last_bit;
1929
1930    if (is_nan(a.cls)) {
1931        return return_nan(a, s);
1932    }
1933    if (a.cls == float_class_zero) {
1934        return a;  /* sqrt(+-0) = +-0 */
1935    }
1936    if (a.sign) {
1937        s->float_exception_flags |= float_flag_invalid;
1938        a.cls = float_class_dnan;
1939        return a;
1940    }
1941    if (a.cls == float_class_inf) {
1942        return a;  /* sqrt(+inf) = +inf */
1943    }
1944
1945    assert(a.cls == float_class_normal);
1946
1947    /* We need two overflow bits at the top. Adding room for that is a
1948     * right shift. If the exponent is odd, we can discard the low bit
1949     * by multiplying the fraction by 2; that's a left shift. Combine
1950     * those and we shift right if the exponent is even.
1951     */
1952    a_frac = a.frac;
1953    if (!(a.exp & 1)) {
1954        a_frac >>= 1;
1955    }
1956    a.exp >>= 1;
1957
1958    /* Bit-by-bit computation of sqrt.  */
1959    r_frac = 0;
1960    s_frac = 0;
1961
1962    /* Iterate from implicit bit down to the 3 extra bits to compute a
1963     * properly rounded result. Remember we've inserted one more bit
1964     * at the top, so these positions are one less.
1965     */
1966    bit = DECOMPOSED_BINARY_POINT - 1;
1967    last_bit = MAX(p->frac_shift - 4, 0);
1968    do {
1969        uint64_t q = 1ULL << bit;
1970        uint64_t t_frac = s_frac + q;
1971        if (t_frac <= a_frac) {
1972            s_frac = t_frac + q;
1973            a_frac -= t_frac;
1974            r_frac += q;
1975        }
1976        a_frac <<= 1;
1977    } while (--bit >= last_bit);
1978
1979    /* Undo the right shift done above. If there is any remaining
1980     * fraction, the result is inexact. Set the sticky bit.
1981     */
1982    a.frac = (r_frac << 1) + (a_frac != 0);
1983
1984    return a;
1985}
1986
1987float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1988{
1989    FloatParts pa = float16_unpack_canonical(a, status);
1990    FloatParts pr = sqrt_float(pa, status, &float16_params);
1991    return float16_round_pack_canonical(pr, status);
1992}
1993
1994float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
1995{
1996    FloatParts pa = float32_unpack_canonical(a, status);
1997    FloatParts pr = sqrt_float(pa, status, &float32_params);
1998    return float32_round_pack_canonical(pr, status);
1999}
2000

2001float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
2002{
2003    FloatParts pa = float64_unpack_canonical(a, status);
2004    FloatParts pr = sqrt_float(pa, status, &float64_params);
2005    return float64_round_pack_canonical(pr, status);
2006}
2007
2008
2009/*----------------------------------------------------------------------------
2010| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2011| and 7, and returns the properly rounded 32-bit integer corresponding to the
2012| input.  If `zSign' is 1, the input is negated before being converted to an
2013| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2014| is simply rounded to an integer, with the inexact exception raised if the
2015| input cannot be represented exactly as an integer.  However, if the fixed-
2016| point input is too large, the invalid exception is raised and the largest
2017| positive or negative integer is returned.
2018*----------------------------------------------------------------------------*/
2019
2020static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2021{
2022    int8_t roundingMode;
2023    flag roundNearestEven;
2024    int8_t roundIncrement, roundBits;
2025    int32_t z;
2026
2027    roundingMode = status->float_rounding_mode;
2028    roundNearestEven = ( roundingMode == float_round_nearest_even );
2029    switch (roundingMode) {
2030    case float_round_nearest_even:
2031    case float_round_ties_away:
2032        roundIncrement = 0x40;
2033        break;
2034    case float_round_to_zero:
2035        roundIncrement = 0;
2036        break;
2037    case float_round_up:
2038        roundIncrement = zSign ? 0 : 0x7f;
2039        break;
2040    case float_round_down:
2041        roundIncrement = zSign ? 0x7f : 0;
2042        break;
2043    default:
2044        abort();
2045    }
2046    roundBits = absZ & 0x7F;
2047    absZ = ( absZ + roundIncrement )>>7;
2048    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2049    z = absZ;
2050    if ( zSign ) z = - z;
2051    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2052        float_raise(float_flag_invalid, status);
2053        return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2054    }
2055    if (roundBits) {
2056        status->float_exception_flags |= float_flag_inexact;
2057    }
2058    return z;
2059
2060}
2061
2062/*----------------------------------------------------------------------------
2063| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2064| `absZ1', with binary point between bits 63 and 64 (between the input words),
2065| and returns the properly rounded 64-bit integer corresponding to the input.
2066| If `zSign' is 1, the input is negated before being converted to an integer.
2067| Ordinarily, the fixed-point input is simply rounded to an integer, with
2068| the inexact exception raised if the input cannot be represented exactly as
2069| an integer.  However, if the fixed-point input is too large, the invalid
2070| exception is raised and the largest positive or negative integer is
2071| returned.
2072*----------------------------------------------------------------------------*/
2073
2074static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2075                               float_status *status)
2076{
2077    int8_t roundingMode;
2078    flag roundNearestEven, increment;
2079    int64_t z;
2080
2081    roundingMode = status->float_rounding_mode;
2082    roundNearestEven = ( roundingMode == float_round_nearest_even );
2083    switch (roundingMode) {
2084    case float_round_nearest_even:
2085    case float_round_ties_away:
2086        increment = ((int64_t) absZ1 < 0);
2087        break;
2088    case float_round_to_zero:
2089        increment = 0;
2090        break;
2091    case float_round_up:
2092        increment = !zSign && absZ1;
2093        break;
2094    case float_round_down:
2095        increment = zSign && absZ1;
2096        break;
2097    default:
2098        abort();
2099    }
2100    if ( increment ) {
2101        ++absZ0;
2102        if ( absZ0 == 0 ) goto overflow;
2103        absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2104    }
2105    z = absZ0;
2106    if ( zSign ) z = - z;
2107    if ( z && ( ( z < 0 ) ^ zSign ) ) {
2108 overflow:
2109        float_raise(float_flag_invalid, status);
2110        return
2111              zSign ? (int64_t) LIT64( 0x8000000000000000 )
2112            : LIT64( 0x7FFFFFFFFFFFFFFF );
2113    }
2114    if (absZ1) {
2115        status->float_exception_flags |= float_flag_inexact;
2116    }
2117    return z;
2118
2119}
2120
2121/*----------------------------------------------------------------------------
2122| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2123| `absZ1', with binary point between bits 63 and 64 (between the input words),
2124| and returns the properly rounded 64-bit unsigned integer corresponding to the
2125| input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2126| with the inexact exception raised if the input cannot be represented exactly
2127| as an integer.  However, if the fixed-point input is too large, the invalid
2128| exception is raised and the largest unsigned integer is returned.
2129*----------------------------------------------------------------------------*/
2130
2131static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2132                                uint64_t absZ1, float_status *status)
2133{
2134    int8_t roundingMode;
2135    flag roundNearestEven, increment;
2136
2137    roundingMode = status->float_rounding_mode;
2138    roundNearestEven = (roundingMode == float_round_nearest_even);
2139    switch (roundingMode) {
2140    case float_round_nearest_even:
2141    case float_round_ties_away:
2142        increment = ((int64_t)absZ1 < 0);
2143        break;
2144    case float_round_to_zero:
2145        increment = 0;
2146        break;
2147    case float_round_up:
2148        increment = !zSign && absZ1;
2149        break;
2150    case float_round_down:
2151        increment = zSign && absZ1;
2152        break;
2153    default:
2154        abort();
2155    }
2156    if (increment) {
2157        ++absZ0;
2158        if (absZ0 == 0) {
2159            float_raise(float_flag_invalid, status);
2160            return LIT64(0xFFFFFFFFFFFFFFFF);
2161        }
2162        absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2163    }
2164
2165    if (zSign && absZ0) {
2166        float_raise(float_flag_invalid, status);
2167        return 0;
2168    }
2169
2170    if (absZ1) {
2171        status->float_exception_flags |= float_flag_inexact;
2172    }
2173    return absZ0;
2174}
2175
2176/*----------------------------------------------------------------------------
2177| If `a' is denormal and we are in flush-to-zero mode then set the
2178| input-denormal exception and return zero. Otherwise just return the value.
2179*----------------------------------------------------------------------------*/
2180float32 float32_squash_input_denormal(float32 a, float_status *status)
2181{
2182    if (status->flush_inputs_to_zero) {
2183        if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2184            float_raise(float_flag_input_denormal, status);
2185            return make_float32(float32_val(a) & 0x80000000);
2186        }
2187    }
2188    return a;
2189}
2190
2191/*----------------------------------------------------------------------------
2192| Normalizes the subnormal single-precision floating-point value represented
2193| by the denormalized significand `aSig'.  The normalized exponent and
2194| significand are stored at the locations pointed to by `zExpPtr' and
2195| `zSigPtr', respectively.
2196*----------------------------------------------------------------------------*/
2197
2198static void
2199 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2200{
2201    int8_t shiftCount;
2202
2203    shiftCount = countLeadingZeros32( aSig ) - 8;
2204    *zSigPtr = aSig<<shiftCount;
2205    *zExpPtr = 1 - shiftCount;
2206
2207}
2208
2209/*----------------------------------------------------------------------------
2210| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2211| and significand `zSig', and returns the proper single-precision floating-
2212| point value corresponding to the abstract input.  Ordinarily, the abstract
2213| value is simply rounded and packed into the single-precision format, with
2214| the inexact exception raised if the abstract input cannot be represented
2215| exactly.  However, if the abstract value is too large, the overflow and
2216| inexact exceptions are raised and an infinity or maximal finite value is
2217| returned.  If the abstract value is too small, the input value is rounded to
2218| a subnormal number, and the underflow and inexact exceptions are raised if
2219| the abstract input cannot be represented exactly as a subnormal single-
2220| precision floating-point number.
2221|     The input significand `zSig' has its binary point between bits 30
2222| and 29, which is 7 bits to the left of the usual location.  This shifted
2223| significand must be normalized or smaller.  If `zSig' is not normalized,
2224| `zExp' must be 0; in that case, the result returned is a subnormal number,
2225| and it must not require rounding.  In the usual case that `zSig' is
2226| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2227| The handling of underflow and overflow follows the IEC/IEEE Standard for
2228| Binary Floating-Point Arithmetic.
2229*----------------------------------------------------------------------------*/
2230
2231static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2232                                   float_status *status)
2233{
2234    int8_t roundingMode;
2235    flag roundNearestEven;
2236    int8_t roundIncrement, roundBits;
2237    flag isTiny;
2238
2239    roundingMode = status->float_rounding_mode;
2240    roundNearestEven = ( roundingMode == float_round_nearest_even );
2241    switch (roundingMode) {
2242    case float_round_nearest_even:
2243    case float_round_ties_away:
2244        roundIncrement = 0x40;
2245        break;
2246    case float_round_to_zero:
2247        roundIncrement = 0;
2248        break;
2249    case float_round_up:
2250        roundIncrement = zSign ? 0 : 0x7f;
2251        break;
2252    case float_round_down:
2253        roundIncrement = zSign ? 0x7f : 0;
2254        break;
2255    default:
2256        abort();
2257        break;
2258    }
2259    roundBits = zSig & 0x7F;
2260    if ( 0xFD <= (uint16_t) zExp ) {
2261        if (    ( 0xFD < zExp )
2262             || (    ( zExp == 0xFD )
2263                  && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2264           ) {
2265            float_raise(float_flag_overflow | float_flag_inexact, status);
2266            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2267        }
2268        if ( zExp < 0 ) {
2269            if (status->flush_to_zero) {
2270                float_raise(float_flag_output_denormal, status);
2271                return packFloat32(zSign, 0, 0);
2272            }
2273            isTiny =
2274                (status->float_detect_tininess
2275                 == float_tininess_before_rounding)
2276                || ( zExp < -1 )
2277                || ( zSig + roundIncrement < 0x80000000 );
2278            shift32RightJamming( zSig, - zExp, &zSig );
2279            zExp = 0;
2280            roundBits = zSig & 0x7F;
2281            if (isTiny && roundBits) {
2282                float_raise(float_flag_underflow, status);
2283            }
2284        }
2285    }
2286    if (roundBits) {
2287        status->float_exception_flags |= float_flag_inexact;
2288    }
2289    zSig = ( zSig + roundIncrement )>>7;
2290    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2291    if ( zSig == 0 ) zExp = 0;
2292    return packFloat32( zSign, zExp, zSig );
2293
2294}
2295
2296/*----------------------------------------------------------------------------
2297| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2298| and significand `zSig', and returns the proper single-precision floating-
2299| point value corresponding to the abstract input.  This routine is just like
2300| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2301| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2302| floating-point exponent.
2303*----------------------------------------------------------------------------*/
2304
2305static float32
2306 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2307                              float_status *status)
2308{
2309    int8_t shiftCount;
2310
2311    shiftCount = countLeadingZeros32( zSig ) - 1;
2312    return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2313                               status);
2314
2315}
2316
2317/*----------------------------------------------------------------------------
2318| If `a' is denormal and we are in flush-to-zero mode then set the
2319| input-denormal exception and return zero. Otherwise just return the value.
2320*----------------------------------------------------------------------------*/
2321float64 float64_squash_input_denormal(float64 a, float_status *status)
2322{
2323    if (status->flush_inputs_to_zero) {
2324        if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2325            float_raise(float_flag_input_denormal, status);
2326            return make_float64(float64_val(a) & (1ULL << 63));
2327        }
2328    }
2329    return a;
2330}
2331
2332/*----------------------------------------------------------------------------
2333| Normalizes the subnormal double-precision floating-point value represented
2334| by the denormalized significand `aSig'.  The normalized exponent and
2335| significand are stored at the locations pointed to by `zExpPtr' and
2336| `zSigPtr', respectively.
2337*----------------------------------------------------------------------------*/
2338
2339static void
2340 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2341{
2342    int8_t shiftCount;
2343
2344    shiftCount = countLeadingZeros64( aSig ) - 11;
2345    *zSigPtr = aSig<<shiftCount;
2346    *zExpPtr = 1 - shiftCount;
2347
2348}
2349
2350/*----------------------------------------------------------------------------
2351| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2352| double-precision floating-point value, returning the result.  After being
2353| shifted into the proper positions, the three fields are simply added
2354| together to form the result.  This means that any integer portion of `zSig'
2355| will be added into the exponent.  Since a properly normalized significand
2356| will have an integer portion equal to 1, the `zExp' input should be 1 less
2357| than the desired result exponent whenever `zSig' is a complete, normalized
2358| significand.
2359*----------------------------------------------------------------------------*/
2360
2361static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2362{
2363
2364    return make_float64(
2365        ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2366
2367}
2368
2369/*----------------------------------------------------------------------------
2370| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2371| and significand `zSig', and returns the proper double-precision floating-
2372| point value corresponding to the abstract input.  Ordinarily, the abstract
2373| value is simply rounded and packed into the double-precision format, with
2374| the inexact exception raised if the abstract input cannot be represented
2375| exactly.  However, if the abstract value is too large, the overflow and
2376| inexact exceptions are raised and an infinity or maximal finite value is
2377| returned.  If the abstract value is too small, the input value is rounded to
2378| a subnormal number, and the underflow and inexact exceptions are raised if
2379| the abstract input cannot be represented exactly as a subnormal double-
2380| precision floating-point number.
2381|     The input significand `zSig' has its binary point between bits 62
2382| and 61, which is 10 bits to the left of the usual location.  This shifted
2383| significand must be normalized or smaller.  If `zSig' is not normalized,
2384| `zExp' must be 0; in that case, the result returned is a subnormal number,
2385| and it must not require rounding.  In the usual case that `zSig' is
2386| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2387| The handling of underflow and overflow follows the IEC/IEEE Standard for
2388| Binary Floating-Point Arithmetic.
2389*----------------------------------------------------------------------------*/
2390
2391static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2392                                   float_status *status)
2393{
2394    int8_t roundingMode;
2395    flag roundNearestEven;
2396    int roundIncrement, roundBits;
2397    flag isTiny;
2398
2399    roundingMode = status->float_rounding_mode;
2400    roundNearestEven = ( roundingMode == float_round_nearest_even );
2401    switch (roundingMode) {
2402    case float_round_nearest_even:
2403    case float_round_ties_away:
2404        roundIncrement = 0x200;
2405        break;
2406    case float_round_to_zero:
2407        roundIncrement = 0;
2408        break;
2409    case float_round_up:
2410        roundIncrement = zSign ? 0 : 0x3ff;
2411        break;
2412    case float_round_down:
2413        roundIncrement = zSign ? 0x3ff : 0;
2414        break;
2415    case float_round_to_odd:
2416        roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2417        break;
2418    default:
2419        abort();
2420    }
2421    roundBits = zSig & 0x3FF;
2422    if ( 0x7FD <= (uint16_t) zExp ) {
2423        if (    ( 0x7FD < zExp )
2424             || (    ( zExp == 0x7FD )
2425                  && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2426           ) {
2427            bool overflow_to_inf = roundingMode != float_round_to_odd &&
2428                                   roundIncrement != 0;
2429            float_raise(float_flag_overflow | float_flag_inexact, status);
2430            return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2431        }
2432        if ( zExp < 0 ) {
2433            if (status->flush_to_zero) {
2434                float_raise(float_flag_output_denormal, status);
2435                return packFloat64(zSign, 0, 0);
2436            }
2437            isTiny =
2438                   (status->float_detect_tininess
2439                    == float_tininess_before_rounding)
2440                || ( zExp < -1 )
2441                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2442            shift64RightJamming( zSig, - zExp, &zSig );
2443            zExp = 0;
2444            roundBits = zSig & 0x3FF;
2445            if (isTiny && roundBits) {
2446                float_raise(float_flag_underflow, status);
2447            }
2448            if (roundingMode == float_round_to_odd) {
2449                /*
2450                 * For round-to-odd case, the roundIncrement depends on
2451                 * zSig which just changed.
2452                 */
2453                roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2454            }
2455        }
2456    }
2457    if (roundBits) {
2458        status->float_exception_flags |= float_flag_inexact;
2459    }
2460    zSig = ( zSig + roundIncrement )>>10;
2461    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2462    if ( zSig == 0 ) zExp = 0;
2463    return packFloat64( zSign, zExp, zSig );
2464
2465}
2466
2467/*----------------------------------------------------------------------------
2468| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2469| and significand `zSig', and returns the proper double-precision floating-
2470| point value corresponding to the abstract input.  This routine is just like
2471| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2472| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2473| floating-point exponent.
2474*----------------------------------------------------------------------------*/
2475
2476static float64
2477 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2478                              float_status *status)
2479{
2480    int8_t shiftCount;
2481
2482    shiftCount = countLeadingZeros64( zSig ) - 1;
2483    return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2484                               status);
2485
2486}
2487
2488/*----------------------------------------------------------------------------
2489| Normalizes the subnormal extended double-precision floating-point value
2490| represented by the denormalized significand `aSig'.  The normalized exponent
2491| and significand are stored at the locations pointed to by `zExpPtr' and
2492| `zSigPtr', respectively.
2493*----------------------------------------------------------------------------*/
2494
2495void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2496                                uint64_t *zSigPtr)
2497{
2498    int8_t shiftCount;
2499
2500    shiftCount = countLeadingZeros64( aSig );
2501    *zSigPtr = aSig<<shiftCount;
2502    *zExpPtr = 1 - shiftCount;
2503}
2504
2505/*----------------------------------------------------------------------------
2506| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2507| and extended significand formed by the concatenation of `zSig0' and `zSig1',
2508| and returns the proper extended double-precision floating-point value
2509| corresponding to the abstract input.  Ordinarily, the abstract value is
2510| rounded and packed into the extended double-precision format, with the
2511| inexact exception raised if the abstract input cannot be represented
2512| exactly.  However, if the abstract value is too large, the overflow and
2513| inexact exceptions are raised and an infinity or maximal finite value is
2514| returned.  If the abstract value is too small, the input value is rounded to
2515| a subnormal number, and the underflow and inexact exceptions are raised if
2516| the abstract input cannot be represented exactly as a subnormal extended
2517| double-precision floating-point number.
2518|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
2519| number of bits as single or double precision, respectively.  Otherwise, the
2520| result is rounded to the full precision of the extended double-precision
2521| format.
2522|     The input significand must be normalized or smaller.  If the input
2523| significand is not normalized, `zExp' must be 0; in that case, the result
2524| returned is a subnormal number, and it must not require rounding.  The
2525| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2526| Floating-Point Arithmetic.
2527*----------------------------------------------------------------------------*/
2528
2529floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2530                              int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2531                              float_status *status)
2532{
2533    int8_t roundingMode;
2534    flag roundNearestEven, increment, isTiny;
2535    int64_t roundIncrement, roundMask, roundBits;
2536
2537    roundingMode = status->float_rounding_mode;
2538    roundNearestEven = ( roundingMode == float_round_nearest_even );
2539    if ( roundingPrecision == 80 ) goto precision80;
2540    if ( roundingPrecision == 64 ) {
2541        roundIncrement = LIT64( 0x0000000000000400 );
2542        roundMask = LIT64( 0x00000000000007FF );
2543    }
2544    else if ( roundingPrecision == 32 ) {
2545        roundIncrement = LIT64( 0x0000008000000000 );
2546        roundMask = LIT64( 0x000000FFFFFFFFFF );
2547    }
2548    else {
2549        goto precision80;
2550    }
2551    zSig0 |= ( zSig1 != 0 );
2552    switch (roundingMode) {
2553    case float_round_nearest_even:
2554    case float_round_ties_away:
2555        break;
2556    case float_round_to_zero:
2557        roundIncrement = 0;
2558        break;
2559    case float_round_up:
2560        roundIncrement = zSign ? 0 : roundMask;
2561        break;
2562    case float_round_down:
2563        roundIncrement = zSign ? roundMask : 0;
2564        break;
2565    default:
2566        abort();
2567    }
2568    roundBits = zSig0 & roundMask;
2569    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2570        if (    ( 0x7FFE < zExp )
2571             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2572           ) {
2573            goto overflow;
2574        }
2575        if ( zExp <= 0 ) {
2576            if (status->flush_to_zero) {
2577                float_raise(float_flag_output_denormal, status);
2578                return packFloatx80(zSign, 0, 0);
2579            }
2580            isTiny =
2581                   (status->float_detect_tininess
2582                    == float_tininess_before_rounding)
2583                || ( zExp < 0 )
2584                || ( zSig0 <= zSig0 + roundIncrement );
2585            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2586            zExp = 0;
2587            roundBits = zSig0 & roundMask;
2588            if (isTiny && roundBits) {
2589                float_raise(float_flag_underflow, status);
2590            }
2591            if (roundBits) {
2592                status->float_exception_flags |= float_flag_inexact;
2593            }
2594            zSig0 += roundIncrement;
2595            if ( (int64_t) zSig0 < 0 ) zExp = 1;
2596            roundIncrement = roundMask + 1;
2597            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2598                roundMask |= roundIncrement;
2599            }
2600            zSig0 &= ~ roundMask;
2601            return packFloatx80( zSign, zExp, zSig0 );
2602        }
2603    }
2604    if (roundBits) {
2605        status->float_exception_flags |= float_flag_inexact;
2606    }
2607    zSig0 += roundIncrement;
2608    if ( zSig0 < roundIncrement ) {
2609        ++zExp;
2610        zSig0 = LIT64( 0x8000000000000000 );
2611    }
2612    roundIncrement = roundMask + 1;
2613    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2614        roundMask |= roundIncrement;
2615    }
2616    zSig0 &= ~ roundMask;
2617    if ( zSig0 == 0 ) zExp = 0;
2618    return packFloatx80( zSign, zExp, zSig0 );
2619 precision80:
2620    switch (roundingMode) {
2621    case float_round_nearest_even:
2622    case float_round_ties_away:
2623        increment = ((int64_t)zSig1 < 0);
2624        break;
2625    case float_round_to_zero:
2626        increment = 0;
2627        break;
2628    case float_round_up:
2629        increment = !zSign && zSig1;
2630        break;
2631    case float_round_down:
2632        increment = zSign && zSig1;
2633        break;
2634    default:
2635        abort();
2636    }
2637    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2638        if (    ( 0x7FFE < zExp )
2639             || (    ( zExp == 0x7FFE )
2640                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2641                  && increment
2642                )
2643           ) {
2644            roundMask = 0;
2645 overflow:
2646            float_raise(float_flag_overflow | float_flag_inexact, status);
2647            if (    ( roundingMode == float_round_to_zero )
2648                 || ( zSign && ( roundingMode == float_round_up ) )
2649                 || ( ! zSign && ( roundingMode == float_round_down ) )
2650               ) {
2651                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2652            }
2653            return packFloatx80(zSign,
2654                                floatx80_infinity_high,
2655                                floatx80_infinity_low);
2656        }
2657        if ( zExp <= 0 ) {
2658            isTiny =
2659                   (status->float_detect_tininess
2660                    == float_tininess_before_rounding)
2661                || ( zExp < 0 )
2662                || ! increment
2663                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2664            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2665            zExp = 0;
2666            if (isTiny && zSig1) {
2667                float_raise(float_flag_underflow, status);
2668            }
2669            if (zSig1) {
2670                status->float_exception_flags |= float_flag_inexact;
2671            }
2672            switch (roundingMode) {
2673            case float_round_nearest_even:
2674            case float_round_ties_away:
2675                increment = ((int64_t)zSig1 < 0);
2676                break;
2677            case float_round_to_zero:
2678                increment = 0;
2679                break;
2680            case float_round_up:
2681                increment = !zSign && zSig1;
2682                break;
2683            case float_round_down:
2684                increment = zSign && zSig1;
2685                break;
2686            default:
2687                abort();
2688            }
2689            if ( increment ) {
2690                ++zSig0;
2691                zSig0 &=
2692                    ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2693                if ( (int64_t) zSig0 < 0 ) zExp = 1;
2694            }
2695            return packFloatx80( zSign, zExp, zSig0 );
2696        }
2697    }
2698    if (zSig1) {
2699        status->float_exception_flags |= float_flag_inexact;
2700    }
2701    if ( increment ) {
2702        ++zSig0;
2703        if ( zSig0 == 0 ) {
2704            ++zExp;
2705            zSig0 = LIT64( 0x8000000000000000 );
2706        }
2707        else {
2708            zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2709        }
2710    }
2711    else {
2712        if ( zSig0 == 0 ) zExp = 0;
2713    }
2714    return packFloatx80( zSign, zExp, zSig0 );
2715
2716}
2717
2718/*----------------------------------------------------------------------------
2719| Takes an abstract floating-point value having sign `zSign', exponent
2720| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2721| and returns the proper extended double-precision floating-point value
2722| corresponding to the abstract input.  This routine is just like
2723| `roundAndPackFloatx80' except that the input significand does not have to be
2724| normalized.
2725*----------------------------------------------------------------------------*/
2726
2727floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2728                                       flag zSign, int32_t zExp,
2729                                       uint64_t zSig0, uint64_t zSig1,
2730                                       float_status *status)
2731{
2732    int8_t shiftCount;
2733
2734    if ( zSig0 == 0 ) {
2735        zSig0 = zSig1;
2736        zSig1 = 0;
2737        zExp -= 64;
2738    }
2739    shiftCount = countLeadingZeros64( zSig0 );
2740    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2741    zExp -= shiftCount;
2742    return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2743                                zSig0, zSig1, status);
2744
2745}
2746
2747/*----------------------------------------------------------------------------
2748| Returns the least-significant 64 fraction bits of the quadruple-precision
2749| floating-point value `a'.
2750*----------------------------------------------------------------------------*/
2751
2752static inline uint64_t extractFloat128Frac1( float128 a )
2753{
2754
2755    return a.low;
2756
2757}
2758
2759/*----------------------------------------------------------------------------
2760| Returns the most-significant 48 fraction bits of the quadruple-precision
2761| floating-point value `a'.
2762*----------------------------------------------------------------------------*/
2763
2764static inline uint64_t extractFloat128Frac0( float128 a )
2765{
2766
2767    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2768
2769}
2770
2771/*----------------------------------------------------------------------------
2772| Returns the exponent bits of the quadruple-precision floating-point value
2773| `a'.
2774*----------------------------------------------------------------------------*/
2775
2776static inline int32_t extractFloat128Exp( float128 a )
2777{
2778
2779    return ( a.high>>48 ) & 0x7FFF;
2780
2781}
2782
2783/*----------------------------------------------------------------------------
2784| Returns the sign bit of the quadruple-precision floating-point value `a'.
2785*----------------------------------------------------------------------------*/
2786
2787static inline flag extractFloat128Sign( float128 a )
2788{
2789
2790    return a.high>>63;
2791
2792}
2793
2794/*----------------------------------------------------------------------------
2795| Normalizes the subnormal quadruple-precision floating-point value
2796| represented by the denormalized significand formed by the concatenation of
2797| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
2798| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
2799| significand are stored at the location pointed to by `zSig0Ptr', and the
2800| least significant 64 bits of the normalized significand are stored at the
2801| location pointed to by `zSig1Ptr'.
2802*----------------------------------------------------------------------------*/
2803
2804static void
2805 normalizeFloat128Subnormal(
2806     uint64_t aSig0,
2807     uint64_t aSig1,
2808     int32_t *zExpPtr,
2809     uint64_t *zSig0Ptr,
2810     uint64_t *zSig1Ptr
2811 )
2812{
2813    int8_t shiftCount;
2814
2815    if ( aSig0 == 0 ) {
2816        shiftCount = countLeadingZeros64( aSig1 ) - 15;
2817        if ( shiftCount < 0 ) {
2818            *zSig0Ptr = aSig1>>( - shiftCount );
2819            *zSig1Ptr = aSig1<<( shiftCount & 63 );
2820        }
2821        else {
2822            *zSig0Ptr = aSig1<<shiftCount;
2823            *zSig1Ptr = 0;
2824        }
2825        *zExpPtr = - shiftCount - 63;
2826    }
2827    else {
2828        shiftCount = countLeadingZeros64( aSig0 ) - 15;
2829        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2830        *zExpPtr = 1 - shiftCount;
2831    }
2832
2833}
2834
2835/*----------------------------------------------------------------------------
2836| Packs the sign `zSign', the exponent `zExp', and the significand formed
2837| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2838| floating-point value, returning the result.  After being shifted into the
2839| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2840| added together to form the most significant 32 bits of the result.  This
2841| means that any integer portion of `zSig0' will be added into the exponent.
2842| Since a properly normalized significand will have an integer portion equal
2843| to 1, the `zExp' input should be 1 less than the desired result exponent
2844| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2845| significand.
2846*----------------------------------------------------------------------------*/
2847
2848static inline float128
2849 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2850{
2851    float128 z;
2852
2853    z.low = zSig1;
2854    z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2855    return z;
2856
2857}
2858
2859/*----------------------------------------------------------------------------
2860| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2861| and extended significand formed by the concatenation of `zSig0', `zSig1',
2862| and `zSig2', and returns the proper quadruple-precision floating-point value
2863| corresponding to the abstract input.  Ordinarily, the abstract value is
2864| simply rounded and packed into the quadruple-precision format, with the
2865| inexact exception raised if the abstract input cannot be represented
2866| exactly.  However, if the abstract value is too large, the overflow and
2867| inexact exceptions are raised and an infinity or maximal finite value is
2868| returned.  If the abstract value is too small, the input value is rounded to
2869| a subnormal number, and the underflow and inexact exceptions are raised if
2870| the abstract input cannot be represented exactly as a subnormal quadruple-
2871| precision floating-point number.
2872|     The input significand must be normalized or smaller.  If the input
2873| significand is not normalized, `zExp' must be 0; in that case, the result
2874| returned is a subnormal number, and it must not require rounding.  In the
2875| usual case that the input significand is normalized, `zExp' must be 1 less
2876| than the ``true'' floating-point exponent.  The handling of underflow and
2877| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2878*----------------------------------------------------------------------------*/
2879
2880static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2881                                     uint64_t zSig0, uint64_t zSig1,
2882                                     uint64_t zSig2, float_status *status)
2883{
2884    int8_t roundingMode;
2885    flag roundNearestEven, increment, isTiny;
2886
2887    roundingMode = status->float_rounding_mode;
2888    roundNearestEven = ( roundingMode == float_round_nearest_even );
2889    switch (roundingMode) {
2890    case float_round_nearest_even:
2891    case float_round_ties_away:
2892        increment = ((int64_t)zSig2 < 0);
2893        break;
2894    case float_round_to_zero:
2895        increment = 0;
2896        break;
2897    case float_round_up:
2898        increment = !zSign && zSig2;
2899        break;
2900    case float_round_down:
2901        increment = zSign && zSig2;
2902        break;
2903    case float_round_to_odd:
2904        increment = !(zSig1 & 0x1) && zSig2;
2905        break;
2906    default:
2907        abort();
2908    }
2909    if ( 0x7FFD <= (uint32_t) zExp ) {
2910        if (    ( 0x7FFD < zExp )
2911             || (    ( zExp == 0x7FFD )
2912                  && eq128(
2913                         LIT64( 0x0001FFFFFFFFFFFF ),
2914                         LIT64( 0xFFFFFFFFFFFFFFFF ),
2915                         zSig0,
2916                         zSig1
2917                     )
2918                  && increment
2919                )
2920           ) {
2921            float_raise(float_flag_overflow | float_flag_inexact, status);
2922            if (    ( roundingMode == float_round_to_zero )
2923                 || ( zSign && ( roundingMode == float_round_up ) )
2924                 || ( ! zSign && ( roundingMode == float_round_down ) )
2925                 || (roundingMode == float_round_to_odd)
2926               ) {
2927                return
2928                    packFloat128(
2929                        zSign,
2930                        0x7FFE,
2931                        LIT64( 0x0000FFFFFFFFFFFF ),
2932                        LIT64( 0xFFFFFFFFFFFFFFFF )
2933                    );
2934            }
2935            return packFloat128( zSign, 0x7FFF, 0, 0 );
2936        }
2937        if ( zExp < 0 ) {
2938            if (status->flush_to_zero) {
2939                float_raise(float_flag_output_denormal, status);
2940                return packFloat128(zSign, 0, 0, 0);
2941            }
2942            isTiny =
2943                   (status->float_detect_tininess
2944                    == float_tininess_before_rounding)
2945                || ( zExp < -1 )
2946                || ! increment
2947                || lt128(
2948                       zSig0,
2949                       zSig1,
2950                       LIT64( 0x0001FFFFFFFFFFFF ),
2951                       LIT64( 0xFFFFFFFFFFFFFFFF )
2952                   );
2953            shift128ExtraRightJamming(
2954                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2955            zExp = 0;
2956            if (isTiny && zSig2) {
2957                float_raise(float_flag_underflow, status);
2958            }
2959            switch (roundingMode) {
2960            case float_round_nearest_even:
2961            case float_round_ties_away:
2962                increment = ((int64_t)zSig2 < 0);
2963                break;
2964            case float_round_to_zero:
2965                increment = 0;
2966                break;
2967            case float_round_up:
2968                increment = !zSign && zSig2;
2969                break;
2970            case float_round_down:
2971                increment = zSign && zSig2;
2972                break;
2973            case float_round_to_odd:
2974                increment = !(zSig1 & 0x1) && zSig2;
2975                break;
2976            default:
2977                abort();
2978            }
2979        }
2980    }
2981    if (zSig2) {
2982        status->float_exception_flags |= float_flag_inexact;
2983    }
2984    if ( increment ) {
2985        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2986        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2987    }
2988    else {
2989        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2990    }
2991    return packFloat128( zSign, zExp, zSig0, zSig1 );
2992
2993}
2994
2995/*----------------------------------------------------------------------------
2996| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2997| and significand formed by the concatenation of `zSig0' and `zSig1', and
2998| returns the proper quadruple-precision floating-point value corresponding
2999| to the abstract input.  This routine is just like `roundAndPackFloat128'
3000| except that the input significand has fewer bits and does not have to be

3001| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
3002| point exponent.
3003*----------------------------------------------------------------------------*/
3004
3005static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3006                                              uint64_t zSig0, uint64_t zSig1,
3007                                              float_status *status)
3008{
3009    int8_t shiftCount;
3010    uint64_t zSig2;
3011
3012    if ( zSig0 == 0 ) {
3013        zSig0 = zSig1;
3014        zSig1 = 0;
3015        zExp -= 64;
3016    }
3017    shiftCount = countLeadingZeros64( zSig0 ) - 15;
3018    if ( 0 <= shiftCount ) {
3019        zSig2 = 0;
3020        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3021    }
3022    else {
3023        shift128ExtraRightJamming(
3024            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3025    }
3026    zExp -= shiftCount;
3027    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3028
3029}
3030
3031
3032/*----------------------------------------------------------------------------
3033| Returns the result of converting the 32-bit two's complement integer `a'
3034| to the extended double-precision floating-point format.  The conversion
3035| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3036| Arithmetic.
3037*----------------------------------------------------------------------------*/
3038
3039floatx80 int32_to_floatx80(int32_t a, float_status *status)
3040{
3041    flag zSign;
3042    uint32_t absA;
3043    int8_t shiftCount;
3044    uint64_t zSig;
3045
3046    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3047    zSign = ( a < 0 );
3048    absA = zSign ? - a : a;
3049    shiftCount = countLeadingZeros32( absA ) + 32;
3050    zSig = absA;
3051    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3052
3053}
3054
3055/*----------------------------------------------------------------------------
3056| Returns the result of converting the 32-bit two's complement integer `a' to
3057| the quadruple-precision floating-point format.  The conversion is performed
3058| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3059*----------------------------------------------------------------------------*/
3060
3061float128 int32_to_float128(int32_t a, float_status *status)
3062{
3063    flag zSign;
3064    uint32_t absA;
3065    int8_t shiftCount;
3066    uint64_t zSig0;
3067
3068    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3069    zSign = ( a < 0 );
3070    absA = zSign ? - a : a;
3071    shiftCount = countLeadingZeros32( absA ) + 17;
3072    zSig0 = absA;
3073    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3074
3075}
3076
3077/*----------------------------------------------------------------------------
3078| Returns the result of converting the 64-bit two's complement integer `a'
3079| to the extended double-precision floating-point format.  The conversion
3080| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3081| Arithmetic.
3082*----------------------------------------------------------------------------*/
3083
3084floatx80 int64_to_floatx80(int64_t a, float_status *status)
3085{
3086    flag zSign;
3087    uint64_t absA;
3088    int8_t shiftCount;
3089
3090    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3091    zSign = ( a < 0 );
3092    absA = zSign ? - a : a;
3093    shiftCount = countLeadingZeros64( absA );
3094    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3095
3096}
3097
3098/*----------------------------------------------------------------------------
3099| Returns the result of converting the 64-bit two's complement integer `a' to
3100| the quadruple-precision floating-point format.  The conversion is performed
3101| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3102*----------------------------------------------------------------------------*/
3103
3104float128 int64_to_float128(int64_t a, float_status *status)
3105{
3106    flag zSign;
3107    uint64_t absA;
3108    int8_t shiftCount;
3109    int32_t zExp;
3110    uint64_t zSig0, zSig1;
3111
3112    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3113    zSign = ( a < 0 );
3114    absA = zSign ? - a : a;
3115    shiftCount = countLeadingZeros64( absA ) + 49;
3116    zExp = 0x406E - shiftCount;
3117    if ( 64 <= shiftCount ) {
3118        zSig1 = 0;
3119        zSig0 = absA;
3120        shiftCount -= 64;
3121    }
3122    else {
3123        zSig1 = absA;
3124        zSig0 = 0;
3125    }
3126    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3127    return packFloat128( zSign, zExp, zSig0, zSig1 );
3128
3129}
3130
3131/*----------------------------------------------------------------------------
3132| Returns the result of converting the 64-bit unsigned integer `a'
3133| to the quadruple-precision floating-point format.  The conversion is performed
3134| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3135*----------------------------------------------------------------------------*/
3136
3137float128 uint64_to_float128(uint64_t a, float_status *status)
3138{
3139    if (a == 0) {
3140        return float128_zero;
3141    }
3142    return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
3143}
3144
3145
3146
3147
3148/*----------------------------------------------------------------------------
3149| Returns the result of converting the single-precision floating-point value
3150| `a' to the double-precision floating-point format.  The conversion is
3151| performed according to the IEC/IEEE Standard for Binary Floating-Point
3152| Arithmetic.
3153*----------------------------------------------------------------------------*/
3154
3155float64 float32_to_float64(float32 a, float_status *status)
3156{
3157    flag aSign;
3158    int aExp;
3159    uint32_t aSig;
3160    a = float32_squash_input_denormal(a, status);
3161
3162    aSig = extractFloat32Frac( a );
3163    aExp = extractFloat32Exp( a );
3164    aSign = extractFloat32Sign( a );
3165    if ( aExp == 0xFF ) {
3166        if (aSig) {
3167            return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3168        }
3169        return packFloat64( aSign, 0x7FF, 0 );
3170    }
3171    if ( aExp == 0 ) {
3172        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3173        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3174        --aExp;
3175    }
3176    return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3177
3178}
3179
3180/*----------------------------------------------------------------------------
3181| Returns the result of converting the single-precision floating-point value
3182| `a' to the extended double-precision floating-point format.  The conversion
3183| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3184| Arithmetic.
3185*----------------------------------------------------------------------------*/
3186
3187floatx80 float32_to_floatx80(float32 a, float_status *status)
3188{
3189    flag aSign;
3190    int aExp;
3191    uint32_t aSig;
3192
3193    a = float32_squash_input_denormal(a, status);
3194    aSig = extractFloat32Frac( a );
3195    aExp = extractFloat32Exp( a );
3196    aSign = extractFloat32Sign( a );
3197    if ( aExp == 0xFF ) {
3198        if (aSig) {
3199            return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3200        }
3201        return packFloatx80(aSign,
3202                            floatx80_infinity_high,
3203                            floatx80_infinity_low);
3204    }
3205    if ( aExp == 0 ) {
3206        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3207        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3208    }
3209    aSig |= 0x00800000;
3210    return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3211
3212}
3213
3214/*----------------------------------------------------------------------------
3215| Returns the result of converting the single-precision floating-point value
3216| `a' to the double-precision floating-point format.  The conversion is
3217| performed according to the IEC/IEEE Standard for Binary Floating-Point
3218| Arithmetic.
3219*----------------------------------------------------------------------------*/
3220
3221float128 float32_to_float128(float32 a, float_status *status)
3222{
3223    flag aSign;
3224    int aExp;
3225    uint32_t aSig;
3226
3227    a = float32_squash_input_denormal(a, status);
3228    aSig = extractFloat32Frac( a );
3229    aExp = extractFloat32Exp( a );
3230    aSign = extractFloat32Sign( a );
3231    if ( aExp == 0xFF ) {
3232        if (aSig) {
3233            return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3234        }
3235        return packFloat128( aSign, 0x7FFF, 0, 0 );
3236    }
3237    if ( aExp == 0 ) {
3238        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3239        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3240        --aExp;
3241    }
3242    return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3243
3244}
3245
3246/*----------------------------------------------------------------------------
3247| Returns the remainder of the single-precision floating-point value `a'
3248| with respect to the corresponding value `b'.  The operation is performed
3249| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3250*----------------------------------------------------------------------------*/
3251
3252float32 float32_rem(float32 a, float32 b, float_status *status)
3253{
3254    flag aSign, zSign;
3255    int aExp, bExp, expDiff;
3256    uint32_t aSig, bSig;
3257    uint32_t q;
3258    uint64_t aSig64, bSig64, q64;
3259    uint32_t alternateASig;
3260    int32_t sigMean;
3261    a = float32_squash_input_denormal(a, status);
3262    b = float32_squash_input_denormal(b, status);
3263
3264    aSig = extractFloat32Frac( a );
3265    aExp = extractFloat32Exp( a );
3266    aSign = extractFloat32Sign( a );
3267    bSig = extractFloat32Frac( b );
3268    bExp = extractFloat32Exp( b );
3269    if ( aExp == 0xFF ) {
3270        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3271            return propagateFloat32NaN(a, b, status);
3272        }
3273        float_raise(float_flag_invalid, status);
3274        return float32_default_nan(status);
3275    }
3276    if ( bExp == 0xFF ) {
3277        if (bSig) {
3278            return propagateFloat32NaN(a, b, status);
3279        }
3280        return a;
3281    }
3282    if ( bExp == 0 ) {
3283        if ( bSig == 0 ) {
3284            float_raise(float_flag_invalid, status);
3285            return float32_default_nan(status);
3286        }
3287        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3288    }
3289    if ( aExp == 0 ) {
3290        if ( aSig == 0 ) return a;
3291        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3292    }
3293    expDiff = aExp - bExp;
3294    aSig |= 0x00800000;
3295    bSig |= 0x00800000;
3296    if ( expDiff < 32 ) {
3297        aSig <<= 8;
3298        bSig <<= 8;
3299        if ( expDiff < 0 ) {
3300            if ( expDiff < -1 ) return a;
3301            aSig >>= 1;
3302        }
3303        q = ( bSig <= aSig );
3304        if ( q ) aSig -= bSig;
3305        if ( 0 < expDiff ) {
3306            q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3307            q >>= 32 - expDiff;
3308            bSig >>= 2;
3309            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3310        }
3311        else {
3312            aSig >>= 2;
3313            bSig >>= 2;
3314        }
3315    }
3316    else {
3317        if ( bSig <= aSig ) aSig -= bSig;
3318        aSig64 = ( (uint64_t) aSig )<<40;
3319        bSig64 = ( (uint64_t) bSig )<<40;
3320        expDiff -= 64;
3321        while ( 0 < expDiff ) {
3322            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3323            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3324            aSig64 = - ( ( bSig * q64 )<<38 );
3325            expDiff -= 62;
3326        }
3327        expDiff += 64;
3328        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3329        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3330        q = q64>>( 64 - expDiff );
3331        bSig <<= 6;
3332        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3333    }
3334    do {
3335        alternateASig = aSig;
3336        ++q;
3337        aSig -= bSig;
3338    } while ( 0 <= (int32_t) aSig );
3339    sigMean = aSig + alternateASig;
3340    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3341        aSig = alternateASig;
3342    }
3343    zSign = ( (int32_t) aSig < 0 );
3344    if ( zSign ) aSig = - aSig;
3345    return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3346}
3347
3348
3349
3350/*----------------------------------------------------------------------------
3351| Returns the binary exponential of the single-precision floating-point value
3352| `a'. The operation is performed according to the IEC/IEEE Standard for
3353| Binary Floating-Point Arithmetic.
3354|
3355| Uses the following identities:
3356|
3357| 1. -------------------------------------------------------------------------
3358|      x    x*ln(2)
3359|     2  = e
3360|
3361| 2. -------------------------------------------------------------------------
3362|                      2     3     4     5           n
3363|      x        x     x     x     x     x           x
3364|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3365|               1!    2!    3!    4!    5!          n!
3366*----------------------------------------------------------------------------*/
3367
3368static const float64 float32_exp2_coefficients[15] =
3369{
3370    const_float64( 0x3ff0000000000000ll ), /*  1 */
3371    const_float64( 0x3fe0000000000000ll ), /*  2 */
3372    const_float64( 0x3fc5555555555555ll ), /*  3 */
3373    const_float64( 0x3fa5555555555555ll ), /*  4 */
3374    const_float64( 0x3f81111111111111ll ), /*  5 */
3375    const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3376    const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3377    const_float64( 0x3efa01a01a01a01all ), /*  8 */
3378    const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3379    const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3380    const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3381    const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3382    const_float64( 0x3de6124613a86d09ll ), /* 13 */
3383    const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3384    const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3385};
3386
3387float32 float32_exp2(float32 a, float_status *status)
3388{
3389    flag aSign;
3390    int aExp;
3391    uint32_t aSig;
3392    float64 r, x, xn;
3393    int i;
3394    a = float32_squash_input_denormal(a, status);
3395
3396    aSig = extractFloat32Frac( a );
3397    aExp = extractFloat32Exp( a );
3398    aSign = extractFloat32Sign( a );
3399
3400    if ( aExp == 0xFF) {
3401        if (aSig) {
3402            return propagateFloat32NaN(a, float32_zero, status);
3403        }
3404        return (aSign) ? float32_zero : a;
3405    }
3406    if (aExp == 0) {
3407        if (aSig == 0) return float32_one;
3408    }
3409
3410    float_raise(float_flag_inexact, status);
3411
3412    /* ******************************* */
3413    /* using float64 for approximation */
3414    /* ******************************* */
3415    x = float32_to_float64(a, status);
3416    x = float64_mul(x, float64_ln2, status);
3417
3418    xn = x;
3419    r = float64_one;
3420    for (i = 0 ; i < 15 ; i++) {
3421        float64 f;
3422
3423        f = float64_mul(xn, float32_exp2_coefficients[i], status);
3424        r = float64_add(r, f, status);
3425
3426        xn = float64_mul(xn, x, status);
3427    }
3428
3429    return float64_to_float32(r, status);
3430}
3431
3432/*----------------------------------------------------------------------------
3433| Returns the binary log of the single-precision floating-point value `a'.
3434| The operation is performed according to the IEC/IEEE Standard for Binary
3435| Floating-Point Arithmetic.
3436*----------------------------------------------------------------------------*/
3437float32 float32_log2(float32 a, float_status *status)
3438{
3439    flag aSign, zSign;
3440    int aExp;
3441    uint32_t aSig, zSig, i;
3442
3443    a = float32_squash_input_denormal(a, status);
3444    aSig = extractFloat32Frac( a );
3445    aExp = extractFloat32Exp( a );
3446    aSign = extractFloat32Sign( a );
3447
3448    if ( aExp == 0 ) {
3449        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3450        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3451    }
3452    if ( aSign ) {
3453        float_raise(float_flag_invalid, status);
3454        return float32_default_nan(status);
3455    }
3456    if ( aExp == 0xFF ) {
3457        if (aSig) {
3458            return propagateFloat32NaN(a, float32_zero, status);
3459        }
3460        return a;
3461    }
3462
3463    aExp -= 0x7F;
3464    aSig |= 0x00800000;
3465    zSign = aExp < 0;
3466    zSig = aExp << 23;
3467
3468    for (i = 1 << 22; i > 0; i >>= 1) {
3469        aSig = ( (uint64_t)aSig * aSig ) >> 23;
3470        if ( aSig & 0x01000000 ) {
3471            aSig >>= 1;
3472            zSig |= i;
3473        }
3474    }
3475
3476    if ( zSign )
3477        zSig = -zSig;
3478
3479    return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3480}
3481
3482/*----------------------------------------------------------------------------
3483| Returns 1 if the single-precision floating-point value `a' is equal to
3484| the corresponding value `b', and 0 otherwise.  The invalid exception is
3485| raised if either operand is a NaN.  Otherwise, the comparison is performed
3486| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3487*----------------------------------------------------------------------------*/
3488
3489int float32_eq(float32 a, float32 b, float_status *status)
3490{
3491    uint32_t av, bv;
3492    a = float32_squash_input_denormal(a, status);
3493    b = float32_squash_input_denormal(b, status);
3494
3495    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3496         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3497       ) {
3498        float_raise(float_flag_invalid, status);
3499        return 0;
3500    }
3501    av = float32_val(a);
3502    bv = float32_val(b);
3503    return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3504}
3505
3506/*----------------------------------------------------------------------------
3507| Returns 1 if the single-precision floating-point value `a' is less than
3508| or equal to the corresponding value `b', and 0 otherwise.  The invalid
3509| exception is raised if either operand is a NaN.  The comparison is performed
3510| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3511*----------------------------------------------------------------------------*/
3512
3513int float32_le(float32 a, float32 b, float_status *status)
3514{
3515    flag aSign, bSign;
3516    uint32_t av, bv;
3517    a = float32_squash_input_denormal(a, status);
3518    b = float32_squash_input_denormal(b, status);
3519
3520    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3521         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3522       ) {
3523        float_raise(float_flag_invalid, status);
3524        return 0;
3525    }
3526    aSign = extractFloat32Sign( a );
3527    bSign = extractFloat32Sign( b );
3528    av = float32_val(a);
3529    bv = float32_val(b);
3530    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3531    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3532
3533}
3534
3535/*----------------------------------------------------------------------------
3536| Returns 1 if the single-precision floating-point value `a' is less than
3537| the corresponding value `b', and 0 otherwise.  The invalid exception is
3538| raised if either operand is a NaN.  The comparison is performed according
3539| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3540*----------------------------------------------------------------------------*/
3541
3542int float32_lt(float32 a, float32 b, float_status *status)
3543{
3544    flag aSign, bSign;
3545    uint32_t av, bv;
3546    a = float32_squash_input_denormal(a, status);
3547    b = float32_squash_input_denormal(b, status);
3548
3549    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3550         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3551       ) {
3552        float_raise(float_flag_invalid, status);
3553        return 0;
3554    }
3555    aSign = extractFloat32Sign( a );
3556    bSign = extractFloat32Sign( b );
3557    av = float32_val(a);
3558    bv = float32_val(b);
3559    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3560    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3561
3562}
3563
3564/*----------------------------------------------------------------------------
3565| Returns 1 if the single-precision floating-point values `a' and `b' cannot
3566| be compared, and 0 otherwise.  The invalid exception is raised if either
3567| operand is a NaN.  The comparison is performed according to the IEC/IEEE
3568| Standard for Binary Floating-Point Arithmetic.
3569*----------------------------------------------------------------------------*/
3570
3571int float32_unordered(float32 a, float32 b, float_status *status)
3572{
3573    a = float32_squash_input_denormal(a, status);
3574    b = float32_squash_input_denormal(b, status);
3575
3576    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3577         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3578       ) {
3579        float_raise(float_flag_invalid, status);
3580        return 1;
3581    }
3582    return 0;
3583}
3584
3585/*----------------------------------------------------------------------------
3586| Returns 1 if the single-precision floating-point value `a' is equal to
3587| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3588| exception.  The comparison is performed according to the IEC/IEEE Standard
3589| for Binary Floating-Point Arithmetic.
3590*----------------------------------------------------------------------------*/
3591
3592int float32_eq_quiet(float32 a, float32 b, float_status *status)
3593{
3594    a = float32_squash_input_denormal(a, status);
3595    b = float32_squash_input_denormal(b, status);
3596
3597    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3598         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3599       ) {
3600        if (float32_is_signaling_nan(a, status)
3601         || float32_is_signaling_nan(b, status)) {
3602            float_raise(float_flag_invalid, status);
3603        }
3604        return 0;
3605    }
3606    return ( float32_val(a) == float32_val(b) ) ||
3607            ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3608}
3609
3610/*----------------------------------------------------------------------------
3611| Returns 1 if the single-precision floating-point value `a' is less than or
3612| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3613| cause an exception.  Otherwise, the comparison is performed according to the
3614| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3615*----------------------------------------------------------------------------*/
3616
3617int float32_le_quiet(float32 a, float32 b, float_status *status)
3618{
3619    flag aSign, bSign;
3620    uint32_t av, bv;
3621    a = float32_squash_input_denormal(a, status);
3622    b = float32_squash_input_denormal(b, status);
3623
3624    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3625         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3626       ) {
3627        if (float32_is_signaling_nan(a, status)
3628         || float32_is_signaling_nan(b, status)) {
3629            float_raise(float_flag_invalid, status);
3630        }
3631        return 0;
3632    }
3633    aSign = extractFloat32Sign( a );
3634    bSign = extractFloat32Sign( b );
3635    av = float32_val(a);
3636    bv = float32_val(b);
3637    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3638    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3639
3640}
3641
3642/*----------------------------------------------------------------------------
3643| Returns 1 if the single-precision floating-point value `a' is less than
3644| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3645| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3646| Standard for Binary Floating-Point Arithmetic.
3647*----------------------------------------------------------------------------*/
3648
3649int float32_lt_quiet(float32 a, float32 b, float_status *status)
3650{
3651    flag aSign, bSign;
3652    uint32_t av, bv;
3653    a = float32_squash_input_denormal(a, status);
3654    b = float32_squash_input_denormal(b, status);
3655
3656    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3657         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3658       ) {
3659        if (float32_is_signaling_nan(a, status)
3660         || float32_is_signaling_nan(b, status)) {
3661            float_raise(float_flag_invalid, status);
3662        }
3663        return 0;
3664    }
3665    aSign = extractFloat32Sign( a );
3666    bSign = extractFloat32Sign( b );
3667    av = float32_val(a);
3668    bv = float32_val(b);
3669    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3670    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3671
3672}
3673
3674/*----------------------------------------------------------------------------
3675| Returns 1 if the single-precision floating-point values `a' and `b' cannot
3676| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3677| comparison is performed according to the IEC/IEEE Standard for Binary
3678| Floating-Point Arithmetic.
3679*----------------------------------------------------------------------------*/
3680
3681int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3682{
3683    a = float32_squash_input_denormal(a, status);
3684    b = float32_squash_input_denormal(b, status);
3685
3686    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3687         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3688       ) {
3689        if (float32_is_signaling_nan(a, status)
3690         || float32_is_signaling_nan(b, status)) {
3691            float_raise(float_flag_invalid, status);
3692        }
3693        return 1;
3694    }
3695    return 0;
3696}
3697
3698
3699/*----------------------------------------------------------------------------
3700| Returns the result of converting the double-precision floating-point value
3701| `a' to the single-precision floating-point format.  The conversion is
3702| performed according to the IEC/IEEE Standard for Binary Floating-Point
3703| Arithmetic.
3704*----------------------------------------------------------------------------*/
3705
3706float32 float64_to_float32(float64 a, float_status *status)
3707{
3708    flag aSign;
3709    int aExp;
3710    uint64_t aSig;
3711    uint32_t zSig;
3712    a = float64_squash_input_denormal(a, status);
3713
3714    aSig = extractFloat64Frac( a );
3715    aExp = extractFloat64Exp( a );
3716    aSign = extractFloat64Sign( a );
3717    if ( aExp == 0x7FF ) {
3718        if (aSig) {
3719            return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3720        }
3721        return packFloat32( aSign, 0xFF, 0 );
3722    }
3723    shift64RightJamming( aSig, 22, &aSig );
3724    zSig = aSig;
3725    if ( aExp || zSig ) {
3726        zSig |= 0x40000000;
3727        aExp -= 0x381;
3728    }
3729    return roundAndPackFloat32(aSign, aExp, zSig, status);
3730
3731}
3732
3733
3734/*----------------------------------------------------------------------------
3735| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3736| half-precision floating-point value, returning the result.  After being
3737| shifted into the proper positions, the three fields are simply added
3738| together to form the result.  This means that any integer portion of `zSig'
3739| will be added into the exponent.  Since a properly normalized significand
3740| will have an integer portion equal to 1, the `zExp' input should be 1 less
3741| than the desired result exponent whenever `zSig' is a complete, normalized
3742| significand.
3743*----------------------------------------------------------------------------*/
3744static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3745{
3746    return make_float16(
3747        (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3748}
3749
3750/*----------------------------------------------------------------------------
3751| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3752| and significand `zSig', and returns the proper half-precision floating-
3753| point value corresponding to the abstract input.  Ordinarily, the abstract
3754| value is simply rounded and packed into the half-precision format, with
3755| the inexact exception raised if the abstract input cannot be represented
3756| exactly.  However, if the abstract value is too large, the overflow and
3757| inexact exceptions are raised and an infinity or maximal finite value is
3758| returned.  If the abstract value is too small, the input value is rounded to
3759| a subnormal number, and the underflow and inexact exceptions are raised if
3760| the abstract input cannot be represented exactly as a subnormal half-
3761| precision floating-point number.
3762| The `ieee' flag indicates whether to use IEEE standard half precision, or
3763| ARM-style "alternative representation", which omits the NaN and Inf
3764| encodings in order to raise the maximum representable exponent by one.
3765|     The input significand `zSig' has its binary point between bits 22
3766| and 23, which is 13 bits to the left of the usual location.  This shifted
3767| significand must be normalized or smaller.  If `zSig' is not normalized,
3768| `zExp' must be 0; in that case, the result returned is a subnormal number,
3769| and it must not require rounding.  In the usual case that `zSig' is
3770| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3771| Note the slightly odd position of the binary point in zSig compared with the
3772| other roundAndPackFloat functions. This should probably be fixed if we
3773| need to implement more float16 routines than just conversion.
3774| The handling of underflow and overflow follows the IEC/IEEE Standard for
3775| Binary Floating-Point Arithmetic.
3776*----------------------------------------------------------------------------*/
3777
3778static float16 roundAndPackFloat16(flag zSign, int zExp,
3779                                   uint32_t zSig, flag ieee,
3780                                   float_status *status)
3781{
3782    int maxexp = ieee ? 29 : 30;
3783    uint32_t mask;
3784    uint32_t increment;
3785    bool rounding_bumps_exp;
3786    bool is_tiny = false;
3787
3788    /* Calculate the mask of bits of the mantissa which are not
3789     * representable in half-precision and will be lost.
3790     */
3791    if (zExp < 1) {
3792        /* Will be denormal in halfprec */
3793        mask = 0x00ffffff;
3794        if (zExp >= -11) {
3795            mask >>= 11 + zExp;
3796        }
3797    } else {
3798        /* Normal number in halfprec */
3799        mask = 0x00001fff;
3800    }
3801
3802    switch (status->float_rounding_mode) {
3803    case float_round_nearest_even:
3804        increment = (mask + 1) >> 1;
3805        if ((zSig & mask) == increment) {
3806            increment = zSig & (increment << 1);
3807        }
3808        break;
3809    case float_round_ties_away:
3810        increment = (mask + 1) >> 1;
3811        break;
3812    case float_round_up:
3813        increment = zSign ? 0 : mask;
3814        break;
3815    case float_round_down:
3816        increment = zSign ? mask : 0;
3817        break;
3818    default: /* round_to_zero */
3819        increment = 0;
3820        break;
3821    }
3822
3823    rounding_bumps_exp = (zSig + increment >= 0x01000000);
3824
3825    if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3826        if (ieee) {
3827            float_raise(float_flag_overflow | float_flag_inexact, status);
3828            return packFloat16(zSign, 0x1f, 0);
3829        } else {
3830            float_raise(float_flag_invalid, status);
3831            return packFloat16(zSign, 0x1f, 0x3ff);
3832        }
3833    }
3834
3835    if (zExp < 0) {
3836        /* Note that flush-to-zero does not affect half-precision results */
3837        is_tiny =
3838            (status->float_detect_tininess == float_tininess_before_rounding)
3839            || (zExp < -1)
3840            || (!rounding_bumps_exp);
3841    }
3842    if (zSig & mask) {
3843        float_raise(float_flag_inexact, status);
3844        if (is_tiny) {
3845            float_raise(float_flag_underflow, status);
3846        }
3847    }
3848
3849    zSig += increment;
3850    if (rounding_bumps_exp) {
3851        zSig >>= 1;
3852        zExp++;
3853    }
3854
3855    if (zExp < -10) {
3856        return packFloat16(zSign, 0, 0);
3857    }
3858    if (zExp < 0) {
3859        zSig >>= -zExp;
3860        zExp = 0;
3861    }
3862    return packFloat16(zSign, zExp, zSig >> 13);
3863}
3864
3865/*----------------------------------------------------------------------------
3866| If `a' is denormal and we are in flush-to-zero mode then set the
3867| input-denormal exception and return zero. Otherwise just return the value.
3868*----------------------------------------------------------------------------*/
3869float16 float16_squash_input_denormal(float16 a, float_status *status)
3870{
3871    if (status->flush_inputs_to_zero) {
3872        if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3873            float_raise(float_flag_input_denormal, status);
3874            return make_float16(float16_val(a) & 0x8000);
3875        }
3876    }
3877    return a;
3878}
3879
3880static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3881                                      uint32_t *zSigPtr)
3882{
3883    int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3884    *zSigPtr = aSig << shiftCount;
3885    *zExpPtr = 1 - shiftCount;
3886}
3887
3888/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3889   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3890
3891float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3892{
3893    flag aSign;
3894    int aExp;
3895    uint32_t aSig;
3896
3897    aSign = extractFloat16Sign(a);
3898    aExp = extractFloat16Exp(a);
3899    aSig = extractFloat16Frac(a);
3900
3901    if (aExp == 0x1f && ieee) {
3902        if (aSig) {
3903            return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3904        }
3905        return packFloat32(aSign, 0xff, 0);
3906    }
3907    if (aExp == 0) {
3908        if (aSig == 0) {
3909            return packFloat32(aSign, 0, 0);
3910        }
3911
3912        normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3913        aExp--;
3914    }
3915    return packFloat32( aSign, aExp + 0x70, aSig << 13);
3916}
3917
3918float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3919{
3920    flag aSign;
3921    int aExp;
3922    uint32_t aSig;
3923
3924    a = float32_squash_input_denormal(a, status);
3925
3926    aSig = extractFloat32Frac( a );
3927    aExp = extractFloat32Exp( a );
3928    aSign = extractFloat32Sign( a );
3929    if ( aExp == 0xFF ) {
3930        if (aSig) {
3931            /* Input is a NaN */
3932            if (!ieee) {
3933                float_raise(float_flag_invalid, status);
3934                return packFloat16(aSign, 0, 0);
3935            }
3936            return commonNaNToFloat16(
3937                float32ToCommonNaN(a, status), status);
3938        }
3939        /* Infinity */
3940        if (!ieee) {
3941            float_raise(float_flag_invalid, status);
3942            return packFloat16(aSign, 0x1f, 0x3ff);
3943        }
3944        return packFloat16(aSign, 0x1f, 0);
3945    }
3946    if (aExp == 0 && aSig == 0) {
3947        return packFloat16(aSign, 0, 0);
3948    }
3949    /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3950     * even if the input is denormal; however this is harmless because
3951     * the largest possible single-precision denormal is still smaller
3952     * than the smallest representable half-precision denormal, and so we
3953     * will end up ignoring aSig and returning via the "always return zero"
3954     * codepath.
3955     */
3956    aSig |= 0x00800000;
3957    aExp -= 0x71;
3958
3959    return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3960}
3961
3962float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3963{
3964    flag aSign;
3965    int aExp;
3966    uint32_t aSig;
3967
3968    aSign = extractFloat16Sign(a);
3969    aExp = extractFloat16Exp(a);
3970    aSig = extractFloat16Frac(a);
3971
3972    if (aExp == 0x1f && ieee) {
3973        if (aSig) {
3974            return commonNaNToFloat64(
3975                float16ToCommonNaN(a, status), status);
3976        }
3977        return packFloat64(aSign, 0x7ff, 0);
3978    }
3979    if (aExp == 0) {
3980        if (aSig == 0) {
3981            return packFloat64(aSign, 0, 0);
3982        }
3983
3984        normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3985        aExp--;
3986    }
3987    return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3988}
3989
3990float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3991{
3992    flag aSign;
3993    int aExp;
3994    uint64_t aSig;
3995    uint32_t zSig;
3996
3997    a = float64_squash_input_denormal(a, status);
3998
3999    aSig = extractFloat64Frac(a);
4000    aExp = extractFloat64Exp(a);

4001    aSign = extractFloat64Sign(a);
4002    if (aExp == 0x7FF) {
4003        if (aSig) {
4004            /* Input is a NaN */
4005            if (!ieee) {
4006                float_raise(float_flag_invalid, status);
4007                return packFloat16(aSign, 0, 0);
4008            }
4009            return commonNaNToFloat16(
4010                float64ToCommonNaN(a, status), status);
4011        }
4012        /* Infinity */
4013        if (!ieee) {
4014            float_raise(float_flag_invalid, status);
4015            return packFloat16(aSign, 0x1f, 0x3ff);
4016        }
4017        return packFloat16(aSign, 0x1f, 0);
4018    }
4019    shift64RightJamming(aSig, 29, &aSig);
4020    zSig = aSig;
4021    if (aExp == 0 && zSig == 0) {
4022        return packFloat16(aSign, 0, 0);
4023    }
4024    /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4025     * even if the input is denormal; however this is harmless because
4026     * the largest possible single-precision denormal is still smaller
4027     * than the smallest representable half-precision denormal, and so we
4028     * will end up ignoring aSig and returning via the "always return zero"
4029     * codepath.
4030     */
4031    zSig |= 0x00800000;
4032    aExp -= 0x3F1;
4033
4034    return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4035}
4036
4037/*----------------------------------------------------------------------------
4038| Returns the result of converting the double-precision floating-point value
4039| `a' to the extended double-precision floating-point format.  The conversion
4040| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4041| Arithmetic.
4042*----------------------------------------------------------------------------*/
4043
4044floatx80 float64_to_floatx80(float64 a, float_status *status)
4045{
4046    flag aSign;
4047    int aExp;
4048    uint64_t aSig;
4049
4050    a = float64_squash_input_denormal(a, status);
4051    aSig = extractFloat64Frac( a );
4052    aExp = extractFloat64Exp( a );
4053    aSign = extractFloat64Sign( a );
4054    if ( aExp == 0x7FF ) {
4055        if (aSig) {
4056            return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4057        }
4058        return packFloatx80(aSign,
4059                            floatx80_infinity_high,
4060                            floatx80_infinity_low);
4061    }
4062    if ( aExp == 0 ) {
4063        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4064        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4065    }
4066    return
4067        packFloatx80(
4068            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4069
4070}
4071
4072/*----------------------------------------------------------------------------
4073| Returns the result of converting the double-precision floating-point value
4074| `a' to the quadruple-precision floating-point format.  The conversion is
4075| performed according to the IEC/IEEE Standard for Binary Floating-Point
4076| Arithmetic.
4077*----------------------------------------------------------------------------*/
4078
4079float128 float64_to_float128(float64 a, float_status *status)
4080{
4081    flag aSign;
4082    int aExp;
4083    uint64_t aSig, zSig0, zSig1;
4084
4085    a = float64_squash_input_denormal(a, status);
4086    aSig = extractFloat64Frac( a );
4087    aExp = extractFloat64Exp( a );
4088    aSign = extractFloat64Sign( a );
4089    if ( aExp == 0x7FF ) {
4090        if (aSig) {
4091            return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4092        }
4093        return packFloat128( aSign, 0x7FFF, 0, 0 );
4094    }
4095    if ( aExp == 0 ) {
4096        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4097        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4098        --aExp;
4099    }
4100    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4101    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4102
4103}
4104
4105
4106/*----------------------------------------------------------------------------
4107| Returns the remainder of the double-precision floating-point value `a'
4108| with respect to the corresponding value `b'.  The operation is performed
4109| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4110*----------------------------------------------------------------------------*/
4111
4112float64 float64_rem(float64 a, float64 b, float_status *status)
4113{
4114    flag aSign, zSign;
4115    int aExp, bExp, expDiff;
4116    uint64_t aSig, bSig;
4117    uint64_t q, alternateASig;
4118    int64_t sigMean;
4119
4120    a = float64_squash_input_denormal(a, status);
4121    b = float64_squash_input_denormal(b, status);
4122    aSig = extractFloat64Frac( a );
4123    aExp = extractFloat64Exp( a );
4124    aSign = extractFloat64Sign( a );
4125    bSig = extractFloat64Frac( b );
4126    bExp = extractFloat64Exp( b );
4127    if ( aExp == 0x7FF ) {
4128        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4129            return propagateFloat64NaN(a, b, status);
4130        }
4131        float_raise(float_flag_invalid, status);
4132        return float64_default_nan(status);
4133    }
4134    if ( bExp == 0x7FF ) {
4135        if (bSig) {
4136            return propagateFloat64NaN(a, b, status);
4137        }
4138        return a;
4139    }
4140    if ( bExp == 0 ) {
4141        if ( bSig == 0 ) {
4142            float_raise(float_flag_invalid, status);
4143            return float64_default_nan(status);
4144        }
4145        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4146    }
4147    if ( aExp == 0 ) {
4148        if ( aSig == 0 ) return a;
4149        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4150    }
4151    expDiff = aExp - bExp;
4152    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4153    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4154    if ( expDiff < 0 ) {
4155        if ( expDiff < -1 ) return a;
4156        aSig >>= 1;
4157    }
4158    q = ( bSig <= aSig );
4159    if ( q ) aSig -= bSig;
4160    expDiff -= 64;
4161    while ( 0 < expDiff ) {
4162        q = estimateDiv128To64( aSig, 0, bSig );
4163        q = ( 2 < q ) ? q - 2 : 0;
4164        aSig = - ( ( bSig>>2 ) * q );
4165        expDiff -= 62;
4166    }
4167    expDiff += 64;
4168    if ( 0 < expDiff ) {
4169        q = estimateDiv128To64( aSig, 0, bSig );
4170        q = ( 2 < q ) ? q - 2 : 0;
4171        q >>= 64 - expDiff;
4172        bSig >>= 2;
4173        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4174    }
4175    else {
4176        aSig >>= 2;
4177        bSig >>= 2;
4178    }
4179    do {
4180        alternateASig = aSig;
4181        ++q;
4182        aSig -= bSig;
4183    } while ( 0 <= (int64_t) aSig );
4184    sigMean = aSig + alternateASig;
4185    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4186        aSig = alternateASig;
4187    }
4188    zSign = ( (int64_t) aSig < 0 );
4189    if ( zSign ) aSig = - aSig;
4190    return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4191
4192}
4193
4194/*----------------------------------------------------------------------------
4195| Returns the binary log of the double-precision floating-point value `a'.
4196| The operation is performed according to the IEC/IEEE Standard for Binary
4197| Floating-Point Arithmetic.
4198*----------------------------------------------------------------------------*/
4199float64 float64_log2(float64 a, float_status *status)
4200{
4201    flag aSign, zSign;
4202    int aExp;
4203    uint64_t aSig, aSig0, aSig1, zSig, i;
4204    a = float64_squash_input_denormal(a, status);
4205
4206    aSig = extractFloat64Frac( a );
4207    aExp = extractFloat64Exp( a );
4208    aSign = extractFloat64Sign( a );
4209
4210    if ( aExp == 0 ) {
4211        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4212        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4213    }
4214    if ( aSign ) {
4215        float_raise(float_flag_invalid, status);
4216        return float64_default_nan(status);
4217    }
4218    if ( aExp == 0x7FF ) {
4219        if (aSig) {
4220            return propagateFloat64NaN(a, float64_zero, status);
4221        }
4222        return a;
4223    }
4224
4225    aExp -= 0x3FF;
4226    aSig |= LIT64( 0x0010000000000000 );
4227    zSign = aExp < 0;
4228    zSig = (uint64_t)aExp << 52;
4229    for (i = 1LL << 51; i > 0; i >>= 1) {
4230        mul64To128( aSig, aSig, &aSig0, &aSig1 );
4231        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4232        if ( aSig & LIT64( 0x0020000000000000 ) ) {
4233            aSig >>= 1;
4234            zSig |= i;
4235        }
4236    }
4237
4238    if ( zSign )
4239        zSig = -zSig;
4240    return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4241}
4242
4243/*----------------------------------------------------------------------------
4244| Returns 1 if the double-precision floating-point value `a' is equal to the
4245| corresponding value `b', and 0 otherwise.  The invalid exception is raised
4246| if either operand is a NaN.  Otherwise, the comparison is performed
4247| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4248*----------------------------------------------------------------------------*/
4249
4250int float64_eq(float64 a, float64 b, float_status *status)
4251{
4252    uint64_t av, bv;
4253    a = float64_squash_input_denormal(a, status);
4254    b = float64_squash_input_denormal(b, status);
4255
4256    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4257         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4258       ) {
4259        float_raise(float_flag_invalid, status);
4260        return 0;
4261    }
4262    av = float64_val(a);
4263    bv = float64_val(b);
4264    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4265
4266}
4267
4268/*----------------------------------------------------------------------------
4269| Returns 1 if the double-precision floating-point value `a' is less than or
4270| equal to the corresponding value `b', and 0 otherwise.  The invalid
4271| exception is raised if either operand is a NaN.  The comparison is performed
4272| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4273*----------------------------------------------------------------------------*/
4274
4275int float64_le(float64 a, float64 b, float_status *status)
4276{
4277    flag aSign, bSign;
4278    uint64_t av, bv;
4279    a = float64_squash_input_denormal(a, status);
4280    b = float64_squash_input_denormal(b, status);
4281
4282    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4283         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4284       ) {
4285        float_raise(float_flag_invalid, status);
4286        return 0;
4287    }
4288    aSign = extractFloat64Sign( a );
4289    bSign = extractFloat64Sign( b );
4290    av = float64_val(a);
4291    bv = float64_val(b);
4292    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4293    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4294
4295}
4296
4297/*----------------------------------------------------------------------------
4298| Returns 1 if the double-precision floating-point value `a' is less than
4299| the corresponding value `b', and 0 otherwise.  The invalid exception is
4300| raised if either operand is a NaN.  The comparison is performed according
4301| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4302*----------------------------------------------------------------------------*/
4303
4304int float64_lt(float64 a, float64 b, float_status *status)
4305{
4306    flag aSign, bSign;
4307    uint64_t av, bv;
4308
4309    a = float64_squash_input_denormal(a, status);
4310    b = float64_squash_input_denormal(b, status);
4311    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4312         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4313       ) {
4314        float_raise(float_flag_invalid, status);
4315        return 0;
4316    }
4317    aSign = extractFloat64Sign( a );
4318    bSign = extractFloat64Sign( b );
4319    av = float64_val(a);
4320    bv = float64_val(b);
4321    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4322    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4323
4324}
4325
4326/*----------------------------------------------------------------------------
4327| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4328| be compared, and 0 otherwise.  The invalid exception is raised if either
4329| operand is a NaN.  The comparison is performed according to the IEC/IEEE
4330| Standard for Binary Floating-Point Arithmetic.
4331*----------------------------------------------------------------------------*/
4332
4333int float64_unordered(float64 a, float64 b, float_status *status)
4334{
4335    a = float64_squash_input_denormal(a, status);
4336    b = float64_squash_input_denormal(b, status);
4337
4338    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4339         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4340       ) {
4341        float_raise(float_flag_invalid, status);
4342        return 1;
4343    }
4344    return 0;
4345}
4346
4347/*----------------------------------------------------------------------------
4348| Returns 1 if the double-precision floating-point value `a' is equal to the
4349| corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4350| exception.The comparison is performed according to the IEC/IEEE Standard
4351| for Binary Floating-Point Arithmetic.
4352*----------------------------------------------------------------------------*/
4353
4354int float64_eq_quiet(float64 a, float64 b, float_status *status)
4355{
4356    uint64_t av, bv;
4357    a = float64_squash_input_denormal(a, status);
4358    b = float64_squash_input_denormal(b, status);
4359
4360    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4361         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4362       ) {
4363        if (float64_is_signaling_nan(a, status)
4364         || float64_is_signaling_nan(b, status)) {
4365            float_raise(float_flag_invalid, status);
4366        }
4367        return 0;
4368    }
4369    av = float64_val(a);
4370    bv = float64_val(b);
4371    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4372
4373}
4374
4375/*----------------------------------------------------------------------------
4376| Returns 1 if the double-precision floating-point value `a' is less than or
4377| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4378| cause an exception.  Otherwise, the comparison is performed according to the
4379| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4380*----------------------------------------------------------------------------*/
4381
4382int float64_le_quiet(float64 a, float64 b, float_status *status)
4383{
4384    flag aSign, bSign;
4385    uint64_t av, bv;
4386    a = float64_squash_input_denormal(a, status);
4387    b = float64_squash_input_denormal(b, status);
4388
4389    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4390         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4391       ) {
4392        if (float64_is_signaling_nan(a, status)
4393         || float64_is_signaling_nan(b, status)) {
4394            float_raise(float_flag_invalid, status);
4395        }
4396        return 0;
4397    }
4398    aSign = extractFloat64Sign( a );
4399    bSign = extractFloat64Sign( b );
4400    av = float64_val(a);
4401    bv = float64_val(b);
4402    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4403    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4404
4405}
4406
4407/*----------------------------------------------------------------------------
4408| Returns 1 if the double-precision floating-point value `a' is less than
4409| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4410| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4411| Standard for Binary Floating-Point Arithmetic.
4412*----------------------------------------------------------------------------*/
4413
4414int float64_lt_quiet(float64 a, float64 b, float_status *status)
4415{
4416    flag aSign, bSign;
4417    uint64_t av, bv;
4418    a = float64_squash_input_denormal(a, status);
4419    b = float64_squash_input_denormal(b, status);
4420
4421    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4422         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4423       ) {
4424        if (float64_is_signaling_nan(a, status)
4425         || float64_is_signaling_nan(b, status)) {
4426            float_raise(float_flag_invalid, status);
4427        }
4428        return 0;
4429    }
4430    aSign = extractFloat64Sign( a );
4431    bSign = extractFloat64Sign( b );
4432    av = float64_val(a);
4433    bv = float64_val(b);
4434    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4435    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4436
4437}
4438
4439/*----------------------------------------------------------------------------
4440| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4441| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4442| comparison is performed according to the IEC/IEEE Standard for Binary
4443| Floating-Point Arithmetic.
4444*----------------------------------------------------------------------------*/
4445
4446int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4447{
4448    a = float64_squash_input_denormal(a, status);
4449    b = float64_squash_input_denormal(b, status);
4450
4451    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4452         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4453       ) {
4454        if (float64_is_signaling_nan(a, status)
4455         || float64_is_signaling_nan(b, status)) {
4456            float_raise(float_flag_invalid, status);
4457        }
4458        return 1;
4459    }
4460    return 0;
4461}
4462
4463/*----------------------------------------------------------------------------
4464| Returns the result of converting the extended double-precision floating-
4465| point value `a' to the 32-bit two's complement integer format.  The
4466| conversion is performed according to the IEC/IEEE Standard for Binary
4467| Floating-Point Arithmetic---which means in particular that the conversion
4468| is rounded according to the current rounding mode.  If `a' is a NaN, the
4469| largest positive integer is returned.  Otherwise, if the conversion
4470| overflows, the largest integer with the same sign as `a' is returned.
4471*----------------------------------------------------------------------------*/
4472
4473int32_t floatx80_to_int32(floatx80 a, float_status *status)
4474{
4475    flag aSign;
4476    int32_t aExp, shiftCount;
4477    uint64_t aSig;
4478
4479    if (floatx80_invalid_encoding(a)) {
4480        float_raise(float_flag_invalid, status);
4481        return 1 << 31;
4482    }
4483    aSig = extractFloatx80Frac( a );
4484    aExp = extractFloatx80Exp( a );
4485    aSign = extractFloatx80Sign( a );
4486    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4487    shiftCount = 0x4037 - aExp;
4488    if ( shiftCount <= 0 ) shiftCount = 1;
4489    shift64RightJamming( aSig, shiftCount, &aSig );
4490    return roundAndPackInt32(aSign, aSig, status);
4491
4492}
4493
4494/*----------------------------------------------------------------------------
4495| Returns the result of converting the extended double-precision floating-
4496| point value `a' to the 32-bit two's complement integer format.  The
4497| conversion is performed according to the IEC/IEEE Standard for Binary
4498| Floating-Point Arithmetic, except that the conversion is always rounded
4499| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4500| Otherwise, if the conversion overflows, the largest integer with the same
4501| sign as `a' is returned.
4502*----------------------------------------------------------------------------*/
4503
4504int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4505{
4506    flag aSign;
4507    int32_t aExp, shiftCount;
4508    uint64_t aSig, savedASig;
4509    int32_t z;
4510
4511    if (floatx80_invalid_encoding(a)) {
4512        float_raise(float_flag_invalid, status);
4513        return 1 << 31;
4514    }
4515    aSig = extractFloatx80Frac( a );
4516    aExp = extractFloatx80Exp( a );
4517    aSign = extractFloatx80Sign( a );
4518    if ( 0x401E < aExp ) {
4519        if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4520        goto invalid;
4521    }
4522    else if ( aExp < 0x3FFF ) {
4523        if (aExp || aSig) {
4524            status->float_exception_flags |= float_flag_inexact;
4525        }
4526        return 0;
4527    }
4528    shiftCount = 0x403E - aExp;
4529    savedASig = aSig;
4530    aSig >>= shiftCount;
4531    z = aSig;
4532    if ( aSign ) z = - z;
4533    if ( ( z < 0 ) ^ aSign ) {
4534 invalid:
4535        float_raise(float_flag_invalid, status);
4536        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4537    }
4538    if ( ( aSig<<shiftCount ) != savedASig ) {
4539        status->float_exception_flags |= float_flag_inexact;
4540    }
4541    return z;
4542
4543}
4544
4545/*----------------------------------------------------------------------------
4546| Returns the result of converting the extended double-precision floating-
4547| point value `a' to the 64-bit two's complement integer format.  The
4548| conversion is performed according to the IEC/IEEE Standard for Binary
4549| Floating-Point Arithmetic---which means in particular that the conversion
4550| is rounded according to the current rounding mode.  If `a' is a NaN,
4551| the largest positive integer is returned.  Otherwise, if the conversion
4552| overflows, the largest integer with the same sign as `a' is returned.
4553*----------------------------------------------------------------------------*/
4554
4555int64_t floatx80_to_int64(floatx80 a, float_status *status)
4556{
4557    flag aSign;
4558    int32_t aExp, shiftCount;
4559    uint64_t aSig, aSigExtra;
4560
4561    if (floatx80_invalid_encoding(a)) {
4562        float_raise(float_flag_invalid, status);
4563        return 1ULL << 63;
4564    }
4565    aSig = extractFloatx80Frac( a );
4566    aExp = extractFloatx80Exp( a );
4567    aSign = extractFloatx80Sign( a );
4568    shiftCount = 0x403E - aExp;
4569    if ( shiftCount <= 0 ) {
4570        if ( shiftCount ) {
4571            float_raise(float_flag_invalid, status);
4572            if (!aSign || floatx80_is_any_nan(a)) {
4573                return LIT64( 0x7FFFFFFFFFFFFFFF );
4574            }
4575            return (int64_t) LIT64( 0x8000000000000000 );
4576        }
4577        aSigExtra = 0;
4578    }
4579    else {
4580        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4581    }
4582    return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4583
4584}
4585
4586/*----------------------------------------------------------------------------
4587| Returns the result of converting the extended double-precision floating-
4588| point value `a' to the 64-bit two's complement integer format.  The
4589| conversion is performed according to the IEC/IEEE Standard for Binary
4590| Floating-Point Arithmetic, except that the conversion is always rounded
4591| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4592| Otherwise, if the conversion overflows, the largest integer with the same
4593| sign as `a' is returned.
4594*----------------------------------------------------------------------------*/
4595
4596int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4597{
4598    flag aSign;
4599    int32_t aExp, shiftCount;
4600    uint64_t aSig;
4601    int64_t z;
4602
4603    if (floatx80_invalid_encoding(a)) {
4604        float_raise(float_flag_invalid, status);
4605        return 1ULL << 63;
4606    }
4607    aSig = extractFloatx80Frac( a );
4608    aExp = extractFloatx80Exp( a );
4609    aSign = extractFloatx80Sign( a );
4610    shiftCount = aExp - 0x403E;
4611    if ( 0 <= shiftCount ) {
4612        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4613        if ( ( a.high != 0xC03E ) || aSig ) {
4614            float_raise(float_flag_invalid, status);
4615            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4616                return LIT64( 0x7FFFFFFFFFFFFFFF );
4617            }
4618        }
4619        return (int64_t) LIT64( 0x8000000000000000 );
4620    }
4621    else if ( aExp < 0x3FFF ) {
4622        if (aExp | aSig) {
4623            status->float_exception_flags |= float_flag_inexact;
4624        }
4625        return 0;
4626    }
4627    z = aSig>>( - shiftCount );
4628    if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4629        status->float_exception_flags |= float_flag_inexact;
4630    }
4631    if ( aSign ) z = - z;
4632    return z;
4633
4634}
4635
4636/*----------------------------------------------------------------------------
4637| Returns the result of converting the extended double-precision floating-
4638| point value `a' to the single-precision floating-point format.  The
4639| conversion is performed according to the IEC/IEEE Standard for Binary
4640| Floating-Point Arithmetic.
4641*----------------------------------------------------------------------------*/
4642
4643float32 floatx80_to_float32(floatx80 a, float_status *status)
4644{
4645    flag aSign;
4646    int32_t aExp;
4647    uint64_t aSig;
4648
4649    if (floatx80_invalid_encoding(a)) {
4650        float_raise(float_flag_invalid, status);
4651        return float32_default_nan(status);
4652    }
4653    aSig = extractFloatx80Frac( a );
4654    aExp = extractFloatx80Exp( a );
4655    aSign = extractFloatx80Sign( a );
4656    if ( aExp == 0x7FFF ) {
4657        if ( (uint64_t) ( aSig<<1 ) ) {
4658            return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4659        }
4660        return packFloat32( aSign, 0xFF, 0 );
4661    }
4662    shift64RightJamming( aSig, 33, &aSig );
4663    if ( aExp || aSig ) aExp -= 0x3F81;
4664    return roundAndPackFloat32(aSign, aExp, aSig, status);
4665
4666}
4667
4668/*----------------------------------------------------------------------------
4669| Returns the result of converting the extended double-precision floating-
4670| point value `a' to the double-precision floating-point format.  The
4671| conversion is performed according to the IEC/IEEE Standard for Binary
4672| Floating-Point Arithmetic.
4673*----------------------------------------------------------------------------*/
4674
4675float64 floatx80_to_float64(floatx80 a, float_status *status)
4676{
4677    flag aSign;
4678    int32_t aExp;
4679    uint64_t aSig, zSig;
4680
4681    if (floatx80_invalid_encoding(a)) {
4682        float_raise(float_flag_invalid, status);
4683        return float64_default_nan(status);
4684    }
4685    aSig = extractFloatx80Frac( a );
4686    aExp = extractFloatx80Exp( a );
4687    aSign = extractFloatx80Sign( a );
4688    if ( aExp == 0x7FFF ) {
4689        if ( (uint64_t) ( aSig<<1 ) ) {
4690            return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4691        }
4692        return packFloat64( aSign, 0x7FF, 0 );
4693    }
4694    shift64RightJamming( aSig, 1, &zSig );
4695    if ( aExp || aSig ) aExp -= 0x3C01;
4696    return roundAndPackFloat64(aSign, aExp, zSig, status);
4697
4698}
4699
4700/*----------------------------------------------------------------------------
4701| Returns the result of converting the extended double-precision floating-
4702| point value `a' to the quadruple-precision floating-point format.  The
4703| conversion is performed according to the IEC/IEEE Standard for Binary
4704| Floating-Point Arithmetic.
4705*----------------------------------------------------------------------------*/
4706
4707float128 floatx80_to_float128(floatx80 a, float_status *status)
4708{
4709    flag aSign;
4710    int aExp;
4711    uint64_t aSig, zSig0, zSig1;
4712
4713    if (floatx80_invalid_encoding(a)) {
4714        float_raise(float_flag_invalid, status);
4715        return float128_default_nan(status);
4716    }
4717    aSig = extractFloatx80Frac( a );
4718    aExp = extractFloatx80Exp( a );
4719    aSign = extractFloatx80Sign( a );
4720    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4721        return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4722    }
4723    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4724    return packFloat128( aSign, aExp, zSig0, zSig1 );
4725
4726}
4727
4728/*----------------------------------------------------------------------------
4729| Rounds the extended double-precision floating-point value `a'
4730| to the precision provided by floatx80_rounding_precision and returns the
4731| result as an extended double-precision floating-point value.
4732| The operation is performed according to the IEC/IEEE Standard for Binary
4733| Floating-Point Arithmetic.
4734*----------------------------------------------------------------------------*/
4735
4736floatx80 floatx80_round(floatx80 a, float_status *status)
4737{
4738    return roundAndPackFloatx80(status->floatx80_rounding_precision,
4739                                extractFloatx80Sign(a),
4740                                extractFloatx80Exp(a),
4741                                extractFloatx80Frac(a), 0, status);
4742}
4743
4744/*----------------------------------------------------------------------------
4745| Rounds the extended double-precision floating-point value `a' to an integer,
4746| and returns the result as an extended quadruple-precision floating-point
4747| value.  The operation is performed according to the IEC/IEEE Standard for
4748| Binary Floating-Point Arithmetic.
4749*----------------------------------------------------------------------------*/
4750
4751floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4752{
4753    flag aSign;
4754    int32_t aExp;
4755    uint64_t lastBitMask, roundBitsMask;
4756    floatx80 z;
4757
4758    if (floatx80_invalid_encoding(a)) {
4759        float_raise(float_flag_invalid, status);
4760        return floatx80_default_nan(status);
4761    }
4762    aExp = extractFloatx80Exp( a );
4763    if ( 0x403E <= aExp ) {
4764        if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4765            return propagateFloatx80NaN(a, a, status);
4766        }
4767        return a;
4768    }
4769    if ( aExp < 0x3FFF ) {
4770        if (    ( aExp == 0 )
4771             && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4772            return a;
4773        }
4774        status->float_exception_flags |= float_flag_inexact;
4775        aSign = extractFloatx80Sign( a );
4776        switch (status->float_rounding_mode) {
4777         case float_round_nearest_even:
4778            if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4779               ) {
4780                return
4781                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4782            }
4783            break;
4784        case float_round_ties_away:
4785            if (aExp == 0x3FFE) {
4786                return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4787            }
4788            break;
4789         case float_round_down:
4790            return
4791                  aSign ?
4792                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4793                : packFloatx80( 0, 0, 0 );
4794         case float_round_up:
4795            return
4796                  aSign ? packFloatx80( 1, 0, 0 )
4797                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4798        }
4799        return packFloatx80( aSign, 0, 0 );
4800    }
4801    lastBitMask = 1;
4802    lastBitMask <<= 0x403E - aExp;
4803    roundBitsMask = lastBitMask - 1;
4804    z = a;
4805    switch (status->float_rounding_mode) {
4806    case float_round_nearest_even:
4807        z.low += lastBitMask>>1;
4808        if ((z.low & roundBitsMask) == 0) {
4809            z.low &= ~lastBitMask;
4810        }
4811        break;
4812    case float_round_ties_away:
4813        z.low += lastBitMask >> 1;
4814        break;
4815    case float_round_to_zero:
4816        break;
4817    case float_round_up:
4818        if (!extractFloatx80Sign(z)) {
4819            z.low += roundBitsMask;
4820        }
4821        break;
4822    case float_round_down:
4823        if (extractFloatx80Sign(z)) {
4824            z.low += roundBitsMask;
4825        }
4826        break;
4827    default:
4828        abort();
4829    }
4830    z.low &= ~ roundBitsMask;
4831    if ( z.low == 0 ) {
4832        ++z.high;
4833        z.low = LIT64( 0x8000000000000000 );
4834    }
4835    if (z.low != a.low) {
4836        status->float_exception_flags |= float_flag_inexact;
4837    }
4838    return z;
4839
4840}
4841
4842/*----------------------------------------------------------------------------
4843| Returns the result of adding the absolute values of the extended double-
4844| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4845| negated before being returned.  `zSign' is ignored if the result is a NaN.
4846| The addition is performed according to the IEC/IEEE Standard for Binary
4847| Floating-Point Arithmetic.
4848*----------------------------------------------------------------------------*/
4849
4850static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4851                                float_status *status)
4852{
4853    int32_t aExp, bExp, zExp;
4854    uint64_t aSig, bSig, zSig0, zSig1;
4855    int32_t expDiff;
4856
4857    aSig = extractFloatx80Frac( a );
4858    aExp = extractFloatx80Exp( a );
4859    bSig = extractFloatx80Frac( b );
4860    bExp = extractFloatx80Exp( b );
4861    expDiff = aExp - bExp;
4862    if ( 0 < expDiff ) {
4863        if ( aExp == 0x7FFF ) {
4864            if ((uint64_t)(aSig << 1)) {
4865                return propagateFloatx80NaN(a, b, status);
4866            }
4867            return a;
4868        }
4869        if ( bExp == 0 ) --expDiff;
4870        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4871        zExp = aExp;
4872    }
4873    else if ( expDiff < 0 ) {
4874        if ( bExp == 0x7FFF ) {
4875            if ((uint64_t)(bSig << 1)) {
4876                return propagateFloatx80NaN(a, b, status);
4877            }
4878            return packFloatx80(zSign,
4879                                floatx80_infinity_high,
4880                                floatx80_infinity_low);
4881        }
4882        if ( aExp == 0 ) ++expDiff;
4883        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4884        zExp = bExp;
4885    }
4886    else {
4887        if ( aExp == 0x7FFF ) {
4888            if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4889                return propagateFloatx80NaN(a, b, status);
4890            }
4891            return a;
4892        }
4893        zSig1 = 0;
4894        zSig0 = aSig + bSig;
4895        if ( aExp == 0 ) {
4896            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4897            goto roundAndPack;
4898        }
4899        zExp = aExp;
4900        goto shiftRight1;
4901    }
4902    zSig0 = aSig + bSig;
4903    if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4904 shiftRight1:
4905    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4906    zSig0 |= LIT64( 0x8000000000000000 );
4907    ++zExp;
4908 roundAndPack:
4909    return roundAndPackFloatx80(status->floatx80_rounding_precision,
4910                                zSign, zExp, zSig0, zSig1, status);
4911}
4912
4913/*----------------------------------------------------------------------------
4914| Returns the result of subtracting the absolute values of the extended
4915| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4916| difference is negated before being returned.  `zSign' is ignored if the
4917| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4918| Standard for Binary Floating-Point Arithmetic.
4919*----------------------------------------------------------------------------*/
4920
4921static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4922                                float_status *status)
4923{
4924    int32_t aExp, bExp, zExp;
4925    uint64_t aSig, bSig, zSig0, zSig1;
4926    int32_t expDiff;
4927
4928    aSig = extractFloatx80Frac( a );
4929    aExp = extractFloatx80Exp( a );
4930    bSig = extractFloatx80Frac( b );
4931    bExp = extractFloatx80Exp( b );
4932    expDiff = aExp - bExp;
4933    if ( 0 < expDiff ) goto aExpBigger;
4934    if ( expDiff < 0 ) goto bExpBigger;
4935    if ( aExp == 0x7FFF ) {
4936        if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4937            return propagateFloatx80NaN(a, b, status);
4938        }
4939        float_raise(float_flag_invalid, status);
4940        return floatx80_default_nan(status);
4941    }
4942    if ( aExp == 0 ) {
4943        aExp = 1;
4944        bExp = 1;
4945    }
4946    zSig1 = 0;
4947    if ( bSig < aSig ) goto aBigger;
4948    if ( aSig < bSig ) goto bBigger;
4949    return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4950 bExpBigger:
4951    if ( bExp == 0x7FFF ) {
4952        if ((uint64_t)(bSig << 1)) {
4953            return propagateFloatx80NaN(a, b, status);
4954        }
4955        return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4956                            floatx80_infinity_low);
4957    }
4958    if ( aExp == 0 ) ++expDiff;
4959    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4960 bBigger:
4961    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4962    zExp = bExp;
4963    zSign ^= 1;
4964    goto normalizeRoundAndPack;
4965 aExpBigger:
4966    if ( aExp == 0x7FFF ) {
4967        if ((uint64_t)(aSig << 1)) {
4968            return propagateFloatx80NaN(a, b, status);
4969        }
4970        return a;
4971    }
4972    if ( bExp == 0 ) --expDiff;
4973    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4974 aBigger:
4975    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4976    zExp = aExp;
4977 normalizeRoundAndPack:
4978    return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4979                                         zSign, zExp, zSig0, zSig1, status);
4980}
4981
4982/*----------------------------------------------------------------------------
4983| Returns the result of adding the extended double-precision floating-point
4984| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4985| Standard for Binary Floating-Point Arithmetic.
4986*----------------------------------------------------------------------------*/
4987
4988floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4989{
4990    flag aSign, bSign;
4991
4992    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4993        float_raise(float_flag_invalid, status);
4994        return floatx80_default_nan(status);
4995    }
4996    aSign = extractFloatx80Sign( a );
4997    bSign = extractFloatx80Sign( b );
4998    if ( aSign == bSign ) {
4999        return addFloatx80Sigs(a, b, aSign, status);
5000    }

5001    else {
5002        return subFloatx80Sigs(a, b, aSign, status);
5003    }
5004
5005}
5006
5007/*----------------------------------------------------------------------------
5008| Returns the result of subtracting the extended double-precision floating-
5009| point values `a' and `b'.  The operation is performed according to the
5010| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5011*----------------------------------------------------------------------------*/
5012
5013floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5014{
5015    flag aSign, bSign;
5016
5017    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5018        float_raise(float_flag_invalid, status);
5019        return floatx80_default_nan(status);
5020    }
5021    aSign = extractFloatx80Sign( a );
5022    bSign = extractFloatx80Sign( b );
5023    if ( aSign == bSign ) {
5024        return subFloatx80Sigs(a, b, aSign, status);
5025    }
5026    else {
5027        return addFloatx80Sigs(a, b, aSign, status);
5028    }
5029
5030}
5031
5032/*----------------------------------------------------------------------------
5033| Returns the result of multiplying the extended double-precision floating-
5034| point values `a' and `b'.  The operation is performed according to the
5035| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5036*----------------------------------------------------------------------------*/
5037
5038floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5039{
5040    flag aSign, bSign, zSign;
5041    int32_t aExp, bExp, zExp;
5042    uint64_t aSig, bSig, zSig0, zSig1;
5043
5044    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5045        float_raise(float_flag_invalid, status);
5046        return floatx80_default_nan(status);
5047    }
5048    aSig = extractFloatx80Frac( a );
5049    aExp = extractFloatx80Exp( a );
5050    aSign = extractFloatx80Sign( a );
5051    bSig = extractFloatx80Frac( b );
5052    bExp = extractFloatx80Exp( b );
5053    bSign = extractFloatx80Sign( b );
5054    zSign = aSign ^ bSign;
5055    if ( aExp == 0x7FFF ) {
5056        if (    (uint64_t) ( aSig<<1 )
5057             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5058            return propagateFloatx80NaN(a, b, status);
5059        }
5060        if ( ( bExp | bSig ) == 0 ) goto invalid;
5061        return packFloatx80(zSign, floatx80_infinity_high,
5062                                   floatx80_infinity_low);
5063    }
5064    if ( bExp == 0x7FFF ) {
5065        if ((uint64_t)(bSig << 1)) {
5066            return propagateFloatx80NaN(a, b, status);
5067        }
5068        if ( ( aExp | aSig ) == 0 ) {
5069 invalid:
5070            float_raise(float_flag_invalid, status);
5071            return floatx80_default_nan(status);
5072        }
5073        return packFloatx80(zSign, floatx80_infinity_high,
5074                                   floatx80_infinity_low);
5075    }
5076    if ( aExp == 0 ) {
5077        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5078        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5079    }
5080    if ( bExp == 0 ) {
5081        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5082        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5083    }
5084    zExp = aExp + bExp - 0x3FFE;
5085    mul64To128( aSig, bSig, &zSig0, &zSig1 );
5086    if ( 0 < (int64_t) zSig0 ) {
5087        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5088        --zExp;
5089    }
5090    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5091                                zSign, zExp, zSig0, zSig1, status);
5092}
5093
5094/*----------------------------------------------------------------------------
5095| Returns the result of dividing the extended double-precision floating-point
5096| value `a' by the corresponding value `b'.  The operation is performed
5097| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5098*----------------------------------------------------------------------------*/
5099
5100floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5101{
5102    flag aSign, bSign, zSign;
5103    int32_t aExp, bExp, zExp;
5104    uint64_t aSig, bSig, zSig0, zSig1;
5105    uint64_t rem0, rem1, rem2, term0, term1, term2;
5106
5107    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5108        float_raise(float_flag_invalid, status);
5109        return floatx80_default_nan(status);
5110    }
5111    aSig = extractFloatx80Frac( a );
5112    aExp = extractFloatx80Exp( a );
5113    aSign = extractFloatx80Sign( a );
5114    bSig = extractFloatx80Frac( b );
5115    bExp = extractFloatx80Exp( b );
5116    bSign = extractFloatx80Sign( b );
5117    zSign = aSign ^ bSign;
5118    if ( aExp == 0x7FFF ) {
5119        if ((uint64_t)(aSig << 1)) {
5120            return propagateFloatx80NaN(a, b, status);
5121        }
5122        if ( bExp == 0x7FFF ) {
5123            if ((uint64_t)(bSig << 1)) {
5124                return propagateFloatx80NaN(a, b, status);
5125            }
5126            goto invalid;
5127        }
5128        return packFloatx80(zSign, floatx80_infinity_high,
5129                                   floatx80_infinity_low);
5130    }
5131    if ( bExp == 0x7FFF ) {
5132        if ((uint64_t)(bSig << 1)) {
5133            return propagateFloatx80NaN(a, b, status);
5134        }
5135        return packFloatx80( zSign, 0, 0 );
5136    }
5137    if ( bExp == 0 ) {
5138        if ( bSig == 0 ) {
5139            if ( ( aExp | aSig ) == 0 ) {
5140 invalid:
5141                float_raise(float_flag_invalid, status);
5142                return floatx80_default_nan(status);
5143            }
5144            float_raise(float_flag_divbyzero, status);
5145            return packFloatx80(zSign, floatx80_infinity_high,
5146                                       floatx80_infinity_low);
5147        }
5148        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5149    }
5150    if ( aExp == 0 ) {
5151        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5152        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5153    }
5154    zExp = aExp - bExp + 0x3FFE;
5155    rem1 = 0;
5156    if ( bSig <= aSig ) {
5157        shift128Right( aSig, 0, 1, &aSig, &rem1 );
5158        ++zExp;
5159    }
5160    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5161    mul64To128( bSig, zSig0, &term0, &term1 );
5162    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5163    while ( (int64_t) rem0 < 0 ) {
5164        --zSig0;
5165        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5166    }
5167    zSig1 = estimateDiv128To64( rem1, 0, bSig );
5168    if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5169        mul64To128( bSig, zSig1, &term1, &term2 );
5170        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5171        while ( (int64_t) rem1 < 0 ) {
5172            --zSig1;
5173            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5174        }
5175        zSig1 |= ( ( rem1 | rem2 ) != 0 );
5176    }
5177    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5178                                zSign, zExp, zSig0, zSig1, status);
5179}
5180
5181/*----------------------------------------------------------------------------
5182| Returns the remainder of the extended double-precision floating-point value
5183| `a' with respect to the corresponding value `b'.  The operation is performed
5184| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5185*----------------------------------------------------------------------------*/
5186
5187floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5188{
5189    flag aSign, zSign;
5190    int32_t aExp, bExp, expDiff;
5191    uint64_t aSig0, aSig1, bSig;
5192    uint64_t q, term0, term1, alternateASig0, alternateASig1;
5193
5194    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5195        float_raise(float_flag_invalid, status);
5196        return floatx80_default_nan(status);
5197    }
5198    aSig0 = extractFloatx80Frac( a );
5199    aExp = extractFloatx80Exp( a );
5200    aSign = extractFloatx80Sign( a );
5201    bSig = extractFloatx80Frac( b );
5202    bExp = extractFloatx80Exp( b );
5203    if ( aExp == 0x7FFF ) {
5204        if (    (uint64_t) ( aSig0<<1 )
5205             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5206            return propagateFloatx80NaN(a, b, status);
5207        }
5208        goto invalid;
5209    }
5210    if ( bExp == 0x7FFF ) {
5211        if ((uint64_t)(bSig << 1)) {
5212            return propagateFloatx80NaN(a, b, status);
5213        }
5214        return a;
5215    }
5216    if ( bExp == 0 ) {
5217        if ( bSig == 0 ) {
5218 invalid:
5219            float_raise(float_flag_invalid, status);
5220            return floatx80_default_nan(status);
5221        }
5222        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5223    }
5224    if ( aExp == 0 ) {
5225        if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5226        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5227    }
5228    bSig |= LIT64( 0x8000000000000000 );
5229    zSign = aSign;
5230    expDiff = aExp - bExp;
5231    aSig1 = 0;
5232    if ( expDiff < 0 ) {
5233        if ( expDiff < -1 ) return a;
5234        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5235        expDiff = 0;
5236    }
5237    q = ( bSig <= aSig0 );
5238    if ( q ) aSig0 -= bSig;
5239    expDiff -= 64;
5240    while ( 0 < expDiff ) {
5241        q = estimateDiv128To64( aSig0, aSig1, bSig );
5242        q = ( 2 < q ) ? q - 2 : 0;
5243        mul64To128( bSig, q, &term0, &term1 );
5244        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5245        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5246        expDiff -= 62;
5247    }
5248    expDiff += 64;
5249    if ( 0 < expDiff ) {
5250        q = estimateDiv128To64( aSig0, aSig1, bSig );
5251        q = ( 2 < q ) ? q - 2 : 0;
5252        q >>= 64 - expDiff;
5253        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5254        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5255        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5256        while ( le128( term0, term1, aSig0, aSig1 ) ) {
5257            ++q;
5258            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5259        }
5260    }
5261    else {
5262        term1 = 0;
5263        term0 = bSig;
5264    }
5265    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5266    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5267         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5268              && ( q & 1 ) )
5269       ) {
5270        aSig0 = alternateASig0;
5271        aSig1 = alternateASig1;
5272        zSign = ! zSign;
5273    }
5274    return
5275        normalizeRoundAndPackFloatx80(
5276            80, zSign, bExp + expDiff, aSig0, aSig1, status);
5277
5278}
5279
5280/*----------------------------------------------------------------------------
5281| Returns the square root of the extended double-precision floating-point
5282| value `a'.  The operation is performed according to the IEC/IEEE Standard
5283| for Binary Floating-Point Arithmetic.
5284*----------------------------------------------------------------------------*/
5285
5286floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5287{
5288    flag aSign;
5289    int32_t aExp, zExp;
5290    uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5291    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5292
5293    if (floatx80_invalid_encoding(a)) {
5294        float_raise(float_flag_invalid, status);
5295        return floatx80_default_nan(status);
5296    }
5297    aSig0 = extractFloatx80Frac( a );
5298    aExp = extractFloatx80Exp( a );
5299    aSign = extractFloatx80Sign( a );
5300    if ( aExp == 0x7FFF ) {
5301        if ((uint64_t)(aSig0 << 1)) {
5302            return propagateFloatx80NaN(a, a, status);
5303        }
5304        if ( ! aSign ) return a;
5305        goto invalid;
5306    }
5307    if ( aSign ) {
5308        if ( ( aExp | aSig0 ) == 0 ) return a;
5309 invalid:
5310        float_raise(float_flag_invalid, status);
5311        return floatx80_default_nan(status);
5312    }
5313    if ( aExp == 0 ) {
5314        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5315        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5316    }
5317    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5318    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5319    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5320    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5321    doubleZSig0 = zSig0<<1;
5322    mul64To128( zSig0, zSig0, &term0, &term1 );
5323    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5324    while ( (int64_t) rem0 < 0 ) {
5325        --zSig0;
5326        doubleZSig0 -= 2;
5327        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5328    }
5329    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5330    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5331        if ( zSig1 == 0 ) zSig1 = 1;
5332        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5333        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5334        mul64To128( zSig1, zSig1, &term2, &term3 );
5335        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5336        while ( (int64_t) rem1 < 0 ) {
5337            --zSig1;
5338            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5339            term3 |= 1;
5340            term2 |= doubleZSig0;
5341            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5342        }
5343        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5344    }
5345    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5346    zSig0 |= doubleZSig0;
5347    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5348                                0, zExp, zSig0, zSig1, status);
5349}
5350
5351/*----------------------------------------------------------------------------
5352| Returns 1 if the extended double-precision floating-point value `a' is equal
5353| to the corresponding value `b', and 0 otherwise.  The invalid exception is
5354| raised if either operand is a NaN.  Otherwise, the comparison is performed
5355| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5356*----------------------------------------------------------------------------*/
5357
5358int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5359{
5360
5361    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5362        || (extractFloatx80Exp(a) == 0x7FFF
5363            && (uint64_t) (extractFloatx80Frac(a) << 1))
5364        || (extractFloatx80Exp(b) == 0x7FFF
5365            && (uint64_t) (extractFloatx80Frac(b) << 1))
5366       ) {
5367        float_raise(float_flag_invalid, status);
5368        return 0;
5369    }
5370    return
5371           ( a.low == b.low )
5372        && (    ( a.high == b.high )
5373             || (    ( a.low == 0 )
5374                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5375           );
5376
5377}
5378
5379/*----------------------------------------------------------------------------
5380| Returns 1 if the extended double-precision floating-point value `a' is
5381| less than or equal to the corresponding value `b', and 0 otherwise.  The
5382| invalid exception is raised if either operand is a NaN.  The comparison is
5383| performed according to the IEC/IEEE Standard for Binary Floating-Point
5384| Arithmetic.
5385*----------------------------------------------------------------------------*/
5386
5387int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5388{
5389    flag aSign, bSign;
5390
5391    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5392        || (extractFloatx80Exp(a) == 0x7FFF
5393            && (uint64_t) (extractFloatx80Frac(a) << 1))
5394        || (extractFloatx80Exp(b) == 0x7FFF
5395            && (uint64_t) (extractFloatx80Frac(b) << 1))
5396       ) {
5397        float_raise(float_flag_invalid, status);
5398        return 0;
5399    }
5400    aSign = extractFloatx80Sign( a );
5401    bSign = extractFloatx80Sign( b );
5402    if ( aSign != bSign ) {
5403        return
5404               aSign
5405            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5406                 == 0 );
5407    }
5408    return
5409          aSign ? le128( b.high, b.low, a.high, a.low )
5410        : le128( a.high, a.low, b.high, b.low );
5411
5412}
5413
5414/*----------------------------------------------------------------------------
5415| Returns 1 if the extended double-precision floating-point value `a' is
5416| less than the corresponding value `b', and 0 otherwise.  The invalid
5417| exception is raised if either operand is a NaN.  The comparison is performed
5418| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5419*----------------------------------------------------------------------------*/
5420
5421int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5422{
5423    flag aSign, bSign;
5424
5425    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5426        || (extractFloatx80Exp(a) == 0x7FFF
5427            && (uint64_t) (extractFloatx80Frac(a) << 1))
5428        || (extractFloatx80Exp(b) == 0x7FFF
5429            && (uint64_t) (extractFloatx80Frac(b) << 1))
5430       ) {
5431        float_raise(float_flag_invalid, status);
5432        return 0;
5433    }
5434    aSign = extractFloatx80Sign( a );
5435    bSign = extractFloatx80Sign( b );
5436    if ( aSign != bSign ) {
5437        return
5438               aSign
5439            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5440                 != 0 );
5441    }
5442    return
5443          aSign ? lt128( b.high, b.low, a.high, a.low )
5444        : lt128( a.high, a.low, b.high, b.low );
5445
5446}
5447
5448/*----------------------------------------------------------------------------
5449| Returns 1 if the extended double-precision floating-point values `a' and `b'
5450| cannot be compared, and 0 otherwise.  The invalid exception is raised if
5451| either operand is a NaN.   The comparison is performed according to the
5452| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5453*----------------------------------------------------------------------------*/
5454int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5455{
5456    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5457        || (extractFloatx80Exp(a) == 0x7FFF
5458            && (uint64_t) (extractFloatx80Frac(a) << 1))
5459        || (extractFloatx80Exp(b) == 0x7FFF
5460            && (uint64_t) (extractFloatx80Frac(b) << 1))
5461       ) {
5462        float_raise(float_flag_invalid, status);
5463        return 1;
5464    }
5465    return 0;
5466}
5467
5468/*----------------------------------------------------------------------------
5469| Returns 1 if the extended double-precision floating-point value `a' is
5470| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5471| cause an exception.  The comparison is performed according to the IEC/IEEE
5472| Standard for Binary Floating-Point Arithmetic.
5473*----------------------------------------------------------------------------*/
5474
5475int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5476{
5477
5478    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5479        float_raise(float_flag_invalid, status);
5480        return 0;
5481    }
5482    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5483              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5484         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5485              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5486       ) {
5487        if (floatx80_is_signaling_nan(a, status)
5488         || floatx80_is_signaling_nan(b, status)) {
5489            float_raise(float_flag_invalid, status);
5490        }
5491        return 0;
5492    }
5493    return
5494           ( a.low == b.low )
5495        && (    ( a.high == b.high )
5496             || (    ( a.low == 0 )
5497                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5498           );
5499
5500}
5501
5502/*----------------------------------------------------------------------------
5503| Returns 1 if the extended double-precision floating-point value `a' is less
5504| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5505| do not cause an exception.  Otherwise, the comparison is performed according
5506| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5507*----------------------------------------------------------------------------*/
5508
5509int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5510{
5511    flag aSign, bSign;
5512
5513    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5514        float_raise(float_flag_invalid, status);
5515        return 0;
5516    }
5517    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5518              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5519         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5520              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5521       ) {
5522        if (floatx80_is_signaling_nan(a, status)
5523         || floatx80_is_signaling_nan(b, status)) {
5524            float_raise(float_flag_invalid, status);
5525        }
5526        return 0;
5527    }
5528    aSign = extractFloatx80Sign( a );
5529    bSign = extractFloatx80Sign( b );
5530    if ( aSign != bSign ) {
5531        return
5532               aSign
5533            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5534                 == 0 );
5535    }
5536    return
5537          aSign ? le128( b.high, b.low, a.high, a.low )
5538        : le128( a.high, a.low, b.high, b.low );
5539
5540}
5541
5542/*----------------------------------------------------------------------------
5543| Returns 1 if the extended double-precision floating-point value `a' is less
5544| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5545| an exception.  Otherwise, the comparison is performed according to the
5546| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5547*----------------------------------------------------------------------------*/
5548
5549int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5550{
5551    flag aSign, bSign;
5552
5553    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5554        float_raise(float_flag_invalid, status);
5555        return 0;
5556    }
5557    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5558              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5559         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5560              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5561       ) {
5562        if (floatx80_is_signaling_nan(a, status)
5563         || floatx80_is_signaling_nan(b, status)) {
5564            float_raise(float_flag_invalid, status);
5565        }
5566        return 0;
5567    }
5568    aSign = extractFloatx80Sign( a );
5569    bSign = extractFloatx80Sign( b );
5570    if ( aSign != bSign ) {
5571        return
5572               aSign
5573            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5574                 != 0 );
5575    }
5576    return
5577          aSign ? lt128( b.high, b.low, a.high, a.low )
5578        : lt128( a.high, a.low, b.high, b.low );
5579
5580}
5581
5582/*----------------------------------------------------------------------------
5583| Returns 1 if the extended double-precision floating-point values `a' and `b'
5584| cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5585| The comparison is performed according to the IEC/IEEE Standard for Binary
5586| Floating-Point Arithmetic.
5587*----------------------------------------------------------------------------*/
5588int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5589{
5590    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5591        float_raise(float_flag_invalid, status);
5592        return 1;
5593    }
5594    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5595              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5596         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5597              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5598       ) {
5599        if (floatx80_is_signaling_nan(a, status)
5600         || floatx80_is_signaling_nan(b, status)) {
5601            float_raise(float_flag_invalid, status);
5602        }
5603        return 1;
5604    }
5605    return 0;
5606}
5607
5608/*----------------------------------------------------------------------------
5609| Returns the result of converting the quadruple-precision floating-point
5610| value `a' to the 32-bit two's complement integer format.  The conversion
5611| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5612| Arithmetic---which means in particular that the conversion is rounded
5613| according to the current rounding mode.  If `a' is a NaN, the largest
5614| positive integer is returned.  Otherwise, if the conversion overflows, the
5615| largest integer with the same sign as `a' is returned.
5616*----------------------------------------------------------------------------*/
5617
5618int32_t float128_to_int32(float128 a, float_status *status)
5619{
5620    flag aSign;
5621    int32_t aExp, shiftCount;
5622    uint64_t aSig0, aSig1;
5623
5624    aSig1 = extractFloat128Frac1( a );
5625    aSig0 = extractFloat128Frac0( a );
5626    aExp = extractFloat128Exp( a );
5627    aSign = extractFloat128Sign( a );
5628    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5629    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5630    aSig0 |= ( aSig1 != 0 );
5631    shiftCount = 0x4028 - aExp;
5632    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5633    return roundAndPackInt32(aSign, aSig0, status);
5634
5635}
5636
5637/*----------------------------------------------------------------------------
5638| Returns the result of converting the quadruple-precision floating-point
5639| value `a' to the 32-bit two's complement integer format.  The conversion
5640| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5641| Arithmetic, except that the conversion is always rounded toward zero.  If
5642| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5643| conversion overflows, the largest integer with the same sign as `a' is
5644| returned.
5645*----------------------------------------------------------------------------*/
5646
5647int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5648{
5649    flag aSign;
5650    int32_t aExp, shiftCount;
5651    uint64_t aSig0, aSig1, savedASig;
5652    int32_t z;
5653
5654    aSig1 = extractFloat128Frac1( a );
5655    aSig0 = extractFloat128Frac0( a );
5656    aExp = extractFloat128Exp( a );
5657    aSign = extractFloat128Sign( a );
5658    aSig0 |= ( aSig1 != 0 );
5659    if ( 0x401E < aExp ) {
5660        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5661        goto invalid;
5662    }
5663    else if ( aExp < 0x3FFF ) {
5664        if (aExp || aSig0) {
5665            status->float_exception_flags |= float_flag_inexact;
5666        }
5667        return 0;
5668    }
5669    aSig0 |= LIT64( 0x0001000000000000 );
5670    shiftCount = 0x402F - aExp;
5671    savedASig = aSig0;
5672    aSig0 >>= shiftCount;
5673    z = aSig0;
5674    if ( aSign ) z = - z;
5675    if ( ( z < 0 ) ^ aSign ) {
5676 invalid:
5677        float_raise(float_flag_invalid, status);
5678        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5679    }
5680    if ( ( aSig0<<shiftCount ) != savedASig ) {
5681        status->float_exception_flags |= float_flag_inexact;
5682    }
5683    return z;
5684
5685}
5686
5687/*----------------------------------------------------------------------------
5688| Returns the result of converting the quadruple-precision floating-point
5689| value `a' to the 64-bit two's complement integer format.  The conversion
5690| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5691| Arithmetic---which means in particular that the conversion is rounded
5692| according to the current rounding mode.  If `a' is a NaN, the largest
5693| positive integer is returned.  Otherwise, if the conversion overflows, the
5694| largest integer with the same sign as `a' is returned.
5695*----------------------------------------------------------------------------*/
5696
5697int64_t float128_to_int64(float128 a, float_status *status)
5698{
5699    flag aSign;
5700    int32_t aExp, shiftCount;
5701    uint64_t aSig0, aSig1;
5702
5703    aSig1 = extractFloat128Frac1( a );
5704    aSig0 = extractFloat128Frac0( a );
5705    aExp = extractFloat128Exp( a );
5706    aSign = extractFloat128Sign( a );
5707    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5708    shiftCount = 0x402F - aExp;
5709    if ( shiftCount <= 0 ) {
5710        if ( 0x403E < aExp ) {
5711            float_raise(float_flag_invalid, status);
5712            if (    ! aSign
5713                 || (    ( aExp == 0x7FFF )
5714                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5715                    )
5716               ) {
5717                return LIT64( 0x7FFFFFFFFFFFFFFF );
5718            }
5719            return (int64_t) LIT64( 0x8000000000000000 );
5720        }
5721        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5722    }
5723    else {
5724        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5725    }
5726    return roundAndPackInt64(aSign, aSig0, aSig1, status);
5727
5728}
5729
5730/*----------------------------------------------------------------------------
5731| Returns the result of converting the quadruple-precision floating-point
5732| value `a' to the 64-bit two's complement integer format.  The conversion
5733| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5734| Arithmetic, except that the conversion is always rounded toward zero.
5735| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5736| the conversion overflows, the largest integer with the same sign as `a' is
5737| returned.
5738*----------------------------------------------------------------------------*/
5739
5740int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5741{
5742    flag aSign;
5743    int32_t aExp, shiftCount;
5744    uint64_t aSig0, aSig1;
5745    int64_t z;
5746
5747    aSig1 = extractFloat128Frac1( a );
5748    aSig0 = extractFloat128Frac0( a );
5749    aExp = extractFloat128Exp( a );
5750    aSign = extractFloat128Sign( a );
5751    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5752    shiftCount = aExp - 0x402F;
5753    if ( 0 < shiftCount ) {
5754        if ( 0x403E <= aExp ) {
5755            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5756            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5757                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5758                if (aSig1) {
5759                    status->float_exception_flags |= float_flag_inexact;
5760                }
5761            }
5762            else {
5763                float_raise(float_flag_invalid, status);
5764                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5765                    return LIT64( 0x7FFFFFFFFFFFFFFF );
5766                }
5767            }
5768            return (int64_t) LIT64( 0x8000000000000000 );
5769        }
5770        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5771        if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5772            status->float_exception_flags |= float_flag_inexact;
5773        }
5774    }
5775    else {
5776        if ( aExp < 0x3FFF ) {
5777            if ( aExp | aSig0 | aSig1 ) {
5778                status->float_exception_flags |= float_flag_inexact;
5779            }
5780            return 0;
5781        }
5782        z = aSig0>>( - shiftCount );
5783        if (    aSig1
5784             || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5785            status->float_exception_flags |= float_flag_inexact;
5786        }
5787    }
5788    if ( aSign ) z = - z;
5789    return z;
5790
5791}
5792
5793/*----------------------------------------------------------------------------
5794| Returns the result of converting the quadruple-precision floating-point value
5795| `a' to the 64-bit unsigned integer format.  The conversion is
5796| performed according to the IEC/IEEE Standard for Binary Floating-Point
5797| Arithmetic---which means in particular that the conversion is rounded
5798| according to the current rounding mode.  If `a' is a NaN, the largest
5799| positive integer is returned.  If the conversion overflows, the
5800| largest unsigned integer is returned.  If 'a' is negative, the value is
5801| rounded and zero is returned; negative values that do not round to zero
5802| will raise the inexact exception.
5803*----------------------------------------------------------------------------*/
5804
5805uint64_t float128_to_uint64(float128 a, float_status *status)
5806{
5807    flag aSign;
5808    int aExp;
5809    int shiftCount;
5810    uint64_t aSig0, aSig1;
5811
5812    aSig0 = extractFloat128Frac0(a);
5813    aSig1 = extractFloat128Frac1(a);
5814    aExp = extractFloat128Exp(a);
5815    aSign = extractFloat128Sign(a);
5816    if (aSign && (aExp > 0x3FFE)) {
5817        float_raise(float_flag_invalid, status);
5818        if (float128_is_any_nan(a)) {
5819            return LIT64(0xFFFFFFFFFFFFFFFF);
5820        } else {
5821            return 0;
5822        }
5823    }
5824    if (aExp) {
5825        aSig0 |= LIT64(0x0001000000000000);
5826    }
5827    shiftCount = 0x402F - aExp;
5828    if (shiftCount <= 0) {
5829        if (0x403E < aExp) {
5830            float_raise(float_flag_invalid, status);
5831            return LIT64(0xFFFFFFFFFFFFFFFF);
5832        }
5833        shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5834    } else {
5835        shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5836    }
5837    return roundAndPackUint64(aSign, aSig0, aSig1, status);
5838}
5839
5840uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5841{
5842    uint64_t v;
5843    signed char current_rounding_mode = status->float_rounding_mode;
5844
5845    set_float_rounding_mode(float_round_to_zero, status);
5846    v = float128_to_uint64(a, status);
5847    set_float_rounding_mode(current_rounding_mode, status);
5848
5849    return v;
5850}
5851
5852/*----------------------------------------------------------------------------
5853| Returns the result of converting the quadruple-precision floating-point
5854| value `a' to the 32-bit unsigned integer format.  The conversion
5855| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5856| Arithmetic except that the conversion is always rounded toward zero.
5857| If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5858| if the conversion overflows, the largest unsigned integer is returned.
5859| If 'a' is negative, the value is rounded and zero is returned; negative
5860| values that do not round to zero will raise the inexact exception.
5861*----------------------------------------------------------------------------*/
5862
5863uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5864{
5865    uint64_t v;
5866    uint32_t res;
5867    int old_exc_flags = get_float_exception_flags(status);
5868
5869    v = float128_to_uint64_round_to_zero(a, status);
5870    if (v > 0xffffffff) {
5871        res = 0xffffffff;
5872    } else {
5873        return v;
5874    }
5875    set_float_exception_flags(old_exc_flags, status);
5876    float_raise(float_flag_invalid, status);
5877    return res;
5878}
5879
5880/*----------------------------------------------------------------------------
5881| Returns the result of converting the quadruple-precision floating-point
5882| value `a' to the single-precision floating-point format.  The conversion
5883| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5884| Arithmetic.
5885*----------------------------------------------------------------------------*/
5886
5887float32 float128_to_float32(float128 a, float_status *status)
5888{
5889    flag aSign;
5890    int32_t aExp;
5891    uint64_t aSig0, aSig1;
5892    uint32_t zSig;
5893
5894    aSig1 = extractFloat128Frac1( a );
5895    aSig0 = extractFloat128Frac0( a );
5896    aExp = extractFloat128Exp( a );
5897    aSign = extractFloat128Sign( a );
5898    if ( aExp == 0x7FFF ) {
5899        if ( aSig0 | aSig1 ) {
5900            return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5901        }
5902        return packFloat32( aSign, 0xFF, 0 );
5903    }
5904    aSig0 |= ( aSig1 != 0 );
5905    shift64RightJamming( aSig0, 18, &aSig0 );
5906    zSig = aSig0;
5907    if ( aExp || zSig ) {
5908        zSig |= 0x40000000;
5909        aExp -= 0x3F81;
5910    }
5911    return roundAndPackFloat32(aSign, aExp, zSig, status);
5912
5913}
5914
5915/*----------------------------------------------------------------------------
5916| Returns the result of converting the quadruple-precision floating-point
5917| value `a' to the double-precision floating-point format.  The conversion
5918| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5919| Arithmetic.
5920*----------------------------------------------------------------------------*/
5921
5922float64 float128_to_float64(float128 a, float_status *status)
5923{
5924    flag aSign;
5925    int32_t aExp;
5926    uint64_t aSig0, aSig1;
5927
5928    aSig1 = extractFloat128Frac1( a );
5929    aSig0 = extractFloat128Frac0( a );
5930    aExp = extractFloat128Exp( a );
5931    aSign = extractFloat128Sign( a );
5932    if ( aExp == 0x7FFF ) {
5933        if ( aSig0 | aSig1 ) {
5934            return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5935        }
5936        return packFloat64( aSign, 0x7FF, 0 );
5937    }
5938    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5939    aSig0 |= ( aSig1 != 0 );
5940    if ( aExp || aSig0 ) {
5941        aSig0 |= LIT64( 0x4000000000000000 );
5942        aExp -= 0x3C01;
5943    }
5944    return roundAndPackFloat64(aSign, aExp, aSig0, status);
5945
5946}
5947
5948/*----------------------------------------------------------------------------
5949| Returns the result of converting the quadruple-precision floating-point
5950| value `a' to the extended double-precision floating-point format.  The
5951| conversion is performed according to the IEC/IEEE Standard for Binary
5952| Floating-Point Arithmetic.
5953*----------------------------------------------------------------------------*/
5954
5955floatx80 float128_to_floatx80(float128 a, float_status *status)
5956{
5957    flag aSign;
5958    int32_t aExp;
5959    uint64_t aSig0, aSig1;
5960
5961    aSig1 = extractFloat128Frac1( a );
5962    aSig0 = extractFloat128Frac0( a );
5963    aExp = extractFloat128Exp( a );
5964    aSign = extractFloat128Sign( a );
5965    if ( aExp == 0x7FFF ) {
5966        if ( aSig0 | aSig1 ) {
5967            return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5968        }
5969        return packFloatx80(aSign, floatx80_infinity_high,
5970                                   floatx80_infinity_low);
5971    }
5972    if ( aExp == 0 ) {
5973        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5974        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5975    }
5976    else {
5977        aSig0 |= LIT64( 0x0001000000000000 );
5978    }
5979    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5980    return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5981
5982}
5983
5984/*----------------------------------------------------------------------------
5985| Rounds the quadruple-precision floating-point value `a' to an integer, and
5986| returns the result as a quadruple-precision floating-point value.  The
5987| operation is performed according to the IEC/IEEE Standard for Binary
5988| Floating-Point Arithmetic.
5989*----------------------------------------------------------------------------*/
5990
5991float128 float128_round_to_int(float128 a, float_status *status)
5992{
5993    flag aSign;
5994    int32_t aExp;
5995    uint64_t lastBitMask, roundBitsMask;
5996    float128 z;
5997
5998    aExp = extractFloat128Exp( a );
5999    if ( 0x402F <= aExp ) {
6000        if ( 0x406F <= aExp ) {

6001            if (    ( aExp == 0x7FFF )
6002                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6003               ) {
6004                return propagateFloat128NaN(a, a, status);
6005            }
6006            return a;
6007        }
6008        lastBitMask = 1;
6009        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6010        roundBitsMask = lastBitMask - 1;
6011        z = a;
6012        switch (status->float_rounding_mode) {
6013        case float_round_nearest_even:
6014            if ( lastBitMask ) {
6015                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6016                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6017            }
6018            else {
6019                if ( (int64_t) z.low < 0 ) {
6020                    ++z.high;
6021                    if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6022                }
6023            }
6024            break;
6025        case float_round_ties_away:
6026            if (lastBitMask) {
6027                add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6028            } else {
6029                if ((int64_t) z.low < 0) {
6030                    ++z.high;
6031                }
6032            }
6033            break;
6034        case float_round_to_zero:
6035            break;
6036        case float_round_up:
6037            if (!extractFloat128Sign(z)) {
6038                add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6039            }
6040            break;
6041        case float_round_down:
6042            if (extractFloat128Sign(z)) {
6043                add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6044            }
6045            break;
6046        default:
6047            abort();
6048        }
6049        z.low &= ~ roundBitsMask;
6050    }
6051    else {
6052        if ( aExp < 0x3FFF ) {
6053            if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6054            status->float_exception_flags |= float_flag_inexact;
6055            aSign = extractFloat128Sign( a );
6056            switch (status->float_rounding_mode) {
6057             case float_round_nearest_even:
6058                if (    ( aExp == 0x3FFE )
6059                     && (   extractFloat128Frac0( a )
6060                          | extractFloat128Frac1( a ) )
6061                   ) {
6062                    return packFloat128( aSign, 0x3FFF, 0, 0 );
6063                }
6064                break;
6065            case float_round_ties_away:
6066                if (aExp == 0x3FFE) {
6067                    return packFloat128(aSign, 0x3FFF, 0, 0);
6068                }
6069                break;
6070             case float_round_down:
6071                return
6072                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6073                    : packFloat128( 0, 0, 0, 0 );
6074             case float_round_up:
6075                return
6076                      aSign ? packFloat128( 1, 0, 0, 0 )
6077                    : packFloat128( 0, 0x3FFF, 0, 0 );
6078            }
6079            return packFloat128( aSign, 0, 0, 0 );
6080        }
6081        lastBitMask = 1;
6082        lastBitMask <<= 0x402F - aExp;
6083        roundBitsMask = lastBitMask - 1;
6084        z.low = 0;
6085        z.high = a.high;
6086        switch (status->float_rounding_mode) {
6087        case float_round_nearest_even:
6088            z.high += lastBitMask>>1;
6089            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6090                z.high &= ~ lastBitMask;
6091            }
6092            break;
6093        case float_round_ties_away:
6094            z.high += lastBitMask>>1;
6095            break;
6096        case float_round_to_zero:
6097            break;
6098        case float_round_up:
6099            if (!extractFloat128Sign(z)) {
6100                z.high |= ( a.low != 0 );
6101                z.high += roundBitsMask;
6102            }
6103            break;
6104        case float_round_down:
6105            if (extractFloat128Sign(z)) {
6106                z.high |= (a.low != 0);
6107                z.high += roundBitsMask;
6108            }
6109            break;
6110        default:
6111            abort();
6112        }
6113        z.high &= ~ roundBitsMask;
6114    }
6115    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6116        status->float_exception_flags |= float_flag_inexact;
6117    }
6118    return z;
6119
6120}
6121
6122/*----------------------------------------------------------------------------
6123| Returns the result of adding the absolute values of the quadruple-precision
6124| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6125| before being returned.  `zSign' is ignored if the result is a NaN.
6126| The addition is performed according to the IEC/IEEE Standard for Binary
6127| Floating-Point Arithmetic.
6128*----------------------------------------------------------------------------*/
6129
6130static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6131                                float_status *status)
6132{
6133    int32_t aExp, bExp, zExp;
6134    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6135    int32_t expDiff;
6136
6137    aSig1 = extractFloat128Frac1( a );
6138    aSig0 = extractFloat128Frac0( a );
6139    aExp = extractFloat128Exp( a );
6140    bSig1 = extractFloat128Frac1( b );
6141    bSig0 = extractFloat128Frac0( b );
6142    bExp = extractFloat128Exp( b );
6143    expDiff = aExp - bExp;
6144    if ( 0 < expDiff ) {
6145        if ( aExp == 0x7FFF ) {
6146            if (aSig0 | aSig1) {
6147                return propagateFloat128NaN(a, b, status);
6148            }
6149            return a;
6150        }
6151        if ( bExp == 0 ) {
6152            --expDiff;
6153        }
6154        else {
6155            bSig0 |= LIT64( 0x0001000000000000 );
6156        }
6157        shift128ExtraRightJamming(
6158            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6159        zExp = aExp;
6160    }
6161    else if ( expDiff < 0 ) {
6162        if ( bExp == 0x7FFF ) {
6163            if (bSig0 | bSig1) {
6164                return propagateFloat128NaN(a, b, status);
6165            }
6166            return packFloat128( zSign, 0x7FFF, 0, 0 );
6167        }
6168        if ( aExp == 0 ) {
6169            ++expDiff;
6170        }
6171        else {
6172            aSig0 |= LIT64( 0x0001000000000000 );
6173        }
6174        shift128ExtraRightJamming(
6175            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6176        zExp = bExp;
6177    }
6178    else {
6179        if ( aExp == 0x7FFF ) {
6180            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6181                return propagateFloat128NaN(a, b, status);
6182            }
6183            return a;
6184        }
6185        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6186        if ( aExp == 0 ) {
6187            if (status->flush_to_zero) {
6188                if (zSig0 | zSig1) {
6189                    float_raise(float_flag_output_denormal, status);
6190                }
6191                return packFloat128(zSign, 0, 0, 0);
6192            }
6193            return packFloat128( zSign, 0, zSig0, zSig1 );
6194        }
6195        zSig2 = 0;
6196        zSig0 |= LIT64( 0x0002000000000000 );
6197        zExp = aExp;
6198        goto shiftRight1;
6199    }
6200    aSig0 |= LIT64( 0x0001000000000000 );
6201    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6202    --zExp;
6203    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6204    ++zExp;
6205 shiftRight1:
6206    shift128ExtraRightJamming(
6207        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6208 roundAndPack:
6209    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6210
6211}
6212
6213/*----------------------------------------------------------------------------
6214| Returns the result of subtracting the absolute values of the quadruple-
6215| precision floating-point values `a' and `b'.  If `zSign' is 1, the
6216| difference is negated before being returned.  `zSign' is ignored if the
6217| result is a NaN.  The subtraction is performed according to the IEC/IEEE
6218| Standard for Binary Floating-Point Arithmetic.
6219*----------------------------------------------------------------------------*/
6220
6221static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6222                                float_status *status)
6223{
6224    int32_t aExp, bExp, zExp;
6225    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6226    int32_t expDiff;
6227
6228    aSig1 = extractFloat128Frac1( a );
6229    aSig0 = extractFloat128Frac0( a );
6230    aExp = extractFloat128Exp( a );
6231    bSig1 = extractFloat128Frac1( b );
6232    bSig0 = extractFloat128Frac0( b );
6233    bExp = extractFloat128Exp( b );
6234    expDiff = aExp - bExp;
6235    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6236    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6237    if ( 0 < expDiff ) goto aExpBigger;
6238    if ( expDiff < 0 ) goto bExpBigger;
6239    if ( aExp == 0x7FFF ) {
6240        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6241            return propagateFloat128NaN(a, b, status);
6242        }
6243        float_raise(float_flag_invalid, status);
6244        return float128_default_nan(status);
6245    }
6246    if ( aExp == 0 ) {
6247        aExp = 1;
6248        bExp = 1;
6249    }
6250    if ( bSig0 < aSig0 ) goto aBigger;
6251    if ( aSig0 < bSig0 ) goto bBigger;
6252    if ( bSig1 < aSig1 ) goto aBigger;
6253    if ( aSig1 < bSig1 ) goto bBigger;
6254    return packFloat128(status->float_rounding_mode == float_round_down,
6255                        0, 0, 0);
6256 bExpBigger:
6257    if ( bExp == 0x7FFF ) {
6258        if (bSig0 | bSig1) {
6259            return propagateFloat128NaN(a, b, status);
6260        }
6261        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6262    }
6263    if ( aExp == 0 ) {
6264        ++expDiff;
6265    }
6266    else {
6267        aSig0 |= LIT64( 0x4000000000000000 );
6268    }
6269    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6270    bSig0 |= LIT64( 0x4000000000000000 );
6271 bBigger:
6272    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6273    zExp = bExp;
6274    zSign ^= 1;
6275    goto normalizeRoundAndPack;
6276 aExpBigger:
6277    if ( aExp == 0x7FFF ) {
6278        if (aSig0 | aSig1) {
6279            return propagateFloat128NaN(a, b, status);
6280        }
6281        return a;
6282    }
6283    if ( bExp == 0 ) {
6284        --expDiff;
6285    }
6286    else {
6287        bSig0 |= LIT64( 0x4000000000000000 );
6288    }
6289    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6290    aSig0 |= LIT64( 0x4000000000000000 );
6291 aBigger:
6292    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6293    zExp = aExp;
6294 normalizeRoundAndPack:
6295    --zExp;
6296    return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6297                                         status);
6298
6299}
6300
6301/*----------------------------------------------------------------------------
6302| Returns the result of adding the quadruple-precision floating-point values
6303| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6304| for Binary Floating-Point Arithmetic.
6305*----------------------------------------------------------------------------*/
6306
6307float128 float128_add(float128 a, float128 b, float_status *status)
6308{
6309    flag aSign, bSign;
6310
6311    aSign = extractFloat128Sign( a );
6312    bSign = extractFloat128Sign( b );
6313    if ( aSign == bSign ) {
6314        return addFloat128Sigs(a, b, aSign, status);
6315    }
6316    else {
6317        return subFloat128Sigs(a, b, aSign, status);
6318    }
6319
6320}
6321
6322/*----------------------------------------------------------------------------
6323| Returns the result of subtracting the quadruple-precision floating-point
6324| values `a' and `b'.  The operation is performed according to the IEC/IEEE
6325| Standard for Binary Floating-Point Arithmetic.
6326*----------------------------------------------------------------------------*/
6327
6328float128 float128_sub(float128 a, float128 b, float_status *status)
6329{
6330    flag aSign, bSign;
6331
6332    aSign = extractFloat128Sign( a );
6333    bSign = extractFloat128Sign( b );
6334    if ( aSign == bSign ) {
6335        return subFloat128Sigs(a, b, aSign, status);
6336    }
6337    else {
6338        return addFloat128Sigs(a, b, aSign, status);
6339    }
6340
6341}
6342
6343/*----------------------------------------------------------------------------
6344| Returns the result of multiplying the quadruple-precision floating-point
6345| values `a' and `b'.  The operation is performed according to the IEC/IEEE
6346| Standard for Binary Floating-Point Arithmetic.
6347*----------------------------------------------------------------------------*/
6348
6349float128 float128_mul(float128 a, float128 b, float_status *status)
6350{
6351    flag aSign, bSign, zSign;
6352    int32_t aExp, bExp, zExp;
6353    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6354
6355    aSig1 = extractFloat128Frac1( a );
6356    aSig0 = extractFloat128Frac0( a );
6357    aExp = extractFloat128Exp( a );
6358    aSign = extractFloat128Sign( a );
6359    bSig1 = extractFloat128Frac1( b );
6360    bSig0 = extractFloat128Frac0( b );
6361    bExp = extractFloat128Exp( b );
6362    bSign = extractFloat128Sign( b );
6363    zSign = aSign ^ bSign;
6364    if ( aExp == 0x7FFF ) {
6365        if (    ( aSig0 | aSig1 )
6366             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6367            return propagateFloat128NaN(a, b, status);
6368        }
6369        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6370        return packFloat128( zSign, 0x7FFF, 0, 0 );
6371    }
6372    if ( bExp == 0x7FFF ) {
6373        if (bSig0 | bSig1) {
6374            return propagateFloat128NaN(a, b, status);
6375        }
6376        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6377 invalid:
6378            float_raise(float_flag_invalid, status);
6379            return float128_default_nan(status);
6380        }
6381        return packFloat128( zSign, 0x7FFF, 0, 0 );
6382    }
6383    if ( aExp == 0 ) {
6384        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6385        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6386    }
6387    if ( bExp == 0 ) {
6388        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6389        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6390    }
6391    zExp = aExp + bExp - 0x4000;
6392    aSig0 |= LIT64( 0x0001000000000000 );
6393    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6394    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6395    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6396    zSig2 |= ( zSig3 != 0 );
6397    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6398        shift128ExtraRightJamming(
6399            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6400        ++zExp;
6401    }
6402    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6403
6404}
6405
6406/*----------------------------------------------------------------------------
6407| Returns the result of dividing the quadruple-precision floating-point value
6408| `a' by the corresponding value `b'.  The operation is performed according to
6409| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6410*----------------------------------------------------------------------------*/
6411
6412float128 float128_div(float128 a, float128 b, float_status *status)
6413{
6414    flag aSign, bSign, zSign;
6415    int32_t aExp, bExp, zExp;
6416    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6417    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6418
6419    aSig1 = extractFloat128Frac1( a );
6420    aSig0 = extractFloat128Frac0( a );
6421    aExp = extractFloat128Exp( a );
6422    aSign = extractFloat128Sign( a );
6423    bSig1 = extractFloat128Frac1( b );
6424    bSig0 = extractFloat128Frac0( b );
6425    bExp = extractFloat128Exp( b );
6426    bSign = extractFloat128Sign( b );
6427    zSign = aSign ^ bSign;
6428    if ( aExp == 0x7FFF ) {
6429        if (aSig0 | aSig1) {
6430            return propagateFloat128NaN(a, b, status);
6431        }
6432        if ( bExp == 0x7FFF ) {
6433            if (bSig0 | bSig1) {
6434                return propagateFloat128NaN(a, b, status);
6435            }
6436            goto invalid;
6437        }
6438        return packFloat128( zSign, 0x7FFF, 0, 0 );
6439    }
6440    if ( bExp == 0x7FFF ) {
6441        if (bSig0 | bSig1) {
6442            return propagateFloat128NaN(a, b, status);
6443        }
6444        return packFloat128( zSign, 0, 0, 0 );
6445    }
6446    if ( bExp == 0 ) {
6447        if ( ( bSig0 | bSig1 ) == 0 ) {
6448            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6449 invalid:
6450                float_raise(float_flag_invalid, status);
6451                return float128_default_nan(status);
6452            }
6453            float_raise(float_flag_divbyzero, status);
6454            return packFloat128( zSign, 0x7FFF, 0, 0 );
6455        }
6456        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6457    }
6458    if ( aExp == 0 ) {
6459        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6460        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6461    }
6462    zExp = aExp - bExp + 0x3FFD;
6463    shortShift128Left(
6464        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6465    shortShift128Left(
6466        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6467    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6468        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6469        ++zExp;
6470    }
6471    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6472    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6473    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6474    while ( (int64_t) rem0 < 0 ) {
6475        --zSig0;
6476        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6477    }
6478    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6479    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6480        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6481        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6482        while ( (int64_t) rem1 < 0 ) {
6483            --zSig1;
6484            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6485        }
6486        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6487    }
6488    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6489    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6490
6491}
6492
6493/*----------------------------------------------------------------------------
6494| Returns the remainder of the quadruple-precision floating-point value `a'
6495| with respect to the corresponding value `b'.  The operation is performed
6496| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6497*----------------------------------------------------------------------------*/
6498
6499float128 float128_rem(float128 a, float128 b, float_status *status)
6500{
6501    flag aSign, zSign;
6502    int32_t aExp, bExp, expDiff;
6503    uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6504    uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6505    int64_t sigMean0;
6506
6507    aSig1 = extractFloat128Frac1( a );
6508    aSig0 = extractFloat128Frac0( a );
6509    aExp = extractFloat128Exp( a );
6510    aSign = extractFloat128Sign( a );
6511    bSig1 = extractFloat128Frac1( b );
6512    bSig0 = extractFloat128Frac0( b );
6513    bExp = extractFloat128Exp( b );
6514    if ( aExp == 0x7FFF ) {
6515        if (    ( aSig0 | aSig1 )
6516             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6517            return propagateFloat128NaN(a, b, status);
6518        }
6519        goto invalid;
6520    }
6521    if ( bExp == 0x7FFF ) {
6522        if (bSig0 | bSig1) {
6523            return propagateFloat128NaN(a, b, status);
6524        }
6525        return a;
6526    }
6527    if ( bExp == 0 ) {
6528        if ( ( bSig0 | bSig1 ) == 0 ) {
6529 invalid:
6530            float_raise(float_flag_invalid, status);
6531            return float128_default_nan(status);
6532        }
6533        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6534    }
6535    if ( aExp == 0 ) {
6536        if ( ( aSig0 | aSig1 ) == 0 ) return a;
6537        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6538    }
6539    expDiff = aExp - bExp;
6540    if ( expDiff < -1 ) return a;
6541    shortShift128Left(
6542        aSig0 | LIT64( 0x0001000000000000 ),
6543        aSig1,
6544        15 - ( expDiff < 0 ),
6545        &aSig0,
6546        &aSig1
6547    );
6548    shortShift128Left(
6549        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6550    q = le128( bSig0, bSig1, aSig0, aSig1 );
6551    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6552    expDiff -= 64;
6553    while ( 0 < expDiff ) {
6554        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6555        q = ( 4 < q ) ? q - 4 : 0;
6556        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6557        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6558        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6559        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6560        expDiff -= 61;
6561    }
6562    if ( -64 < expDiff ) {
6563        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6564        q = ( 4 < q ) ? q - 4 : 0;
6565        q >>= - expDiff;
6566        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6567        expDiff += 52;
6568        if ( expDiff < 0 ) {
6569            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6570        }
6571        else {
6572            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6573        }
6574        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6575        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6576    }
6577    else {
6578        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6579        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6580    }
6581    do {
6582        alternateASig0 = aSig0;
6583        alternateASig1 = aSig1;
6584        ++q;
6585        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6586    } while ( 0 <= (int64_t) aSig0 );
6587    add128(
6588        aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6589    if (    ( sigMean0 < 0 )
6590         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6591        aSig0 = alternateASig0;
6592        aSig1 = alternateASig1;
6593    }
6594    zSign = ( (int64_t) aSig0 < 0 );
6595    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6596    return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6597                                         status);
6598}
6599
6600/*----------------------------------------------------------------------------
6601| Returns the square root of the quadruple-precision floating-point value `a'.
6602| The operation is performed according to the IEC/IEEE Standard for Binary
6603| Floating-Point Arithmetic.
6604*----------------------------------------------------------------------------*/
6605
6606float128 float128_sqrt(float128 a, float_status *status)
6607{
6608    flag aSign;
6609    int32_t aExp, zExp;
6610    uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6611    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6612
6613    aSig1 = extractFloat128Frac1( a );
6614    aSig0 = extractFloat128Frac0( a );
6615    aExp = extractFloat128Exp( a );
6616    aSign = extractFloat128Sign( a );
6617    if ( aExp == 0x7FFF ) {
6618        if (aSig0 | aSig1) {
6619            return propagateFloat128NaN(a, a, status);
6620        }
6621        if ( ! aSign ) return a;
6622        goto invalid;
6623    }
6624    if ( aSign ) {
6625        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6626 invalid:
6627        float_raise(float_flag_invalid, status);
6628        return float128_default_nan(status);
6629    }
6630    if ( aExp == 0 ) {
6631        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6632        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6633    }
6634    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6635    aSig0 |= LIT64( 0x0001000000000000 );
6636    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6637    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6638    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6639    doubleZSig0 = zSig0<<1;
6640    mul64To128( zSig0, zSig0, &term0, &term1 );
6641    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6642    while ( (int64_t) rem0 < 0 ) {
6643        --zSig0;
6644        doubleZSig0 -= 2;
6645        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6646    }
6647    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6648    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6649        if ( zSig1 == 0 ) zSig1 = 1;
6650        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6651        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6652        mul64To128( zSig1, zSig1, &term2, &term3 );
6653        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6654        while ( (int64_t) rem1 < 0 ) {
6655            --zSig1;
6656            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6657            term3 |= 1;
6658            term2 |= doubleZSig0;
6659            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6660        }
6661        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6662    }
6663    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6664    return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6665
6666}
6667
6668/*----------------------------------------------------------------------------
6669| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6670| the corresponding value `b', and 0 otherwise.  The invalid exception is
6671| raised if either operand is a NaN.  Otherwise, the comparison is performed
6672| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6673*----------------------------------------------------------------------------*/
6674
6675int float128_eq(float128 a, float128 b, float_status *status)
6676{
6677
6678    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6679              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6680         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6681              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6682       ) {
6683        float_raise(float_flag_invalid, status);
6684        return 0;
6685    }
6686    return
6687           ( a.low == b.low )
6688        && (    ( a.high == b.high )
6689             || (    ( a.low == 0 )
6690                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6691           );
6692
6693}
6694
6695/*----------------------------------------------------------------------------
6696| Returns 1 if the quadruple-precision floating-point value `a' is less than
6697| or equal to the corresponding value `b', and 0 otherwise.  The invalid
6698| exception is raised if either operand is a NaN.  The comparison is performed
6699| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6700*----------------------------------------------------------------------------*/
6701
6702int float128_le(float128 a, float128 b, float_status *status)
6703{
6704    flag aSign, bSign;
6705
6706    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6707              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6708         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6709              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6710       ) {
6711        float_raise(float_flag_invalid, status);
6712        return 0;
6713    }
6714    aSign = extractFloat128Sign( a );
6715    bSign = extractFloat128Sign( b );
6716    if ( aSign != bSign ) {
6717        return
6718               aSign
6719            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6720                 == 0 );
6721    }
6722    return
6723          aSign ? le128( b.high, b.low, a.high, a.low )
6724        : le128( a.high, a.low, b.high, b.low );
6725
6726}
6727
6728/*----------------------------------------------------------------------------
6729| Returns 1 if the quadruple-precision floating-point value `a' is less than
6730| the corresponding value `b', and 0 otherwise.  The invalid exception is
6731| raised if either operand is a NaN.  The comparison is performed according
6732| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6733*----------------------------------------------------------------------------*/
6734
6735int float128_lt(float128 a, float128 b, float_status *status)
6736{
6737    flag aSign, bSign;
6738
6739    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6740              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6741         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6742              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6743       ) {
6744        float_raise(float_flag_invalid, status);
6745        return 0;
6746    }
6747    aSign = extractFloat128Sign( a );
6748    bSign = extractFloat128Sign( b );
6749    if ( aSign != bSign ) {
6750        return
6751               aSign
6752            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6753                 != 0 );
6754    }
6755    return
6756          aSign ? lt128( b.high, b.low, a.high, a.low )
6757        : lt128( a.high, a.low, b.high, b.low );
6758
6759}
6760
6761/*----------------------------------------------------------------------------
6762| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6763| be compared, and 0 otherwise.  The invalid exception is raised if either
6764| operand is a NaN. The comparison is performed according to the IEC/IEEE
6765| Standard for Binary Floating-Point Arithmetic.
6766*----------------------------------------------------------------------------*/
6767
6768int float128_unordered(float128 a, float128 b, float_status *status)
6769{
6770    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6771              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6772         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6773              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6774       ) {
6775        float_raise(float_flag_invalid, status);
6776        return 1;
6777    }
6778    return 0;
6779}
6780
6781/*----------------------------------------------------------------------------
6782| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6783| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6784| exception.  The comparison is performed according to the IEC/IEEE Standard
6785| for Binary Floating-Point Arithmetic.
6786*----------------------------------------------------------------------------*/
6787
6788int float128_eq_quiet(float128 a, float128 b, float_status *status)
6789{
6790
6791    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6792              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6793         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6794              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6795       ) {
6796        if (float128_is_signaling_nan(a, status)
6797         || float128_is_signaling_nan(b, status)) {
6798            float_raise(float_flag_invalid, status);
6799        }
6800        return 0;
6801    }
6802    return
6803           ( a.low == b.low )
6804        && (    ( a.high == b.high )
6805             || (    ( a.low == 0 )
6806                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6807           );
6808
6809}
6810
6811/*----------------------------------------------------------------------------
6812| Returns 1 if the quadruple-precision floating-point value `a' is less than
6813| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6814| cause an exception.  Otherwise, the comparison is performed according to the
6815| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6816*----------------------------------------------------------------------------*/
6817
6818int float128_le_quiet(float128 a, float128 b, float_status *status)
6819{
6820    flag aSign, bSign;
6821
6822    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6823              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6824         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6825              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6826       ) {
6827        if (float128_is_signaling_nan(a, status)
6828         || float128_is_signaling_nan(b, status)) {
6829            float_raise(float_flag_invalid, status);
6830        }
6831        return 0;
6832    }
6833    aSign = extractFloat128Sign( a );
6834    bSign = extractFloat128Sign( b );
6835    if ( aSign != bSign ) {
6836        return
6837               aSign
6838            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6839                 == 0 );
6840    }
6841    return
6842          aSign ? le128( b.high, b.low, a.high, a.low )
6843        : le128( a.high, a.low, b.high, b.low );
6844
6845}
6846
6847/*----------------------------------------------------------------------------
6848| Returns 1 if the quadruple-precision floating-point value `a' is less than
6849| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6850| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6851| Standard for Binary Floating-Point Arithmetic.
6852*----------------------------------------------------------------------------*/
6853
6854int float128_lt_quiet(float128 a, float128 b, float_status *status)
6855{
6856    flag aSign, bSign;
6857
6858    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6859              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6860         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6861              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6862       ) {
6863        if (float128_is_signaling_nan(a, status)
6864         || float128_is_signaling_nan(b, status)) {
6865            float_raise(float_flag_invalid, status);
6866        }
6867        return 0;
6868    }
6869    aSign = extractFloat128Sign( a );
6870    bSign = extractFloat128Sign( b );
6871    if ( aSign != bSign ) {
6872        return
6873               aSign
6874            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6875                 != 0 );
6876    }
6877    return
6878          aSign ? lt128( b.high, b.low, a.high, a.low )
6879        : lt128( a.high, a.low, b.high, b.low );
6880
6881}
6882
6883/*----------------------------------------------------------------------------
6884| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6885| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6886| comparison is performed according to the IEC/IEEE Standard for Binary
6887| Floating-Point Arithmetic.
6888*----------------------------------------------------------------------------*/
6889
6890int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6891{
6892    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6893              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6894         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6895              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6896       ) {
6897        if (float128_is_signaling_nan(a, status)
6898         || float128_is_signaling_nan(b, status)) {
6899            float_raise(float_flag_invalid, status);
6900        }
6901        return 1;
6902    }
6903    return 0;
6904}
6905
6906static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6907                                            int is_quiet, float_status *status)
6908{
6909    flag aSign, bSign;
6910
6911    if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6912        float_raise(float_flag_invalid, status);
6913        return float_relation_unordered;
6914    }
6915    if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6916          ( extractFloatx80Frac( a )<<1 ) ) ||
6917        ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6918          ( extractFloatx80Frac( b )<<1 ) )) {
6919        if (!is_quiet ||
6920            floatx80_is_signaling_nan(a, status) ||
6921            floatx80_is_signaling_nan(b, status)) {
6922            float_raise(float_flag_invalid, status);
6923        }
6924        return float_relation_unordered;
6925    }
6926    aSign = extractFloatx80Sign( a );
6927    bSign = extractFloatx80Sign( b );
6928    if ( aSign != bSign ) {
6929
6930        if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6931             ( ( a.low | b.low ) == 0 ) ) {
6932            /* zero case */
6933            return float_relation_equal;
6934        } else {
6935            return 1 - (2 * aSign);
6936        }
6937    } else {
6938        if (a.low == b.low && a.high == b.high) {
6939            return float_relation_equal;
6940        } else {
6941            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6942        }
6943    }
6944}
6945
6946int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6947{
6948    return floatx80_compare_internal(a, b, 0, status);
6949}
6950
6951int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6952{
6953    return floatx80_compare_internal(a, b, 1, status);
6954}
6955
6956static inline int float128_compare_internal(float128 a, float128 b,
6957                                            int is_quiet, float_status *status)
6958{
6959    flag aSign, bSign;
6960
6961    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6962          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6963        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6964          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6965        if (!is_quiet ||
6966            float128_is_signaling_nan(a, status) ||
6967            float128_is_signaling_nan(b, status)) {
6968            float_raise(float_flag_invalid, status);
6969        }
6970        return float_relation_unordered;
6971    }
6972    aSign = extractFloat128Sign( a );
6973    bSign = extractFloat128Sign( b );
6974    if ( aSign != bSign ) {
6975        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6976            /* zero case */
6977            return float_relation_equal;
6978        } else {
6979            return 1 - (2 * aSign);
6980        }
6981    } else {
6982        if (a.low == b.low && a.high == b.high) {
6983            return float_relation_equal;
6984        } else {
6985            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6986        }
6987    }
6988}
6989
6990int float128_compare(float128 a, float128 b, float_status *status)
6991{
6992    return float128_compare_internal(a, b, 0, status);
6993}
6994
6995int float128_compare_quiet(float128 a, float128 b, float_status *status)
6996{
6997    return float128_compare_internal(a, b, 1, status);
6998}
6999
7000floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)

7001{
7002    flag aSign;
7003    int32_t aExp;
7004    uint64_t aSig;
7005
7006    if (floatx80_invalid_encoding(a)) {
7007        float_raise(float_flag_invalid, status);
7008        return floatx80_default_nan(status);
7009    }
7010    aSig = extractFloatx80Frac( a );
7011    aExp = extractFloatx80Exp( a );
7012    aSign = extractFloatx80Sign( a );
7013
7014    if ( aExp == 0x7FFF ) {
7015        if ( aSig<<1 ) {
7016            return propagateFloatx80NaN(a, a, status);
7017        }
7018        return a;
7019    }
7020
7021    if (aExp == 0) {
7022        if (aSig == 0) {
7023            return a;
7024        }
7025        aExp++;
7026    }
7027
7028    if (n > 0x10000) {
7029        n = 0x10000;
7030    } else if (n < -0x10000) {
7031        n = -0x10000;
7032    }
7033
7034    aExp += n;
7035    return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7036                                         aSign, aExp, aSig, 0, status);
7037}
7038
7039float128 float128_scalbn(float128 a, int n, float_status *status)
7040{
7041    flag aSign;
7042    int32_t aExp;
7043    uint64_t aSig0, aSig1;
7044
7045    aSig1 = extractFloat128Frac1( a );
7046    aSig0 = extractFloat128Frac0( a );
7047    aExp = extractFloat128Exp( a );
7048    aSign = extractFloat128Sign( a );
7049    if ( aExp == 0x7FFF ) {
7050        if ( aSig0 | aSig1 ) {
7051            return propagateFloat128NaN(a, a, status);
7052        }
7053        return a;
7054    }
7055    if (aExp != 0) {
7056        aSig0 |= LIT64( 0x0001000000000000 );
7057    } else if (aSig0 == 0 && aSig1 == 0) {
7058        return a;
7059    } else {
7060        aExp++;
7061    }
7062
7063    if (n > 0x10000) {
7064        n = 0x10000;
7065    } else if (n < -0x10000) {
7066        n = -0x10000;
7067    }
7068
7069    aExp += n - 1;
7070    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7071                                         , status);
7072
7073}
7074