qemu/fpu/softfloat.c
<<
>>
Prefs
   1/*
   2 * QEMU float support
   3 *
   4 * The code in this source file is derived from release 2a of the SoftFloat
   5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6 * some later contributions) are provided under that license, as detailed below.
   7 * It has subsequently been modified by contributors to the QEMU Project,
   8 * so some portions are provided under:
   9 *  the SoftFloat-2a license
  10 *  the BSD license
  11 *  GPL-v2-or-later
  12 *
  13 * Any future contributions to this file after December 1st 2014 will be
  14 * taken to be licensed under the Softfloat-2a license unless specifically
  15 * indicated otherwise.
  16 */
  17
  18/*
  19===============================================================================
  20This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21Arithmetic Package, Release 2a.
  22
  23Written by John R. Hauser.  This work was made possible in part by the
  24International Computer Science Institute, located at Suite 600, 1947 Center
  25Street, Berkeley, California 94704.  Funding was partially provided by the
  26National Science Foundation under grant MIP-9311980.  The original version
  27of this code was written as part of a project to build a fixed-point vector
  28processor in collaboration with the University of California at Berkeley,
  29overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31arithmetic/SoftFloat.html'.
  32
  33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39Derivative works are acceptable, even for commercial purposes, so long as
  40(1) they include prominent notice that the work is derivative, and (2) they
  41include prominent notice akin to these four paragraphs for those parts of
  42this code that are retained.
  43
  44===============================================================================
  45*/
  46
  47/* BSD licensing:
  48 * Copyright (c) 2006, Fabrice Bellard
  49 * All rights reserved.
  50 *
  51 * Redistribution and use in source and binary forms, with or without
  52 * modification, are permitted provided that the following conditions are met:
  53 *
  54 * 1. Redistributions of source code must retain the above copyright notice,
  55 * this list of conditions and the following disclaimer.
  56 *
  57 * 2. Redistributions in binary form must reproduce the above copyright notice,
  58 * this list of conditions and the following disclaimer in the documentation
  59 * and/or other materials provided with the distribution.
  60 *
  61 * 3. Neither the name of the copyright holder nor the names of its contributors
  62 * may be used to endorse or promote products derived from this software without
  63 * specific prior written permission.
  64 *
  65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75 * THE POSSIBILITY OF SUCH DAMAGE.
  76 */
  77
  78/* Portions of this work are licensed under the terms of the GNU GPL,
  79 * version 2 or later. See the COPYING file in the top-level directory.
  80 */
  81
  82/* softfloat (and in particular the code in softfloat-specialize.h) is
  83 * target-dependent and needs the TARGET_* macros.
  84 */
  85#include "config.h"
  86
  87#include "fpu/softfloat.h"
  88
  89/* We only need stdlib for abort() */
  90#include <stdlib.h>
  91
  92/*----------------------------------------------------------------------------
  93| Primitive arithmetic functions, including multi-word arithmetic, and
  94| division and square root approximations.  (Can be specialized to target if
  95| desired.)
  96*----------------------------------------------------------------------------*/
  97#include "softfloat-macros.h"
  98
  99/*----------------------------------------------------------------------------
 100| Functions and definitions to determine:  (1) whether tininess for underflow
 101| is detected before or after rounding by default, (2) what (if anything)
 102| happens when exceptions are raised, (3) how signaling NaNs are distinguished
 103| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 104| are propagated from function inputs to output.  These details are target-
 105| specific.
 106*----------------------------------------------------------------------------*/
 107#include "softfloat-specialize.h"
 108
 109/*----------------------------------------------------------------------------
 110| Returns the fraction bits of the half-precision floating-point value `a'.
 111*----------------------------------------------------------------------------*/
 112
 113static inline uint32_t extractFloat16Frac(float16 a)
 114{
 115    return float16_val(a) & 0x3ff;
 116}
 117
 118/*----------------------------------------------------------------------------
 119| Returns the exponent bits of the half-precision floating-point value `a'.
 120*----------------------------------------------------------------------------*/
 121
 122static inline int_fast16_t extractFloat16Exp(float16 a)
 123{
 124    return (float16_val(a) >> 10) & 0x1f;
 125}
 126
 127/*----------------------------------------------------------------------------
 128| Returns the sign bit of the single-precision floating-point value `a'.
 129*----------------------------------------------------------------------------*/
 130
 131static inline flag extractFloat16Sign(float16 a)
 132{
 133    return float16_val(a)>>15;
 134}
 135
 136/*----------------------------------------------------------------------------
 137| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 138| and 7, and returns the properly rounded 32-bit integer corresponding to the
 139| input.  If `zSign' is 1, the input is negated before being converted to an
 140| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 141| is simply rounded to an integer, with the inexact exception raised if the
 142| input cannot be represented exactly as an integer.  However, if the fixed-
 143| point input is too large, the invalid exception is raised and the largest
 144| positive or negative integer is returned.
 145*----------------------------------------------------------------------------*/
 146
 147static int32 roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
 148{
 149    int8 roundingMode;
 150    flag roundNearestEven;
 151    int8 roundIncrement, roundBits;
 152    int32_t z;
 153
 154    roundingMode = status->float_rounding_mode;
 155    roundNearestEven = ( roundingMode == float_round_nearest_even );
 156    switch (roundingMode) {
 157    case float_round_nearest_even:
 158    case float_round_ties_away:
 159        roundIncrement = 0x40;
 160        break;
 161    case float_round_to_zero:
 162        roundIncrement = 0;
 163        break;
 164    case float_round_up:
 165        roundIncrement = zSign ? 0 : 0x7f;
 166        break;
 167    case float_round_down:
 168        roundIncrement = zSign ? 0x7f : 0;
 169        break;
 170    default:
 171        abort();
 172    }
 173    roundBits = absZ & 0x7F;
 174    absZ = ( absZ + roundIncrement )>>7;
 175    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 176    z = absZ;
 177    if ( zSign ) z = - z;
 178    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 179        float_raise(float_flag_invalid, status);
 180        return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 181    }
 182    if (roundBits) {
 183        status->float_exception_flags |= float_flag_inexact;
 184    }
 185    return z;
 186
 187}
 188
 189/*----------------------------------------------------------------------------
 190| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 191| `absZ1', with binary point between bits 63 and 64 (between the input words),
 192| and returns the properly rounded 64-bit integer corresponding to the input.
 193| If `zSign' is 1, the input is negated before being converted to an integer.
 194| Ordinarily, the fixed-point input is simply rounded to an integer, with
 195| the inexact exception raised if the input cannot be represented exactly as
 196| an integer.  However, if the fixed-point input is too large, the invalid
 197| exception is raised and the largest positive or negative integer is
 198| returned.
 199*----------------------------------------------------------------------------*/
 200
 201static int64 roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
 202                               float_status *status)
 203{
 204    int8 roundingMode;
 205    flag roundNearestEven, increment;
 206    int64_t z;
 207
 208    roundingMode = status->float_rounding_mode;
 209    roundNearestEven = ( roundingMode == float_round_nearest_even );
 210    switch (roundingMode) {
 211    case float_round_nearest_even:
 212    case float_round_ties_away:
 213        increment = ((int64_t) absZ1 < 0);
 214        break;
 215    case float_round_to_zero:
 216        increment = 0;
 217        break;
 218    case float_round_up:
 219        increment = !zSign && absZ1;
 220        break;
 221    case float_round_down:
 222        increment = zSign && absZ1;
 223        break;
 224    default:
 225        abort();
 226    }
 227    if ( increment ) {
 228        ++absZ0;
 229        if ( absZ0 == 0 ) goto overflow;
 230        absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 231    }
 232    z = absZ0;
 233    if ( zSign ) z = - z;
 234    if ( z && ( ( z < 0 ) ^ zSign ) ) {
 235 overflow:
 236        float_raise(float_flag_invalid, status);
 237        return
 238              zSign ? (int64_t) LIT64( 0x8000000000000000 )
 239            : LIT64( 0x7FFFFFFFFFFFFFFF );
 240    }
 241    if (absZ1) {
 242        status->float_exception_flags |= float_flag_inexact;
 243    }
 244    return z;
 245
 246}
 247
 248/*----------------------------------------------------------------------------
 249| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 250| `absZ1', with binary point between bits 63 and 64 (between the input words),
 251| and returns the properly rounded 64-bit unsigned integer corresponding to the
 252| input.  Ordinarily, the fixed-point input is simply rounded to an integer,
 253| with the inexact exception raised if the input cannot be represented exactly
 254| as an integer.  However, if the fixed-point input is too large, the invalid
 255| exception is raised and the largest unsigned integer is returned.
 256*----------------------------------------------------------------------------*/
 257
 258static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
 259                                uint64_t absZ1, float_status *status)
 260{
 261    int8 roundingMode;
 262    flag roundNearestEven, increment;
 263
 264    roundingMode = status->float_rounding_mode;
 265    roundNearestEven = (roundingMode == float_round_nearest_even);
 266    switch (roundingMode) {
 267    case float_round_nearest_even:
 268    case float_round_ties_away:
 269        increment = ((int64_t)absZ1 < 0);
 270        break;
 271    case float_round_to_zero:
 272        increment = 0;
 273        break;
 274    case float_round_up:
 275        increment = !zSign && absZ1;
 276        break;
 277    case float_round_down:
 278        increment = zSign && absZ1;
 279        break;
 280    default:
 281        abort();
 282    }
 283    if (increment) {
 284        ++absZ0;
 285        if (absZ0 == 0) {
 286            float_raise(float_flag_invalid, status);
 287            return LIT64(0xFFFFFFFFFFFFFFFF);
 288        }
 289        absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
 290    }
 291
 292    if (zSign && absZ0) {
 293        float_raise(float_flag_invalid, status);
 294        return 0;
 295    }
 296
 297    if (absZ1) {
 298        status->float_exception_flags |= float_flag_inexact;
 299    }
 300    return absZ0;
 301}
 302
 303/*----------------------------------------------------------------------------
 304| Returns the fraction bits of the single-precision floating-point value `a'.
 305*----------------------------------------------------------------------------*/
 306
 307static inline uint32_t extractFloat32Frac( float32 a )
 308{
 309
 310    return float32_val(a) & 0x007FFFFF;
 311
 312}
 313
 314/*----------------------------------------------------------------------------
 315| Returns the exponent bits of the single-precision floating-point value `a'.
 316*----------------------------------------------------------------------------*/
 317
 318static inline int_fast16_t extractFloat32Exp(float32 a)
 319{
 320
 321    return ( float32_val(a)>>23 ) & 0xFF;
 322
 323}
 324
 325/*----------------------------------------------------------------------------
 326| Returns the sign bit of the single-precision floating-point value `a'.
 327*----------------------------------------------------------------------------*/
 328
 329static inline flag extractFloat32Sign( float32 a )
 330{
 331
 332    return float32_val(a)>>31;
 333
 334}
 335
 336/*----------------------------------------------------------------------------
 337| If `a' is denormal and we are in flush-to-zero mode then set the
 338| input-denormal exception and return zero. Otherwise just return the value.
 339*----------------------------------------------------------------------------*/
 340float32 float32_squash_input_denormal(float32 a, float_status *status)
 341{
 342    if (status->flush_inputs_to_zero) {
 343        if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 344            float_raise(float_flag_input_denormal, status);
 345            return make_float32(float32_val(a) & 0x80000000);
 346        }
 347    }
 348    return a;
 349}
 350
 351/*----------------------------------------------------------------------------
 352| Normalizes the subnormal single-precision floating-point value represented
 353| by the denormalized significand `aSig'.  The normalized exponent and
 354| significand are stored at the locations pointed to by `zExpPtr' and
 355| `zSigPtr', respectively.
 356*----------------------------------------------------------------------------*/
 357
 358static void
 359 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
 360{
 361    int8 shiftCount;
 362
 363    shiftCount = countLeadingZeros32( aSig ) - 8;
 364    *zSigPtr = aSig<<shiftCount;
 365    *zExpPtr = 1 - shiftCount;
 366
 367}
 368
 369/*----------------------------------------------------------------------------
 370| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 371| single-precision floating-point value, returning the result.  After being
 372| shifted into the proper positions, the three fields are simply added
 373| together to form the result.  This means that any integer portion of `zSig'
 374| will be added into the exponent.  Since a properly normalized significand
 375| will have an integer portion equal to 1, the `zExp' input should be 1 less
 376| than the desired result exponent whenever `zSig' is a complete, normalized
 377| significand.
 378*----------------------------------------------------------------------------*/
 379
 380static inline float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
 381{
 382
 383    return make_float32(
 384          ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 385
 386}
 387
 388/*----------------------------------------------------------------------------
 389| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 390| and significand `zSig', and returns the proper single-precision floating-
 391| point value corresponding to the abstract input.  Ordinarily, the abstract
 392| value is simply rounded and packed into the single-precision format, with
 393| the inexact exception raised if the abstract input cannot be represented
 394| exactly.  However, if the abstract value is too large, the overflow and
 395| inexact exceptions are raised and an infinity or maximal finite value is
 396| returned.  If the abstract value is too small, the input value is rounded to
 397| a subnormal number, and the underflow and inexact exceptions are raised if
 398| the abstract input cannot be represented exactly as a subnormal single-
 399| precision floating-point number.
 400|     The input significand `zSig' has its binary point between bits 30
 401| and 29, which is 7 bits to the left of the usual location.  This shifted
 402| significand must be normalized or smaller.  If `zSig' is not normalized,
 403| `zExp' must be 0; in that case, the result returned is a subnormal number,
 404| and it must not require rounding.  In the usual case that `zSig' is
 405| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 406| The handling of underflow and overflow follows the IEC/IEEE Standard for
 407| Binary Floating-Point Arithmetic.
 408*----------------------------------------------------------------------------*/
 409
 410static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
 411                                   float_status *status)
 412{
 413    int8 roundingMode;
 414    flag roundNearestEven;
 415    int8 roundIncrement, roundBits;
 416    flag isTiny;
 417
 418    roundingMode = status->float_rounding_mode;
 419    roundNearestEven = ( roundingMode == float_round_nearest_even );
 420    switch (roundingMode) {
 421    case float_round_nearest_even:
 422    case float_round_ties_away:
 423        roundIncrement = 0x40;
 424        break;
 425    case float_round_to_zero:
 426        roundIncrement = 0;
 427        break;
 428    case float_round_up:
 429        roundIncrement = zSign ? 0 : 0x7f;
 430        break;
 431    case float_round_down:
 432        roundIncrement = zSign ? 0x7f : 0;
 433        break;
 434    default:
 435        abort();
 436        break;
 437    }
 438    roundBits = zSig & 0x7F;
 439    if ( 0xFD <= (uint16_t) zExp ) {
 440        if (    ( 0xFD < zExp )
 441             || (    ( zExp == 0xFD )
 442                  && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 443           ) {
 444            float_raise(float_flag_overflow | float_flag_inexact, status);
 445            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 446        }
 447        if ( zExp < 0 ) {
 448            if (status->flush_to_zero) {
 449                float_raise(float_flag_output_denormal, status);
 450                return packFloat32(zSign, 0, 0);
 451            }
 452            isTiny =
 453                (status->float_detect_tininess
 454                 == float_tininess_before_rounding)
 455                || ( zExp < -1 )
 456                || ( zSig + roundIncrement < 0x80000000 );
 457            shift32RightJamming( zSig, - zExp, &zSig );
 458            zExp = 0;
 459            roundBits = zSig & 0x7F;
 460            if (isTiny && roundBits) {
 461                float_raise(float_flag_underflow, status);
 462            }
 463        }
 464    }
 465    if (roundBits) {
 466        status->float_exception_flags |= float_flag_inexact;
 467    }
 468    zSig = ( zSig + roundIncrement )>>7;
 469    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 470    if ( zSig == 0 ) zExp = 0;
 471    return packFloat32( zSign, zExp, zSig );
 472
 473}
 474
 475/*----------------------------------------------------------------------------
 476| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 477| and significand `zSig', and returns the proper single-precision floating-
 478| point value corresponding to the abstract input.  This routine is just like
 479| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 480| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 481| floating-point exponent.
 482*----------------------------------------------------------------------------*/
 483
 484static float32
 485 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
 486                              float_status *status)
 487{
 488    int8 shiftCount;
 489
 490    shiftCount = countLeadingZeros32( zSig ) - 1;
 491    return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
 492                               status);
 493
 494}
 495
 496/*----------------------------------------------------------------------------
 497| Returns the fraction bits of the double-precision floating-point value `a'.
 498*----------------------------------------------------------------------------*/
 499
 500static inline uint64_t extractFloat64Frac( float64 a )
 501{
 502
 503    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 504
 505}
 506
 507/*----------------------------------------------------------------------------
 508| Returns the exponent bits of the double-precision floating-point value `a'.
 509*----------------------------------------------------------------------------*/
 510
 511static inline int_fast16_t extractFloat64Exp(float64 a)
 512{
 513
 514    return ( float64_val(a)>>52 ) & 0x7FF;
 515
 516}
 517
 518/*----------------------------------------------------------------------------
 519| Returns the sign bit of the double-precision floating-point value `a'.
 520*----------------------------------------------------------------------------*/
 521
 522static inline flag extractFloat64Sign( float64 a )
 523{
 524
 525    return float64_val(a)>>63;
 526
 527}
 528
 529/*----------------------------------------------------------------------------
 530| If `a' is denormal and we are in flush-to-zero mode then set the
 531| input-denormal exception and return zero. Otherwise just return the value.
 532*----------------------------------------------------------------------------*/
 533float64 float64_squash_input_denormal(float64 a, float_status *status)
 534{
 535    if (status->flush_inputs_to_zero) {
 536        if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 537            float_raise(float_flag_input_denormal, status);
 538            return make_float64(float64_val(a) & (1ULL << 63));
 539        }
 540    }
 541    return a;
 542}
 543
 544/*----------------------------------------------------------------------------
 545| Normalizes the subnormal double-precision floating-point value represented
 546| by the denormalized significand `aSig'.  The normalized exponent and
 547| significand are stored at the locations pointed to by `zExpPtr' and
 548| `zSigPtr', respectively.
 549*----------------------------------------------------------------------------*/
 550
 551static void
 552 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
 553{
 554    int8 shiftCount;
 555
 556    shiftCount = countLeadingZeros64( aSig ) - 11;
 557    *zSigPtr = aSig<<shiftCount;
 558    *zExpPtr = 1 - shiftCount;
 559
 560}
 561
 562/*----------------------------------------------------------------------------
 563| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 564| double-precision floating-point value, returning the result.  After being
 565| shifted into the proper positions, the three fields are simply added
 566| together to form the result.  This means that any integer portion of `zSig'
 567| will be added into the exponent.  Since a properly normalized significand
 568| will have an integer portion equal to 1, the `zExp' input should be 1 less
 569| than the desired result exponent whenever `zSig' is a complete, normalized
 570| significand.
 571*----------------------------------------------------------------------------*/
 572
 573static inline float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
 574{
 575
 576    return make_float64(
 577        ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 578
 579}
 580
 581/*----------------------------------------------------------------------------
 582| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 583| and significand `zSig', and returns the proper double-precision floating-
 584| point value corresponding to the abstract input.  Ordinarily, the abstract
 585| value is simply rounded and packed into the double-precision format, with
 586| the inexact exception raised if the abstract input cannot be represented
 587| exactly.  However, if the abstract value is too large, the overflow and
 588| inexact exceptions are raised and an infinity or maximal finite value is
 589| returned.  If the abstract value is too small, the input value is rounded to
 590| a subnormal number, and the underflow and inexact exceptions are raised if
 591| the abstract input cannot be represented exactly as a subnormal double-
 592| precision floating-point number.
 593|     The input significand `zSig' has its binary point between bits 62
 594| and 61, which is 10 bits to the left of the usual location.  This shifted
 595| significand must be normalized or smaller.  If `zSig' is not normalized,
 596| `zExp' must be 0; in that case, the result returned is a subnormal number,
 597| and it must not require rounding.  In the usual case that `zSig' is
 598| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 599| The handling of underflow and overflow follows the IEC/IEEE Standard for
 600| Binary Floating-Point Arithmetic.
 601*----------------------------------------------------------------------------*/
 602
 603static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
 604                                   float_status *status)
 605{
 606    int8 roundingMode;
 607    flag roundNearestEven;
 608    int_fast16_t roundIncrement, roundBits;
 609    flag isTiny;
 610
 611    roundingMode = status->float_rounding_mode;
 612    roundNearestEven = ( roundingMode == float_round_nearest_even );
 613    switch (roundingMode) {
 614    case float_round_nearest_even:
 615    case float_round_ties_away:
 616        roundIncrement = 0x200;
 617        break;
 618    case float_round_to_zero:
 619        roundIncrement = 0;
 620        break;
 621    case float_round_up:
 622        roundIncrement = zSign ? 0 : 0x3ff;
 623        break;
 624    case float_round_down:
 625        roundIncrement = zSign ? 0x3ff : 0;
 626        break;
 627    default:
 628        abort();
 629    }
 630    roundBits = zSig & 0x3FF;
 631    if ( 0x7FD <= (uint16_t) zExp ) {
 632        if (    ( 0x7FD < zExp )
 633             || (    ( zExp == 0x7FD )
 634                  && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 635           ) {
 636            float_raise(float_flag_overflow | float_flag_inexact, status);
 637            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 638        }
 639        if ( zExp < 0 ) {
 640            if (status->flush_to_zero) {
 641                float_raise(float_flag_output_denormal, status);
 642                return packFloat64(zSign, 0, 0);
 643            }
 644            isTiny =
 645                   (status->float_detect_tininess
 646                    == float_tininess_before_rounding)
 647                || ( zExp < -1 )
 648                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 649            shift64RightJamming( zSig, - zExp, &zSig );
 650            zExp = 0;
 651            roundBits = zSig & 0x3FF;
 652            if (isTiny && roundBits) {
 653                float_raise(float_flag_underflow, status);
 654            }
 655        }
 656    }
 657    if (roundBits) {
 658        status->float_exception_flags |= float_flag_inexact;
 659    }
 660    zSig = ( zSig + roundIncrement )>>10;
 661    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 662    if ( zSig == 0 ) zExp = 0;
 663    return packFloat64( zSign, zExp, zSig );
 664
 665}
 666
 667/*----------------------------------------------------------------------------
 668| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 669| and significand `zSig', and returns the proper double-precision floating-
 670| point value corresponding to the abstract input.  This routine is just like
 671| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 672| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 673| floating-point exponent.
 674*----------------------------------------------------------------------------*/
 675
 676static float64
 677 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
 678                              float_status *status)
 679{
 680    int8 shiftCount;
 681
 682    shiftCount = countLeadingZeros64( zSig ) - 1;
 683    return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
 684                               status);
 685
 686}
 687
 688/*----------------------------------------------------------------------------
 689| Returns the fraction bits of the extended double-precision floating-point
 690| value `a'.
 691*----------------------------------------------------------------------------*/
 692
 693static inline uint64_t extractFloatx80Frac( floatx80 a )
 694{
 695
 696    return a.low;
 697
 698}
 699
 700/*----------------------------------------------------------------------------
 701| Returns the exponent bits of the extended double-precision floating-point
 702| value `a'.
 703*----------------------------------------------------------------------------*/
 704
 705static inline int32 extractFloatx80Exp( floatx80 a )
 706{
 707
 708    return a.high & 0x7FFF;
 709
 710}
 711
 712/*----------------------------------------------------------------------------
 713| Returns the sign bit of the extended double-precision floating-point value
 714| `a'.
 715*----------------------------------------------------------------------------*/
 716
 717static inline flag extractFloatx80Sign( floatx80 a )
 718{
 719
 720    return a.high>>15;
 721
 722}
 723
 724/*----------------------------------------------------------------------------
 725| Normalizes the subnormal extended double-precision floating-point value
 726| represented by the denormalized significand `aSig'.  The normalized exponent
 727| and significand are stored at the locations pointed to by `zExpPtr' and
 728| `zSigPtr', respectively.
 729*----------------------------------------------------------------------------*/
 730
 731static void
 732 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
 733{
 734    int8 shiftCount;
 735
 736    shiftCount = countLeadingZeros64( aSig );
 737    *zSigPtr = aSig<<shiftCount;
 738    *zExpPtr = 1 - shiftCount;
 739
 740}
 741
 742/*----------------------------------------------------------------------------
 743| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 744| extended double-precision floating-point value, returning the result.
 745*----------------------------------------------------------------------------*/
 746
 747static inline floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
 748{
 749    floatx80 z;
 750
 751    z.low = zSig;
 752    z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 753    return z;
 754
 755}
 756
 757/*----------------------------------------------------------------------------
 758| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 759| and extended significand formed by the concatenation of `zSig0' and `zSig1',
 760| and returns the proper extended double-precision floating-point value
 761| corresponding to the abstract input.  Ordinarily, the abstract value is
 762| rounded and packed into the extended double-precision format, with the
 763| inexact exception raised if the abstract input cannot be represented
 764| exactly.  However, if the abstract value is too large, the overflow and
 765| inexact exceptions are raised and an infinity or maximal finite value is
 766| returned.  If the abstract value is too small, the input value is rounded to
 767| a subnormal number, and the underflow and inexact exceptions are raised if
 768| the abstract input cannot be represented exactly as a subnormal extended
 769| double-precision floating-point number.
 770|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 771| number of bits as single or double precision, respectively.  Otherwise, the
 772| result is rounded to the full precision of the extended double-precision
 773| format.
 774|     The input significand must be normalized or smaller.  If the input
 775| significand is not normalized, `zExp' must be 0; in that case, the result
 776| returned is a subnormal number, and it must not require rounding.  The
 777| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 778| Floating-Point Arithmetic.
 779*----------------------------------------------------------------------------*/
 780
 781static floatx80 roundAndPackFloatx80(int8 roundingPrecision, flag zSign,
 782                                     int32 zExp, uint64_t zSig0, uint64_t zSig1,
 783                                     float_status *status)
 784{
 785    int8 roundingMode;
 786    flag roundNearestEven, increment, isTiny;
 787    int64 roundIncrement, roundMask, roundBits;
 788
 789    roundingMode = status->float_rounding_mode;
 790    roundNearestEven = ( roundingMode == float_round_nearest_even );
 791    if ( roundingPrecision == 80 ) goto precision80;
 792    if ( roundingPrecision == 64 ) {
 793        roundIncrement = LIT64( 0x0000000000000400 );
 794        roundMask = LIT64( 0x00000000000007FF );
 795    }
 796    else if ( roundingPrecision == 32 ) {
 797        roundIncrement = LIT64( 0x0000008000000000 );
 798        roundMask = LIT64( 0x000000FFFFFFFFFF );
 799    }
 800    else {
 801        goto precision80;
 802    }
 803    zSig0 |= ( zSig1 != 0 );
 804    switch (roundingMode) {
 805    case float_round_nearest_even:
 806    case float_round_ties_away:
 807        break;
 808    case float_round_to_zero:
 809        roundIncrement = 0;
 810        break;
 811    case float_round_up:
 812        roundIncrement = zSign ? 0 : roundMask;
 813        break;
 814    case float_round_down:
 815        roundIncrement = zSign ? roundMask : 0;
 816        break;
 817    default:
 818        abort();
 819    }
 820    roundBits = zSig0 & roundMask;
 821    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 822        if (    ( 0x7FFE < zExp )
 823             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 824           ) {
 825            goto overflow;
 826        }
 827        if ( zExp <= 0 ) {
 828            if (status->flush_to_zero) {
 829                float_raise(float_flag_output_denormal, status);
 830                return packFloatx80(zSign, 0, 0);
 831            }
 832            isTiny =
 833                   (status->float_detect_tininess
 834                    == float_tininess_before_rounding)
 835                || ( zExp < 0 )
 836                || ( zSig0 <= zSig0 + roundIncrement );
 837            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 838            zExp = 0;
 839            roundBits = zSig0 & roundMask;
 840            if (isTiny && roundBits) {
 841                float_raise(float_flag_underflow, status);
 842            }
 843            if (roundBits) {
 844                status->float_exception_flags |= float_flag_inexact;
 845            }
 846            zSig0 += roundIncrement;
 847            if ( (int64_t) zSig0 < 0 ) zExp = 1;
 848            roundIncrement = roundMask + 1;
 849            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 850                roundMask |= roundIncrement;
 851            }
 852            zSig0 &= ~ roundMask;
 853            return packFloatx80( zSign, zExp, zSig0 );
 854        }
 855    }
 856    if (roundBits) {
 857        status->float_exception_flags |= float_flag_inexact;
 858    }
 859    zSig0 += roundIncrement;
 860    if ( zSig0 < roundIncrement ) {
 861        ++zExp;
 862        zSig0 = LIT64( 0x8000000000000000 );
 863    }
 864    roundIncrement = roundMask + 1;
 865    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 866        roundMask |= roundIncrement;
 867    }
 868    zSig0 &= ~ roundMask;
 869    if ( zSig0 == 0 ) zExp = 0;
 870    return packFloatx80( zSign, zExp, zSig0 );
 871 precision80:
 872    switch (roundingMode) {
 873    case float_round_nearest_even:
 874    case float_round_ties_away:
 875        increment = ((int64_t)zSig1 < 0);
 876        break;
 877    case float_round_to_zero:
 878        increment = 0;
 879        break;
 880    case float_round_up:
 881        increment = !zSign && zSig1;
 882        break;
 883    case float_round_down:
 884        increment = zSign && zSig1;
 885        break;
 886    default:
 887        abort();
 888    }
 889    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 890        if (    ( 0x7FFE < zExp )
 891             || (    ( zExp == 0x7FFE )
 892                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 893                  && increment
 894                )
 895           ) {
 896            roundMask = 0;
 897 overflow:
 898            float_raise(float_flag_overflow | float_flag_inexact, status);
 899            if (    ( roundingMode == float_round_to_zero )
 900                 || ( zSign && ( roundingMode == float_round_up ) )
 901                 || ( ! zSign && ( roundingMode == float_round_down ) )
 902               ) {
 903                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 904            }
 905            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 906        }
 907        if ( zExp <= 0 ) {
 908            isTiny =
 909                   (status->float_detect_tininess
 910                    == float_tininess_before_rounding)
 911                || ( zExp < 0 )
 912                || ! increment
 913                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 914            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 915            zExp = 0;
 916            if (isTiny && zSig1) {
 917                float_raise(float_flag_underflow, status);
 918            }
 919            if (zSig1) {
 920                status->float_exception_flags |= float_flag_inexact;
 921            }
 922            switch (roundingMode) {
 923            case float_round_nearest_even:
 924            case float_round_ties_away:
 925                increment = ((int64_t)zSig1 < 0);
 926                break;
 927            case float_round_to_zero:
 928                increment = 0;
 929                break;
 930            case float_round_up:
 931                increment = !zSign && zSig1;
 932                break;
 933            case float_round_down:
 934                increment = zSign && zSig1;
 935                break;
 936            default:
 937                abort();
 938            }
 939            if ( increment ) {
 940                ++zSig0;
 941                zSig0 &=
 942                    ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 943                if ( (int64_t) zSig0 < 0 ) zExp = 1;
 944            }
 945            return packFloatx80( zSign, zExp, zSig0 );
 946        }
 947    }
 948    if (zSig1) {
 949        status->float_exception_flags |= float_flag_inexact;
 950    }
 951    if ( increment ) {
 952        ++zSig0;
 953        if ( zSig0 == 0 ) {
 954            ++zExp;
 955            zSig0 = LIT64( 0x8000000000000000 );
 956        }
 957        else {
 958            zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 959        }
 960    }
 961    else {
 962        if ( zSig0 == 0 ) zExp = 0;
 963    }
 964    return packFloatx80( zSign, zExp, zSig0 );
 965
 966}
 967
 968/*----------------------------------------------------------------------------
 969| Takes an abstract floating-point value having sign `zSign', exponent
 970| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 971| and returns the proper extended double-precision floating-point value
 972| corresponding to the abstract input.  This routine is just like
 973| `roundAndPackFloatx80' except that the input significand does not have to be
 974| normalized.
 975*----------------------------------------------------------------------------*/
 976
 977static floatx80 normalizeRoundAndPackFloatx80(int8 roundingPrecision,
 978                                              flag zSign, int32 zExp,
 979                                              uint64_t zSig0, uint64_t zSig1,
 980                                              float_status *status)
 981{
 982    int8 shiftCount;
 983
 984    if ( zSig0 == 0 ) {
 985        zSig0 = zSig1;
 986        zSig1 = 0;
 987        zExp -= 64;
 988    }
 989    shiftCount = countLeadingZeros64( zSig0 );
 990    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 991    zExp -= shiftCount;
 992    return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
 993                                zSig0, zSig1, status);
 994
 995}
 996
 997/*----------------------------------------------------------------------------
 998| Returns the least-significant 64 fraction bits of the quadruple-precision
 999| floating-point value `a'.
1000*----------------------------------------------------------------------------*/
1001
1002static inline uint64_t extractFloat128Frac1( float128 a )
1003{
1004
1005    return a.low;
1006
1007}
1008
1009/*----------------------------------------------------------------------------
1010| Returns the most-significant 48 fraction bits of the quadruple-precision
1011| floating-point value `a'.
1012*----------------------------------------------------------------------------*/
1013
1014static inline uint64_t extractFloat128Frac0( float128 a )
1015{
1016
1017    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1018
1019}
1020
1021/*----------------------------------------------------------------------------
1022| Returns the exponent bits of the quadruple-precision floating-point value
1023| `a'.
1024*----------------------------------------------------------------------------*/
1025
1026static inline int32 extractFloat128Exp( float128 a )
1027{
1028
1029    return ( a.high>>48 ) & 0x7FFF;
1030
1031}
1032
1033/*----------------------------------------------------------------------------
1034| Returns the sign bit of the quadruple-precision floating-point value `a'.
1035*----------------------------------------------------------------------------*/
1036
1037static inline flag extractFloat128Sign( float128 a )
1038{
1039
1040    return a.high>>63;
1041
1042}
1043
1044/*----------------------------------------------------------------------------
1045| Normalizes the subnormal quadruple-precision floating-point value
1046| represented by the denormalized significand formed by the concatenation of
1047| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1048| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1049| significand are stored at the location pointed to by `zSig0Ptr', and the
1050| least significant 64 bits of the normalized significand are stored at the
1051| location pointed to by `zSig1Ptr'.
1052*----------------------------------------------------------------------------*/
1053
1054static void
1055 normalizeFloat128Subnormal(
1056     uint64_t aSig0,
1057     uint64_t aSig1,
1058     int32 *zExpPtr,
1059     uint64_t *zSig0Ptr,
1060     uint64_t *zSig1Ptr
1061 )
1062{
1063    int8 shiftCount;
1064
1065    if ( aSig0 == 0 ) {
1066        shiftCount = countLeadingZeros64( aSig1 ) - 15;
1067        if ( shiftCount < 0 ) {
1068            *zSig0Ptr = aSig1>>( - shiftCount );
1069            *zSig1Ptr = aSig1<<( shiftCount & 63 );
1070        }
1071        else {
1072            *zSig0Ptr = aSig1<<shiftCount;
1073            *zSig1Ptr = 0;
1074        }
1075        *zExpPtr = - shiftCount - 63;
1076    }
1077    else {
1078        shiftCount = countLeadingZeros64( aSig0 ) - 15;
1079        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1080        *zExpPtr = 1 - shiftCount;
1081    }
1082
1083}
1084
1085/*----------------------------------------------------------------------------
1086| Packs the sign `zSign', the exponent `zExp', and the significand formed
1087| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1088| floating-point value, returning the result.  After being shifted into the
1089| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1090| added together to form the most significant 32 bits of the result.  This
1091| means that any integer portion of `zSig0' will be added into the exponent.
1092| Since a properly normalized significand will have an integer portion equal
1093| to 1, the `zExp' input should be 1 less than the desired result exponent
1094| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1095| significand.
1096*----------------------------------------------------------------------------*/
1097
1098static inline float128
1099 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
1100{
1101    float128 z;
1102
1103    z.low = zSig1;
1104    z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1105    return z;
1106
1107}
1108
1109/*----------------------------------------------------------------------------
1110| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1111| and extended significand formed by the concatenation of `zSig0', `zSig1',
1112| and `zSig2', and returns the proper quadruple-precision floating-point value
1113| corresponding to the abstract input.  Ordinarily, the abstract value is
1114| simply rounded and packed into the quadruple-precision format, with the
1115| inexact exception raised if the abstract input cannot be represented
1116| exactly.  However, if the abstract value is too large, the overflow and
1117| inexact exceptions are raised and an infinity or maximal finite value is
1118| returned.  If the abstract value is too small, the input value is rounded to
1119| a subnormal number, and the underflow and inexact exceptions are raised if
1120| the abstract input cannot be represented exactly as a subnormal quadruple-
1121| precision floating-point number.
1122|     The input significand must be normalized or smaller.  If the input
1123| significand is not normalized, `zExp' must be 0; in that case, the result
1124| returned is a subnormal number, and it must not require rounding.  In the
1125| usual case that the input significand is normalized, `zExp' must be 1 less
1126| than the ``true'' floating-point exponent.  The handling of underflow and
1127| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1128*----------------------------------------------------------------------------*/
1129
1130static float128 roundAndPackFloat128(flag zSign, int32 zExp,
1131                                     uint64_t zSig0, uint64_t zSig1,
1132                                     uint64_t zSig2, float_status *status)
1133{
1134    int8 roundingMode;
1135    flag roundNearestEven, increment, isTiny;
1136
1137    roundingMode = status->float_rounding_mode;
1138    roundNearestEven = ( roundingMode == float_round_nearest_even );
1139    switch (roundingMode) {
1140    case float_round_nearest_even:
1141    case float_round_ties_away:
1142        increment = ((int64_t)zSig2 < 0);
1143        break;
1144    case float_round_to_zero:
1145        increment = 0;
1146        break;
1147    case float_round_up:
1148        increment = !zSign && zSig2;
1149        break;
1150    case float_round_down:
1151        increment = zSign && zSig2;
1152        break;
1153    default:
1154        abort();
1155    }
1156    if ( 0x7FFD <= (uint32_t) zExp ) {
1157        if (    ( 0x7FFD < zExp )
1158             || (    ( zExp == 0x7FFD )
1159                  && eq128(
1160                         LIT64( 0x0001FFFFFFFFFFFF ),
1161                         LIT64( 0xFFFFFFFFFFFFFFFF ),
1162                         zSig0,
1163                         zSig1
1164                     )
1165                  && increment
1166                )
1167           ) {
1168            float_raise(float_flag_overflow | float_flag_inexact, status);
1169            if (    ( roundingMode == float_round_to_zero )
1170                 || ( zSign && ( roundingMode == float_round_up ) )
1171                 || ( ! zSign && ( roundingMode == float_round_down ) )
1172               ) {
1173                return
1174                    packFloat128(
1175                        zSign,
1176                        0x7FFE,
1177                        LIT64( 0x0000FFFFFFFFFFFF ),
1178                        LIT64( 0xFFFFFFFFFFFFFFFF )
1179                    );
1180            }
1181            return packFloat128( zSign, 0x7FFF, 0, 0 );
1182        }
1183        if ( zExp < 0 ) {
1184            if (status->flush_to_zero) {
1185                float_raise(float_flag_output_denormal, status);
1186                return packFloat128(zSign, 0, 0, 0);
1187            }
1188            isTiny =
1189                   (status->float_detect_tininess
1190                    == float_tininess_before_rounding)
1191                || ( zExp < -1 )
1192                || ! increment
1193                || lt128(
1194                       zSig0,
1195                       zSig1,
1196                       LIT64( 0x0001FFFFFFFFFFFF ),
1197                       LIT64( 0xFFFFFFFFFFFFFFFF )
1198                   );
1199            shift128ExtraRightJamming(
1200                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1201            zExp = 0;
1202            if (isTiny && zSig2) {
1203                float_raise(float_flag_underflow, status);
1204            }
1205            switch (roundingMode) {
1206            case float_round_nearest_even:
1207            case float_round_ties_away:
1208                increment = ((int64_t)zSig2 < 0);
1209                break;
1210            case float_round_to_zero:
1211                increment = 0;
1212                break;
1213            case float_round_up:
1214                increment = !zSign && zSig2;
1215                break;
1216            case float_round_down:
1217                increment = zSign && zSig2;
1218                break;
1219            default:
1220                abort();
1221            }
1222        }
1223    }
1224    if (zSig2) {
1225        status->float_exception_flags |= float_flag_inexact;
1226    }
1227    if ( increment ) {
1228        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1229        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1230    }
1231    else {
1232        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1233    }
1234    return packFloat128( zSign, zExp, zSig0, zSig1 );
1235
1236}
1237
1238/*----------------------------------------------------------------------------
1239| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1240| and significand formed by the concatenation of `zSig0' and `zSig1', and
1241| returns the proper quadruple-precision floating-point value corresponding
1242| to the abstract input.  This routine is just like `roundAndPackFloat128'
1243| except that the input significand has fewer bits and does not have to be
1244| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1245| point exponent.
1246*----------------------------------------------------------------------------*/
1247
1248static float128 normalizeRoundAndPackFloat128(flag zSign, int32 zExp,
1249                                              uint64_t zSig0, uint64_t zSig1,
1250                                              float_status *status)
1251{
1252    int8 shiftCount;
1253    uint64_t zSig2;
1254
1255    if ( zSig0 == 0 ) {
1256        zSig0 = zSig1;
1257        zSig1 = 0;
1258        zExp -= 64;
1259    }
1260    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1261    if ( 0 <= shiftCount ) {
1262        zSig2 = 0;
1263        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1264    }
1265    else {
1266        shift128ExtraRightJamming(
1267            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1268    }
1269    zExp -= shiftCount;
1270    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1271
1272}
1273
1274/*----------------------------------------------------------------------------
1275| Returns the result of converting the 32-bit two's complement integer `a'
1276| to the single-precision floating-point format.  The conversion is performed
1277| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1278*----------------------------------------------------------------------------*/
1279
1280float32 int32_to_float32(int32_t a, float_status *status)
1281{
1282    flag zSign;
1283
1284    if ( a == 0 ) return float32_zero;
1285    if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1286    zSign = ( a < 0 );
1287    return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1288}
1289
1290/*----------------------------------------------------------------------------
1291| Returns the result of converting the 32-bit two's complement integer `a'
1292| to the double-precision floating-point format.  The conversion is performed
1293| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1294*----------------------------------------------------------------------------*/
1295
1296float64 int32_to_float64(int32_t a, float_status *status)
1297{
1298    flag zSign;
1299    uint32 absA;
1300    int8 shiftCount;
1301    uint64_t zSig;
1302
1303    if ( a == 0 ) return float64_zero;
1304    zSign = ( a < 0 );
1305    absA = zSign ? - a : a;
1306    shiftCount = countLeadingZeros32( absA ) + 21;
1307    zSig = absA;
1308    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1309
1310}
1311
1312/*----------------------------------------------------------------------------
1313| Returns the result of converting the 32-bit two's complement integer `a'
1314| to the extended double-precision floating-point format.  The conversion
1315| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1316| Arithmetic.
1317*----------------------------------------------------------------------------*/
1318
1319floatx80 int32_to_floatx80(int32_t a, float_status *status)
1320{
1321    flag zSign;
1322    uint32 absA;
1323    int8 shiftCount;
1324    uint64_t zSig;
1325
1326    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1327    zSign = ( a < 0 );
1328    absA = zSign ? - a : a;
1329    shiftCount = countLeadingZeros32( absA ) + 32;
1330    zSig = absA;
1331    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1332
1333}
1334
1335/*----------------------------------------------------------------------------
1336| Returns the result of converting the 32-bit two's complement integer `a' to
1337| the quadruple-precision floating-point format.  The conversion is performed
1338| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1339*----------------------------------------------------------------------------*/
1340
1341float128 int32_to_float128(int32_t a, float_status *status)
1342{
1343    flag zSign;
1344    uint32 absA;
1345    int8 shiftCount;
1346    uint64_t zSig0;
1347
1348    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1349    zSign = ( a < 0 );
1350    absA = zSign ? - a : a;
1351    shiftCount = countLeadingZeros32( absA ) + 17;
1352    zSig0 = absA;
1353    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1354
1355}
1356
1357/*----------------------------------------------------------------------------
1358| Returns the result of converting the 64-bit two's complement integer `a'
1359| to the single-precision floating-point format.  The conversion is performed
1360| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1361*----------------------------------------------------------------------------*/
1362
1363float32 int64_to_float32(int64_t a, float_status *status)
1364{
1365    flag zSign;
1366    uint64 absA;
1367    int8 shiftCount;
1368
1369    if ( a == 0 ) return float32_zero;
1370    zSign = ( a < 0 );
1371    absA = zSign ? - a : a;
1372    shiftCount = countLeadingZeros64( absA ) - 40;
1373    if ( 0 <= shiftCount ) {
1374        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1375    }
1376    else {
1377        shiftCount += 7;
1378        if ( shiftCount < 0 ) {
1379            shift64RightJamming( absA, - shiftCount, &absA );
1380        }
1381        else {
1382            absA <<= shiftCount;
1383        }
1384        return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1385    }
1386
1387}
1388
1389/*----------------------------------------------------------------------------
1390| Returns the result of converting the 64-bit two's complement integer `a'
1391| to the double-precision floating-point format.  The conversion is performed
1392| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1393*----------------------------------------------------------------------------*/
1394
1395float64 int64_to_float64(int64_t a, float_status *status)
1396{
1397    flag zSign;
1398
1399    if ( a == 0 ) return float64_zero;
1400    if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1401        return packFloat64( 1, 0x43E, 0 );
1402    }
1403    zSign = ( a < 0 );
1404    return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1405}
1406
1407/*----------------------------------------------------------------------------
1408| Returns the result of converting the 64-bit two's complement integer `a'
1409| to the extended double-precision floating-point format.  The conversion
1410| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1411| Arithmetic.
1412*----------------------------------------------------------------------------*/
1413
1414floatx80 int64_to_floatx80(int64_t a, float_status *status)
1415{
1416    flag zSign;
1417    uint64 absA;
1418    int8 shiftCount;
1419
1420    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1421    zSign = ( a < 0 );
1422    absA = zSign ? - a : a;
1423    shiftCount = countLeadingZeros64( absA );
1424    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1425
1426}
1427
1428/*----------------------------------------------------------------------------
1429| Returns the result of converting the 64-bit two's complement integer `a' to
1430| the quadruple-precision floating-point format.  The conversion is performed
1431| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1432*----------------------------------------------------------------------------*/
1433
1434float128 int64_to_float128(int64_t a, float_status *status)
1435{
1436    flag zSign;
1437    uint64 absA;
1438    int8 shiftCount;
1439    int32 zExp;
1440    uint64_t zSig0, zSig1;
1441
1442    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1443    zSign = ( a < 0 );
1444    absA = zSign ? - a : a;
1445    shiftCount = countLeadingZeros64( absA ) + 49;
1446    zExp = 0x406E - shiftCount;
1447    if ( 64 <= shiftCount ) {
1448        zSig1 = 0;
1449        zSig0 = absA;
1450        shiftCount -= 64;
1451    }
1452    else {
1453        zSig1 = absA;
1454        zSig0 = 0;
1455    }
1456    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1457    return packFloat128( zSign, zExp, zSig0, zSig1 );
1458
1459}
1460
1461/*----------------------------------------------------------------------------
1462| Returns the result of converting the 64-bit unsigned integer `a'
1463| to the single-precision floating-point format.  The conversion is performed
1464| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1465*----------------------------------------------------------------------------*/
1466
1467float32 uint64_to_float32(uint64_t a, float_status *status)
1468{
1469    int shiftcount;
1470
1471    if (a == 0) {
1472        return float32_zero;
1473    }
1474
1475    /* Determine (left) shift needed to put first set bit into bit posn 23
1476     * (since packFloat32() expects the binary point between bits 23 and 22);
1477     * this is the fast case for smallish numbers.
1478     */
1479    shiftcount = countLeadingZeros64(a) - 40;
1480    if (shiftcount >= 0) {
1481        return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1482    }
1483    /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1484     * expects the binary point between bits 30 and 29, hence the + 7.
1485     */
1486    shiftcount += 7;
1487    if (shiftcount < 0) {
1488        shift64RightJamming(a, -shiftcount, &a);
1489    } else {
1490        a <<= shiftcount;
1491    }
1492
1493    return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
1494}
1495
1496/*----------------------------------------------------------------------------
1497| Returns the result of converting the 64-bit unsigned integer `a'
1498| to the double-precision floating-point format.  The conversion is performed
1499| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1500*----------------------------------------------------------------------------*/
1501
1502float64 uint64_to_float64(uint64_t a, float_status *status)
1503{
1504    int exp = 0x43C;
1505    int shiftcount;
1506
1507    if (a == 0) {
1508        return float64_zero;
1509    }
1510
1511    shiftcount = countLeadingZeros64(a) - 1;
1512    if (shiftcount < 0) {
1513        shift64RightJamming(a, -shiftcount, &a);
1514    } else {
1515        a <<= shiftcount;
1516    }
1517    return roundAndPackFloat64(0, exp - shiftcount, a, status);
1518}
1519
1520/*----------------------------------------------------------------------------
1521| Returns the result of converting the 64-bit unsigned integer `a'
1522| to the quadruple-precision floating-point format.  The conversion is performed
1523| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1524*----------------------------------------------------------------------------*/
1525
1526float128 uint64_to_float128(uint64_t a, float_status *status)
1527{
1528    if (a == 0) {
1529        return float128_zero;
1530    }
1531    return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1532}
1533
1534/*----------------------------------------------------------------------------
1535| Returns the result of converting the single-precision floating-point value
1536| `a' to the 32-bit two's complement integer format.  The conversion is
1537| performed according to the IEC/IEEE Standard for Binary Floating-Point
1538| Arithmetic---which means in particular that the conversion is rounded
1539| according to the current rounding mode.  If `a' is a NaN, the largest
1540| positive integer is returned.  Otherwise, if the conversion overflows, the
1541| largest integer with the same sign as `a' is returned.
1542*----------------------------------------------------------------------------*/
1543
1544int32 float32_to_int32(float32 a, float_status *status)
1545{
1546    flag aSign;
1547    int_fast16_t aExp, shiftCount;
1548    uint32_t aSig;
1549    uint64_t aSig64;
1550
1551    a = float32_squash_input_denormal(a, status);
1552    aSig = extractFloat32Frac( a );
1553    aExp = extractFloat32Exp( a );
1554    aSign = extractFloat32Sign( a );
1555    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1556    if ( aExp ) aSig |= 0x00800000;
1557    shiftCount = 0xAF - aExp;
1558    aSig64 = aSig;
1559    aSig64 <<= 32;
1560    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1561    return roundAndPackInt32(aSign, aSig64, status);
1562
1563}
1564
1565/*----------------------------------------------------------------------------
1566| Returns the result of converting the single-precision floating-point value
1567| `a' to the 32-bit two's complement integer format.  The conversion is
1568| performed according to the IEC/IEEE Standard for Binary Floating-Point
1569| Arithmetic, except that the conversion is always rounded toward zero.
1570| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1571| the conversion overflows, the largest integer with the same sign as `a' is
1572| returned.
1573*----------------------------------------------------------------------------*/
1574
1575int32 float32_to_int32_round_to_zero(float32 a, float_status *status)
1576{
1577    flag aSign;
1578    int_fast16_t aExp, shiftCount;
1579    uint32_t aSig;
1580    int32_t z;
1581    a = float32_squash_input_denormal(a, status);
1582
1583    aSig = extractFloat32Frac( a );
1584    aExp = extractFloat32Exp( a );
1585    aSign = extractFloat32Sign( a );
1586    shiftCount = aExp - 0x9E;
1587    if ( 0 <= shiftCount ) {
1588        if ( float32_val(a) != 0xCF000000 ) {
1589            float_raise(float_flag_invalid, status);
1590            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1591        }
1592        return (int32_t) 0x80000000;
1593    }
1594    else if ( aExp <= 0x7E ) {
1595        if (aExp | aSig) {
1596            status->float_exception_flags |= float_flag_inexact;
1597        }
1598        return 0;
1599    }
1600    aSig = ( aSig | 0x00800000 )<<8;
1601    z = aSig>>( - shiftCount );
1602    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1603        status->float_exception_flags |= float_flag_inexact;
1604    }
1605    if ( aSign ) z = - z;
1606    return z;
1607
1608}
1609
1610/*----------------------------------------------------------------------------
1611| Returns the result of converting the single-precision floating-point value
1612| `a' to the 16-bit two's complement integer format.  The conversion is
1613| performed according to the IEC/IEEE Standard for Binary Floating-Point
1614| Arithmetic, except that the conversion is always rounded toward zero.
1615| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1616| the conversion overflows, the largest integer with the same sign as `a' is
1617| returned.
1618*----------------------------------------------------------------------------*/
1619
1620int_fast16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1621{
1622    flag aSign;
1623    int_fast16_t aExp, shiftCount;
1624    uint32_t aSig;
1625    int32 z;
1626
1627    aSig = extractFloat32Frac( a );
1628    aExp = extractFloat32Exp( a );
1629    aSign = extractFloat32Sign( a );
1630    shiftCount = aExp - 0x8E;
1631    if ( 0 <= shiftCount ) {
1632        if ( float32_val(a) != 0xC7000000 ) {
1633            float_raise(float_flag_invalid, status);
1634            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1635                return 0x7FFF;
1636            }
1637        }
1638        return (int32_t) 0xffff8000;
1639    }
1640    else if ( aExp <= 0x7E ) {
1641        if ( aExp | aSig ) {
1642            status->float_exception_flags |= float_flag_inexact;
1643        }
1644        return 0;
1645    }
1646    shiftCount -= 0x10;
1647    aSig = ( aSig | 0x00800000 )<<8;
1648    z = aSig>>( - shiftCount );
1649    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1650        status->float_exception_flags |= float_flag_inexact;
1651    }
1652    if ( aSign ) {
1653        z = - z;
1654    }
1655    return z;
1656
1657}
1658
1659/*----------------------------------------------------------------------------
1660| Returns the result of converting the single-precision floating-point value
1661| `a' to the 64-bit two's complement integer format.  The conversion is
1662| performed according to the IEC/IEEE Standard for Binary Floating-Point
1663| Arithmetic---which means in particular that the conversion is rounded
1664| according to the current rounding mode.  If `a' is a NaN, the largest
1665| positive integer is returned.  Otherwise, if the conversion overflows, the
1666| largest integer with the same sign as `a' is returned.
1667*----------------------------------------------------------------------------*/
1668
1669int64 float32_to_int64(float32 a, float_status *status)
1670{
1671    flag aSign;
1672    int_fast16_t aExp, shiftCount;
1673    uint32_t aSig;
1674    uint64_t aSig64, aSigExtra;
1675    a = float32_squash_input_denormal(a, status);
1676
1677    aSig = extractFloat32Frac( a );
1678    aExp = extractFloat32Exp( a );
1679    aSign = extractFloat32Sign( a );
1680    shiftCount = 0xBE - aExp;
1681    if ( shiftCount < 0 ) {
1682        float_raise(float_flag_invalid, status);
1683        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1684            return LIT64( 0x7FFFFFFFFFFFFFFF );
1685        }
1686        return (int64_t) LIT64( 0x8000000000000000 );
1687    }
1688    if ( aExp ) aSig |= 0x00800000;
1689    aSig64 = aSig;
1690    aSig64 <<= 40;
1691    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1692    return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
1693
1694}
1695
1696/*----------------------------------------------------------------------------
1697| Returns the result of converting the single-precision floating-point value
1698| `a' to the 64-bit unsigned integer format.  The conversion is
1699| performed according to the IEC/IEEE Standard for Binary Floating-Point
1700| Arithmetic---which means in particular that the conversion is rounded
1701| according to the current rounding mode.  If `a' is a NaN, the largest
1702| unsigned integer is returned.  Otherwise, if the conversion overflows, the
1703| largest unsigned integer is returned.  If the 'a' is negative, the result
1704| is rounded and zero is returned; values that do not round to zero will
1705| raise the inexact exception flag.
1706*----------------------------------------------------------------------------*/
1707
1708uint64 float32_to_uint64(float32 a, float_status *status)
1709{
1710    flag aSign;
1711    int_fast16_t aExp, shiftCount;
1712    uint32_t aSig;
1713    uint64_t aSig64, aSigExtra;
1714    a = float32_squash_input_denormal(a, status);
1715
1716    aSig = extractFloat32Frac(a);
1717    aExp = extractFloat32Exp(a);
1718    aSign = extractFloat32Sign(a);
1719    if ((aSign) && (aExp > 126)) {
1720        float_raise(float_flag_invalid, status);
1721        if (float32_is_any_nan(a)) {
1722            return LIT64(0xFFFFFFFFFFFFFFFF);
1723        } else {
1724            return 0;
1725        }
1726    }
1727    shiftCount = 0xBE - aExp;
1728    if (aExp) {
1729        aSig |= 0x00800000;
1730    }
1731    if (shiftCount < 0) {
1732        float_raise(float_flag_invalid, status);
1733        return LIT64(0xFFFFFFFFFFFFFFFF);
1734    }
1735
1736    aSig64 = aSig;
1737    aSig64 <<= 40;
1738    shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1739    return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
1740}
1741
1742/*----------------------------------------------------------------------------
1743| Returns the result of converting the single-precision floating-point value
1744| `a' to the 64-bit unsigned integer format.  The conversion is
1745| performed according to the IEC/IEEE Standard for Binary Floating-Point
1746| Arithmetic, except that the conversion is always rounded toward zero.  If
1747| `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
1748| conversion overflows, the largest unsigned integer is returned.  If the
1749| 'a' is negative, the result is rounded and zero is returned; values that do
1750| not round to zero will raise the inexact flag.
1751*----------------------------------------------------------------------------*/
1752
1753uint64 float32_to_uint64_round_to_zero(float32 a, float_status *status)
1754{
1755    signed char current_rounding_mode = status->float_rounding_mode;
1756    set_float_rounding_mode(float_round_to_zero, status);
1757    int64_t v = float32_to_uint64(a, status);
1758    set_float_rounding_mode(current_rounding_mode, status);
1759    return v;
1760}
1761
1762/*----------------------------------------------------------------------------
1763| Returns the result of converting the single-precision floating-point value
1764| `a' to the 64-bit two's complement integer format.  The conversion is
1765| performed according to the IEC/IEEE Standard for Binary Floating-Point
1766| Arithmetic, except that the conversion is always rounded toward zero.  If
1767| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1768| conversion overflows, the largest integer with the same sign as `a' is
1769| returned.
1770*----------------------------------------------------------------------------*/
1771
1772int64 float32_to_int64_round_to_zero(float32 a, float_status *status)
1773{
1774    flag aSign;
1775    int_fast16_t aExp, shiftCount;
1776    uint32_t aSig;
1777    uint64_t aSig64;
1778    int64 z;
1779    a = float32_squash_input_denormal(a, status);
1780
1781    aSig = extractFloat32Frac( a );
1782    aExp = extractFloat32Exp( a );
1783    aSign = extractFloat32Sign( a );
1784    shiftCount = aExp - 0xBE;
1785    if ( 0 <= shiftCount ) {
1786        if ( float32_val(a) != 0xDF000000 ) {
1787            float_raise(float_flag_invalid, status);
1788            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1789                return LIT64( 0x7FFFFFFFFFFFFFFF );
1790            }
1791        }
1792        return (int64_t) LIT64( 0x8000000000000000 );
1793    }
1794    else if ( aExp <= 0x7E ) {
1795        if (aExp | aSig) {
1796            status->float_exception_flags |= float_flag_inexact;
1797        }
1798        return 0;
1799    }
1800    aSig64 = aSig | 0x00800000;
1801    aSig64 <<= 40;
1802    z = aSig64>>( - shiftCount );
1803    if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1804        status->float_exception_flags |= float_flag_inexact;
1805    }
1806    if ( aSign ) z = - z;
1807    return z;
1808
1809}
1810
1811/*----------------------------------------------------------------------------
1812| Returns the result of converting the single-precision floating-point value
1813| `a' to the double-precision floating-point format.  The conversion is
1814| performed according to the IEC/IEEE Standard for Binary Floating-Point
1815| Arithmetic.
1816*----------------------------------------------------------------------------*/
1817
1818float64 float32_to_float64(float32 a, float_status *status)
1819{
1820    flag aSign;
1821    int_fast16_t aExp;
1822    uint32_t aSig;
1823    a = float32_squash_input_denormal(a, status);
1824
1825    aSig = extractFloat32Frac( a );
1826    aExp = extractFloat32Exp( a );
1827    aSign = extractFloat32Sign( a );
1828    if ( aExp == 0xFF ) {
1829        if (aSig) {
1830            return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1831        }
1832        return packFloat64( aSign, 0x7FF, 0 );
1833    }
1834    if ( aExp == 0 ) {
1835        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1836        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1837        --aExp;
1838    }
1839    return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1840
1841}
1842
1843/*----------------------------------------------------------------------------
1844| Returns the result of converting the single-precision floating-point value
1845| `a' to the extended double-precision floating-point format.  The conversion
1846| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1847| Arithmetic.
1848*----------------------------------------------------------------------------*/
1849
1850floatx80 float32_to_floatx80(float32 a, float_status *status)
1851{
1852    flag aSign;
1853    int_fast16_t aExp;
1854    uint32_t aSig;
1855
1856    a = float32_squash_input_denormal(a, status);
1857    aSig = extractFloat32Frac( a );
1858    aExp = extractFloat32Exp( a );
1859    aSign = extractFloat32Sign( a );
1860    if ( aExp == 0xFF ) {
1861        if (aSig) {
1862            return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1863        }
1864        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1865    }
1866    if ( aExp == 0 ) {
1867        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1868        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1869    }
1870    aSig |= 0x00800000;
1871    return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1872
1873}
1874
1875/*----------------------------------------------------------------------------
1876| Returns the result of converting the single-precision floating-point value
1877| `a' to the double-precision floating-point format.  The conversion is
1878| performed according to the IEC/IEEE Standard for Binary Floating-Point
1879| Arithmetic.
1880*----------------------------------------------------------------------------*/
1881
1882float128 float32_to_float128(float32 a, float_status *status)
1883{
1884    flag aSign;
1885    int_fast16_t aExp;
1886    uint32_t aSig;
1887
1888    a = float32_squash_input_denormal(a, status);
1889    aSig = extractFloat32Frac( a );
1890    aExp = extractFloat32Exp( a );
1891    aSign = extractFloat32Sign( a );
1892    if ( aExp == 0xFF ) {
1893        if (aSig) {
1894            return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1895        }
1896        return packFloat128( aSign, 0x7FFF, 0, 0 );
1897    }
1898    if ( aExp == 0 ) {
1899        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1900        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1901        --aExp;
1902    }
1903    return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1904
1905}
1906
1907/*----------------------------------------------------------------------------
1908| Rounds the single-precision floating-point value `a' to an integer, and
1909| returns the result as a single-precision floating-point value.  The
1910| operation is performed according to the IEC/IEEE Standard for Binary
1911| Floating-Point Arithmetic.
1912*----------------------------------------------------------------------------*/
1913
1914float32 float32_round_to_int(float32 a, float_status *status)
1915{
1916    flag aSign;
1917    int_fast16_t aExp;
1918    uint32_t lastBitMask, roundBitsMask;
1919    uint32_t z;
1920    a = float32_squash_input_denormal(a, status);
1921
1922    aExp = extractFloat32Exp( a );
1923    if ( 0x96 <= aExp ) {
1924        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1925            return propagateFloat32NaN(a, a, status);
1926        }
1927        return a;
1928    }
1929    if ( aExp <= 0x7E ) {
1930        if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1931        status->float_exception_flags |= float_flag_inexact;
1932        aSign = extractFloat32Sign( a );
1933        switch (status->float_rounding_mode) {
1934         case float_round_nearest_even:
1935            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1936                return packFloat32( aSign, 0x7F, 0 );
1937            }
1938            break;
1939        case float_round_ties_away:
1940            if (aExp == 0x7E) {
1941                return packFloat32(aSign, 0x7F, 0);
1942            }
1943            break;
1944         case float_round_down:
1945            return make_float32(aSign ? 0xBF800000 : 0);
1946         case float_round_up:
1947            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1948        }
1949        return packFloat32( aSign, 0, 0 );
1950    }
1951    lastBitMask = 1;
1952    lastBitMask <<= 0x96 - aExp;
1953    roundBitsMask = lastBitMask - 1;
1954    z = float32_val(a);
1955    switch (status->float_rounding_mode) {
1956    case float_round_nearest_even:
1957        z += lastBitMask>>1;
1958        if ((z & roundBitsMask) == 0) {
1959            z &= ~lastBitMask;
1960        }
1961        break;
1962    case float_round_ties_away:
1963        z += lastBitMask >> 1;
1964        break;
1965    case float_round_to_zero:
1966        break;
1967    case float_round_up:
1968        if (!extractFloat32Sign(make_float32(z))) {
1969            z += roundBitsMask;
1970        }
1971        break;
1972    case float_round_down:
1973        if (extractFloat32Sign(make_float32(z))) {
1974            z += roundBitsMask;
1975        }
1976        break;
1977    default:
1978        abort();
1979    }
1980    z &= ~ roundBitsMask;
1981    if (z != float32_val(a)) {
1982        status->float_exception_flags |= float_flag_inexact;
1983    }
1984    return make_float32(z);
1985
1986}
1987
1988/*----------------------------------------------------------------------------
1989| Returns the result of adding the absolute values of the single-precision
1990| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1991| before being returned.  `zSign' is ignored if the result is a NaN.
1992| The addition is performed according to the IEC/IEEE Standard for Binary
1993| Floating-Point Arithmetic.
1994*----------------------------------------------------------------------------*/
1995
1996static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
1997                              float_status *status)
1998{
1999    int_fast16_t aExp, bExp, zExp;
2000    uint32_t aSig, bSig, zSig;
2001    int_fast16_t expDiff;
2002
2003    aSig = extractFloat32Frac( a );
2004    aExp = extractFloat32Exp( a );
2005    bSig = extractFloat32Frac( b );
2006    bExp = extractFloat32Exp( b );
2007    expDiff = aExp - bExp;
2008    aSig <<= 6;
2009    bSig <<= 6;
2010    if ( 0 < expDiff ) {
2011        if ( aExp == 0xFF ) {
2012            if (aSig) {
2013                return propagateFloat32NaN(a, b, status);
2014            }
2015            return a;
2016        }
2017        if ( bExp == 0 ) {
2018            --expDiff;
2019        }
2020        else {
2021            bSig |= 0x20000000;
2022        }
2023        shift32RightJamming( bSig, expDiff, &bSig );
2024        zExp = aExp;
2025    }
2026    else if ( expDiff < 0 ) {
2027        if ( bExp == 0xFF ) {
2028            if (bSig) {
2029                return propagateFloat32NaN(a, b, status);
2030            }
2031            return packFloat32( zSign, 0xFF, 0 );
2032        }
2033        if ( aExp == 0 ) {
2034            ++expDiff;
2035        }
2036        else {
2037            aSig |= 0x20000000;
2038        }
2039        shift32RightJamming( aSig, - expDiff, &aSig );
2040        zExp = bExp;
2041    }
2042    else {
2043        if ( aExp == 0xFF ) {
2044            if (aSig | bSig) {
2045                return propagateFloat32NaN(a, b, status);
2046            }
2047            return a;
2048        }
2049        if ( aExp == 0 ) {
2050            if (status->flush_to_zero) {
2051                if (aSig | bSig) {
2052                    float_raise(float_flag_output_denormal, status);
2053                }
2054                return packFloat32(zSign, 0, 0);
2055            }
2056            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2057        }
2058        zSig = 0x40000000 + aSig + bSig;
2059        zExp = aExp;
2060        goto roundAndPack;
2061    }
2062    aSig |= 0x20000000;
2063    zSig = ( aSig + bSig )<<1;
2064    --zExp;
2065    if ( (int32_t) zSig < 0 ) {
2066        zSig = aSig + bSig;
2067        ++zExp;
2068    }
2069 roundAndPack:
2070    return roundAndPackFloat32(zSign, zExp, zSig, status);
2071
2072}
2073
2074/*----------------------------------------------------------------------------
2075| Returns the result of subtracting the absolute values of the single-
2076| precision floating-point values `a' and `b'.  If `zSign' is 1, the
2077| difference is negated before being returned.  `zSign' is ignored if the
2078| result is a NaN.  The subtraction is performed according to the IEC/IEEE
2079| Standard for Binary Floating-Point Arithmetic.
2080*----------------------------------------------------------------------------*/
2081
2082static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2083                              float_status *status)
2084{
2085    int_fast16_t aExp, bExp, zExp;
2086    uint32_t aSig, bSig, zSig;
2087    int_fast16_t expDiff;
2088
2089    aSig = extractFloat32Frac( a );
2090    aExp = extractFloat32Exp( a );
2091    bSig = extractFloat32Frac( b );
2092    bExp = extractFloat32Exp( b );
2093    expDiff = aExp - bExp;
2094    aSig <<= 7;
2095    bSig <<= 7;
2096    if ( 0 < expDiff ) goto aExpBigger;
2097    if ( expDiff < 0 ) goto bExpBigger;
2098    if ( aExp == 0xFF ) {
2099        if (aSig | bSig) {
2100            return propagateFloat32NaN(a, b, status);
2101        }
2102        float_raise(float_flag_invalid, status);
2103        return float32_default_nan;
2104    }
2105    if ( aExp == 0 ) {
2106        aExp = 1;
2107        bExp = 1;
2108    }
2109    if ( bSig < aSig ) goto aBigger;
2110    if ( aSig < bSig ) goto bBigger;
2111    return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
2112 bExpBigger:
2113    if ( bExp == 0xFF ) {
2114        if (bSig) {
2115            return propagateFloat32NaN(a, b, status);
2116        }
2117        return packFloat32( zSign ^ 1, 0xFF, 0 );
2118    }
2119    if ( aExp == 0 ) {
2120        ++expDiff;
2121    }
2122    else {
2123        aSig |= 0x40000000;
2124    }
2125    shift32RightJamming( aSig, - expDiff, &aSig );
2126    bSig |= 0x40000000;
2127 bBigger:
2128    zSig = bSig - aSig;
2129    zExp = bExp;
2130    zSign ^= 1;
2131    goto normalizeRoundAndPack;
2132 aExpBigger:
2133    if ( aExp == 0xFF ) {
2134        if (aSig) {
2135            return propagateFloat32NaN(a, b, status);
2136        }
2137        return a;
2138    }
2139    if ( bExp == 0 ) {
2140        --expDiff;
2141    }
2142    else {
2143        bSig |= 0x40000000;
2144    }
2145    shift32RightJamming( bSig, expDiff, &bSig );
2146    aSig |= 0x40000000;
2147 aBigger:
2148    zSig = aSig - bSig;
2149    zExp = aExp;
2150 normalizeRoundAndPack:
2151    --zExp;
2152    return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
2153
2154}
2155
2156/*----------------------------------------------------------------------------
2157| Returns the result of adding the single-precision floating-point values `a'
2158| and `b'.  The operation is performed according to the IEC/IEEE Standard for
2159| Binary Floating-Point Arithmetic.
2160*----------------------------------------------------------------------------*/
2161
2162float32 float32_add(float32 a, float32 b, float_status *status)
2163{
2164    flag aSign, bSign;
2165    a = float32_squash_input_denormal(a, status);
2166    b = float32_squash_input_denormal(b, status);
2167
2168    aSign = extractFloat32Sign( a );
2169    bSign = extractFloat32Sign( b );
2170    if ( aSign == bSign ) {
2171        return addFloat32Sigs(a, b, aSign, status);
2172    }
2173    else {
2174        return subFloat32Sigs(a, b, aSign, status);
2175    }
2176
2177}
2178
2179/*----------------------------------------------------------------------------
2180| Returns the result of subtracting the single-precision floating-point values
2181| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2182| for Binary Floating-Point Arithmetic.
2183*----------------------------------------------------------------------------*/
2184
2185float32 float32_sub(float32 a, float32 b, float_status *status)
2186{
2187    flag aSign, bSign;
2188    a = float32_squash_input_denormal(a, status);
2189    b = float32_squash_input_denormal(b, status);
2190
2191    aSign = extractFloat32Sign( a );
2192    bSign = extractFloat32Sign( b );
2193    if ( aSign == bSign ) {
2194        return subFloat32Sigs(a, b, aSign, status);
2195    }
2196    else {
2197        return addFloat32Sigs(a, b, aSign, status);
2198    }
2199
2200}
2201
2202/*----------------------------------------------------------------------------
2203| Returns the result of multiplying the single-precision floating-point values
2204| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2205| for Binary Floating-Point Arithmetic.
2206*----------------------------------------------------------------------------*/
2207
2208float32 float32_mul(float32 a, float32 b, float_status *status)
2209{
2210    flag aSign, bSign, zSign;
2211    int_fast16_t aExp, bExp, zExp;
2212    uint32_t aSig, bSig;
2213    uint64_t zSig64;
2214    uint32_t zSig;
2215
2216    a = float32_squash_input_denormal(a, status);
2217    b = float32_squash_input_denormal(b, status);
2218
2219    aSig = extractFloat32Frac( a );
2220    aExp = extractFloat32Exp( a );
2221    aSign = extractFloat32Sign( a );
2222    bSig = extractFloat32Frac( b );
2223    bExp = extractFloat32Exp( b );
2224    bSign = extractFloat32Sign( b );
2225    zSign = aSign ^ bSign;
2226    if ( aExp == 0xFF ) {
2227        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2228            return propagateFloat32NaN(a, b, status);
2229        }
2230        if ( ( bExp | bSig ) == 0 ) {
2231            float_raise(float_flag_invalid, status);
2232            return float32_default_nan;
2233        }
2234        return packFloat32( zSign, 0xFF, 0 );
2235    }
2236    if ( bExp == 0xFF ) {
2237        if (bSig) {
2238            return propagateFloat32NaN(a, b, status);
2239        }
2240        if ( ( aExp | aSig ) == 0 ) {
2241            float_raise(float_flag_invalid, status);
2242            return float32_default_nan;
2243        }
2244        return packFloat32( zSign, 0xFF, 0 );
2245    }
2246    if ( aExp == 0 ) {
2247        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2248        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2249    }
2250    if ( bExp == 0 ) {
2251        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2252        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2253    }
2254    zExp = aExp + bExp - 0x7F;
2255    aSig = ( aSig | 0x00800000 )<<7;
2256    bSig = ( bSig | 0x00800000 )<<8;
2257    shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2258    zSig = zSig64;
2259    if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2260        zSig <<= 1;
2261        --zExp;
2262    }
2263    return roundAndPackFloat32(zSign, zExp, zSig, status);
2264
2265}
2266
2267/*----------------------------------------------------------------------------
2268| Returns the result of dividing the single-precision floating-point value `a'
2269| by the corresponding value `b'.  The operation is performed according to the
2270| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2271*----------------------------------------------------------------------------*/
2272
2273float32 float32_div(float32 a, float32 b, float_status *status)
2274{
2275    flag aSign, bSign, zSign;
2276    int_fast16_t aExp, bExp, zExp;
2277    uint32_t aSig, bSig, zSig;
2278    a = float32_squash_input_denormal(a, status);
2279    b = float32_squash_input_denormal(b, status);
2280
2281    aSig = extractFloat32Frac( a );
2282    aExp = extractFloat32Exp( a );
2283    aSign = extractFloat32Sign( a );
2284    bSig = extractFloat32Frac( b );
2285    bExp = extractFloat32Exp( b );
2286    bSign = extractFloat32Sign( b );
2287    zSign = aSign ^ bSign;
2288    if ( aExp == 0xFF ) {
2289        if (aSig) {
2290            return propagateFloat32NaN(a, b, status);
2291        }
2292        if ( bExp == 0xFF ) {
2293            if (bSig) {
2294                return propagateFloat32NaN(a, b, status);
2295            }
2296            float_raise(float_flag_invalid, status);
2297            return float32_default_nan;
2298        }
2299        return packFloat32( zSign, 0xFF, 0 );
2300    }
2301    if ( bExp == 0xFF ) {
2302        if (bSig) {
2303            return propagateFloat32NaN(a, b, status);
2304        }
2305        return packFloat32( zSign, 0, 0 );
2306    }
2307    if ( bExp == 0 ) {
2308        if ( bSig == 0 ) {
2309            if ( ( aExp | aSig ) == 0 ) {
2310                float_raise(float_flag_invalid, status);
2311                return float32_default_nan;
2312            }
2313            float_raise(float_flag_divbyzero, status);
2314            return packFloat32( zSign, 0xFF, 0 );
2315        }
2316        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2317    }
2318    if ( aExp == 0 ) {
2319        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2320        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2321    }
2322    zExp = aExp - bExp + 0x7D;
2323    aSig = ( aSig | 0x00800000 )<<7;
2324    bSig = ( bSig | 0x00800000 )<<8;
2325    if ( bSig <= ( aSig + aSig ) ) {
2326        aSig >>= 1;
2327        ++zExp;
2328    }
2329    zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2330    if ( ( zSig & 0x3F ) == 0 ) {
2331        zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2332    }
2333    return roundAndPackFloat32(zSign, zExp, zSig, status);
2334
2335}
2336
2337/*----------------------------------------------------------------------------
2338| Returns the remainder of the single-precision floating-point value `a'
2339| with respect to the corresponding value `b'.  The operation is performed
2340| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2341*----------------------------------------------------------------------------*/
2342
2343float32 float32_rem(float32 a, float32 b, float_status *status)
2344{
2345    flag aSign, zSign;
2346    int_fast16_t aExp, bExp, expDiff;
2347    uint32_t aSig, bSig;
2348    uint32_t q;
2349    uint64_t aSig64, bSig64, q64;
2350    uint32_t alternateASig;
2351    int32_t sigMean;
2352    a = float32_squash_input_denormal(a, status);
2353    b = float32_squash_input_denormal(b, status);
2354
2355    aSig = extractFloat32Frac( a );
2356    aExp = extractFloat32Exp( a );
2357    aSign = extractFloat32Sign( a );
2358    bSig = extractFloat32Frac( b );
2359    bExp = extractFloat32Exp( b );
2360    if ( aExp == 0xFF ) {
2361        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2362            return propagateFloat32NaN(a, b, status);
2363        }
2364        float_raise(float_flag_invalid, status);
2365        return float32_default_nan;
2366    }
2367    if ( bExp == 0xFF ) {
2368        if (bSig) {
2369            return propagateFloat32NaN(a, b, status);
2370        }
2371        return a;
2372    }
2373    if ( bExp == 0 ) {
2374        if ( bSig == 0 ) {
2375            float_raise(float_flag_invalid, status);
2376            return float32_default_nan;
2377        }
2378        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2379    }
2380    if ( aExp == 0 ) {
2381        if ( aSig == 0 ) return a;
2382        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2383    }
2384    expDiff = aExp - bExp;
2385    aSig |= 0x00800000;
2386    bSig |= 0x00800000;
2387    if ( expDiff < 32 ) {
2388        aSig <<= 8;
2389        bSig <<= 8;
2390        if ( expDiff < 0 ) {
2391            if ( expDiff < -1 ) return a;
2392            aSig >>= 1;
2393        }
2394        q = ( bSig <= aSig );
2395        if ( q ) aSig -= bSig;
2396        if ( 0 < expDiff ) {
2397            q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2398            q >>= 32 - expDiff;
2399            bSig >>= 2;
2400            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2401        }
2402        else {
2403            aSig >>= 2;
2404            bSig >>= 2;
2405        }
2406    }
2407    else {
2408        if ( bSig <= aSig ) aSig -= bSig;
2409        aSig64 = ( (uint64_t) aSig )<<40;
2410        bSig64 = ( (uint64_t) bSig )<<40;
2411        expDiff -= 64;
2412        while ( 0 < expDiff ) {
2413            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2414            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2415            aSig64 = - ( ( bSig * q64 )<<38 );
2416            expDiff -= 62;
2417        }
2418        expDiff += 64;
2419        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2420        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2421        q = q64>>( 64 - expDiff );
2422        bSig <<= 6;
2423        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2424    }
2425    do {
2426        alternateASig = aSig;
2427        ++q;
2428        aSig -= bSig;
2429    } while ( 0 <= (int32_t) aSig );
2430    sigMean = aSig + alternateASig;
2431    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2432        aSig = alternateASig;
2433    }
2434    zSign = ( (int32_t) aSig < 0 );
2435    if ( zSign ) aSig = - aSig;
2436    return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2437}
2438
2439/*----------------------------------------------------------------------------
2440| Returns the result of multiplying the single-precision floating-point values
2441| `a' and `b' then adding 'c', with no intermediate rounding step after the
2442| multiplication.  The operation is performed according to the IEC/IEEE
2443| Standard for Binary Floating-Point Arithmetic 754-2008.
2444| The flags argument allows the caller to select negation of the
2445| addend, the intermediate product, or the final result. (The difference
2446| between this and having the caller do a separate negation is that negating
2447| externally will flip the sign bit on NaNs.)
2448*----------------------------------------------------------------------------*/
2449
2450float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2451                       float_status *status)
2452{
2453    flag aSign, bSign, cSign, zSign;
2454    int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2455    uint32_t aSig, bSig, cSig;
2456    flag pInf, pZero, pSign;
2457    uint64_t pSig64, cSig64, zSig64;
2458    uint32_t pSig;
2459    int shiftcount;
2460    flag signflip, infzero;
2461
2462    a = float32_squash_input_denormal(a, status);
2463    b = float32_squash_input_denormal(b, status);
2464    c = float32_squash_input_denormal(c, status);
2465    aSig = extractFloat32Frac(a);
2466    aExp = extractFloat32Exp(a);
2467    aSign = extractFloat32Sign(a);
2468    bSig = extractFloat32Frac(b);
2469    bExp = extractFloat32Exp(b);
2470    bSign = extractFloat32Sign(b);
2471    cSig = extractFloat32Frac(c);
2472    cExp = extractFloat32Exp(c);
2473    cSign = extractFloat32Sign(c);
2474
2475    infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2476               (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2477
2478    /* It is implementation-defined whether the cases of (0,inf,qnan)
2479     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2480     * they return if they do), so we have to hand this information
2481     * off to the target-specific pick-a-NaN routine.
2482     */
2483    if (((aExp == 0xff) && aSig) ||
2484        ((bExp == 0xff) && bSig) ||
2485        ((cExp == 0xff) && cSig)) {
2486        return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2487    }
2488
2489    if (infzero) {
2490        float_raise(float_flag_invalid, status);
2491        return float32_default_nan;
2492    }
2493
2494    if (flags & float_muladd_negate_c) {
2495        cSign ^= 1;
2496    }
2497
2498    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2499
2500    /* Work out the sign and type of the product */
2501    pSign = aSign ^ bSign;
2502    if (flags & float_muladd_negate_product) {
2503        pSign ^= 1;
2504    }
2505    pInf = (aExp == 0xff) || (bExp == 0xff);
2506    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2507
2508    if (cExp == 0xff) {
2509        if (pInf && (pSign ^ cSign)) {
2510            /* addition of opposite-signed infinities => InvalidOperation */
2511            float_raise(float_flag_invalid, status);
2512            return float32_default_nan;
2513        }
2514        /* Otherwise generate an infinity of the same sign */
2515        return packFloat32(cSign ^ signflip, 0xff, 0);
2516    }
2517
2518    if (pInf) {
2519        return packFloat32(pSign ^ signflip, 0xff, 0);
2520    }
2521
2522    if (pZero) {
2523        if (cExp == 0) {
2524            if (cSig == 0) {
2525                /* Adding two exact zeroes */
2526                if (pSign == cSign) {
2527                    zSign = pSign;
2528                } else if (status->float_rounding_mode == float_round_down) {
2529                    zSign = 1;
2530                } else {
2531                    zSign = 0;
2532                }
2533                return packFloat32(zSign ^ signflip, 0, 0);
2534            }
2535            /* Exact zero plus a denorm */
2536            if (status->flush_to_zero) {
2537                float_raise(float_flag_output_denormal, status);
2538                return packFloat32(cSign ^ signflip, 0, 0);
2539            }
2540        }
2541        /* Zero plus something non-zero : just return the something */
2542        if (flags & float_muladd_halve_result) {
2543            if (cExp == 0) {
2544                normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2545            }
2546            /* Subtract one to halve, and one again because roundAndPackFloat32
2547             * wants one less than the true exponent.
2548             */
2549            cExp -= 2;
2550            cSig = (cSig | 0x00800000) << 7;
2551            return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2552        }
2553        return packFloat32(cSign ^ signflip, cExp, cSig);
2554    }
2555
2556    if (aExp == 0) {
2557        normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2558    }
2559    if (bExp == 0) {
2560        normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2561    }
2562
2563    /* Calculate the actual result a * b + c */
2564
2565    /* Multiply first; this is easy. */
2566    /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2567     * because we want the true exponent, not the "one-less-than"
2568     * flavour that roundAndPackFloat32() takes.
2569     */
2570    pExp = aExp + bExp - 0x7e;
2571    aSig = (aSig | 0x00800000) << 7;
2572    bSig = (bSig | 0x00800000) << 8;
2573    pSig64 = (uint64_t)aSig * bSig;
2574    if ((int64_t)(pSig64 << 1) >= 0) {
2575        pSig64 <<= 1;
2576        pExp--;
2577    }
2578
2579    zSign = pSign ^ signflip;
2580
2581    /* Now pSig64 is the significand of the multiply, with the explicit bit in
2582     * position 62.
2583     */
2584    if (cExp == 0) {
2585        if (!cSig) {
2586            /* Throw out the special case of c being an exact zero now */
2587            shift64RightJamming(pSig64, 32, &pSig64);
2588            pSig = pSig64;
2589            if (flags & float_muladd_halve_result) {
2590                pExp--;
2591            }
2592            return roundAndPackFloat32(zSign, pExp - 1,
2593                                       pSig, status);
2594        }
2595        normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2596    }
2597
2598    cSig64 = (uint64_t)cSig << (62 - 23);
2599    cSig64 |= LIT64(0x4000000000000000);
2600    expDiff = pExp - cExp;
2601
2602    if (pSign == cSign) {
2603        /* Addition */
2604        if (expDiff > 0) {
2605            /* scale c to match p */
2606            shift64RightJamming(cSig64, expDiff, &cSig64);
2607            zExp = pExp;
2608        } else if (expDiff < 0) {
2609            /* scale p to match c */
2610            shift64RightJamming(pSig64, -expDiff, &pSig64);
2611            zExp = cExp;
2612        } else {
2613            /* no scaling needed */
2614            zExp = cExp;
2615        }
2616        /* Add significands and make sure explicit bit ends up in posn 62 */
2617        zSig64 = pSig64 + cSig64;
2618        if ((int64_t)zSig64 < 0) {
2619            shift64RightJamming(zSig64, 1, &zSig64);
2620        } else {
2621            zExp--;
2622        }
2623    } else {
2624        /* Subtraction */
2625        if (expDiff > 0) {
2626            shift64RightJamming(cSig64, expDiff, &cSig64);
2627            zSig64 = pSig64 - cSig64;
2628            zExp = pExp;
2629        } else if (expDiff < 0) {
2630            shift64RightJamming(pSig64, -expDiff, &pSig64);
2631            zSig64 = cSig64 - pSig64;
2632            zExp = cExp;
2633            zSign ^= 1;
2634        } else {
2635            zExp = pExp;
2636            if (cSig64 < pSig64) {
2637                zSig64 = pSig64 - cSig64;
2638            } else if (pSig64 < cSig64) {
2639                zSig64 = cSig64 - pSig64;
2640                zSign ^= 1;
2641            } else {
2642                /* Exact zero */
2643                zSign = signflip;
2644                if (status->float_rounding_mode == float_round_down) {
2645                    zSign ^= 1;
2646                }
2647                return packFloat32(zSign, 0, 0);
2648            }
2649        }
2650        --zExp;
2651        /* Normalize to put the explicit bit back into bit 62. */
2652        shiftcount = countLeadingZeros64(zSig64) - 1;
2653        zSig64 <<= shiftcount;
2654        zExp -= shiftcount;
2655    }
2656    if (flags & float_muladd_halve_result) {
2657        zExp--;
2658    }
2659
2660    shift64RightJamming(zSig64, 32, &zSig64);
2661    return roundAndPackFloat32(zSign, zExp, zSig64, status);
2662}
2663
2664
2665/*----------------------------------------------------------------------------
2666| Returns the square root of the single-precision floating-point value `a'.
2667| The operation is performed according to the IEC/IEEE Standard for Binary
2668| Floating-Point Arithmetic.
2669*----------------------------------------------------------------------------*/
2670
2671float32 float32_sqrt(float32 a, float_status *status)
2672{
2673    flag aSign;
2674    int_fast16_t aExp, zExp;
2675    uint32_t aSig, zSig;
2676    uint64_t rem, term;
2677    a = float32_squash_input_denormal(a, status);
2678
2679    aSig = extractFloat32Frac( a );
2680    aExp = extractFloat32Exp( a );
2681    aSign = extractFloat32Sign( a );
2682    if ( aExp == 0xFF ) {
2683        if (aSig) {
2684            return propagateFloat32NaN(a, float32_zero, status);
2685        }
2686        if ( ! aSign ) return a;
2687        float_raise(float_flag_invalid, status);
2688        return float32_default_nan;
2689    }
2690    if ( aSign ) {
2691        if ( ( aExp | aSig ) == 0 ) return a;
2692        float_raise(float_flag_invalid, status);
2693        return float32_default_nan;
2694    }
2695    if ( aExp == 0 ) {
2696        if ( aSig == 0 ) return float32_zero;
2697        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2698    }
2699    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2700    aSig = ( aSig | 0x00800000 )<<8;
2701    zSig = estimateSqrt32( aExp, aSig ) + 2;
2702    if ( ( zSig & 0x7F ) <= 5 ) {
2703        if ( zSig < 2 ) {
2704            zSig = 0x7FFFFFFF;
2705            goto roundAndPack;
2706        }
2707        aSig >>= aExp & 1;
2708        term = ( (uint64_t) zSig ) * zSig;
2709        rem = ( ( (uint64_t) aSig )<<32 ) - term;
2710        while ( (int64_t) rem < 0 ) {
2711            --zSig;
2712            rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2713        }
2714        zSig |= ( rem != 0 );
2715    }
2716    shift32RightJamming( zSig, 1, &zSig );
2717 roundAndPack:
2718    return roundAndPackFloat32(0, zExp, zSig, status);
2719
2720}
2721
2722/*----------------------------------------------------------------------------
2723| Returns the binary exponential of the single-precision floating-point value
2724| `a'. The operation is performed according to the IEC/IEEE Standard for
2725| Binary Floating-Point Arithmetic.
2726|
2727| Uses the following identities:
2728|
2729| 1. -------------------------------------------------------------------------
2730|      x    x*ln(2)
2731|     2  = e
2732|
2733| 2. -------------------------------------------------------------------------
2734|                      2     3     4     5           n
2735|      x        x     x     x     x     x           x
2736|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2737|               1!    2!    3!    4!    5!          n!
2738*----------------------------------------------------------------------------*/
2739
2740static const float64 float32_exp2_coefficients[15] =
2741{
2742    const_float64( 0x3ff0000000000000ll ), /*  1 */
2743    const_float64( 0x3fe0000000000000ll ), /*  2 */
2744    const_float64( 0x3fc5555555555555ll ), /*  3 */
2745    const_float64( 0x3fa5555555555555ll ), /*  4 */
2746    const_float64( 0x3f81111111111111ll ), /*  5 */
2747    const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2748    const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2749    const_float64( 0x3efa01a01a01a01all ), /*  8 */
2750    const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2751    const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2752    const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2753    const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2754    const_float64( 0x3de6124613a86d09ll ), /* 13 */
2755    const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2756    const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2757};
2758
2759float32 float32_exp2(float32 a, float_status *status)
2760{
2761    flag aSign;
2762    int_fast16_t aExp;
2763    uint32_t aSig;
2764    float64 r, x, xn;
2765    int i;
2766    a = float32_squash_input_denormal(a, status);
2767
2768    aSig = extractFloat32Frac( a );
2769    aExp = extractFloat32Exp( a );
2770    aSign = extractFloat32Sign( a );
2771
2772    if ( aExp == 0xFF) {
2773        if (aSig) {
2774            return propagateFloat32NaN(a, float32_zero, status);
2775        }
2776        return (aSign) ? float32_zero : a;
2777    }
2778    if (aExp == 0) {
2779        if (aSig == 0) return float32_one;
2780    }
2781
2782    float_raise(float_flag_inexact, status);
2783
2784    /* ******************************* */
2785    /* using float64 for approximation */
2786    /* ******************************* */
2787    x = float32_to_float64(a, status);
2788    x = float64_mul(x, float64_ln2, status);
2789
2790    xn = x;
2791    r = float64_one;
2792    for (i = 0 ; i < 15 ; i++) {
2793        float64 f;
2794
2795        f = float64_mul(xn, float32_exp2_coefficients[i], status);
2796        r = float64_add(r, f, status);
2797
2798        xn = float64_mul(xn, x, status);
2799    }
2800
2801    return float64_to_float32(r, status);
2802}
2803
2804/*----------------------------------------------------------------------------
2805| Returns the binary log of the single-precision floating-point value `a'.
2806| The operation is performed according to the IEC/IEEE Standard for Binary
2807| Floating-Point Arithmetic.
2808*----------------------------------------------------------------------------*/
2809float32 float32_log2(float32 a, float_status *status)
2810{
2811    flag aSign, zSign;
2812    int_fast16_t aExp;
2813    uint32_t aSig, zSig, i;
2814
2815    a = float32_squash_input_denormal(a, status);
2816    aSig = extractFloat32Frac( a );
2817    aExp = extractFloat32Exp( a );
2818    aSign = extractFloat32Sign( a );
2819
2820    if ( aExp == 0 ) {
2821        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2822        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2823    }
2824    if ( aSign ) {
2825        float_raise(float_flag_invalid, status);
2826        return float32_default_nan;
2827    }
2828    if ( aExp == 0xFF ) {
2829        if (aSig) {
2830            return propagateFloat32NaN(a, float32_zero, status);
2831        }
2832        return a;
2833    }
2834
2835    aExp -= 0x7F;
2836    aSig |= 0x00800000;
2837    zSign = aExp < 0;
2838    zSig = aExp << 23;
2839
2840    for (i = 1 << 22; i > 0; i >>= 1) {
2841        aSig = ( (uint64_t)aSig * aSig ) >> 23;
2842        if ( aSig & 0x01000000 ) {
2843            aSig >>= 1;
2844            zSig |= i;
2845        }
2846    }
2847
2848    if ( zSign )
2849        zSig = -zSig;
2850
2851    return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
2852}
2853
2854/*----------------------------------------------------------------------------
2855| Returns 1 if the single-precision floating-point value `a' is equal to
2856| the corresponding value `b', and 0 otherwise.  The invalid exception is
2857| raised if either operand is a NaN.  Otherwise, the comparison is performed
2858| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2859*----------------------------------------------------------------------------*/
2860
2861int float32_eq(float32 a, float32 b, float_status *status)
2862{
2863    uint32_t av, bv;
2864    a = float32_squash_input_denormal(a, status);
2865    b = float32_squash_input_denormal(b, status);
2866
2867    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2868         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2869       ) {
2870        float_raise(float_flag_invalid, status);
2871        return 0;
2872    }
2873    av = float32_val(a);
2874    bv = float32_val(b);
2875    return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2876}
2877
2878/*----------------------------------------------------------------------------
2879| Returns 1 if the single-precision floating-point value `a' is less than
2880| or equal to the corresponding value `b', and 0 otherwise.  The invalid
2881| exception is raised if either operand is a NaN.  The comparison is performed
2882| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2883*----------------------------------------------------------------------------*/
2884
2885int float32_le(float32 a, float32 b, float_status *status)
2886{
2887    flag aSign, bSign;
2888    uint32_t av, bv;
2889    a = float32_squash_input_denormal(a, status);
2890    b = float32_squash_input_denormal(b, status);
2891
2892    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2893         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2894       ) {
2895        float_raise(float_flag_invalid, status);
2896        return 0;
2897    }
2898    aSign = extractFloat32Sign( a );
2899    bSign = extractFloat32Sign( b );
2900    av = float32_val(a);
2901    bv = float32_val(b);
2902    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2903    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2904
2905}
2906
2907/*----------------------------------------------------------------------------
2908| Returns 1 if the single-precision floating-point value `a' is less than
2909| the corresponding value `b', and 0 otherwise.  The invalid exception is
2910| raised if either operand is a NaN.  The comparison is performed according
2911| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2912*----------------------------------------------------------------------------*/
2913
2914int float32_lt(float32 a, float32 b, float_status *status)
2915{
2916    flag aSign, bSign;
2917    uint32_t av, bv;
2918    a = float32_squash_input_denormal(a, status);
2919    b = float32_squash_input_denormal(b, status);
2920
2921    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2922         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2923       ) {
2924        float_raise(float_flag_invalid, status);
2925        return 0;
2926    }
2927    aSign = extractFloat32Sign( a );
2928    bSign = extractFloat32Sign( b );
2929    av = float32_val(a);
2930    bv = float32_val(b);
2931    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2932    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2933
2934}
2935
2936/*----------------------------------------------------------------------------
2937| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2938| be compared, and 0 otherwise.  The invalid exception is raised if either
2939| operand is a NaN.  The comparison is performed according to the IEC/IEEE
2940| Standard for Binary Floating-Point Arithmetic.
2941*----------------------------------------------------------------------------*/
2942
2943int float32_unordered(float32 a, float32 b, float_status *status)
2944{
2945    a = float32_squash_input_denormal(a, status);
2946    b = float32_squash_input_denormal(b, status);
2947
2948    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2949         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2950       ) {
2951        float_raise(float_flag_invalid, status);
2952        return 1;
2953    }
2954    return 0;
2955}
2956
2957/*----------------------------------------------------------------------------
2958| Returns 1 if the single-precision floating-point value `a' is equal to
2959| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2960| exception.  The comparison is performed according to the IEC/IEEE Standard
2961| for Binary Floating-Point Arithmetic.
2962*----------------------------------------------------------------------------*/
2963
2964int float32_eq_quiet(float32 a, float32 b, float_status *status)
2965{
2966    a = float32_squash_input_denormal(a, status);
2967    b = float32_squash_input_denormal(b, status);
2968
2969    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2970         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2971       ) {
2972        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2973            float_raise(float_flag_invalid, status);
2974        }
2975        return 0;
2976    }
2977    return ( float32_val(a) == float32_val(b) ) ||
2978            ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2979}
2980
2981/*----------------------------------------------------------------------------
2982| Returns 1 if the single-precision floating-point value `a' is less than or
2983| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2984| cause an exception.  Otherwise, the comparison is performed according to the
2985| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2986*----------------------------------------------------------------------------*/
2987
2988int float32_le_quiet(float32 a, float32 b, float_status *status)
2989{
2990    flag aSign, bSign;
2991    uint32_t av, bv;
2992    a = float32_squash_input_denormal(a, status);
2993    b = float32_squash_input_denormal(b, status);
2994
2995    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2996         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2997       ) {
2998        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2999            float_raise(float_flag_invalid, status);
3000        }
3001        return 0;
3002    }
3003    aSign = extractFloat32Sign( a );
3004    bSign = extractFloat32Sign( b );
3005    av = float32_val(a);
3006    bv = float32_val(b);
3007    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3008    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3009
3010}
3011
3012/*----------------------------------------------------------------------------
3013| Returns 1 if the single-precision floating-point value `a' is less than
3014| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3015| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3016| Standard for Binary Floating-Point Arithmetic.
3017*----------------------------------------------------------------------------*/
3018
3019int float32_lt_quiet(float32 a, float32 b, float_status *status)
3020{
3021    flag aSign, bSign;
3022    uint32_t av, bv;
3023    a = float32_squash_input_denormal(a, status);
3024    b = float32_squash_input_denormal(b, status);
3025
3026    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3027         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3028       ) {
3029        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3030            float_raise(float_flag_invalid, status);
3031        }
3032        return 0;
3033    }
3034    aSign = extractFloat32Sign( a );
3035    bSign = extractFloat32Sign( b );
3036    av = float32_val(a);
3037    bv = float32_val(b);
3038    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3039    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3040
3041}
3042
3043/*----------------------------------------------------------------------------
3044| Returns 1 if the single-precision floating-point values `a' and `b' cannot
3045| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3046| comparison is performed according to the IEC/IEEE Standard for Binary
3047| Floating-Point Arithmetic.
3048*----------------------------------------------------------------------------*/
3049
3050int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3051{
3052    a = float32_squash_input_denormal(a, status);
3053    b = float32_squash_input_denormal(b, status);
3054
3055    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3056         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3057       ) {
3058        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3059            float_raise(float_flag_invalid, status);
3060        }
3061        return 1;
3062    }
3063    return 0;
3064}
3065
3066/*----------------------------------------------------------------------------
3067| Returns the result of converting the double-precision floating-point value
3068| `a' to the 32-bit two's complement integer format.  The conversion is
3069| performed according to the IEC/IEEE Standard for Binary Floating-Point
3070| Arithmetic---which means in particular that the conversion is rounded
3071| according to the current rounding mode.  If `a' is a NaN, the largest
3072| positive integer is returned.  Otherwise, if the conversion overflows, the
3073| largest integer with the same sign as `a' is returned.
3074*----------------------------------------------------------------------------*/
3075
3076int32 float64_to_int32(float64 a, float_status *status)
3077{
3078    flag aSign;
3079    int_fast16_t aExp, shiftCount;
3080    uint64_t aSig;
3081    a = float64_squash_input_denormal(a, status);
3082
3083    aSig = extractFloat64Frac( a );
3084    aExp = extractFloat64Exp( a );
3085    aSign = extractFloat64Sign( a );
3086    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3087    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3088    shiftCount = 0x42C - aExp;
3089    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3090    return roundAndPackInt32(aSign, aSig, status);
3091
3092}
3093
3094/*----------------------------------------------------------------------------
3095| Returns the result of converting the double-precision floating-point value
3096| `a' to the 32-bit two's complement integer format.  The conversion is
3097| performed according to the IEC/IEEE Standard for Binary Floating-Point
3098| Arithmetic, except that the conversion is always rounded toward zero.
3099| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3100| the conversion overflows, the largest integer with the same sign as `a' is
3101| returned.
3102*----------------------------------------------------------------------------*/
3103
3104int32 float64_to_int32_round_to_zero(float64 a, float_status *status)
3105{
3106    flag aSign;
3107    int_fast16_t aExp, shiftCount;
3108    uint64_t aSig, savedASig;
3109    int32_t z;
3110    a = float64_squash_input_denormal(a, status);
3111
3112    aSig = extractFloat64Frac( a );
3113    aExp = extractFloat64Exp( a );
3114    aSign = extractFloat64Sign( a );
3115    if ( 0x41E < aExp ) {
3116        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3117        goto invalid;
3118    }
3119    else if ( aExp < 0x3FF ) {
3120        if (aExp || aSig) {
3121            status->float_exception_flags |= float_flag_inexact;
3122        }
3123        return 0;
3124    }
3125    aSig |= LIT64( 0x0010000000000000 );
3126    shiftCount = 0x433 - aExp;
3127    savedASig = aSig;
3128    aSig >>= shiftCount;
3129    z = aSig;
3130    if ( aSign ) z = - z;
3131    if ( ( z < 0 ) ^ aSign ) {
3132 invalid:
3133        float_raise(float_flag_invalid, status);
3134        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3135    }
3136    if ( ( aSig<<shiftCount ) != savedASig ) {
3137        status->float_exception_flags |= float_flag_inexact;
3138    }
3139    return z;
3140
3141}
3142
3143/*----------------------------------------------------------------------------
3144| Returns the result of converting the double-precision floating-point value
3145| `a' to the 16-bit two's complement integer format.  The conversion is
3146| performed according to the IEC/IEEE Standard for Binary Floating-Point
3147| Arithmetic, except that the conversion is always rounded toward zero.
3148| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3149| the conversion overflows, the largest integer with the same sign as `a' is
3150| returned.
3151*----------------------------------------------------------------------------*/
3152
3153int_fast16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3154{
3155    flag aSign;
3156    int_fast16_t aExp, shiftCount;
3157    uint64_t aSig, savedASig;
3158    int32 z;
3159
3160    aSig = extractFloat64Frac( a );
3161    aExp = extractFloat64Exp( a );
3162    aSign = extractFloat64Sign( a );
3163    if ( 0x40E < aExp ) {
3164        if ( ( aExp == 0x7FF ) && aSig ) {
3165            aSign = 0;
3166        }
3167        goto invalid;
3168    }
3169    else if ( aExp < 0x3FF ) {
3170        if ( aExp || aSig ) {
3171            status->float_exception_flags |= float_flag_inexact;
3172        }
3173        return 0;
3174    }
3175    aSig |= LIT64( 0x0010000000000000 );
3176    shiftCount = 0x433 - aExp;
3177    savedASig = aSig;
3178    aSig >>= shiftCount;
3179    z = aSig;
3180    if ( aSign ) {
3181        z = - z;
3182    }
3183    if ( ( (int16_t)z < 0 ) ^ aSign ) {
3184 invalid:
3185        float_raise(float_flag_invalid, status);
3186        return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3187    }
3188    if ( ( aSig<<shiftCount ) != savedASig ) {
3189        status->float_exception_flags |= float_flag_inexact;
3190    }
3191    return z;
3192}
3193
3194/*----------------------------------------------------------------------------
3195| Returns the result of converting the double-precision floating-point value
3196| `a' to the 64-bit two's complement integer format.  The conversion is
3197| performed according to the IEC/IEEE Standard for Binary Floating-Point
3198| Arithmetic---which means in particular that the conversion is rounded
3199| according to the current rounding mode.  If `a' is a NaN, the largest
3200| positive integer is returned.  Otherwise, if the conversion overflows, the
3201| largest integer with the same sign as `a' is returned.
3202*----------------------------------------------------------------------------*/
3203
3204int64 float64_to_int64(float64 a, float_status *status)
3205{
3206    flag aSign;
3207    int_fast16_t aExp, shiftCount;
3208    uint64_t aSig, aSigExtra;
3209    a = float64_squash_input_denormal(a, status);
3210
3211    aSig = extractFloat64Frac( a );
3212    aExp = extractFloat64Exp( a );
3213    aSign = extractFloat64Sign( a );
3214    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3215    shiftCount = 0x433 - aExp;
3216    if ( shiftCount <= 0 ) {
3217        if ( 0x43E < aExp ) {
3218            float_raise(float_flag_invalid, status);
3219            if (    ! aSign
3220                 || (    ( aExp == 0x7FF )
3221                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
3222               ) {
3223                return LIT64( 0x7FFFFFFFFFFFFFFF );
3224            }
3225            return (int64_t) LIT64( 0x8000000000000000 );
3226        }
3227        aSigExtra = 0;
3228        aSig <<= - shiftCount;
3229    }
3230    else {
3231        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3232    }
3233    return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3234
3235}
3236
3237/*----------------------------------------------------------------------------
3238| Returns the result of converting the double-precision floating-point value
3239| `a' to the 64-bit two's complement integer format.  The conversion is
3240| performed according to the IEC/IEEE Standard for Binary Floating-Point
3241| Arithmetic, except that the conversion is always rounded toward zero.
3242| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3243| the conversion overflows, the largest integer with the same sign as `a' is
3244| returned.
3245*----------------------------------------------------------------------------*/
3246
3247int64 float64_to_int64_round_to_zero(float64 a, float_status *status)
3248{
3249    flag aSign;
3250    int_fast16_t aExp, shiftCount;
3251    uint64_t aSig;
3252    int64 z;
3253    a = float64_squash_input_denormal(a, status);
3254
3255    aSig = extractFloat64Frac( a );
3256    aExp = extractFloat64Exp( a );
3257    aSign = extractFloat64Sign( a );
3258    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3259    shiftCount = aExp - 0x433;
3260    if ( 0 <= shiftCount ) {
3261        if ( 0x43E <= aExp ) {
3262            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3263                float_raise(float_flag_invalid, status);
3264                if (    ! aSign
3265                     || (    ( aExp == 0x7FF )
3266                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
3267                   ) {
3268                    return LIT64( 0x7FFFFFFFFFFFFFFF );
3269                }
3270            }
3271            return (int64_t) LIT64( 0x8000000000000000 );
3272        }
3273        z = aSig<<shiftCount;
3274    }
3275    else {
3276        if ( aExp < 0x3FE ) {
3277            if (aExp | aSig) {
3278                status->float_exception_flags |= float_flag_inexact;
3279            }
3280            return 0;
3281        }
3282        z = aSig>>( - shiftCount );
3283        if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3284            status->float_exception_flags |= float_flag_inexact;
3285        }
3286    }
3287    if ( aSign ) z = - z;
3288    return z;
3289
3290}
3291
3292/*----------------------------------------------------------------------------
3293| Returns the result of converting the double-precision floating-point value
3294| `a' to the single-precision floating-point format.  The conversion is
3295| performed according to the IEC/IEEE Standard for Binary Floating-Point
3296| Arithmetic.
3297*----------------------------------------------------------------------------*/
3298
3299float32 float64_to_float32(float64 a, float_status *status)
3300{
3301    flag aSign;
3302    int_fast16_t aExp;
3303    uint64_t aSig;
3304    uint32_t zSig;
3305    a = float64_squash_input_denormal(a, status);
3306
3307    aSig = extractFloat64Frac( a );
3308    aExp = extractFloat64Exp( a );
3309    aSign = extractFloat64Sign( a );
3310    if ( aExp == 0x7FF ) {
3311        if (aSig) {
3312            return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3313        }
3314        return packFloat32( aSign, 0xFF, 0 );
3315    }
3316    shift64RightJamming( aSig, 22, &aSig );
3317    zSig = aSig;
3318    if ( aExp || zSig ) {
3319        zSig |= 0x40000000;
3320        aExp -= 0x381;
3321    }
3322    return roundAndPackFloat32(aSign, aExp, zSig, status);
3323
3324}
3325
3326
3327/*----------------------------------------------------------------------------
3328| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3329| half-precision floating-point value, returning the result.  After being
3330| shifted into the proper positions, the three fields are simply added
3331| together to form the result.  This means that any integer portion of `zSig'
3332| will be added into the exponent.  Since a properly normalized significand
3333| will have an integer portion equal to 1, the `zExp' input should be 1 less
3334| than the desired result exponent whenever `zSig' is a complete, normalized
3335| significand.
3336*----------------------------------------------------------------------------*/
3337static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3338{
3339    return make_float16(
3340        (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3341}
3342
3343/*----------------------------------------------------------------------------
3344| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3345| and significand `zSig', and returns the proper half-precision floating-
3346| point value corresponding to the abstract input.  Ordinarily, the abstract
3347| value is simply rounded and packed into the half-precision format, with
3348| the inexact exception raised if the abstract input cannot be represented
3349| exactly.  However, if the abstract value is too large, the overflow and
3350| inexact exceptions are raised and an infinity or maximal finite value is
3351| returned.  If the abstract value is too small, the input value is rounded to
3352| a subnormal number, and the underflow and inexact exceptions are raised if
3353| the abstract input cannot be represented exactly as a subnormal half-
3354| precision floating-point number.
3355| The `ieee' flag indicates whether to use IEEE standard half precision, or
3356| ARM-style "alternative representation", which omits the NaN and Inf
3357| encodings in order to raise the maximum representable exponent by one.
3358|     The input significand `zSig' has its binary point between bits 22
3359| and 23, which is 13 bits to the left of the usual location.  This shifted
3360| significand must be normalized or smaller.  If `zSig' is not normalized,
3361| `zExp' must be 0; in that case, the result returned is a subnormal number,
3362| and it must not require rounding.  In the usual case that `zSig' is
3363| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3364| Note the slightly odd position of the binary point in zSig compared with the
3365| other roundAndPackFloat functions. This should probably be fixed if we
3366| need to implement more float16 routines than just conversion.
3367| The handling of underflow and overflow follows the IEC/IEEE Standard for
3368| Binary Floating-Point Arithmetic.
3369*----------------------------------------------------------------------------*/
3370
3371static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3372                                   uint32_t zSig, flag ieee,
3373                                   float_status *status)
3374{
3375    int maxexp = ieee ? 29 : 30;
3376    uint32_t mask;
3377    uint32_t increment;
3378    bool rounding_bumps_exp;
3379    bool is_tiny = false;
3380
3381    /* Calculate the mask of bits of the mantissa which are not
3382     * representable in half-precision and will be lost.
3383     */
3384    if (zExp < 1) {
3385        /* Will be denormal in halfprec */
3386        mask = 0x00ffffff;
3387        if (zExp >= -11) {
3388            mask >>= 11 + zExp;
3389        }
3390    } else {
3391        /* Normal number in halfprec */
3392        mask = 0x00001fff;
3393    }
3394
3395    switch (status->float_rounding_mode) {
3396    case float_round_nearest_even:
3397        increment = (mask + 1) >> 1;
3398        if ((zSig & mask) == increment) {
3399            increment = zSig & (increment << 1);
3400        }
3401        break;
3402    case float_round_ties_away:
3403        increment = (mask + 1) >> 1;
3404        break;
3405    case float_round_up:
3406        increment = zSign ? 0 : mask;
3407        break;
3408    case float_round_down:
3409        increment = zSign ? mask : 0;
3410        break;
3411    default: /* round_to_zero */
3412        increment = 0;
3413        break;
3414    }
3415
3416    rounding_bumps_exp = (zSig + increment >= 0x01000000);
3417
3418    if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3419        if (ieee) {
3420            float_raise(float_flag_overflow | float_flag_inexact, status);
3421            return packFloat16(zSign, 0x1f, 0);
3422        } else {
3423            float_raise(float_flag_invalid, status);
3424            return packFloat16(zSign, 0x1f, 0x3ff);
3425        }
3426    }
3427
3428    if (zExp < 0) {
3429        /* Note that flush-to-zero does not affect half-precision results */
3430        is_tiny =
3431            (status->float_detect_tininess == float_tininess_before_rounding)
3432            || (zExp < -1)
3433            || (!rounding_bumps_exp);
3434    }
3435    if (zSig & mask) {
3436        float_raise(float_flag_inexact, status);
3437        if (is_tiny) {
3438            float_raise(float_flag_underflow, status);
3439        }
3440    }
3441
3442    zSig += increment;
3443    if (rounding_bumps_exp) {
3444        zSig >>= 1;
3445        zExp++;
3446    }
3447
3448    if (zExp < -10) {
3449        return packFloat16(zSign, 0, 0);
3450    }
3451    if (zExp < 0) {
3452        zSig >>= -zExp;
3453        zExp = 0;
3454    }
3455    return packFloat16(zSign, zExp, zSig >> 13);
3456}
3457
3458static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3459                                      uint32_t *zSigPtr)
3460{
3461    int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3462    *zSigPtr = aSig << shiftCount;
3463    *zExpPtr = 1 - shiftCount;
3464}
3465
3466/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3467   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3468
3469float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3470{
3471    flag aSign;
3472    int_fast16_t aExp;
3473    uint32_t aSig;
3474
3475    aSign = extractFloat16Sign(a);
3476    aExp = extractFloat16Exp(a);
3477    aSig = extractFloat16Frac(a);
3478
3479    if (aExp == 0x1f && ieee) {
3480        if (aSig) {
3481            return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3482        }
3483        return packFloat32(aSign, 0xff, 0);
3484    }
3485    if (aExp == 0) {
3486        if (aSig == 0) {
3487            return packFloat32(aSign, 0, 0);
3488        }
3489
3490        normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3491        aExp--;
3492    }
3493    return packFloat32( aSign, aExp + 0x70, aSig << 13);
3494}
3495
3496float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3497{
3498    flag aSign;
3499    int_fast16_t aExp;
3500    uint32_t aSig;
3501
3502    a = float32_squash_input_denormal(a, status);
3503
3504    aSig = extractFloat32Frac( a );
3505    aExp = extractFloat32Exp( a );
3506    aSign = extractFloat32Sign( a );
3507    if ( aExp == 0xFF ) {
3508        if (aSig) {
3509            /* Input is a NaN */
3510            if (!ieee) {
3511                float_raise(float_flag_invalid, status);
3512                return packFloat16(aSign, 0, 0);
3513            }
3514            return commonNaNToFloat16(
3515                float32ToCommonNaN(a, status), status);
3516        }
3517        /* Infinity */
3518        if (!ieee) {
3519            float_raise(float_flag_invalid, status);
3520            return packFloat16(aSign, 0x1f, 0x3ff);
3521        }
3522        return packFloat16(aSign, 0x1f, 0);
3523    }
3524    if (aExp == 0 && aSig == 0) {
3525        return packFloat16(aSign, 0, 0);
3526    }
3527    /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3528     * even if the input is denormal; however this is harmless because
3529     * the largest possible single-precision denormal is still smaller
3530     * than the smallest representable half-precision denormal, and so we
3531     * will end up ignoring aSig and returning via the "always return zero"
3532     * codepath.
3533     */
3534    aSig |= 0x00800000;
3535    aExp -= 0x71;
3536
3537    return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3538}
3539
3540float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3541{
3542    flag aSign;
3543    int_fast16_t aExp;
3544    uint32_t aSig;
3545
3546    aSign = extractFloat16Sign(a);
3547    aExp = extractFloat16Exp(a);
3548    aSig = extractFloat16Frac(a);
3549
3550    if (aExp == 0x1f && ieee) {
3551        if (aSig) {
3552            return commonNaNToFloat64(
3553                float16ToCommonNaN(a, status), status);
3554        }
3555        return packFloat64(aSign, 0x7ff, 0);
3556    }
3557    if (aExp == 0) {
3558        if (aSig == 0) {
3559            return packFloat64(aSign, 0, 0);
3560        }
3561
3562        normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3563        aExp--;
3564    }
3565    return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3566}
3567
3568float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3569{
3570    flag aSign;
3571    int_fast16_t aExp;
3572    uint64_t aSig;
3573    uint32_t zSig;
3574
3575    a = float64_squash_input_denormal(a, status);
3576
3577    aSig = extractFloat64Frac(a);
3578    aExp = extractFloat64Exp(a);
3579    aSign = extractFloat64Sign(a);
3580    if (aExp == 0x7FF) {
3581        if (aSig) {
3582            /* Input is a NaN */
3583            if (!ieee) {
3584                float_raise(float_flag_invalid, status);
3585                return packFloat16(aSign, 0, 0);
3586            }
3587            return commonNaNToFloat16(
3588                float64ToCommonNaN(a, status), status);
3589        }
3590        /* Infinity */
3591        if (!ieee) {
3592            float_raise(float_flag_invalid, status);
3593            return packFloat16(aSign, 0x1f, 0x3ff);
3594        }
3595        return packFloat16(aSign, 0x1f, 0);
3596    }
3597    shift64RightJamming(aSig, 29, &aSig);
3598    zSig = aSig;
3599    if (aExp == 0 && zSig == 0) {
3600        return packFloat16(aSign, 0, 0);
3601    }
3602    /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3603     * even if the input is denormal; however this is harmless because
3604     * the largest possible single-precision denormal is still smaller
3605     * than the smallest representable half-precision denormal, and so we
3606     * will end up ignoring aSig and returning via the "always return zero"
3607     * codepath.
3608     */
3609    zSig |= 0x00800000;
3610    aExp -= 0x3F1;
3611
3612    return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3613}
3614
3615/*----------------------------------------------------------------------------
3616| Returns the result of converting the double-precision floating-point value
3617| `a' to the extended double-precision floating-point format.  The conversion
3618| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3619| Arithmetic.
3620*----------------------------------------------------------------------------*/
3621
3622floatx80 float64_to_floatx80(float64 a, float_status *status)
3623{
3624    flag aSign;
3625    int_fast16_t aExp;
3626    uint64_t aSig;
3627
3628    a = float64_squash_input_denormal(a, status);
3629    aSig = extractFloat64Frac( a );
3630    aExp = extractFloat64Exp( a );
3631    aSign = extractFloat64Sign( a );
3632    if ( aExp == 0x7FF ) {
3633        if (aSig) {
3634            return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3635        }
3636        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3637    }
3638    if ( aExp == 0 ) {
3639        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3640        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3641    }
3642    return
3643        packFloatx80(
3644            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3645
3646}
3647
3648/*----------------------------------------------------------------------------
3649| Returns the result of converting the double-precision floating-point value
3650| `a' to the quadruple-precision floating-point format.  The conversion is
3651| performed according to the IEC/IEEE Standard for Binary Floating-Point
3652| Arithmetic.
3653*----------------------------------------------------------------------------*/
3654
3655float128 float64_to_float128(float64 a, float_status *status)
3656{
3657    flag aSign;
3658    int_fast16_t aExp;
3659    uint64_t aSig, zSig0, zSig1;
3660
3661    a = float64_squash_input_denormal(a, status);
3662    aSig = extractFloat64Frac( a );
3663    aExp = extractFloat64Exp( a );
3664    aSign = extractFloat64Sign( a );
3665    if ( aExp == 0x7FF ) {
3666        if (aSig) {
3667            return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3668        }
3669        return packFloat128( aSign, 0x7FFF, 0, 0 );
3670    }
3671    if ( aExp == 0 ) {
3672        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3673        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3674        --aExp;
3675    }
3676    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3677    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3678
3679}
3680
3681/*----------------------------------------------------------------------------
3682| Rounds the double-precision floating-point value `a' to an integer, and
3683| returns the result as a double-precision floating-point value.  The
3684| operation is performed according to the IEC/IEEE Standard for Binary
3685| Floating-Point Arithmetic.
3686*----------------------------------------------------------------------------*/
3687
3688float64 float64_round_to_int(float64 a, float_status *status)
3689{
3690    flag aSign;
3691    int_fast16_t aExp;
3692    uint64_t lastBitMask, roundBitsMask;
3693    uint64_t z;
3694    a = float64_squash_input_denormal(a, status);
3695
3696    aExp = extractFloat64Exp( a );
3697    if ( 0x433 <= aExp ) {
3698        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3699            return propagateFloat64NaN(a, a, status);
3700        }
3701        return a;
3702    }
3703    if ( aExp < 0x3FF ) {
3704        if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3705        status->float_exception_flags |= float_flag_inexact;
3706        aSign = extractFloat64Sign( a );
3707        switch (status->float_rounding_mode) {
3708         case float_round_nearest_even:
3709            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3710                return packFloat64( aSign, 0x3FF, 0 );
3711            }
3712            break;
3713        case float_round_ties_away:
3714            if (aExp == 0x3FE) {
3715                return packFloat64(aSign, 0x3ff, 0);
3716            }
3717            break;
3718         case float_round_down:
3719            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3720         case float_round_up:
3721            return make_float64(
3722            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3723        }
3724        return packFloat64( aSign, 0, 0 );
3725    }
3726    lastBitMask = 1;
3727    lastBitMask <<= 0x433 - aExp;
3728    roundBitsMask = lastBitMask - 1;
3729    z = float64_val(a);
3730    switch (status->float_rounding_mode) {
3731    case float_round_nearest_even:
3732        z += lastBitMask >> 1;
3733        if ((z & roundBitsMask) == 0) {
3734            z &= ~lastBitMask;
3735        }
3736        break;
3737    case float_round_ties_away:
3738        z += lastBitMask >> 1;
3739        break;
3740    case float_round_to_zero:
3741        break;
3742    case float_round_up:
3743        if (!extractFloat64Sign(make_float64(z))) {
3744            z += roundBitsMask;
3745        }
3746        break;
3747    case float_round_down:
3748        if (extractFloat64Sign(make_float64(z))) {
3749            z += roundBitsMask;
3750        }
3751        break;
3752    default:
3753        abort();
3754    }
3755    z &= ~ roundBitsMask;
3756    if (z != float64_val(a)) {
3757        status->float_exception_flags |= float_flag_inexact;
3758    }
3759    return make_float64(z);
3760
3761}
3762
3763float64 float64_trunc_to_int(float64 a, float_status *status)
3764{
3765    int oldmode;
3766    float64 res;
3767    oldmode = status->float_rounding_mode;
3768    status->float_rounding_mode = float_round_to_zero;
3769    res = float64_round_to_int(a, status);
3770    status->float_rounding_mode = oldmode;
3771    return res;
3772}
3773
3774/*----------------------------------------------------------------------------
3775| Returns the result of adding the absolute values of the double-precision
3776| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3777| before being returned.  `zSign' is ignored if the result is a NaN.
3778| The addition is performed according to the IEC/IEEE Standard for Binary
3779| Floating-Point Arithmetic.
3780*----------------------------------------------------------------------------*/
3781
3782static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3783                              float_status *status)
3784{
3785    int_fast16_t aExp, bExp, zExp;
3786    uint64_t aSig, bSig, zSig;
3787    int_fast16_t expDiff;
3788
3789    aSig = extractFloat64Frac( a );
3790    aExp = extractFloat64Exp( a );
3791    bSig = extractFloat64Frac( b );
3792    bExp = extractFloat64Exp( b );
3793    expDiff = aExp - bExp;
3794    aSig <<= 9;
3795    bSig <<= 9;
3796    if ( 0 < expDiff ) {
3797        if ( aExp == 0x7FF ) {
3798            if (aSig) {
3799                return propagateFloat64NaN(a, b, status);
3800            }
3801            return a;
3802        }
3803        if ( bExp == 0 ) {
3804            --expDiff;
3805        }
3806        else {
3807            bSig |= LIT64( 0x2000000000000000 );
3808        }
3809        shift64RightJamming( bSig, expDiff, &bSig );
3810        zExp = aExp;
3811    }
3812    else if ( expDiff < 0 ) {
3813        if ( bExp == 0x7FF ) {
3814            if (bSig) {
3815                return propagateFloat64NaN(a, b, status);
3816            }
3817            return packFloat64( zSign, 0x7FF, 0 );
3818        }
3819        if ( aExp == 0 ) {
3820            ++expDiff;
3821        }
3822        else {
3823            aSig |= LIT64( 0x2000000000000000 );
3824        }
3825        shift64RightJamming( aSig, - expDiff, &aSig );
3826        zExp = bExp;
3827    }
3828    else {
3829        if ( aExp == 0x7FF ) {
3830            if (aSig | bSig) {
3831                return propagateFloat64NaN(a, b, status);
3832            }
3833            return a;
3834        }
3835        if ( aExp == 0 ) {
3836            if (status->flush_to_zero) {
3837                if (aSig | bSig) {
3838                    float_raise(float_flag_output_denormal, status);
3839                }
3840                return packFloat64(zSign, 0, 0);
3841            }
3842            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3843        }
3844        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3845        zExp = aExp;
3846        goto roundAndPack;
3847    }
3848    aSig |= LIT64( 0x2000000000000000 );
3849    zSig = ( aSig + bSig )<<1;
3850    --zExp;
3851    if ( (int64_t) zSig < 0 ) {
3852        zSig = aSig + bSig;
3853        ++zExp;
3854    }
3855 roundAndPack:
3856    return roundAndPackFloat64(zSign, zExp, zSig, status);
3857
3858}
3859
3860/*----------------------------------------------------------------------------
3861| Returns the result of subtracting the absolute values of the double-
3862| precision floating-point values `a' and `b'.  If `zSign' is 1, the
3863| difference is negated before being returned.  `zSign' is ignored if the
3864| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3865| Standard for Binary Floating-Point Arithmetic.
3866*----------------------------------------------------------------------------*/
3867
3868static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3869                              float_status *status)
3870{
3871    int_fast16_t aExp, bExp, zExp;
3872    uint64_t aSig, bSig, zSig;
3873    int_fast16_t expDiff;
3874
3875    aSig = extractFloat64Frac( a );
3876    aExp = extractFloat64Exp( a );
3877    bSig = extractFloat64Frac( b );
3878    bExp = extractFloat64Exp( b );
3879    expDiff = aExp - bExp;
3880    aSig <<= 10;
3881    bSig <<= 10;
3882    if ( 0 < expDiff ) goto aExpBigger;
3883    if ( expDiff < 0 ) goto bExpBigger;
3884    if ( aExp == 0x7FF ) {
3885        if (aSig | bSig) {
3886            return propagateFloat64NaN(a, b, status);
3887        }
3888        float_raise(float_flag_invalid, status);
3889        return float64_default_nan;
3890    }
3891    if ( aExp == 0 ) {
3892        aExp = 1;
3893        bExp = 1;
3894    }
3895    if ( bSig < aSig ) goto aBigger;
3896    if ( aSig < bSig ) goto bBigger;
3897    return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
3898 bExpBigger:
3899    if ( bExp == 0x7FF ) {
3900        if (bSig) {
3901            return propagateFloat64NaN(a, b, status);
3902        }
3903        return packFloat64( zSign ^ 1, 0x7FF, 0 );
3904    }
3905    if ( aExp == 0 ) {
3906        ++expDiff;
3907    }
3908    else {
3909        aSig |= LIT64( 0x4000000000000000 );
3910    }
3911    shift64RightJamming( aSig, - expDiff, &aSig );
3912    bSig |= LIT64( 0x4000000000000000 );
3913 bBigger:
3914    zSig = bSig - aSig;
3915    zExp = bExp;
3916    zSign ^= 1;
3917    goto normalizeRoundAndPack;
3918 aExpBigger:
3919    if ( aExp == 0x7FF ) {
3920        if (aSig) {
3921            return propagateFloat64NaN(a, b, status);
3922        }
3923        return a;
3924    }
3925    if ( bExp == 0 ) {
3926        --expDiff;
3927    }
3928    else {
3929        bSig |= LIT64( 0x4000000000000000 );
3930    }
3931    shift64RightJamming( bSig, expDiff, &bSig );
3932    aSig |= LIT64( 0x4000000000000000 );
3933 aBigger:
3934    zSig = aSig - bSig;
3935    zExp = aExp;
3936 normalizeRoundAndPack:
3937    --zExp;
3938    return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
3939
3940}
3941
3942/*----------------------------------------------------------------------------
3943| Returns the result of adding the double-precision floating-point values `a'
3944| and `b'.  The operation is performed according to the IEC/IEEE Standard for
3945| Binary Floating-Point Arithmetic.
3946*----------------------------------------------------------------------------*/
3947
3948float64 float64_add(float64 a, float64 b, float_status *status)
3949{
3950    flag aSign, bSign;
3951    a = float64_squash_input_denormal(a, status);
3952    b = float64_squash_input_denormal(b, status);
3953
3954    aSign = extractFloat64Sign( a );
3955    bSign = extractFloat64Sign( b );
3956    if ( aSign == bSign ) {
3957        return addFloat64Sigs(a, b, aSign, status);
3958    }
3959    else {
3960        return subFloat64Sigs(a, b, aSign, status);
3961    }
3962
3963}
3964
3965/*----------------------------------------------------------------------------
3966| Returns the result of subtracting the double-precision floating-point values
3967| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3968| for Binary Floating-Point Arithmetic.
3969*----------------------------------------------------------------------------*/
3970
3971float64 float64_sub(float64 a, float64 b, float_status *status)
3972{
3973    flag aSign, bSign;
3974    a = float64_squash_input_denormal(a, status);
3975    b = float64_squash_input_denormal(b, status);
3976
3977    aSign = extractFloat64Sign( a );
3978    bSign = extractFloat64Sign( b );
3979    if ( aSign == bSign ) {
3980        return subFloat64Sigs(a, b, aSign, status);
3981    }
3982    else {
3983        return addFloat64Sigs(a, b, aSign, status);
3984    }
3985
3986}
3987
3988/*----------------------------------------------------------------------------
3989| Returns the result of multiplying the double-precision floating-point values
3990| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3991| for Binary Floating-Point Arithmetic.
3992*----------------------------------------------------------------------------*/
3993
3994float64 float64_mul(float64 a, float64 b, float_status *status)
3995{
3996    flag aSign, bSign, zSign;
3997    int_fast16_t aExp, bExp, zExp;
3998    uint64_t aSig, bSig, zSig0, zSig1;
3999
4000    a = float64_squash_input_denormal(a, status);
4001    b = float64_squash_input_denormal(b, status);
4002
4003    aSig = extractFloat64Frac( a );
4004    aExp = extractFloat64Exp( a );
4005    aSign = extractFloat64Sign( a );
4006    bSig = extractFloat64Frac( b );
4007    bExp = extractFloat64Exp( b );
4008    bSign = extractFloat64Sign( b );
4009    zSign = aSign ^ bSign;
4010    if ( aExp == 0x7FF ) {
4011        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4012            return propagateFloat64NaN(a, b, status);
4013        }
4014        if ( ( bExp | bSig ) == 0 ) {
4015            float_raise(float_flag_invalid, status);
4016            return float64_default_nan;
4017        }
4018        return packFloat64( zSign, 0x7FF, 0 );
4019    }
4020    if ( bExp == 0x7FF ) {
4021        if (bSig) {
4022            return propagateFloat64NaN(a, b, status);
4023        }
4024        if ( ( aExp | aSig ) == 0 ) {
4025            float_raise(float_flag_invalid, status);
4026            return float64_default_nan;
4027        }
4028        return packFloat64( zSign, 0x7FF, 0 );
4029    }
4030    if ( aExp == 0 ) {
4031        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4032        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4033    }
4034    if ( bExp == 0 ) {
4035        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4036        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4037    }
4038    zExp = aExp + bExp - 0x3FF;
4039    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4040    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4041    mul64To128( aSig, bSig, &zSig0, &zSig1 );
4042    zSig0 |= ( zSig1 != 0 );
4043    if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
4044        zSig0 <<= 1;
4045        --zExp;
4046    }
4047    return roundAndPackFloat64(zSign, zExp, zSig0, status);
4048
4049}
4050
4051/*----------------------------------------------------------------------------
4052| Returns the result of dividing the double-precision floating-point value `a'
4053| by the corresponding value `b'.  The operation is performed according to
4054| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4055*----------------------------------------------------------------------------*/
4056
4057float64 float64_div(float64 a, float64 b, float_status *status)
4058{
4059    flag aSign, bSign, zSign;
4060    int_fast16_t aExp, bExp, zExp;
4061    uint64_t aSig, bSig, zSig;
4062    uint64_t rem0, rem1;
4063    uint64_t term0, term1;
4064    a = float64_squash_input_denormal(a, status);
4065    b = float64_squash_input_denormal(b, status);
4066
4067    aSig = extractFloat64Frac( a );
4068    aExp = extractFloat64Exp( a );
4069    aSign = extractFloat64Sign( a );
4070    bSig = extractFloat64Frac( b );
4071    bExp = extractFloat64Exp( b );
4072    bSign = extractFloat64Sign( b );
4073    zSign = aSign ^ bSign;
4074    if ( aExp == 0x7FF ) {
4075        if (aSig) {
4076            return propagateFloat64NaN(a, b, status);
4077        }
4078        if ( bExp == 0x7FF ) {
4079            if (bSig) {
4080                return propagateFloat64NaN(a, b, status);
4081            }
4082            float_raise(float_flag_invalid, status);
4083            return float64_default_nan;
4084        }
4085        return packFloat64( zSign, 0x7FF, 0 );
4086    }
4087    if ( bExp == 0x7FF ) {
4088        if (bSig) {
4089            return propagateFloat64NaN(a, b, status);
4090        }
4091        return packFloat64( zSign, 0, 0 );
4092    }
4093    if ( bExp == 0 ) {
4094        if ( bSig == 0 ) {
4095            if ( ( aExp | aSig ) == 0 ) {
4096                float_raise(float_flag_invalid, status);
4097                return float64_default_nan;
4098            }
4099            float_raise(float_flag_divbyzero, status);
4100            return packFloat64( zSign, 0x7FF, 0 );
4101        }
4102        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4103    }
4104    if ( aExp == 0 ) {
4105        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4106        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4107    }
4108    zExp = aExp - bExp + 0x3FD;
4109    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4110    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4111    if ( bSig <= ( aSig + aSig ) ) {
4112        aSig >>= 1;
4113        ++zExp;
4114    }
4115    zSig = estimateDiv128To64( aSig, 0, bSig );
4116    if ( ( zSig & 0x1FF ) <= 2 ) {
4117        mul64To128( bSig, zSig, &term0, &term1 );
4118        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4119        while ( (int64_t) rem0 < 0 ) {
4120            --zSig;
4121            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4122        }
4123        zSig |= ( rem1 != 0 );
4124    }
4125    return roundAndPackFloat64(zSign, zExp, zSig, status);
4126
4127}
4128
4129/*----------------------------------------------------------------------------
4130| Returns the remainder of the double-precision floating-point value `a'
4131| with respect to the corresponding value `b'.  The operation is performed
4132| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4133*----------------------------------------------------------------------------*/
4134
4135float64 float64_rem(float64 a, float64 b, float_status *status)
4136{
4137    flag aSign, zSign;
4138    int_fast16_t aExp, bExp, expDiff;
4139    uint64_t aSig, bSig;
4140    uint64_t q, alternateASig;
4141    int64_t sigMean;
4142
4143    a = float64_squash_input_denormal(a, status);
4144    b = float64_squash_input_denormal(b, status);
4145    aSig = extractFloat64Frac( a );
4146    aExp = extractFloat64Exp( a );
4147    aSign = extractFloat64Sign( a );
4148    bSig = extractFloat64Frac( b );
4149    bExp = extractFloat64Exp( b );
4150    if ( aExp == 0x7FF ) {
4151        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4152            return propagateFloat64NaN(a, b, status);
4153        }
4154        float_raise(float_flag_invalid, status);
4155        return float64_default_nan;
4156    }
4157    if ( bExp == 0x7FF ) {
4158        if (bSig) {
4159            return propagateFloat64NaN(a, b, status);
4160        }
4161        return a;
4162    }
4163    if ( bExp == 0 ) {
4164        if ( bSig == 0 ) {
4165            float_raise(float_flag_invalid, status);
4166            return float64_default_nan;
4167        }
4168        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4169    }
4170    if ( aExp == 0 ) {
4171        if ( aSig == 0 ) return a;
4172        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4173    }
4174    expDiff = aExp - bExp;
4175    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4176    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4177    if ( expDiff < 0 ) {
4178        if ( expDiff < -1 ) return a;
4179        aSig >>= 1;
4180    }
4181    q = ( bSig <= aSig );
4182    if ( q ) aSig -= bSig;
4183    expDiff -= 64;
4184    while ( 0 < expDiff ) {
4185        q = estimateDiv128To64( aSig, 0, bSig );
4186        q = ( 2 < q ) ? q - 2 : 0;
4187        aSig = - ( ( bSig>>2 ) * q );
4188        expDiff -= 62;
4189    }
4190    expDiff += 64;
4191    if ( 0 < expDiff ) {
4192        q = estimateDiv128To64( aSig, 0, bSig );
4193        q = ( 2 < q ) ? q - 2 : 0;
4194        q >>= 64 - expDiff;
4195        bSig >>= 2;
4196        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4197    }
4198    else {
4199        aSig >>= 2;
4200        bSig >>= 2;
4201    }
4202    do {
4203        alternateASig = aSig;
4204        ++q;
4205        aSig -= bSig;
4206    } while ( 0 <= (int64_t) aSig );
4207    sigMean = aSig + alternateASig;
4208    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4209        aSig = alternateASig;
4210    }
4211    zSign = ( (int64_t) aSig < 0 );
4212    if ( zSign ) aSig = - aSig;
4213    return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4214
4215}
4216
4217/*----------------------------------------------------------------------------
4218| Returns the result of multiplying the double-precision floating-point values
4219| `a' and `b' then adding 'c', with no intermediate rounding step after the
4220| multiplication.  The operation is performed according to the IEC/IEEE
4221| Standard for Binary Floating-Point Arithmetic 754-2008.
4222| The flags argument allows the caller to select negation of the
4223| addend, the intermediate product, or the final result. (The difference
4224| between this and having the caller do a separate negation is that negating
4225| externally will flip the sign bit on NaNs.)
4226*----------------------------------------------------------------------------*/
4227
4228float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4229                       float_status *status)
4230{
4231    flag aSign, bSign, cSign, zSign;
4232    int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
4233    uint64_t aSig, bSig, cSig;
4234    flag pInf, pZero, pSign;
4235    uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4236    int shiftcount;
4237    flag signflip, infzero;
4238
4239    a = float64_squash_input_denormal(a, status);
4240    b = float64_squash_input_denormal(b, status);
4241    c = float64_squash_input_denormal(c, status);
4242    aSig = extractFloat64Frac(a);
4243    aExp = extractFloat64Exp(a);
4244    aSign = extractFloat64Sign(a);
4245    bSig = extractFloat64Frac(b);
4246    bExp = extractFloat64Exp(b);
4247    bSign = extractFloat64Sign(b);
4248    cSig = extractFloat64Frac(c);
4249    cExp = extractFloat64Exp(c);
4250    cSign = extractFloat64Sign(c);
4251
4252    infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4253               (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4254
4255    /* It is implementation-defined whether the cases of (0,inf,qnan)
4256     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4257     * they return if they do), so we have to hand this information
4258     * off to the target-specific pick-a-NaN routine.
4259     */
4260    if (((aExp == 0x7ff) && aSig) ||
4261        ((bExp == 0x7ff) && bSig) ||
4262        ((cExp == 0x7ff) && cSig)) {
4263        return propagateFloat64MulAddNaN(a, b, c, infzero, status);
4264    }
4265
4266    if (infzero) {
4267        float_raise(float_flag_invalid, status);
4268        return float64_default_nan;
4269    }
4270
4271    if (flags & float_muladd_negate_c) {
4272        cSign ^= 1;
4273    }
4274
4275    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4276
4277    /* Work out the sign and type of the product */
4278    pSign = aSign ^ bSign;
4279    if (flags & float_muladd_negate_product) {
4280        pSign ^= 1;
4281    }
4282    pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4283    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4284
4285    if (cExp == 0x7ff) {
4286        if (pInf && (pSign ^ cSign)) {
4287            /* addition of opposite-signed infinities => InvalidOperation */
4288            float_raise(float_flag_invalid, status);
4289            return float64_default_nan;
4290        }
4291        /* Otherwise generate an infinity of the same sign */
4292        return packFloat64(cSign ^ signflip, 0x7ff, 0);
4293    }
4294
4295    if (pInf) {
4296        return packFloat64(pSign ^ signflip, 0x7ff, 0);
4297    }
4298
4299    if (pZero) {
4300        if (cExp == 0) {
4301            if (cSig == 0) {
4302                /* Adding two exact zeroes */
4303                if (pSign == cSign) {
4304                    zSign = pSign;
4305                } else if (status->float_rounding_mode == float_round_down) {
4306                    zSign = 1;
4307                } else {
4308                    zSign = 0;
4309                }
4310                return packFloat64(zSign ^ signflip, 0, 0);
4311            }
4312            /* Exact zero plus a denorm */
4313            if (status->flush_to_zero) {
4314                float_raise(float_flag_output_denormal, status);
4315                return packFloat64(cSign ^ signflip, 0, 0);
4316            }
4317        }
4318        /* Zero plus something non-zero : just return the something */
4319        if (flags & float_muladd_halve_result) {
4320            if (cExp == 0) {
4321                normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4322            }
4323            /* Subtract one to halve, and one again because roundAndPackFloat64
4324             * wants one less than the true exponent.
4325             */
4326            cExp -= 2;
4327            cSig = (cSig | 0x0010000000000000ULL) << 10;
4328            return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
4329        }
4330        return packFloat64(cSign ^ signflip, cExp, cSig);
4331    }
4332
4333    if (aExp == 0) {
4334        normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4335    }
4336    if (bExp == 0) {
4337        normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4338    }
4339
4340    /* Calculate the actual result a * b + c */
4341
4342    /* Multiply first; this is easy. */
4343    /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4344     * because we want the true exponent, not the "one-less-than"
4345     * flavour that roundAndPackFloat64() takes.
4346     */
4347    pExp = aExp + bExp - 0x3fe;
4348    aSig = (aSig | LIT64(0x0010000000000000))<<10;
4349    bSig = (bSig | LIT64(0x0010000000000000))<<11;
4350    mul64To128(aSig, bSig, &pSig0, &pSig1);
4351    if ((int64_t)(pSig0 << 1) >= 0) {
4352        shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4353        pExp--;
4354    }
4355
4356    zSign = pSign ^ signflip;
4357
4358    /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4359     * bit in position 126.
4360     */
4361    if (cExp == 0) {
4362        if (!cSig) {
4363            /* Throw out the special case of c being an exact zero now */
4364            shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4365            if (flags & float_muladd_halve_result) {
4366                pExp--;
4367            }
4368            return roundAndPackFloat64(zSign, pExp - 1,
4369                                       pSig1, status);
4370        }
4371        normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4372    }
4373
4374    /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4375     * significand of the addend, with the explicit bit in position 126.
4376     */
4377    cSig0 = cSig << (126 - 64 - 52);
4378    cSig1 = 0;
4379    cSig0 |= LIT64(0x4000000000000000);
4380    expDiff = pExp - cExp;
4381
4382    if (pSign == cSign) {
4383        /* Addition */
4384        if (expDiff > 0) {
4385            /* scale c to match p */
4386            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4387            zExp = pExp;
4388        } else if (expDiff < 0) {
4389            /* scale p to match c */
4390            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4391            zExp = cExp;
4392        } else {
4393            /* no scaling needed */
4394            zExp = cExp;
4395        }
4396        /* Add significands and make sure explicit bit ends up in posn 126 */
4397        add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4398        if ((int64_t)zSig0 < 0) {
4399            shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4400        } else {
4401            zExp--;
4402        }
4403        shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4404        if (flags & float_muladd_halve_result) {
4405            zExp--;
4406        }
4407        return roundAndPackFloat64(zSign, zExp, zSig1, status);
4408    } else {
4409        /* Subtraction */
4410        if (expDiff > 0) {
4411            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4412            sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4413            zExp = pExp;
4414        } else if (expDiff < 0) {
4415            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4416            sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4417            zExp = cExp;
4418            zSign ^= 1;
4419        } else {
4420            zExp = pExp;
4421            if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4422                sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4423            } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4424                sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4425                zSign ^= 1;
4426            } else {
4427                /* Exact zero */
4428                zSign = signflip;
4429                if (status->float_rounding_mode == float_round_down) {
4430                    zSign ^= 1;
4431                }
4432                return packFloat64(zSign, 0, 0);
4433            }
4434        }
4435        --zExp;
4436        /* Do the equivalent of normalizeRoundAndPackFloat64() but
4437         * starting with the significand in a pair of uint64_t.
4438         */
4439        if (zSig0) {
4440            shiftcount = countLeadingZeros64(zSig0) - 1;
4441            shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4442            if (zSig1) {
4443                zSig0 |= 1;
4444            }
4445            zExp -= shiftcount;
4446        } else {
4447            shiftcount = countLeadingZeros64(zSig1);
4448            if (shiftcount == 0) {
4449                zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4450                zExp -= 63;
4451            } else {
4452                shiftcount--;
4453                zSig0 = zSig1 << shiftcount;
4454                zExp -= (shiftcount + 64);
4455            }
4456        }
4457        if (flags & float_muladd_halve_result) {
4458            zExp--;
4459        }
4460        return roundAndPackFloat64(zSign, zExp, zSig0, status);
4461    }
4462}
4463
4464/*----------------------------------------------------------------------------
4465| Returns the square root of the double-precision floating-point value `a'.
4466| The operation is performed according to the IEC/IEEE Standard for Binary
4467| Floating-Point Arithmetic.
4468*----------------------------------------------------------------------------*/
4469
4470float64 float64_sqrt(float64 a, float_status *status)
4471{
4472    flag aSign;
4473    int_fast16_t aExp, zExp;
4474    uint64_t aSig, zSig, doubleZSig;
4475    uint64_t rem0, rem1, term0, term1;
4476    a = float64_squash_input_denormal(a, status);
4477
4478    aSig = extractFloat64Frac( a );
4479    aExp = extractFloat64Exp( a );
4480    aSign = extractFloat64Sign( a );
4481    if ( aExp == 0x7FF ) {
4482        if (aSig) {
4483            return propagateFloat64NaN(a, a, status);
4484        }
4485        if ( ! aSign ) return a;
4486        float_raise(float_flag_invalid, status);
4487        return float64_default_nan;
4488    }
4489    if ( aSign ) {
4490        if ( ( aExp | aSig ) == 0 ) return a;
4491        float_raise(float_flag_invalid, status);
4492        return float64_default_nan;
4493    }
4494    if ( aExp == 0 ) {
4495        if ( aSig == 0 ) return float64_zero;
4496        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4497    }
4498    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4499    aSig |= LIT64( 0x0010000000000000 );
4500    zSig = estimateSqrt32( aExp, aSig>>21 );
4501    aSig <<= 9 - ( aExp & 1 );
4502    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4503    if ( ( zSig & 0x1FF ) <= 5 ) {
4504        doubleZSig = zSig<<1;
4505        mul64To128( zSig, zSig, &term0, &term1 );
4506        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4507        while ( (int64_t) rem0 < 0 ) {
4508            --zSig;
4509            doubleZSig -= 2;
4510            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4511        }
4512        zSig |= ( ( rem0 | rem1 ) != 0 );
4513    }
4514    return roundAndPackFloat64(0, zExp, zSig, status);
4515
4516}
4517
4518/*----------------------------------------------------------------------------
4519| Returns the binary log of the double-precision floating-point value `a'.
4520| The operation is performed according to the IEC/IEEE Standard for Binary
4521| Floating-Point Arithmetic.
4522*----------------------------------------------------------------------------*/
4523float64 float64_log2(float64 a, float_status *status)
4524{
4525    flag aSign, zSign;
4526    int_fast16_t aExp;
4527    uint64_t aSig, aSig0, aSig1, zSig, i;
4528    a = float64_squash_input_denormal(a, status);
4529
4530    aSig = extractFloat64Frac( a );
4531    aExp = extractFloat64Exp( a );
4532    aSign = extractFloat64Sign( a );
4533
4534    if ( aExp == 0 ) {
4535        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4536        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4537    }
4538    if ( aSign ) {
4539        float_raise(float_flag_invalid, status);
4540        return float64_default_nan;
4541    }
4542    if ( aExp == 0x7FF ) {
4543        if (aSig) {
4544            return propagateFloat64NaN(a, float64_zero, status);
4545        }
4546        return a;
4547    }
4548
4549    aExp -= 0x3FF;
4550    aSig |= LIT64( 0x0010000000000000 );
4551    zSign = aExp < 0;
4552    zSig = (uint64_t)aExp << 52;
4553    for (i = 1LL << 51; i > 0; i >>= 1) {
4554        mul64To128( aSig, aSig, &aSig0, &aSig1 );
4555        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4556        if ( aSig & LIT64( 0x0020000000000000 ) ) {
4557            aSig >>= 1;
4558            zSig |= i;
4559        }
4560    }
4561
4562    if ( zSign )
4563        zSig = -zSig;
4564    return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4565}
4566
4567/*----------------------------------------------------------------------------
4568| Returns 1 if the double-precision floating-point value `a' is equal to the
4569| corresponding value `b', and 0 otherwise.  The invalid exception is raised
4570| if either operand is a NaN.  Otherwise, the comparison is performed
4571| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4572*----------------------------------------------------------------------------*/
4573
4574int float64_eq(float64 a, float64 b, float_status *status)
4575{
4576    uint64_t av, bv;
4577    a = float64_squash_input_denormal(a, status);
4578    b = float64_squash_input_denormal(b, status);
4579
4580    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4581         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4582       ) {
4583        float_raise(float_flag_invalid, status);
4584        return 0;
4585    }
4586    av = float64_val(a);
4587    bv = float64_val(b);
4588    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4589
4590}
4591
4592/*----------------------------------------------------------------------------
4593| Returns 1 if the double-precision floating-point value `a' is less than or
4594| equal to the corresponding value `b', and 0 otherwise.  The invalid
4595| exception is raised if either operand is a NaN.  The comparison is performed
4596| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4597*----------------------------------------------------------------------------*/
4598
4599int float64_le(float64 a, float64 b, float_status *status)
4600{
4601    flag aSign, bSign;
4602    uint64_t av, bv;
4603    a = float64_squash_input_denormal(a, status);
4604    b = float64_squash_input_denormal(b, status);
4605
4606    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4607         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4608       ) {
4609        float_raise(float_flag_invalid, status);
4610        return 0;
4611    }
4612    aSign = extractFloat64Sign( a );
4613    bSign = extractFloat64Sign( b );
4614    av = float64_val(a);
4615    bv = float64_val(b);
4616    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4617    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4618
4619}
4620
4621/*----------------------------------------------------------------------------
4622| Returns 1 if the double-precision floating-point value `a' is less than
4623| the corresponding value `b', and 0 otherwise.  The invalid exception is
4624| raised if either operand is a NaN.  The comparison is performed according
4625| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4626*----------------------------------------------------------------------------*/
4627
4628int float64_lt(float64 a, float64 b, float_status *status)
4629{
4630    flag aSign, bSign;
4631    uint64_t av, bv;
4632
4633    a = float64_squash_input_denormal(a, status);
4634    b = float64_squash_input_denormal(b, status);
4635    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4636         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4637       ) {
4638        float_raise(float_flag_invalid, status);
4639        return 0;
4640    }
4641    aSign = extractFloat64Sign( a );
4642    bSign = extractFloat64Sign( b );
4643    av = float64_val(a);
4644    bv = float64_val(b);
4645    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4646    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4647
4648}
4649
4650/*----------------------------------------------------------------------------
4651| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4652| be compared, and 0 otherwise.  The invalid exception is raised if either
4653| operand is a NaN.  The comparison is performed according to the IEC/IEEE
4654| Standard for Binary Floating-Point Arithmetic.
4655*----------------------------------------------------------------------------*/
4656
4657int float64_unordered(float64 a, float64 b, float_status *status)
4658{
4659    a = float64_squash_input_denormal(a, status);
4660    b = float64_squash_input_denormal(b, status);
4661
4662    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4663         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4664       ) {
4665        float_raise(float_flag_invalid, status);
4666        return 1;
4667    }
4668    return 0;
4669}
4670
4671/*----------------------------------------------------------------------------
4672| Returns 1 if the double-precision floating-point value `a' is equal to the
4673| corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4674| exception.The comparison is performed according to the IEC/IEEE Standard
4675| for Binary Floating-Point Arithmetic.
4676*----------------------------------------------------------------------------*/
4677
4678int float64_eq_quiet(float64 a, float64 b, float_status *status)
4679{
4680    uint64_t av, bv;
4681    a = float64_squash_input_denormal(a, status);
4682    b = float64_squash_input_denormal(b, status);
4683
4684    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4685         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4686       ) {
4687        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4688            float_raise(float_flag_invalid, status);
4689        }
4690        return 0;
4691    }
4692    av = float64_val(a);
4693    bv = float64_val(b);
4694    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4695
4696}
4697
4698/*----------------------------------------------------------------------------
4699| Returns 1 if the double-precision floating-point value `a' is less than or
4700| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4701| cause an exception.  Otherwise, the comparison is performed according to the
4702| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4703*----------------------------------------------------------------------------*/
4704
4705int float64_le_quiet(float64 a, float64 b, float_status *status)
4706{
4707    flag aSign, bSign;
4708    uint64_t av, bv;
4709    a = float64_squash_input_denormal(a, status);
4710    b = float64_squash_input_denormal(b, status);
4711
4712    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4713         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4714       ) {
4715        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4716            float_raise(float_flag_invalid, status);
4717        }
4718        return 0;
4719    }
4720    aSign = extractFloat64Sign( a );
4721    bSign = extractFloat64Sign( b );
4722    av = float64_val(a);
4723    bv = float64_val(b);
4724    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4725    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4726
4727}
4728
4729/*----------------------------------------------------------------------------
4730| Returns 1 if the double-precision floating-point value `a' is less than
4731| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4732| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4733| Standard for Binary Floating-Point Arithmetic.
4734*----------------------------------------------------------------------------*/
4735
4736int float64_lt_quiet(float64 a, float64 b, float_status *status)
4737{
4738    flag aSign, bSign;
4739    uint64_t av, bv;
4740    a = float64_squash_input_denormal(a, status);
4741    b = float64_squash_input_denormal(b, status);
4742
4743    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4744         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4745       ) {
4746        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4747            float_raise(float_flag_invalid, status);
4748        }
4749        return 0;
4750    }
4751    aSign = extractFloat64Sign( a );
4752    bSign = extractFloat64Sign( b );
4753    av = float64_val(a);
4754    bv = float64_val(b);
4755    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4756    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4757
4758}
4759
4760/*----------------------------------------------------------------------------
4761| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4762| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4763| comparison is performed according to the IEC/IEEE Standard for Binary
4764| Floating-Point Arithmetic.
4765*----------------------------------------------------------------------------*/
4766
4767int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4768{
4769    a = float64_squash_input_denormal(a, status);
4770    b = float64_squash_input_denormal(b, status);
4771
4772    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4773         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4774       ) {
4775        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4776            float_raise(float_flag_invalid, status);
4777        }
4778        return 1;
4779    }
4780    return 0;
4781}
4782
4783/*----------------------------------------------------------------------------
4784| Returns the result of converting the extended double-precision floating-
4785| point value `a' to the 32-bit two's complement integer format.  The
4786| conversion is performed according to the IEC/IEEE Standard for Binary
4787| Floating-Point Arithmetic---which means in particular that the conversion
4788| is rounded according to the current rounding mode.  If `a' is a NaN, the
4789| largest positive integer is returned.  Otherwise, if the conversion
4790| overflows, the largest integer with the same sign as `a' is returned.
4791*----------------------------------------------------------------------------*/
4792
4793int32 floatx80_to_int32(floatx80 a, float_status *status)
4794{
4795    flag aSign;
4796    int32 aExp, shiftCount;
4797    uint64_t aSig;
4798
4799    aSig = extractFloatx80Frac( a );
4800    aExp = extractFloatx80Exp( a );
4801    aSign = extractFloatx80Sign( a );
4802    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4803    shiftCount = 0x4037 - aExp;
4804    if ( shiftCount <= 0 ) shiftCount = 1;
4805    shift64RightJamming( aSig, shiftCount, &aSig );
4806    return roundAndPackInt32(aSign, aSig, status);
4807
4808}
4809
4810/*----------------------------------------------------------------------------
4811| Returns the result of converting the extended double-precision floating-
4812| point value `a' to the 32-bit two's complement integer format.  The
4813| conversion is performed according to the IEC/IEEE Standard for Binary
4814| Floating-Point Arithmetic, except that the conversion is always rounded
4815| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4816| Otherwise, if the conversion overflows, the largest integer with the same
4817| sign as `a' is returned.
4818*----------------------------------------------------------------------------*/
4819
4820int32 floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4821{
4822    flag aSign;
4823    int32 aExp, shiftCount;
4824    uint64_t aSig, savedASig;
4825    int32_t z;
4826
4827    aSig = extractFloatx80Frac( a );
4828    aExp = extractFloatx80Exp( a );
4829    aSign = extractFloatx80Sign( a );
4830    if ( 0x401E < aExp ) {
4831        if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4832        goto invalid;
4833    }
4834    else if ( aExp < 0x3FFF ) {
4835        if (aExp || aSig) {
4836            status->float_exception_flags |= float_flag_inexact;
4837        }
4838        return 0;
4839    }
4840    shiftCount = 0x403E - aExp;
4841    savedASig = aSig;
4842    aSig >>= shiftCount;
4843    z = aSig;
4844    if ( aSign ) z = - z;
4845    if ( ( z < 0 ) ^ aSign ) {
4846 invalid:
4847        float_raise(float_flag_invalid, status);
4848        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4849    }
4850    if ( ( aSig<<shiftCount ) != savedASig ) {
4851        status->float_exception_flags |= float_flag_inexact;
4852    }
4853    return z;
4854
4855}
4856
4857/*----------------------------------------------------------------------------
4858| Returns the result of converting the extended double-precision floating-
4859| point value `a' to the 64-bit two's complement integer format.  The
4860| conversion is performed according to the IEC/IEEE Standard for Binary
4861| Floating-Point Arithmetic---which means in particular that the conversion
4862| is rounded according to the current rounding mode.  If `a' is a NaN,
4863| the largest positive integer is returned.  Otherwise, if the conversion
4864| overflows, the largest integer with the same sign as `a' is returned.
4865*----------------------------------------------------------------------------*/
4866
4867int64 floatx80_to_int64(floatx80 a, float_status *status)
4868{
4869    flag aSign;
4870    int32 aExp, shiftCount;
4871    uint64_t aSig, aSigExtra;
4872
4873    aSig = extractFloatx80Frac( a );
4874    aExp = extractFloatx80Exp( a );
4875    aSign = extractFloatx80Sign( a );
4876    shiftCount = 0x403E - aExp;
4877    if ( shiftCount <= 0 ) {
4878        if ( shiftCount ) {
4879            float_raise(float_flag_invalid, status);
4880            if (    ! aSign
4881                 || (    ( aExp == 0x7FFF )
4882                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
4883               ) {
4884                return LIT64( 0x7FFFFFFFFFFFFFFF );
4885            }
4886            return (int64_t) LIT64( 0x8000000000000000 );
4887        }
4888        aSigExtra = 0;
4889    }
4890    else {
4891        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4892    }
4893    return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4894
4895}
4896
4897/*----------------------------------------------------------------------------
4898| Returns the result of converting the extended double-precision floating-
4899| point value `a' to the 64-bit two's complement integer format.  The
4900| conversion is performed according to the IEC/IEEE Standard for Binary
4901| Floating-Point Arithmetic, except that the conversion is always rounded
4902| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4903| Otherwise, if the conversion overflows, the largest integer with the same
4904| sign as `a' is returned.
4905*----------------------------------------------------------------------------*/
4906
4907int64 floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4908{
4909    flag aSign;
4910    int32 aExp, shiftCount;
4911    uint64_t aSig;
4912    int64 z;
4913
4914    aSig = extractFloatx80Frac( a );
4915    aExp = extractFloatx80Exp( a );
4916    aSign = extractFloatx80Sign( a );
4917    shiftCount = aExp - 0x403E;
4918    if ( 0 <= shiftCount ) {
4919        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4920        if ( ( a.high != 0xC03E ) || aSig ) {
4921            float_raise(float_flag_invalid, status);
4922            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4923                return LIT64( 0x7FFFFFFFFFFFFFFF );
4924            }
4925        }
4926        return (int64_t) LIT64( 0x8000000000000000 );
4927    }
4928    else if ( aExp < 0x3FFF ) {
4929        if (aExp | aSig) {
4930            status->float_exception_flags |= float_flag_inexact;
4931        }
4932        return 0;
4933    }
4934    z = aSig>>( - shiftCount );
4935    if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4936        status->float_exception_flags |= float_flag_inexact;
4937    }
4938    if ( aSign ) z = - z;
4939    return z;
4940
4941}
4942
4943/*----------------------------------------------------------------------------
4944| Returns the result of converting the extended double-precision floating-
4945| point value `a' to the single-precision floating-point format.  The
4946| conversion is performed according to the IEC/IEEE Standard for Binary
4947| Floating-Point Arithmetic.
4948*----------------------------------------------------------------------------*/
4949
4950float32 floatx80_to_float32(floatx80 a, float_status *status)
4951{
4952    flag aSign;
4953    int32 aExp;
4954    uint64_t aSig;
4955
4956    aSig = extractFloatx80Frac( a );
4957    aExp = extractFloatx80Exp( a );
4958    aSign = extractFloatx80Sign( a );
4959    if ( aExp == 0x7FFF ) {
4960        if ( (uint64_t) ( aSig<<1 ) ) {
4961            return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4962        }
4963        return packFloat32( aSign, 0xFF, 0 );
4964    }
4965    shift64RightJamming( aSig, 33, &aSig );
4966    if ( aExp || aSig ) aExp -= 0x3F81;
4967    return roundAndPackFloat32(aSign, aExp, aSig, status);
4968
4969}
4970
4971/*----------------------------------------------------------------------------
4972| Returns the result of converting the extended double-precision floating-
4973| point value `a' to the double-precision floating-point format.  The
4974| conversion is performed according to the IEC/IEEE Standard for Binary
4975| Floating-Point Arithmetic.
4976*----------------------------------------------------------------------------*/
4977
4978float64 floatx80_to_float64(floatx80 a, float_status *status)
4979{
4980    flag aSign;
4981    int32 aExp;
4982    uint64_t aSig, zSig;
4983
4984    aSig = extractFloatx80Frac( a );
4985    aExp = extractFloatx80Exp( a );
4986    aSign = extractFloatx80Sign( a );
4987    if ( aExp == 0x7FFF ) {
4988        if ( (uint64_t) ( aSig<<1 ) ) {
4989            return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4990        }
4991        return packFloat64( aSign, 0x7FF, 0 );
4992    }
4993    shift64RightJamming( aSig, 1, &zSig );
4994    if ( aExp || aSig ) aExp -= 0x3C01;
4995    return roundAndPackFloat64(aSign, aExp, zSig, status);
4996
4997}
4998
4999/*----------------------------------------------------------------------------
5000| Returns the result of converting the extended double-precision floating-
5001| point value `a' to the quadruple-precision floating-point format.  The
5002| conversion is performed according to the IEC/IEEE Standard for Binary
5003| Floating-Point Arithmetic.
5004*----------------------------------------------------------------------------*/
5005
5006float128 floatx80_to_float128(floatx80 a, float_status *status)
5007{
5008    flag aSign;
5009    int_fast16_t aExp;
5010    uint64_t aSig, zSig0, zSig1;
5011
5012    aSig = extractFloatx80Frac( a );
5013    aExp = extractFloatx80Exp( a );
5014    aSign = extractFloatx80Sign( a );
5015    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5016        return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5017    }
5018    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5019    return packFloat128( aSign, aExp, zSig0, zSig1 );
5020
5021}
5022
5023/*----------------------------------------------------------------------------
5024| Rounds the extended double-precision floating-point value `a' to an integer,
5025| and returns the result as an extended quadruple-precision floating-point
5026| value.  The operation is performed according to the IEC/IEEE Standard for
5027| Binary Floating-Point Arithmetic.
5028*----------------------------------------------------------------------------*/
5029
5030floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5031{
5032    flag aSign;
5033    int32 aExp;
5034    uint64_t lastBitMask, roundBitsMask;
5035    floatx80 z;
5036
5037    aExp = extractFloatx80Exp( a );
5038    if ( 0x403E <= aExp ) {
5039        if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5040            return propagateFloatx80NaN(a, a, status);
5041        }
5042        return a;
5043    }
5044    if ( aExp < 0x3FFF ) {
5045        if (    ( aExp == 0 )
5046             && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5047            return a;
5048        }
5049        status->float_exception_flags |= float_flag_inexact;
5050        aSign = extractFloatx80Sign( a );
5051        switch (status->float_rounding_mode) {
5052         case float_round_nearest_even:
5053            if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5054               ) {
5055                return
5056                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5057            }
5058            break;
5059        case float_round_ties_away:
5060            if (aExp == 0x3FFE) {
5061                return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5062            }
5063            break;
5064         case float_round_down:
5065            return
5066                  aSign ?
5067                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5068                : packFloatx80( 0, 0, 0 );
5069         case float_round_up:
5070            return
5071                  aSign ? packFloatx80( 1, 0, 0 )
5072                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5073        }
5074        return packFloatx80( aSign, 0, 0 );
5075    }
5076    lastBitMask = 1;
5077    lastBitMask <<= 0x403E - aExp;
5078    roundBitsMask = lastBitMask - 1;
5079    z = a;
5080    switch (status->float_rounding_mode) {
5081    case float_round_nearest_even:
5082        z.low += lastBitMask>>1;
5083        if ((z.low & roundBitsMask) == 0) {
5084            z.low &= ~lastBitMask;
5085        }
5086        break;
5087    case float_round_ties_away:
5088        z.low += lastBitMask >> 1;
5089        break;
5090    case float_round_to_zero:
5091        break;
5092    case float_round_up:
5093        if (!extractFloatx80Sign(z)) {
5094            z.low += roundBitsMask;
5095        }
5096        break;
5097    case float_round_down:
5098        if (extractFloatx80Sign(z)) {
5099            z.low += roundBitsMask;
5100        }
5101        break;
5102    default:
5103        abort();
5104    }
5105    z.low &= ~ roundBitsMask;
5106    if ( z.low == 0 ) {
5107        ++z.high;
5108        z.low = LIT64( 0x8000000000000000 );
5109    }
5110    if (z.low != a.low) {
5111        status->float_exception_flags |= float_flag_inexact;
5112    }
5113    return z;
5114
5115}
5116
5117/*----------------------------------------------------------------------------
5118| Returns the result of adding the absolute values of the extended double-
5119| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5120| negated before being returned.  `zSign' is ignored if the result is a NaN.
5121| The addition is performed according to the IEC/IEEE Standard for Binary
5122| Floating-Point Arithmetic.
5123*----------------------------------------------------------------------------*/
5124
5125static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5126                                float_status *status)
5127{
5128    int32 aExp, bExp, zExp;
5129    uint64_t aSig, bSig, zSig0, zSig1;
5130    int32 expDiff;
5131
5132    aSig = extractFloatx80Frac( a );
5133    aExp = extractFloatx80Exp( a );
5134    bSig = extractFloatx80Frac( b );
5135    bExp = extractFloatx80Exp( b );
5136    expDiff = aExp - bExp;
5137    if ( 0 < expDiff ) {
5138        if ( aExp == 0x7FFF ) {
5139            if ((uint64_t)(aSig << 1)) {
5140                return propagateFloatx80NaN(a, b, status);
5141            }
5142            return a;
5143        }
5144        if ( bExp == 0 ) --expDiff;
5145        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5146        zExp = aExp;
5147    }
5148    else if ( expDiff < 0 ) {
5149        if ( bExp == 0x7FFF ) {
5150            if ((uint64_t)(bSig << 1)) {
5151                return propagateFloatx80NaN(a, b, status);
5152            }
5153            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5154        }
5155        if ( aExp == 0 ) ++expDiff;
5156        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5157        zExp = bExp;
5158    }
5159    else {
5160        if ( aExp == 0x7FFF ) {
5161            if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5162                return propagateFloatx80NaN(a, b, status);
5163            }
5164            return a;
5165        }
5166        zSig1 = 0;
5167        zSig0 = aSig + bSig;
5168        if ( aExp == 0 ) {
5169            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5170            goto roundAndPack;
5171        }
5172        zExp = aExp;
5173        goto shiftRight1;
5174    }
5175    zSig0 = aSig + bSig;
5176    if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5177 shiftRight1:
5178    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5179    zSig0 |= LIT64( 0x8000000000000000 );
5180    ++zExp;
5181 roundAndPack:
5182    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5183                                zSign, zExp, zSig0, zSig1, status);
5184}
5185
5186/*----------------------------------------------------------------------------
5187| Returns the result of subtracting the absolute values of the extended
5188| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5189| difference is negated before being returned.  `zSign' is ignored if the
5190| result is a NaN.  The subtraction is performed according to the IEC/IEEE
5191| Standard for Binary Floating-Point Arithmetic.
5192*----------------------------------------------------------------------------*/
5193
5194static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5195                                float_status *status)
5196{
5197    int32 aExp, bExp, zExp;
5198    uint64_t aSig, bSig, zSig0, zSig1;
5199    int32 expDiff;
5200    floatx80 z;
5201
5202    aSig = extractFloatx80Frac( a );
5203    aExp = extractFloatx80Exp( a );
5204    bSig = extractFloatx80Frac( b );
5205    bExp = extractFloatx80Exp( b );
5206    expDiff = aExp - bExp;
5207    if ( 0 < expDiff ) goto aExpBigger;
5208    if ( expDiff < 0 ) goto bExpBigger;
5209    if ( aExp == 0x7FFF ) {
5210        if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5211            return propagateFloatx80NaN(a, b, status);
5212        }
5213        float_raise(float_flag_invalid, status);
5214        z.low = floatx80_default_nan_low;
5215        z.high = floatx80_default_nan_high;
5216        return z;
5217    }
5218    if ( aExp == 0 ) {
5219        aExp = 1;
5220        bExp = 1;
5221    }
5222    zSig1 = 0;
5223    if ( bSig < aSig ) goto aBigger;
5224    if ( aSig < bSig ) goto bBigger;
5225    return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5226 bExpBigger:
5227    if ( bExp == 0x7FFF ) {
5228        if ((uint64_t)(bSig << 1)) {
5229            return propagateFloatx80NaN(a, b, status);
5230        }
5231        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5232    }
5233    if ( aExp == 0 ) ++expDiff;
5234    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5235 bBigger:
5236    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5237    zExp = bExp;
5238    zSign ^= 1;
5239    goto normalizeRoundAndPack;
5240 aExpBigger:
5241    if ( aExp == 0x7FFF ) {
5242        if ((uint64_t)(aSig << 1)) {
5243            return propagateFloatx80NaN(a, b, status);
5244        }
5245        return a;
5246    }
5247    if ( bExp == 0 ) --expDiff;
5248    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5249 aBigger:
5250    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5251    zExp = aExp;
5252 normalizeRoundAndPack:
5253    return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5254                                         zSign, zExp, zSig0, zSig1, status);
5255}
5256
5257/*----------------------------------------------------------------------------
5258| Returns the result of adding the extended double-precision floating-point
5259| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5260| Standard for Binary Floating-Point Arithmetic.
5261*----------------------------------------------------------------------------*/
5262
5263floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5264{
5265    flag aSign, bSign;
5266
5267    aSign = extractFloatx80Sign( a );
5268    bSign = extractFloatx80Sign( b );
5269    if ( aSign == bSign ) {
5270        return addFloatx80Sigs(a, b, aSign, status);
5271    }
5272    else {
5273        return subFloatx80Sigs(a, b, aSign, status);
5274    }
5275
5276}
5277
5278/*----------------------------------------------------------------------------
5279| Returns the result of subtracting the extended double-precision floating-
5280| point values `a' and `b'.  The operation is performed according to the
5281| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5282*----------------------------------------------------------------------------*/
5283
5284floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5285{
5286    flag aSign, bSign;
5287
5288    aSign = extractFloatx80Sign( a );
5289    bSign = extractFloatx80Sign( b );
5290    if ( aSign == bSign ) {
5291        return subFloatx80Sigs(a, b, aSign, status);
5292    }
5293    else {
5294        return addFloatx80Sigs(a, b, aSign, status);
5295    }
5296
5297}
5298
5299/*----------------------------------------------------------------------------
5300| Returns the result of multiplying the extended double-precision floating-
5301| point values `a' and `b'.  The operation is performed according to the
5302| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5303*----------------------------------------------------------------------------*/
5304
5305floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5306{
5307    flag aSign, bSign, zSign;
5308    int32 aExp, bExp, zExp;
5309    uint64_t aSig, bSig, zSig0, zSig1;
5310    floatx80 z;
5311
5312    aSig = extractFloatx80Frac( a );
5313    aExp = extractFloatx80Exp( a );
5314    aSign = extractFloatx80Sign( a );
5315    bSig = extractFloatx80Frac( b );
5316    bExp = extractFloatx80Exp( b );
5317    bSign = extractFloatx80Sign( b );
5318    zSign = aSign ^ bSign;
5319    if ( aExp == 0x7FFF ) {
5320        if (    (uint64_t) ( aSig<<1 )
5321             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5322            return propagateFloatx80NaN(a, b, status);
5323        }
5324        if ( ( bExp | bSig ) == 0 ) goto invalid;
5325        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5326    }
5327    if ( bExp == 0x7FFF ) {
5328        if ((uint64_t)(bSig << 1)) {
5329            return propagateFloatx80NaN(a, b, status);
5330        }
5331        if ( ( aExp | aSig ) == 0 ) {
5332 invalid:
5333            float_raise(float_flag_invalid, status);
5334            z.low = floatx80_default_nan_low;
5335            z.high = floatx80_default_nan_high;
5336            return z;
5337        }
5338        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5339    }
5340    if ( aExp == 0 ) {
5341        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5342        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5343    }
5344    if ( bExp == 0 ) {
5345        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5346        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5347    }
5348    zExp = aExp + bExp - 0x3FFE;
5349    mul64To128( aSig, bSig, &zSig0, &zSig1 );
5350    if ( 0 < (int64_t) zSig0 ) {
5351        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5352        --zExp;
5353    }
5354    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5355                                zSign, zExp, zSig0, zSig1, status);
5356}
5357
5358/*----------------------------------------------------------------------------
5359| Returns the result of dividing the extended double-precision floating-point
5360| value `a' by the corresponding value `b'.  The operation is performed
5361| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5362*----------------------------------------------------------------------------*/
5363
5364floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5365{
5366    flag aSign, bSign, zSign;
5367    int32 aExp, bExp, zExp;
5368    uint64_t aSig, bSig, zSig0, zSig1;
5369    uint64_t rem0, rem1, rem2, term0, term1, term2;
5370    floatx80 z;
5371
5372    aSig = extractFloatx80Frac( a );
5373    aExp = extractFloatx80Exp( a );
5374    aSign = extractFloatx80Sign( a );
5375    bSig = extractFloatx80Frac( b );
5376    bExp = extractFloatx80Exp( b );
5377    bSign = extractFloatx80Sign( b );
5378    zSign = aSign ^ bSign;
5379    if ( aExp == 0x7FFF ) {
5380        if ((uint64_t)(aSig << 1)) {
5381            return propagateFloatx80NaN(a, b, status);
5382        }
5383        if ( bExp == 0x7FFF ) {
5384            if ((uint64_t)(bSig << 1)) {
5385                return propagateFloatx80NaN(a, b, status);
5386            }
5387            goto invalid;
5388        }
5389        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5390    }
5391    if ( bExp == 0x7FFF ) {
5392        if ((uint64_t)(bSig << 1)) {
5393            return propagateFloatx80NaN(a, b, status);
5394        }
5395        return packFloatx80( zSign, 0, 0 );
5396    }
5397    if ( bExp == 0 ) {
5398        if ( bSig == 0 ) {
5399            if ( ( aExp | aSig ) == 0 ) {
5400 invalid:
5401                float_raise(float_flag_invalid, status);
5402                z.low = floatx80_default_nan_low;
5403                z.high = floatx80_default_nan_high;
5404                return z;
5405            }
5406            float_raise(float_flag_divbyzero, status);
5407            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5408        }
5409        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5410    }
5411    if ( aExp == 0 ) {
5412        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5413        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5414    }
5415    zExp = aExp - bExp + 0x3FFE;
5416    rem1 = 0;
5417    if ( bSig <= aSig ) {
5418        shift128Right( aSig, 0, 1, &aSig, &rem1 );
5419        ++zExp;
5420    }
5421    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5422    mul64To128( bSig, zSig0, &term0, &term1 );
5423    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5424    while ( (int64_t) rem0 < 0 ) {
5425        --zSig0;
5426        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5427    }
5428    zSig1 = estimateDiv128To64( rem1, 0, bSig );
5429    if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5430        mul64To128( bSig, zSig1, &term1, &term2 );
5431        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5432        while ( (int64_t) rem1 < 0 ) {
5433            --zSig1;
5434            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5435        }
5436        zSig1 |= ( ( rem1 | rem2 ) != 0 );
5437    }
5438    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5439                                zSign, zExp, zSig0, zSig1, status);
5440}
5441
5442/*----------------------------------------------------------------------------
5443| Returns the remainder of the extended double-precision floating-point value
5444| `a' with respect to the corresponding value `b'.  The operation is performed
5445| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5446*----------------------------------------------------------------------------*/
5447
5448floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5449{
5450    flag aSign, zSign;
5451    int32 aExp, bExp, expDiff;
5452    uint64_t aSig0, aSig1, bSig;
5453    uint64_t q, term0, term1, alternateASig0, alternateASig1;
5454    floatx80 z;
5455
5456    aSig0 = extractFloatx80Frac( a );
5457    aExp = extractFloatx80Exp( a );
5458    aSign = extractFloatx80Sign( a );
5459    bSig = extractFloatx80Frac( b );
5460    bExp = extractFloatx80Exp( b );
5461    if ( aExp == 0x7FFF ) {
5462        if (    (uint64_t) ( aSig0<<1 )
5463             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5464            return propagateFloatx80NaN(a, b, status);
5465        }
5466        goto invalid;
5467    }
5468    if ( bExp == 0x7FFF ) {
5469        if ((uint64_t)(bSig << 1)) {
5470            return propagateFloatx80NaN(a, b, status);
5471        }
5472        return a;
5473    }
5474    if ( bExp == 0 ) {
5475        if ( bSig == 0 ) {
5476 invalid:
5477            float_raise(float_flag_invalid, status);
5478            z.low = floatx80_default_nan_low;
5479            z.high = floatx80_default_nan_high;
5480            return z;
5481        }
5482        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5483    }
5484    if ( aExp == 0 ) {
5485        if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5486        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5487    }
5488    bSig |= LIT64( 0x8000000000000000 );
5489    zSign = aSign;
5490    expDiff = aExp - bExp;
5491    aSig1 = 0;
5492    if ( expDiff < 0 ) {
5493        if ( expDiff < -1 ) return a;
5494        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5495        expDiff = 0;
5496    }
5497    q = ( bSig <= aSig0 );
5498    if ( q ) aSig0 -= bSig;
5499    expDiff -= 64;
5500    while ( 0 < expDiff ) {
5501        q = estimateDiv128To64( aSig0, aSig1, bSig );
5502        q = ( 2 < q ) ? q - 2 : 0;
5503        mul64To128( bSig, q, &term0, &term1 );
5504        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5505        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5506        expDiff -= 62;
5507    }
5508    expDiff += 64;
5509    if ( 0 < expDiff ) {
5510        q = estimateDiv128To64( aSig0, aSig1, bSig );
5511        q = ( 2 < q ) ? q - 2 : 0;
5512        q >>= 64 - expDiff;
5513        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5514        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5515        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5516        while ( le128( term0, term1, aSig0, aSig1 ) ) {
5517            ++q;
5518            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5519        }
5520    }
5521    else {
5522        term1 = 0;
5523        term0 = bSig;
5524    }
5525    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5526    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5527         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5528              && ( q & 1 ) )
5529       ) {
5530        aSig0 = alternateASig0;
5531        aSig1 = alternateASig1;
5532        zSign = ! zSign;
5533    }
5534    return
5535        normalizeRoundAndPackFloatx80(
5536            80, zSign, bExp + expDiff, aSig0, aSig1, status);
5537
5538}
5539
5540/*----------------------------------------------------------------------------
5541| Returns the square root of the extended double-precision floating-point
5542| value `a'.  The operation is performed according to the IEC/IEEE Standard
5543| for Binary Floating-Point Arithmetic.
5544*----------------------------------------------------------------------------*/
5545
5546floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5547{
5548    flag aSign;
5549    int32 aExp, zExp;
5550    uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5551    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5552    floatx80 z;
5553
5554    aSig0 = extractFloatx80Frac( a );
5555    aExp = extractFloatx80Exp( a );
5556    aSign = extractFloatx80Sign( a );
5557    if ( aExp == 0x7FFF ) {
5558        if ((uint64_t)(aSig0 << 1)) {
5559            return propagateFloatx80NaN(a, a, status);
5560        }
5561        if ( ! aSign ) return a;
5562        goto invalid;
5563    }
5564    if ( aSign ) {
5565        if ( ( aExp | aSig0 ) == 0 ) return a;
5566 invalid:
5567        float_raise(float_flag_invalid, status);
5568        z.low = floatx80_default_nan_low;
5569        z.high = floatx80_default_nan_high;
5570        return z;
5571    }
5572    if ( aExp == 0 ) {
5573        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5574        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5575    }
5576    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5577    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5578    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5579    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5580    doubleZSig0 = zSig0<<1;
5581    mul64To128( zSig0, zSig0, &term0, &term1 );
5582    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5583    while ( (int64_t) rem0 < 0 ) {
5584        --zSig0;
5585        doubleZSig0 -= 2;
5586        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5587    }
5588    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5589    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5590        if ( zSig1 == 0 ) zSig1 = 1;
5591        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5592        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5593        mul64To128( zSig1, zSig1, &term2, &term3 );
5594        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5595        while ( (int64_t) rem1 < 0 ) {
5596            --zSig1;
5597            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5598            term3 |= 1;
5599            term2 |= doubleZSig0;
5600            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5601        }
5602        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5603    }
5604    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5605    zSig0 |= doubleZSig0;
5606    return roundAndPackFloatx80(status->floatx80_rounding_precision,
5607                                0, zExp, zSig0, zSig1, status);
5608}
5609
5610/*----------------------------------------------------------------------------
5611| Returns 1 if the extended double-precision floating-point value `a' is equal
5612| to the corresponding value `b', and 0 otherwise.  The invalid exception is
5613| raised if either operand is a NaN.  Otherwise, the comparison is performed
5614| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5615*----------------------------------------------------------------------------*/
5616
5617int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5618{
5619
5620    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5621              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5622         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5623              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5624       ) {
5625        float_raise(float_flag_invalid, status);
5626        return 0;
5627    }
5628    return
5629           ( a.low == b.low )
5630        && (    ( a.high == b.high )
5631             || (    ( a.low == 0 )
5632                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5633           );
5634
5635}
5636
5637/*----------------------------------------------------------------------------
5638| Returns 1 if the extended double-precision floating-point value `a' is
5639| less than or equal to the corresponding value `b', and 0 otherwise.  The
5640| invalid exception is raised if either operand is a NaN.  The comparison is
5641| performed according to the IEC/IEEE Standard for Binary Floating-Point
5642| Arithmetic.
5643*----------------------------------------------------------------------------*/
5644
5645int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5646{
5647    flag aSign, bSign;
5648
5649    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5650              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5651         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5652              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5653       ) {
5654        float_raise(float_flag_invalid, status);
5655        return 0;
5656    }
5657    aSign = extractFloatx80Sign( a );
5658    bSign = extractFloatx80Sign( b );
5659    if ( aSign != bSign ) {
5660        return
5661               aSign
5662            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5663                 == 0 );
5664    }
5665    return
5666          aSign ? le128( b.high, b.low, a.high, a.low )
5667        : le128( a.high, a.low, b.high, b.low );
5668
5669}
5670
5671/*----------------------------------------------------------------------------
5672| Returns 1 if the extended double-precision floating-point value `a' is
5673| less than the corresponding value `b', and 0 otherwise.  The invalid
5674| exception is raised if either operand is a NaN.  The comparison is performed
5675| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5676*----------------------------------------------------------------------------*/
5677
5678int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5679{
5680    flag aSign, bSign;
5681
5682    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5683              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5684         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5685              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5686       ) {
5687        float_raise(float_flag_invalid, status);
5688        return 0;
5689    }
5690    aSign = extractFloatx80Sign( a );
5691    bSign = extractFloatx80Sign( b );
5692    if ( aSign != bSign ) {
5693        return
5694               aSign
5695            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5696                 != 0 );
5697    }
5698    return
5699          aSign ? lt128( b.high, b.low, a.high, a.low )
5700        : lt128( a.high, a.low, b.high, b.low );
5701
5702}
5703
5704/*----------------------------------------------------------------------------
5705| Returns 1 if the extended double-precision floating-point values `a' and `b'
5706| cannot be compared, and 0 otherwise.  The invalid exception is raised if
5707| either operand is a NaN.   The comparison is performed according to the
5708| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5709*----------------------------------------------------------------------------*/
5710int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5711{
5712    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5713              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5714         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5715              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5716       ) {
5717        float_raise(float_flag_invalid, status);
5718        return 1;
5719    }
5720    return 0;
5721}
5722
5723/*----------------------------------------------------------------------------
5724| Returns 1 if the extended double-precision floating-point value `a' is
5725| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5726| cause an exception.  The comparison is performed according to the IEC/IEEE
5727| Standard for Binary Floating-Point Arithmetic.
5728*----------------------------------------------------------------------------*/
5729
5730int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5731{
5732
5733    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5734              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5735         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5736              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5737       ) {
5738        if (    floatx80_is_signaling_nan( a )
5739             || floatx80_is_signaling_nan( b ) ) {
5740            float_raise(float_flag_invalid, status);
5741        }
5742        return 0;
5743    }
5744    return
5745           ( a.low == b.low )
5746        && (    ( a.high == b.high )
5747             || (    ( a.low == 0 )
5748                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5749           );
5750
5751}
5752
5753/*----------------------------------------------------------------------------
5754| Returns 1 if the extended double-precision floating-point value `a' is less
5755| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5756| do not cause an exception.  Otherwise, the comparison is performed according
5757| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5758*----------------------------------------------------------------------------*/
5759
5760int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5761{
5762    flag aSign, bSign;
5763
5764    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5765              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5766         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5767              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5768       ) {
5769        if (    floatx80_is_signaling_nan( a )
5770             || floatx80_is_signaling_nan( b ) ) {
5771            float_raise(float_flag_invalid, status);
5772        }
5773        return 0;
5774    }
5775    aSign = extractFloatx80Sign( a );
5776    bSign = extractFloatx80Sign( b );
5777    if ( aSign != bSign ) {
5778        return
5779               aSign
5780            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5781                 == 0 );
5782    }
5783    return
5784          aSign ? le128( b.high, b.low, a.high, a.low )
5785        : le128( a.high, a.low, b.high, b.low );
5786
5787}
5788
5789/*----------------------------------------------------------------------------
5790| Returns 1 if the extended double-precision floating-point value `a' is less
5791| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5792| an exception.  Otherwise, the comparison is performed according to the
5793| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5794*----------------------------------------------------------------------------*/
5795
5796int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5797{
5798    flag aSign, bSign;
5799
5800    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5801              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5802         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5803              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5804       ) {
5805        if (    floatx80_is_signaling_nan( a )
5806             || floatx80_is_signaling_nan( b ) ) {
5807            float_raise(float_flag_invalid, status);
5808        }
5809        return 0;
5810    }
5811    aSign = extractFloatx80Sign( a );
5812    bSign = extractFloatx80Sign( b );
5813    if ( aSign != bSign ) {
5814        return
5815               aSign
5816            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5817                 != 0 );
5818    }
5819    return
5820          aSign ? lt128( b.high, b.low, a.high, a.low )
5821        : lt128( a.high, a.low, b.high, b.low );
5822
5823}
5824
5825/*----------------------------------------------------------------------------
5826| Returns 1 if the extended double-precision floating-point values `a' and `b'
5827| cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5828| The comparison is performed according to the IEC/IEEE Standard for Binary
5829| Floating-Point Arithmetic.
5830*----------------------------------------------------------------------------*/
5831int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5832{
5833    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5834              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5835         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5836              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5837       ) {
5838        if (    floatx80_is_signaling_nan( a )
5839             || floatx80_is_signaling_nan( b ) ) {
5840            float_raise(float_flag_invalid, status);
5841        }
5842        return 1;
5843    }
5844    return 0;
5845}
5846
5847/*----------------------------------------------------------------------------
5848| Returns the result of converting the quadruple-precision floating-point
5849| value `a' to the 32-bit two's complement integer format.  The conversion
5850| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5851| Arithmetic---which means in particular that the conversion is rounded
5852| according to the current rounding mode.  If `a' is a NaN, the largest
5853| positive integer is returned.  Otherwise, if the conversion overflows, the
5854| largest integer with the same sign as `a' is returned.
5855*----------------------------------------------------------------------------*/
5856
5857int32 float128_to_int32(float128 a, float_status *status)
5858{
5859    flag aSign;
5860    int32 aExp, shiftCount;
5861    uint64_t aSig0, aSig1;
5862
5863    aSig1 = extractFloat128Frac1( a );
5864    aSig0 = extractFloat128Frac0( a );
5865    aExp = extractFloat128Exp( a );
5866    aSign = extractFloat128Sign( a );
5867    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5868    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5869    aSig0 |= ( aSig1 != 0 );
5870    shiftCount = 0x4028 - aExp;
5871    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5872    return roundAndPackInt32(aSign, aSig0, status);
5873
5874}
5875
5876/*----------------------------------------------------------------------------
5877| Returns the result of converting the quadruple-precision floating-point
5878| value `a' to the 32-bit two's complement integer format.  The conversion
5879| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5880| Arithmetic, except that the conversion is always rounded toward zero.  If
5881| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5882| conversion overflows, the largest integer with the same sign as `a' is
5883| returned.
5884*----------------------------------------------------------------------------*/
5885
5886int32 float128_to_int32_round_to_zero(float128 a, float_status *status)
5887{
5888    flag aSign;
5889    int32 aExp, shiftCount;
5890    uint64_t aSig0, aSig1, savedASig;
5891    int32_t z;
5892
5893    aSig1 = extractFloat128Frac1( a );
5894    aSig0 = extractFloat128Frac0( a );
5895    aExp = extractFloat128Exp( a );
5896    aSign = extractFloat128Sign( a );
5897    aSig0 |= ( aSig1 != 0 );
5898    if ( 0x401E < aExp ) {
5899        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5900        goto invalid;
5901    }
5902    else if ( aExp < 0x3FFF ) {
5903        if (aExp || aSig0) {
5904            status->float_exception_flags |= float_flag_inexact;
5905        }
5906        return 0;
5907    }
5908    aSig0 |= LIT64( 0x0001000000000000 );
5909    shiftCount = 0x402F - aExp;
5910    savedASig = aSig0;
5911    aSig0 >>= shiftCount;
5912    z = aSig0;
5913    if ( aSign ) z = - z;
5914    if ( ( z < 0 ) ^ aSign ) {
5915 invalid:
5916        float_raise(float_flag_invalid, status);
5917        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5918    }
5919    if ( ( aSig0<<shiftCount ) != savedASig ) {
5920        status->float_exception_flags |= float_flag_inexact;
5921    }
5922    return z;
5923
5924}
5925
5926/*----------------------------------------------------------------------------
5927| Returns the result of converting the quadruple-precision floating-point
5928| value `a' to the 64-bit two's complement integer format.  The conversion
5929| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5930| Arithmetic---which means in particular that the conversion is rounded
5931| according to the current rounding mode.  If `a' is a NaN, the largest
5932| positive integer is returned.  Otherwise, if the conversion overflows, the
5933| largest integer with the same sign as `a' is returned.
5934*----------------------------------------------------------------------------*/
5935
5936int64 float128_to_int64(float128 a, float_status *status)
5937{
5938    flag aSign;
5939    int32 aExp, shiftCount;
5940    uint64_t aSig0, aSig1;
5941
5942    aSig1 = extractFloat128Frac1( a );
5943    aSig0 = extractFloat128Frac0( a );
5944    aExp = extractFloat128Exp( a );
5945    aSign = extractFloat128Sign( a );
5946    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5947    shiftCount = 0x402F - aExp;
5948    if ( shiftCount <= 0 ) {
5949        if ( 0x403E < aExp ) {
5950            float_raise(float_flag_invalid, status);
5951            if (    ! aSign
5952                 || (    ( aExp == 0x7FFF )
5953                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5954                    )
5955               ) {
5956                return LIT64( 0x7FFFFFFFFFFFFFFF );
5957            }
5958            return (int64_t) LIT64( 0x8000000000000000 );
5959        }
5960        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5961    }
5962    else {
5963        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5964    }
5965    return roundAndPackInt64(aSign, aSig0, aSig1, status);
5966
5967}
5968
5969/*----------------------------------------------------------------------------
5970| Returns the result of converting the quadruple-precision floating-point
5971| value `a' to the 64-bit two's complement integer format.  The conversion
5972| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5973| Arithmetic, except that the conversion is always rounded toward zero.
5974| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5975| the conversion overflows, the largest integer with the same sign as `a' is
5976| returned.
5977*----------------------------------------------------------------------------*/
5978
5979int64 float128_to_int64_round_to_zero(float128 a, float_status *status)
5980{
5981    flag aSign;
5982    int32 aExp, shiftCount;
5983    uint64_t aSig0, aSig1;
5984    int64 z;
5985
5986    aSig1 = extractFloat128Frac1( a );
5987    aSig0 = extractFloat128Frac0( a );
5988    aExp = extractFloat128Exp( a );
5989    aSign = extractFloat128Sign( a );
5990    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5991    shiftCount = aExp - 0x402F;
5992    if ( 0 < shiftCount ) {
5993        if ( 0x403E <= aExp ) {
5994            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5995            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5996                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5997                if (aSig1) {
5998                    status->float_exception_flags |= float_flag_inexact;
5999                }
6000            }
6001            else {
6002                float_raise(float_flag_invalid, status);
6003                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6004                    return LIT64( 0x7FFFFFFFFFFFFFFF );
6005                }
6006            }
6007            return (int64_t) LIT64( 0x8000000000000000 );
6008        }
6009        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6010        if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6011            status->float_exception_flags |= float_flag_inexact;
6012        }
6013    }
6014    else {
6015        if ( aExp < 0x3FFF ) {
6016            if ( aExp | aSig0 | aSig1 ) {
6017                status->float_exception_flags |= float_flag_inexact;
6018            }
6019            return 0;
6020        }
6021        z = aSig0>>( - shiftCount );
6022        if (    aSig1
6023             || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6024            status->float_exception_flags |= float_flag_inexact;
6025        }
6026    }
6027    if ( aSign ) z = - z;
6028    return z;
6029
6030}
6031
6032/*----------------------------------------------------------------------------
6033| Returns the result of converting the quadruple-precision floating-point
6034| value `a' to the single-precision floating-point format.  The conversion
6035| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6036| Arithmetic.
6037*----------------------------------------------------------------------------*/
6038
6039float32 float128_to_float32(float128 a, float_status *status)
6040{
6041    flag aSign;
6042    int32 aExp;
6043    uint64_t aSig0, aSig1;
6044    uint32_t zSig;
6045
6046    aSig1 = extractFloat128Frac1( a );
6047    aSig0 = extractFloat128Frac0( a );
6048    aExp = extractFloat128Exp( a );
6049    aSign = extractFloat128Sign( a );
6050    if ( aExp == 0x7FFF ) {
6051        if ( aSig0 | aSig1 ) {
6052            return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6053        }
6054        return packFloat32( aSign, 0xFF, 0 );
6055    }
6056    aSig0 |= ( aSig1 != 0 );
6057    shift64RightJamming( aSig0, 18, &aSig0 );
6058    zSig = aSig0;
6059    if ( aExp || zSig ) {
6060        zSig |= 0x40000000;
6061        aExp -= 0x3F81;
6062    }
6063    return roundAndPackFloat32(aSign, aExp, zSig, status);
6064
6065}
6066
6067/*----------------------------------------------------------------------------
6068| Returns the result of converting the quadruple-precision floating-point
6069| value `a' to the double-precision floating-point format.  The conversion
6070| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6071| Arithmetic.
6072*----------------------------------------------------------------------------*/
6073
6074float64 float128_to_float64(float128 a, float_status *status)
6075{
6076    flag aSign;
6077    int32 aExp;
6078    uint64_t aSig0, aSig1;
6079
6080    aSig1 = extractFloat128Frac1( a );
6081    aSig0 = extractFloat128Frac0( a );
6082    aExp = extractFloat128Exp( a );
6083    aSign = extractFloat128Sign( a );
6084    if ( aExp == 0x7FFF ) {
6085        if ( aSig0 | aSig1 ) {
6086            return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6087        }
6088        return packFloat64( aSign, 0x7FF, 0 );
6089    }
6090    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6091    aSig0 |= ( aSig1 != 0 );
6092    if ( aExp || aSig0 ) {
6093        aSig0 |= LIT64( 0x4000000000000000 );
6094        aExp -= 0x3C01;
6095    }
6096    return roundAndPackFloat64(aSign, aExp, aSig0, status);
6097
6098}
6099
6100/*----------------------------------------------------------------------------
6101| Returns the result of converting the quadruple-precision floating-point
6102| value `a' to the extended double-precision floating-point format.  The
6103| conversion is performed according to the IEC/IEEE Standard for Binary
6104| Floating-Point Arithmetic.
6105*----------------------------------------------------------------------------*/
6106
6107floatx80 float128_to_floatx80(float128 a, float_status *status)
6108{
6109    flag aSign;
6110    int32 aExp;
6111    uint64_t aSig0, aSig1;
6112
6113    aSig1 = extractFloat128Frac1( a );
6114    aSig0 = extractFloat128Frac0( a );
6115    aExp = extractFloat128Exp( a );
6116    aSign = extractFloat128Sign( a );
6117    if ( aExp == 0x7FFF ) {
6118        if ( aSig0 | aSig1 ) {
6119            return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6120        }
6121        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6122    }
6123    if ( aExp == 0 ) {
6124        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6125        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6126    }
6127    else {
6128        aSig0 |= LIT64( 0x0001000000000000 );
6129    }
6130    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6131    return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6132
6133}
6134
6135/*----------------------------------------------------------------------------
6136| Rounds the quadruple-precision floating-point value `a' to an integer, and
6137| returns the result as a quadruple-precision floating-point value.  The
6138| operation is performed according to the IEC/IEEE Standard for Binary
6139| Floating-Point Arithmetic.
6140*----------------------------------------------------------------------------*/
6141
6142float128 float128_round_to_int(float128 a, float_status *status)
6143{
6144    flag aSign;
6145    int32 aExp;
6146    uint64_t lastBitMask, roundBitsMask;
6147    float128 z;
6148
6149    aExp = extractFloat128Exp( a );
6150    if ( 0x402F <= aExp ) {
6151        if ( 0x406F <= aExp ) {
6152            if (    ( aExp == 0x7FFF )
6153                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6154               ) {
6155                return propagateFloat128NaN(a, a, status);
6156            }
6157            return a;
6158        }
6159        lastBitMask = 1;
6160        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6161        roundBitsMask = lastBitMask - 1;
6162        z = a;
6163        switch (status->float_rounding_mode) {
6164        case float_round_nearest_even:
6165            if ( lastBitMask ) {
6166                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6167                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6168            }
6169            else {
6170                if ( (int64_t) z.low < 0 ) {
6171                    ++z.high;
6172                    if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6173                }
6174            }
6175            break;
6176        case float_round_ties_away:
6177            if (lastBitMask) {
6178                add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6179            } else {
6180                if ((int64_t) z.low < 0) {
6181                    ++z.high;
6182                }
6183            }
6184            break;
6185        case float_round_to_zero:
6186            break;
6187        case float_round_up:
6188            if (!extractFloat128Sign(z)) {
6189                add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6190            }
6191            break;
6192        case float_round_down:
6193            if (extractFloat128Sign(z)) {
6194                add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6195            }
6196            break;
6197        default:
6198            abort();
6199        }
6200        z.low &= ~ roundBitsMask;
6201    }
6202    else {
6203        if ( aExp < 0x3FFF ) {
6204            if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6205            status->float_exception_flags |= float_flag_inexact;
6206            aSign = extractFloat128Sign( a );
6207            switch (status->float_rounding_mode) {
6208             case float_round_nearest_even:
6209                if (    ( aExp == 0x3FFE )
6210                     && (   extractFloat128Frac0( a )
6211                          | extractFloat128Frac1( a ) )
6212                   ) {
6213                    return packFloat128( aSign, 0x3FFF, 0, 0 );
6214                }
6215                break;
6216            case float_round_ties_away:
6217                if (aExp == 0x3FFE) {
6218                    return packFloat128(aSign, 0x3FFF, 0, 0);
6219                }
6220                break;
6221             case float_round_down:
6222                return
6223                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6224                    : packFloat128( 0, 0, 0, 0 );
6225             case float_round_up:
6226                return
6227                      aSign ? packFloat128( 1, 0, 0, 0 )
6228                    : packFloat128( 0, 0x3FFF, 0, 0 );
6229            }
6230            return packFloat128( aSign, 0, 0, 0 );
6231        }
6232        lastBitMask = 1;
6233        lastBitMask <<= 0x402F - aExp;
6234        roundBitsMask = lastBitMask - 1;
6235        z.low = 0;
6236        z.high = a.high;
6237        switch (status->float_rounding_mode) {
6238        case float_round_nearest_even:
6239            z.high += lastBitMask>>1;
6240            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6241                z.high &= ~ lastBitMask;
6242            }
6243            break;
6244        case float_round_ties_away:
6245            z.high += lastBitMask>>1;
6246            break;
6247        case float_round_to_zero:
6248            break;
6249        case float_round_up:
6250            if (!extractFloat128Sign(z)) {
6251                z.high |= ( a.low != 0 );
6252                z.high += roundBitsMask;
6253            }
6254            break;
6255        case float_round_down:
6256            if (extractFloat128Sign(z)) {
6257                z.high |= (a.low != 0);
6258                z.high += roundBitsMask;
6259            }
6260            break;
6261        default:
6262            abort();
6263        }
6264        z.high &= ~ roundBitsMask;
6265    }
6266    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6267        status->float_exception_flags |= float_flag_inexact;
6268    }
6269    return z;
6270
6271}
6272
6273/*----------------------------------------------------------------------------
6274| Returns the result of adding the absolute values of the quadruple-precision
6275| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6276| before being returned.  `zSign' is ignored if the result is a NaN.
6277| The addition is performed according to the IEC/IEEE Standard for Binary
6278| Floating-Point Arithmetic.
6279*----------------------------------------------------------------------------*/
6280
6281static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6282                                float_status *status)
6283{
6284    int32 aExp, bExp, zExp;
6285    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6286    int32 expDiff;
6287
6288    aSig1 = extractFloat128Frac1( a );
6289    aSig0 = extractFloat128Frac0( a );
6290    aExp = extractFloat128Exp( a );
6291    bSig1 = extractFloat128Frac1( b );
6292    bSig0 = extractFloat128Frac0( b );
6293    bExp = extractFloat128Exp( b );
6294    expDiff = aExp - bExp;
6295    if ( 0 < expDiff ) {
6296        if ( aExp == 0x7FFF ) {
6297            if (aSig0 | aSig1) {
6298                return propagateFloat128NaN(a, b, status);
6299            }
6300            return a;
6301        }
6302        if ( bExp == 0 ) {
6303            --expDiff;
6304        }
6305        else {
6306            bSig0 |= LIT64( 0x0001000000000000 );
6307        }
6308        shift128ExtraRightJamming(
6309            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6310        zExp = aExp;
6311    }
6312    else if ( expDiff < 0 ) {
6313        if ( bExp == 0x7FFF ) {
6314            if (bSig0 | bSig1) {
6315                return propagateFloat128NaN(a, b, status);
6316            }
6317            return packFloat128( zSign, 0x7FFF, 0, 0 );
6318        }
6319        if ( aExp == 0 ) {
6320            ++expDiff;
6321        }
6322        else {
6323            aSig0 |= LIT64( 0x0001000000000000 );
6324        }
6325        shift128ExtraRightJamming(
6326            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6327        zExp = bExp;
6328    }
6329    else {
6330        if ( aExp == 0x7FFF ) {
6331            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6332                return propagateFloat128NaN(a, b, status);
6333            }
6334            return a;
6335        }
6336        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6337        if ( aExp == 0 ) {
6338            if (status->flush_to_zero) {
6339                if (zSig0 | zSig1) {
6340                    float_raise(float_flag_output_denormal, status);
6341                }
6342                return packFloat128(zSign, 0, 0, 0);
6343            }
6344            return packFloat128( zSign, 0, zSig0, zSig1 );
6345        }
6346        zSig2 = 0;
6347        zSig0 |= LIT64( 0x0002000000000000 );
6348        zExp = aExp;
6349        goto shiftRight1;
6350    }
6351    aSig0 |= LIT64( 0x0001000000000000 );
6352    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6353    --zExp;
6354    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6355    ++zExp;
6356 shiftRight1:
6357    shift128ExtraRightJamming(
6358        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6359 roundAndPack:
6360    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6361
6362}
6363
6364/*----------------------------------------------------------------------------
6365| Returns the result of subtracting the absolute values of the quadruple-
6366| precision floating-point values `a' and `b'.  If `zSign' is 1, the
6367| difference is negated before being returned.  `zSign' is ignored if the
6368| result is a NaN.  The subtraction is performed according to the IEC/IEEE
6369| Standard for Binary Floating-Point Arithmetic.
6370*----------------------------------------------------------------------------*/
6371
6372static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6373                                float_status *status)
6374{
6375    int32 aExp, bExp, zExp;
6376    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6377    int32 expDiff;
6378    float128 z;
6379
6380    aSig1 = extractFloat128Frac1( a );
6381    aSig0 = extractFloat128Frac0( a );
6382    aExp = extractFloat128Exp( a );
6383    bSig1 = extractFloat128Frac1( b );
6384    bSig0 = extractFloat128Frac0( b );
6385    bExp = extractFloat128Exp( b );
6386    expDiff = aExp - bExp;
6387    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6388    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6389    if ( 0 < expDiff ) goto aExpBigger;
6390    if ( expDiff < 0 ) goto bExpBigger;
6391    if ( aExp == 0x7FFF ) {
6392        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6393            return propagateFloat128NaN(a, b, status);
6394        }
6395        float_raise(float_flag_invalid, status);
6396        z.low = float128_default_nan_low;
6397        z.high = float128_default_nan_high;
6398        return z;
6399    }
6400    if ( aExp == 0 ) {
6401        aExp = 1;
6402        bExp = 1;
6403    }
6404    if ( bSig0 < aSig0 ) goto aBigger;
6405    if ( aSig0 < bSig0 ) goto bBigger;
6406    if ( bSig1 < aSig1 ) goto aBigger;
6407    if ( aSig1 < bSig1 ) goto bBigger;
6408    return packFloat128(status->float_rounding_mode == float_round_down,
6409                        0, 0, 0);
6410 bExpBigger:
6411    if ( bExp == 0x7FFF ) {
6412        if (bSig0 | bSig1) {
6413            return propagateFloat128NaN(a, b, status);
6414        }
6415        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6416    }
6417    if ( aExp == 0 ) {
6418        ++expDiff;
6419    }
6420    else {
6421        aSig0 |= LIT64( 0x4000000000000000 );
6422    }
6423    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6424    bSig0 |= LIT64( 0x4000000000000000 );
6425 bBigger:
6426    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6427    zExp = bExp;
6428    zSign ^= 1;
6429    goto normalizeRoundAndPack;
6430 aExpBigger:
6431    if ( aExp == 0x7FFF ) {
6432        if (aSig0 | aSig1) {
6433            return propagateFloat128NaN(a, b, status);
6434        }
6435        return a;
6436    }
6437    if ( bExp == 0 ) {
6438        --expDiff;
6439    }
6440    else {
6441        bSig0 |= LIT64( 0x4000000000000000 );
6442    }
6443    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6444    aSig0 |= LIT64( 0x4000000000000000 );
6445 aBigger:
6446    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6447    zExp = aExp;
6448 normalizeRoundAndPack:
6449    --zExp;
6450    return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6451                                         status);
6452
6453}
6454
6455/*----------------------------------------------------------------------------
6456| Returns the result of adding the quadruple-precision floating-point values
6457| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6458| for Binary Floating-Point Arithmetic.
6459*----------------------------------------------------------------------------*/
6460
6461float128 float128_add(float128 a, float128 b, float_status *status)
6462{
6463    flag aSign, bSign;
6464
6465    aSign = extractFloat128Sign( a );
6466    bSign = extractFloat128Sign( b );
6467    if ( aSign == bSign ) {
6468        return addFloat128Sigs(a, b, aSign, status);
6469    }
6470    else {
6471        return subFloat128Sigs(a, b, aSign, status);
6472    }
6473
6474}
6475
6476/*----------------------------------------------------------------------------
6477| Returns the result of subtracting the quadruple-precision floating-point
6478| values `a' and `b'.  The operation is performed according to the IEC/IEEE
6479| Standard for Binary Floating-Point Arithmetic.
6480*----------------------------------------------------------------------------*/
6481
6482float128 float128_sub(float128 a, float128 b, float_status *status)
6483{
6484    flag aSign, bSign;
6485
6486    aSign = extractFloat128Sign( a );
6487    bSign = extractFloat128Sign( b );
6488    if ( aSign == bSign ) {
6489        return subFloat128Sigs(a, b, aSign, status);
6490    }
6491    else {
6492        return addFloat128Sigs(a, b, aSign, status);
6493    }
6494
6495}
6496
6497/*----------------------------------------------------------------------------
6498| Returns the result of multiplying the quadruple-precision floating-point
6499| values `a' and `b'.  The operation is performed according to the IEC/IEEE
6500| Standard for Binary Floating-Point Arithmetic.
6501*----------------------------------------------------------------------------*/
6502
6503float128 float128_mul(float128 a, float128 b, float_status *status)
6504{
6505    flag aSign, bSign, zSign;
6506    int32 aExp, bExp, zExp;
6507    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6508    float128 z;
6509
6510    aSig1 = extractFloat128Frac1( a );
6511    aSig0 = extractFloat128Frac0( a );
6512    aExp = extractFloat128Exp( a );
6513    aSign = extractFloat128Sign( a );
6514    bSig1 = extractFloat128Frac1( b );
6515    bSig0 = extractFloat128Frac0( b );
6516    bExp = extractFloat128Exp( b );
6517    bSign = extractFloat128Sign( b );
6518    zSign = aSign ^ bSign;
6519    if ( aExp == 0x7FFF ) {
6520        if (    ( aSig0 | aSig1 )
6521             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6522            return propagateFloat128NaN(a, b, status);
6523        }
6524        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6525        return packFloat128( zSign, 0x7FFF, 0, 0 );
6526    }
6527    if ( bExp == 0x7FFF ) {
6528        if (bSig0 | bSig1) {
6529            return propagateFloat128NaN(a, b, status);
6530        }
6531        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6532 invalid:
6533            float_raise(float_flag_invalid, status);
6534            z.low = float128_default_nan_low;
6535            z.high = float128_default_nan_high;
6536            return z;
6537        }
6538        return packFloat128( zSign, 0x7FFF, 0, 0 );
6539    }
6540    if ( aExp == 0 ) {
6541        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6542        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6543    }
6544    if ( bExp == 0 ) {
6545        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6546        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6547    }
6548    zExp = aExp + bExp - 0x4000;
6549    aSig0 |= LIT64( 0x0001000000000000 );
6550    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6551    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6552    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6553    zSig2 |= ( zSig3 != 0 );
6554    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6555        shift128ExtraRightJamming(
6556            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6557        ++zExp;
6558    }
6559    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6560
6561}
6562
6563/*----------------------------------------------------------------------------
6564| Returns the result of dividing the quadruple-precision floating-point value
6565| `a' by the corresponding value `b'.  The operation is performed according to
6566| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6567*----------------------------------------------------------------------------*/
6568
6569float128 float128_div(float128 a, float128 b, float_status *status)
6570{
6571    flag aSign, bSign, zSign;
6572    int32 aExp, bExp, zExp;
6573    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6574    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6575    float128 z;
6576
6577    aSig1 = extractFloat128Frac1( a );
6578    aSig0 = extractFloat128Frac0( a );
6579    aExp = extractFloat128Exp( a );
6580    aSign = extractFloat128Sign( a );
6581    bSig1 = extractFloat128Frac1( b );
6582    bSig0 = extractFloat128Frac0( b );
6583    bExp = extractFloat128Exp( b );
6584    bSign = extractFloat128Sign( b );
6585    zSign = aSign ^ bSign;
6586    if ( aExp == 0x7FFF ) {
6587        if (aSig0 | aSig1) {
6588            return propagateFloat128NaN(a, b, status);
6589        }
6590        if ( bExp == 0x7FFF ) {
6591            if (bSig0 | bSig1) {
6592                return propagateFloat128NaN(a, b, status);
6593            }
6594            goto invalid;
6595        }
6596        return packFloat128( zSign, 0x7FFF, 0, 0 );
6597    }
6598    if ( bExp == 0x7FFF ) {
6599        if (bSig0 | bSig1) {
6600            return propagateFloat128NaN(a, b, status);
6601        }
6602        return packFloat128( zSign, 0, 0, 0 );
6603    }
6604    if ( bExp == 0 ) {
6605        if ( ( bSig0 | bSig1 ) == 0 ) {
6606            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6607 invalid:
6608                float_raise(float_flag_invalid, status);
6609                z.low = float128_default_nan_low;
6610                z.high = float128_default_nan_high;
6611                return z;
6612            }
6613            float_raise(float_flag_divbyzero, status);
6614            return packFloat128( zSign, 0x7FFF, 0, 0 );
6615        }
6616        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6617    }
6618    if ( aExp == 0 ) {
6619        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6620        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6621    }
6622    zExp = aExp - bExp + 0x3FFD;
6623    shortShift128Left(
6624        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6625    shortShift128Left(
6626        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6627    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6628        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6629        ++zExp;
6630    }
6631    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6632    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6633    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6634    while ( (int64_t) rem0 < 0 ) {
6635        --zSig0;
6636        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6637    }
6638    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6639    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6640        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6641        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6642        while ( (int64_t) rem1 < 0 ) {
6643            --zSig1;
6644            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6645        }
6646        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6647    }
6648    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6649    return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6650
6651}
6652
6653/*----------------------------------------------------------------------------
6654| Returns the remainder of the quadruple-precision floating-point value `a'
6655| with respect to the corresponding value `b'.  The operation is performed
6656| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6657*----------------------------------------------------------------------------*/
6658
6659float128 float128_rem(float128 a, float128 b, float_status *status)
6660{
6661    flag aSign, zSign;
6662    int32 aExp, bExp, expDiff;
6663    uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6664    uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6665    int64_t sigMean0;
6666    float128 z;
6667
6668    aSig1 = extractFloat128Frac1( a );
6669    aSig0 = extractFloat128Frac0( a );
6670    aExp = extractFloat128Exp( a );
6671    aSign = extractFloat128Sign( a );
6672    bSig1 = extractFloat128Frac1( b );
6673    bSig0 = extractFloat128Frac0( b );
6674    bExp = extractFloat128Exp( b );
6675    if ( aExp == 0x7FFF ) {
6676        if (    ( aSig0 | aSig1 )
6677             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6678            return propagateFloat128NaN(a, b, status);
6679        }
6680        goto invalid;
6681    }
6682    if ( bExp == 0x7FFF ) {
6683        if (bSig0 | bSig1) {
6684            return propagateFloat128NaN(a, b, status);
6685        }
6686        return a;
6687    }
6688    if ( bExp == 0 ) {
6689        if ( ( bSig0 | bSig1 ) == 0 ) {
6690 invalid:
6691            float_raise(float_flag_invalid, status);
6692            z.low = float128_default_nan_low;
6693            z.high = float128_default_nan_high;
6694            return z;
6695        }
6696        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6697    }
6698    if ( aExp == 0 ) {
6699        if ( ( aSig0 | aSig1 ) == 0 ) return a;
6700        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6701    }
6702    expDiff = aExp - bExp;
6703    if ( expDiff < -1 ) return a;
6704    shortShift128Left(
6705        aSig0 | LIT64( 0x0001000000000000 ),
6706        aSig1,
6707        15 - ( expDiff < 0 ),
6708        &aSig0,
6709        &aSig1
6710    );
6711    shortShift128Left(
6712        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6713    q = le128( bSig0, bSig1, aSig0, aSig1 );
6714    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6715    expDiff -= 64;
6716    while ( 0 < expDiff ) {
6717        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6718        q = ( 4 < q ) ? q - 4 : 0;
6719        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6720        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6721        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6722        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6723        expDiff -= 61;
6724    }
6725    if ( -64 < expDiff ) {
6726        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6727        q = ( 4 < q ) ? q - 4 : 0;
6728        q >>= - expDiff;
6729        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6730        expDiff += 52;
6731        if ( expDiff < 0 ) {
6732            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6733        }
6734        else {
6735            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6736        }
6737        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6738        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6739    }
6740    else {
6741        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6742        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6743    }
6744    do {
6745        alternateASig0 = aSig0;
6746        alternateASig1 = aSig1;
6747        ++q;
6748        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6749    } while ( 0 <= (int64_t) aSig0 );
6750    add128(
6751        aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6752    if (    ( sigMean0 < 0 )
6753         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6754        aSig0 = alternateASig0;
6755        aSig1 = alternateASig1;
6756    }
6757    zSign = ( (int64_t) aSig0 < 0 );
6758    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6759    return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6760                                         status);
6761}
6762
6763/*----------------------------------------------------------------------------
6764| Returns the square root of the quadruple-precision floating-point value `a'.
6765| The operation is performed according to the IEC/IEEE Standard for Binary
6766| Floating-Point Arithmetic.
6767*----------------------------------------------------------------------------*/
6768
6769float128 float128_sqrt(float128 a, float_status *status)
6770{
6771    flag aSign;
6772    int32 aExp, zExp;
6773    uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6774    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6775    float128 z;
6776
6777    aSig1 = extractFloat128Frac1( a );
6778    aSig0 = extractFloat128Frac0( a );
6779    aExp = extractFloat128Exp( a );
6780    aSign = extractFloat128Sign( a );
6781    if ( aExp == 0x7FFF ) {
6782        if (aSig0 | aSig1) {
6783            return propagateFloat128NaN(a, a, status);
6784        }
6785        if ( ! aSign ) return a;
6786        goto invalid;
6787    }
6788    if ( aSign ) {
6789        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6790 invalid:
6791        float_raise(float_flag_invalid, status);
6792        z.low = float128_default_nan_low;
6793        z.high = float128_default_nan_high;
6794        return z;
6795    }
6796    if ( aExp == 0 ) {
6797        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6798        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6799    }
6800    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6801    aSig0 |= LIT64( 0x0001000000000000 );
6802    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6803    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6804    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6805    doubleZSig0 = zSig0<<1;
6806    mul64To128( zSig0, zSig0, &term0, &term1 );
6807    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6808    while ( (int64_t) rem0 < 0 ) {
6809        --zSig0;
6810        doubleZSig0 -= 2;
6811        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6812    }
6813    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6814    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6815        if ( zSig1 == 0 ) zSig1 = 1;
6816        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6817        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6818        mul64To128( zSig1, zSig1, &term2, &term3 );
6819        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6820        while ( (int64_t) rem1 < 0 ) {
6821            --zSig1;
6822            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6823            term3 |= 1;
6824            term2 |= doubleZSig0;
6825            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6826        }
6827        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6828    }
6829    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6830    return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6831
6832}
6833
6834/*----------------------------------------------------------------------------
6835| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6836| the corresponding value `b', and 0 otherwise.  The invalid exception is
6837| raised if either operand is a NaN.  Otherwise, the comparison is performed
6838| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6839*----------------------------------------------------------------------------*/
6840
6841int float128_eq(float128 a, float128 b, float_status *status)
6842{
6843
6844    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6845              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6846         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6847              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6848       ) {
6849        float_raise(float_flag_invalid, status);
6850        return 0;
6851    }
6852    return
6853           ( a.low == b.low )
6854        && (    ( a.high == b.high )
6855             || (    ( a.low == 0 )
6856                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6857           );
6858
6859}
6860
6861/*----------------------------------------------------------------------------
6862| Returns 1 if the quadruple-precision floating-point value `a' is less than
6863| or equal to the corresponding value `b', and 0 otherwise.  The invalid
6864| exception is raised if either operand is a NaN.  The comparison is performed
6865| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6866*----------------------------------------------------------------------------*/
6867
6868int float128_le(float128 a, float128 b, float_status *status)
6869{
6870    flag aSign, bSign;
6871
6872    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6873              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6874         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6875              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6876       ) {
6877        float_raise(float_flag_invalid, status);
6878        return 0;
6879    }
6880    aSign = extractFloat128Sign( a );
6881    bSign = extractFloat128Sign( b );
6882    if ( aSign != bSign ) {
6883        return
6884               aSign
6885            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6886                 == 0 );
6887    }
6888    return
6889          aSign ? le128( b.high, b.low, a.high, a.low )
6890        : le128( a.high, a.low, b.high, b.low );
6891
6892}
6893
6894/*----------------------------------------------------------------------------
6895| Returns 1 if the quadruple-precision floating-point value `a' is less than
6896| the corresponding value `b', and 0 otherwise.  The invalid exception is
6897| raised if either operand is a NaN.  The comparison is performed according
6898| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6899*----------------------------------------------------------------------------*/
6900
6901int float128_lt(float128 a, float128 b, float_status *status)
6902{
6903    flag aSign, bSign;
6904
6905    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6906              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6907         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6908              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6909       ) {
6910        float_raise(float_flag_invalid, status);
6911        return 0;
6912    }
6913    aSign = extractFloat128Sign( a );
6914    bSign = extractFloat128Sign( b );
6915    if ( aSign != bSign ) {
6916        return
6917               aSign
6918            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6919                 != 0 );
6920    }
6921    return
6922          aSign ? lt128( b.high, b.low, a.high, a.low )
6923        : lt128( a.high, a.low, b.high, b.low );
6924
6925}
6926
6927/*----------------------------------------------------------------------------
6928| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6929| be compared, and 0 otherwise.  The invalid exception is raised if either
6930| operand is a NaN. The comparison is performed according to the IEC/IEEE
6931| Standard for Binary Floating-Point Arithmetic.
6932*----------------------------------------------------------------------------*/
6933
6934int float128_unordered(float128 a, float128 b, float_status *status)
6935{
6936    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6937              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6938         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6939              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6940       ) {
6941        float_raise(float_flag_invalid, status);
6942        return 1;
6943    }
6944    return 0;
6945}
6946
6947/*----------------------------------------------------------------------------
6948| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6949| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6950| exception.  The comparison is performed according to the IEC/IEEE Standard
6951| for Binary Floating-Point Arithmetic.
6952*----------------------------------------------------------------------------*/
6953
6954int float128_eq_quiet(float128 a, float128 b, float_status *status)
6955{
6956
6957    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6958              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6959         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6960              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6961       ) {
6962        if (    float128_is_signaling_nan( a )
6963             || float128_is_signaling_nan( b ) ) {
6964            float_raise(float_flag_invalid, status);
6965        }
6966        return 0;
6967    }
6968    return
6969           ( a.low == b.low )
6970        && (    ( a.high == b.high )
6971             || (    ( a.low == 0 )
6972                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6973           );
6974
6975}
6976
6977/*----------------------------------------------------------------------------
6978| Returns 1 if the quadruple-precision floating-point value `a' is less than
6979| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6980| cause an exception.  Otherwise, the comparison is performed according to the
6981| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6982*----------------------------------------------------------------------------*/
6983
6984int float128_le_quiet(float128 a, float128 b, float_status *status)
6985{
6986    flag aSign, bSign;
6987
6988    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6989              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6990         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6991              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6992       ) {
6993        if (    float128_is_signaling_nan( a )
6994             || float128_is_signaling_nan( b ) ) {
6995            float_raise(float_flag_invalid, status);
6996        }
6997        return 0;
6998    }
6999    aSign = extractFloat128Sign( a );
7000    bSign = extractFloat128Sign( b );
7001    if ( aSign != bSign ) {
7002        return
7003               aSign
7004            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7005                 == 0 );
7006    }
7007    return
7008          aSign ? le128( b.high, b.low, a.high, a.low )
7009        : le128( a.high, a.low, b.high, b.low );
7010
7011}
7012
7013/*----------------------------------------------------------------------------
7014| Returns 1 if the quadruple-precision floating-point value `a' is less than
7015| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7016| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7017| Standard for Binary Floating-Point Arithmetic.
7018*----------------------------------------------------------------------------*/
7019
7020int float128_lt_quiet(float128 a, float128 b, float_status *status)
7021{
7022    flag aSign, bSign;
7023
7024    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7025              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7026         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7027              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7028       ) {
7029        if (    float128_is_signaling_nan( a )
7030             || float128_is_signaling_nan( b ) ) {
7031            float_raise(float_flag_invalid, status);
7032        }
7033        return 0;
7034    }
7035    aSign = extractFloat128Sign( a );
7036    bSign = extractFloat128Sign( b );
7037    if ( aSign != bSign ) {
7038        return
7039               aSign
7040            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7041                 != 0 );
7042    }
7043    return
7044          aSign ? lt128( b.high, b.low, a.high, a.low )
7045        : lt128( a.high, a.low, b.high, b.low );
7046
7047}
7048
7049/*----------------------------------------------------------------------------
7050| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7051| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7052| comparison is performed according to the IEC/IEEE Standard for Binary
7053| Floating-Point Arithmetic.
7054*----------------------------------------------------------------------------*/
7055
7056int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7057{
7058    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7059              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7060         || (    ( extractFloat128Exp( b ) == 0x7FFF )
7061              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7062       ) {
7063        if (    float128_is_signaling_nan( a )
7064             || float128_is_signaling_nan( b ) ) {
7065            float_raise(float_flag_invalid, status);
7066        }
7067        return 1;
7068    }
7069    return 0;
7070}
7071
7072/* misc functions */
7073float32 uint32_to_float32(uint32_t a, float_status *status)
7074{
7075    return int64_to_float32(a, status);
7076}
7077
7078float64 uint32_to_float64(uint32_t a, float_status *status)
7079{
7080    return int64_to_float64(a, status);
7081}
7082
7083uint32 float32_to_uint32(float32 a, float_status *status)
7084{
7085    int64_t v;
7086    uint32 res;
7087    int old_exc_flags = get_float_exception_flags(status);
7088
7089    v = float32_to_int64(a, status);
7090    if (v < 0) {
7091        res = 0;
7092    } else if (v > 0xffffffff) {
7093        res = 0xffffffff;
7094    } else {
7095        return v;
7096    }
7097    set_float_exception_flags(old_exc_flags, status);
7098    float_raise(float_flag_invalid, status);
7099    return res;
7100}
7101
7102uint32 float32_to_uint32_round_to_zero(float32 a, float_status *status)
7103{
7104    int64_t v;
7105    uint32 res;
7106    int old_exc_flags = get_float_exception_flags(status);
7107
7108    v = float32_to_int64_round_to_zero(a, status);
7109    if (v < 0) {
7110        res = 0;
7111    } else if (v > 0xffffffff) {
7112        res = 0xffffffff;
7113    } else {
7114        return v;
7115    }
7116    set_float_exception_flags(old_exc_flags, status);
7117    float_raise(float_flag_invalid, status);
7118    return res;
7119}
7120
7121int_fast16_t float32_to_int16(float32 a, float_status *status)
7122{
7123    int32_t v;
7124    int_fast16_t res;
7125    int old_exc_flags = get_float_exception_flags(status);
7126
7127    v = float32_to_int32(a, status);
7128    if (v < -0x8000) {
7129        res = -0x8000;
7130    } else if (v > 0x7fff) {
7131        res = 0x7fff;
7132    } else {
7133        return v;
7134    }
7135
7136    set_float_exception_flags(old_exc_flags, status);
7137    float_raise(float_flag_invalid, status);
7138    return res;
7139}
7140
7141uint_fast16_t float32_to_uint16(float32 a, float_status *status)
7142{
7143    int32_t v;
7144    uint_fast16_t res;
7145    int old_exc_flags = get_float_exception_flags(status);
7146
7147    v = float32_to_int32(a, status);
7148    if (v < 0) {
7149        res = 0;
7150    } else if (v > 0xffff) {
7151        res = 0xffff;
7152    } else {
7153        return v;
7154    }
7155
7156    set_float_exception_flags(old_exc_flags, status);
7157    float_raise(float_flag_invalid, status);
7158    return res;
7159}
7160
7161uint_fast16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7162{
7163    int64_t v;
7164    uint_fast16_t res;
7165    int old_exc_flags = get_float_exception_flags(status);
7166
7167    v = float32_to_int64_round_to_zero(a, status);
7168    if (v < 0) {
7169        res = 0;
7170    } else if (v > 0xffff) {
7171        res = 0xffff;
7172    } else {
7173        return v;
7174    }
7175    set_float_exception_flags(old_exc_flags, status);
7176    float_raise(float_flag_invalid, status);
7177    return res;
7178}
7179
7180uint32 float64_to_uint32(float64 a, float_status *status)
7181{
7182    uint64_t v;
7183    uint32 res;
7184    int old_exc_flags = get_float_exception_flags(status);
7185
7186    v = float64_to_uint64(a, status);
7187    if (v > 0xffffffff) {
7188        res = 0xffffffff;
7189    } else {
7190        return v;
7191    }
7192    set_float_exception_flags(old_exc_flags, status);
7193    float_raise(float_flag_invalid, status);
7194    return res;
7195}
7196
7197uint32 float64_to_uint32_round_to_zero(float64 a, float_status *status)
7198{
7199    uint64_t v;
7200    uint32 res;
7201    int old_exc_flags = get_float_exception_flags(status);
7202
7203    v = float64_to_uint64_round_to_zero(a, status);
7204    if (v > 0xffffffff) {
7205        res = 0xffffffff;
7206    } else {
7207        return v;
7208    }
7209    set_float_exception_flags(old_exc_flags, status);
7210    float_raise(float_flag_invalid, status);
7211    return res;
7212}
7213
7214int_fast16_t float64_to_int16(float64 a, float_status *status)
7215{
7216    int64_t v;
7217    int_fast16_t res;
7218    int old_exc_flags = get_float_exception_flags(status);
7219
7220    v = float64_to_int32(a, status);
7221    if (v < -0x8000) {
7222        res = -0x8000;
7223    } else if (v > 0x7fff) {
7224        res = 0x7fff;
7225    } else {
7226        return v;
7227    }
7228
7229    set_float_exception_flags(old_exc_flags, status);
7230    float_raise(float_flag_invalid, status);
7231    return res;
7232}
7233
7234uint_fast16_t float64_to_uint16(float64 a, float_status *status)
7235{
7236    int64_t v;
7237    uint_fast16_t res;
7238    int old_exc_flags = get_float_exception_flags(status);
7239
7240    v = float64_to_int32(a, status);
7241    if (v < 0) {
7242        res = 0;
7243    } else if (v > 0xffff) {
7244        res = 0xffff;
7245    } else {
7246        return v;
7247    }
7248
7249    set_float_exception_flags(old_exc_flags, status);
7250    float_raise(float_flag_invalid, status);
7251    return res;
7252}
7253
7254uint_fast16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7255{
7256    int64_t v;
7257    uint_fast16_t res;
7258    int old_exc_flags = get_float_exception_flags(status);
7259
7260    v = float64_to_int64_round_to_zero(a, status);
7261    if (v < 0) {
7262        res = 0;
7263    } else if (v > 0xffff) {
7264        res = 0xffff;
7265    } else {
7266        return v;
7267    }
7268    set_float_exception_flags(old_exc_flags, status);
7269    float_raise(float_flag_invalid, status);
7270    return res;
7271}
7272
7273/*----------------------------------------------------------------------------
7274| Returns the result of converting the double-precision floating-point value
7275| `a' to the 64-bit unsigned integer format.  The conversion is
7276| performed according to the IEC/IEEE Standard for Binary Floating-Point
7277| Arithmetic---which means in particular that the conversion is rounded
7278| according to the current rounding mode.  If `a' is a NaN, the largest
7279| positive integer is returned.  If the conversion overflows, the
7280| largest unsigned integer is returned.  If 'a' is negative, the value is
7281| rounded and zero is returned; negative values that do not round to zero
7282| will raise the inexact exception.
7283*----------------------------------------------------------------------------*/
7284
7285uint64_t float64_to_uint64(float64 a, float_status *status)
7286{
7287    flag aSign;
7288    int_fast16_t aExp, shiftCount;
7289    uint64_t aSig, aSigExtra;
7290    a = float64_squash_input_denormal(a, status);
7291
7292    aSig = extractFloat64Frac(a);
7293    aExp = extractFloat64Exp(a);
7294    aSign = extractFloat64Sign(a);
7295    if (aSign && (aExp > 1022)) {
7296        float_raise(float_flag_invalid, status);
7297        if (float64_is_any_nan(a)) {
7298            return LIT64(0xFFFFFFFFFFFFFFFF);
7299        } else {
7300            return 0;
7301        }
7302    }
7303    if (aExp) {
7304        aSig |= LIT64(0x0010000000000000);
7305    }
7306    shiftCount = 0x433 - aExp;
7307    if (shiftCount <= 0) {
7308        if (0x43E < aExp) {
7309            float_raise(float_flag_invalid, status);
7310            return LIT64(0xFFFFFFFFFFFFFFFF);
7311        }
7312        aSigExtra = 0;
7313        aSig <<= -shiftCount;
7314    } else {
7315        shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7316    }
7317    return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7318}
7319
7320uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7321{
7322    signed char current_rounding_mode = status->float_rounding_mode;
7323    set_float_rounding_mode(float_round_to_zero, status);
7324    int64_t v = float64_to_uint64(a, status);
7325    set_float_rounding_mode(current_rounding_mode, status);
7326    return v;
7327}
7328
7329#define COMPARE(s, nan_exp)                                                  \
7330static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7331                                      int is_quiet, float_status *status)    \
7332{                                                                            \
7333    flag aSign, bSign;                                                       \
7334    uint ## s ## _t av, bv;                                                  \
7335    a = float ## s ## _squash_input_denormal(a, status);                     \
7336    b = float ## s ## _squash_input_denormal(b, status);                     \
7337                                                                             \
7338    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7339         extractFloat ## s ## Frac( a ) ) ||                                 \
7340        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7341          extractFloat ## s ## Frac( b ) )) {                                \
7342        if (!is_quiet ||                                                     \
7343            float ## s ## _is_signaling_nan( a ) ||                          \
7344            float ## s ## _is_signaling_nan( b ) ) {                         \
7345            float_raise(float_flag_invalid, status);                         \
7346        }                                                                    \
7347        return float_relation_unordered;                                     \
7348    }                                                                        \
7349    aSign = extractFloat ## s ## Sign( a );                                  \
7350    bSign = extractFloat ## s ## Sign( b );                                  \
7351    av = float ## s ## _val(a);                                              \
7352    bv = float ## s ## _val(b);                                              \
7353    if ( aSign != bSign ) {                                                  \
7354        if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7355            /* zero case */                                                  \
7356            return float_relation_equal;                                     \
7357        } else {                                                             \
7358            return 1 - (2 * aSign);                                          \
7359        }                                                                    \
7360    } else {                                                                 \
7361        if (av == bv) {                                                      \
7362            return float_relation_equal;                                     \
7363        } else {                                                             \
7364            return 1 - 2 * (aSign ^ ( av < bv ));                            \
7365        }                                                                    \
7366    }                                                                        \
7367}                                                                            \
7368                                                                             \
7369int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7370{                                                                            \
7371    return float ## s ## _compare_internal(a, b, 0, status);                 \
7372}                                                                            \
7373                                                                             \
7374int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
7375                                 float_status *status)                       \
7376{                                                                            \
7377    return float ## s ## _compare_internal(a, b, 1, status);                 \
7378}
7379
7380COMPARE(32, 0xff)
7381COMPARE(64, 0x7ff)
7382
7383static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7384                                            int is_quiet, float_status *status)
7385{
7386    flag aSign, bSign;
7387
7388    if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7389          ( extractFloatx80Frac( a )<<1 ) ) ||
7390        ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7391          ( extractFloatx80Frac( b )<<1 ) )) {
7392        if (!is_quiet ||
7393            floatx80_is_signaling_nan( a ) ||
7394            floatx80_is_signaling_nan( b ) ) {
7395            float_raise(float_flag_invalid, status);
7396        }
7397        return float_relation_unordered;
7398    }
7399    aSign = extractFloatx80Sign( a );
7400    bSign = extractFloatx80Sign( b );
7401    if ( aSign != bSign ) {
7402
7403        if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7404             ( ( a.low | b.low ) == 0 ) ) {
7405            /* zero case */
7406            return float_relation_equal;
7407        } else {
7408            return 1 - (2 * aSign);
7409        }
7410    } else {
7411        if (a.low == b.low && a.high == b.high) {
7412            return float_relation_equal;
7413        } else {
7414            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7415        }
7416    }
7417}
7418
7419int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7420{
7421    return floatx80_compare_internal(a, b, 0, status);
7422}
7423
7424int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7425{
7426    return floatx80_compare_internal(a, b, 1, status);
7427}
7428
7429static inline int float128_compare_internal(float128 a, float128 b,
7430                                            int is_quiet, float_status *status)
7431{
7432    flag aSign, bSign;
7433
7434    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7435          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7436        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7437          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7438        if (!is_quiet ||
7439            float128_is_signaling_nan( a ) ||
7440            float128_is_signaling_nan( b ) ) {
7441            float_raise(float_flag_invalid, status);
7442        }
7443        return float_relation_unordered;
7444    }
7445    aSign = extractFloat128Sign( a );
7446    bSign = extractFloat128Sign( b );
7447    if ( aSign != bSign ) {
7448        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7449            /* zero case */
7450            return float_relation_equal;
7451        } else {
7452            return 1 - (2 * aSign);
7453        }
7454    } else {
7455        if (a.low == b.low && a.high == b.high) {
7456            return float_relation_equal;
7457        } else {
7458            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7459        }
7460    }
7461}
7462
7463int float128_compare(float128 a, float128 b, float_status *status)
7464{
7465    return float128_compare_internal(a, b, 0, status);
7466}
7467
7468int float128_compare_quiet(float128 a, float128 b, float_status *status)
7469{
7470    return float128_compare_internal(a, b, 1, status);
7471}
7472
7473/* min() and max() functions. These can't be implemented as
7474 * 'compare and pick one input' because that would mishandle
7475 * NaNs and +0 vs -0.
7476 *
7477 * minnum() and maxnum() functions. These are similar to the min()
7478 * and max() functions but if one of the arguments is a QNaN and
7479 * the other is numerical then the numerical argument is returned.
7480 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7481 * and maxNum() operations. min() and max() are the typical min/max
7482 * semantics provided by many CPUs which predate that specification.
7483 *
7484 * minnummag() and maxnummag() functions correspond to minNumMag()
7485 * and minNumMag() from the IEEE-754 2008.
7486 */
7487#define MINMAX(s)                                                       \
7488static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7489                                               int ismin, int isieee,   \
7490                                               int ismag,               \
7491                                               float_status *status)    \
7492{                                                                       \
7493    flag aSign, bSign;                                                  \
7494    uint ## s ## _t av, bv, aav, abv;                                   \
7495    a = float ## s ## _squash_input_denormal(a, status);                \
7496    b = float ## s ## _squash_input_denormal(b, status);                \
7497    if (float ## s ## _is_any_nan(a) ||                                 \
7498        float ## s ## _is_any_nan(b)) {                                 \
7499        if (isieee) {                                                   \
7500            if (float ## s ## _is_quiet_nan(a) &&                       \
7501                !float ## s ##_is_any_nan(b)) {                         \
7502                return b;                                               \
7503            } else if (float ## s ## _is_quiet_nan(b) &&                \
7504                       !float ## s ## _is_any_nan(a)) {                 \
7505                return a;                                               \
7506            }                                                           \
7507        }                                                               \
7508        return propagateFloat ## s ## NaN(a, b, status);                \
7509    }                                                                   \
7510    aSign = extractFloat ## s ## Sign(a);                               \
7511    bSign = extractFloat ## s ## Sign(b);                               \
7512    av = float ## s ## _val(a);                                         \
7513    bv = float ## s ## _val(b);                                         \
7514    if (ismag) {                                                        \
7515        aav = float ## s ## _abs(av);                                   \
7516        abv = float ## s ## _abs(bv);                                   \
7517        if (aav != abv) {                                               \
7518            if (ismin) {                                                \
7519                return (aav < abv) ? a : b;                             \
7520            } else {                                                    \
7521                return (aav < abv) ? b : a;                             \
7522            }                                                           \
7523        }                                                               \
7524    }                                                                   \
7525    if (aSign != bSign) {                                               \
7526        if (ismin) {                                                    \
7527            return aSign ? a : b;                                       \
7528        } else {                                                        \
7529            return aSign ? b : a;                                       \
7530        }                                                               \
7531    } else {                                                            \
7532        if (ismin) {                                                    \
7533            return (aSign ^ (av < bv)) ? a : b;                         \
7534        } else {                                                        \
7535            return (aSign ^ (av < bv)) ? b : a;                         \
7536        }                                                               \
7537    }                                                                   \
7538}                                                                       \
7539                                                                        \
7540float ## s float ## s ## _min(float ## s a, float ## s b,               \
7541                              float_status *status)                     \
7542{                                                                       \
7543    return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
7544}                                                                       \
7545                                                                        \
7546float ## s float ## s ## _max(float ## s a, float ## s b,               \
7547                              float_status *status)                     \
7548{                                                                       \
7549    return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
7550}                                                                       \
7551                                                                        \
7552float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
7553                                 float_status *status)                  \
7554{                                                                       \
7555    return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
7556}                                                                       \
7557                                                                        \
7558float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
7559                                 float_status *status)                  \
7560{                                                                       \
7561    return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
7562}                                                                       \
7563                                                                        \
7564float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
7565                                    float_status *status)               \
7566{                                                                       \
7567    return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
7568}                                                                       \
7569                                                                        \
7570float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
7571                                    float_status *status)               \
7572{                                                                       \
7573    return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
7574}
7575
7576MINMAX(32)
7577MINMAX(64)
7578
7579
7580/* Multiply A by 2 raised to the power N.  */
7581float32 float32_scalbn(float32 a, int n, float_status *status)
7582{
7583    flag aSign;
7584    int16_t aExp;
7585    uint32_t aSig;
7586
7587    a = float32_squash_input_denormal(a, status);
7588    aSig = extractFloat32Frac( a );
7589    aExp = extractFloat32Exp( a );
7590    aSign = extractFloat32Sign( a );
7591
7592    if ( aExp == 0xFF ) {
7593        if ( aSig ) {
7594            return propagateFloat32NaN(a, a, status);
7595        }
7596        return a;
7597    }
7598    if (aExp != 0) {
7599        aSig |= 0x00800000;
7600    } else if (aSig == 0) {
7601        return a;
7602    } else {
7603        aExp++;
7604    }
7605
7606    if (n > 0x200) {
7607        n = 0x200;
7608    } else if (n < -0x200) {
7609        n = -0x200;
7610    }
7611
7612    aExp += n - 1;
7613    aSig <<= 7;
7614    return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7615}
7616
7617float64 float64_scalbn(float64 a, int n, float_status *status)
7618{
7619    flag aSign;
7620    int16_t aExp;
7621    uint64_t aSig;
7622
7623    a = float64_squash_input_denormal(a, status);
7624    aSig = extractFloat64Frac( a );
7625    aExp = extractFloat64Exp( a );
7626    aSign = extractFloat64Sign( a );
7627
7628    if ( aExp == 0x7FF ) {
7629        if ( aSig ) {
7630            return propagateFloat64NaN(a, a, status);
7631        }
7632        return a;
7633    }
7634    if (aExp != 0) {
7635        aSig |= LIT64( 0x0010000000000000 );
7636    } else if (aSig == 0) {
7637        return a;
7638    } else {
7639        aExp++;
7640    }
7641
7642    if (n > 0x1000) {
7643        n = 0x1000;
7644    } else if (n < -0x1000) {
7645        n = -0x1000;
7646    }
7647
7648    aExp += n - 1;
7649    aSig <<= 10;
7650    return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7651}
7652
7653floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7654{
7655    flag aSign;
7656    int32_t aExp;
7657    uint64_t aSig;
7658
7659    aSig = extractFloatx80Frac( a );
7660    aExp = extractFloatx80Exp( a );
7661    aSign = extractFloatx80Sign( a );
7662
7663    if ( aExp == 0x7FFF ) {
7664        if ( aSig<<1 ) {
7665            return propagateFloatx80NaN(a, a, status);
7666        }
7667        return a;
7668    }
7669
7670    if (aExp == 0) {
7671        if (aSig == 0) {
7672            return a;
7673        }
7674        aExp++;
7675    }
7676
7677    if (n > 0x10000) {
7678        n = 0x10000;
7679    } else if (n < -0x10000) {
7680        n = -0x10000;
7681    }
7682
7683    aExp += n;
7684    return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7685                                         aSign, aExp, aSig, 0, status);
7686}
7687
7688float128 float128_scalbn(float128 a, int n, float_status *status)
7689{
7690    flag aSign;
7691    int32_t aExp;
7692    uint64_t aSig0, aSig1;
7693
7694    aSig1 = extractFloat128Frac1( a );
7695    aSig0 = extractFloat128Frac0( a );
7696    aExp = extractFloat128Exp( a );
7697    aSign = extractFloat128Sign( a );
7698    if ( aExp == 0x7FFF ) {
7699        if ( aSig0 | aSig1 ) {
7700            return propagateFloat128NaN(a, a, status);
7701        }
7702        return a;
7703    }
7704    if (aExp != 0) {
7705        aSig0 |= LIT64( 0x0001000000000000 );
7706    } else if (aSig0 == 0 && aSig1 == 0) {
7707        return a;
7708    } else {
7709        aExp++;
7710    }
7711
7712    if (n > 0x10000) {
7713        n = 0x10000;
7714    } else if (n < -0x10000) {
7715        n = -0x10000;
7716    }
7717
7718    aExp += n - 1;
7719    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7720                                         , status);
7721
7722}
7723